diff --git a/OpenCL/inc_hash_sm3.cl b/OpenCL/inc_hash_sm3.cl new file mode 100644 index 000000000..b7950352d --- /dev/null +++ b/OpenCL/inc_hash_sm3.cl @@ -0,0 +1,1828 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#include "inc_vendor.h" +#include "inc_types.h" +#include "inc_platform.h" +#include "inc_common.h" +#include "inc_hash_sm3.h" + +#define LOG_BUF_16(msg) \ + printf("%s : %.08x %.08x %.08x %.08x %.08x %.08x %.08x %.08x" \ + " %.08x %.08x %.08x %.08x %.08x %.08x %.08x %.08x\n", \ + msg, \ + w0[0], w0[1], w0[2], w0[3], \ + w1[0], w1[1], w1[2], w1[3], \ + w2[0], w2[1], w2[2], w2[3], \ + w3[0], w3[1], w3[2], w3[3] \ + ) + +#define LOG_TMP_BUF_16(msg) \ + printf("%s : %.08x %.08x %.08x %.08x %.08x %.08x %.08x %.08x" \ + " %.08x %.08x %.08x %.08x %.08x %.08x %.08x %.08x\n", \ + msg, \ + w0_t, w1_t, w2_t, w3_t, w4_t, w5_t, w6_t, w7_t, \ + w8_t, w9_t, wa_t, wb_t, wc_t, wd_t, we_t, wf_t \ + ) +/* + " %.08x %.08x %.08x %.08x %.08x %.08x %.08x %.08x" \ + " %.08x %.08x %.08x %.08x %.08x %.08x %.08x %.08x\n" \ + w0_t, w1_t, w2_t, w3_t, w4_t, w5_t, w6_t, w7_t, \ + w8_t, w9_t, wa_t, wb_t, wc_t, wd_t, we_t, wf_t, \ +*/ +#define LOG_LOOP(msg, i) \ + printf("%s (%d) :" \ + " Intermediate digest values :" \ + " %.08x %.08x %.08x %.08x %.08x %.08x %.08x %.08x\n", \ + msg, i, \ + a, b, c, d, e, f, g, h \ + ) + +#define LOG_CTX_BUF_16(msg) \ + printf("%s : %.08x %.08x %.08x %.08x %.08x %.08x %.08x %.08x" \ + " %.08x %.08x %.08x %.08x %.08x %.08x %.08x %.08x\n", \ + msg, \ + ctx->w0[0], ctx->w0[1], ctx->w0[2], ctx->w0[3], \ + ctx->w1[0], ctx->w1[1], ctx->w1[2], ctx->w1[3], \ + ctx->w2[0], ctx->w2[1], ctx->w2[2], ctx->w2[3], \ + ctx->w3[0], ctx->w3[1], ctx->w3[2], ctx->w3[3] \ + ) + +#define LOG_DIGEST(msg) \ + printf("%s : %.08x %.08x %.08x %.08x %.08x %.08x %.08x %.08x\n", \ + msg, digest[0], digest[1], digest[2], digest[3], \ + digest[4], digest[5], digest[6], digest[7] \ + ) + +#define LOG_CTX_DIGEST(msg) \ + printf("%s : %.08x %.08x %.08x %.08x %.08x %.08x %.08x %.08x\n", \ + msg, ctx->h[0], ctx->h[1], ctx->h[2], ctx->h[3], \ + ctx->h[4], ctx->h[5], ctx->h[6], ctx->h[7] \ + ) + +// important notes on this: +// input buf unused bytes needs to be set to zero +// input buf needs to be in algorithm native byte order (md5 = LE, sm3 = BE, etc) +// input buf needs to be 64 byte aligned when using sm3_update() + +DECLSPEC void sm3_transform (PRIVATE_AS const u32 *w0, PRIVATE_AS const u32 *w1, PRIVATE_AS const u32 *w2, PRIVATE_AS const u32 *w3, PRIVATE_AS u32 *digest) +{ + // printf("sm3_transform\n"); + u32 a = digest[0]; + u32 b = digest[1]; + u32 c = digest[2]; + u32 d = digest[3]; + u32 e = digest[4]; + u32 f = digest[5]; + u32 g = digest[6]; + u32 h = digest[7]; + + // LOG_DIGEST("Digest buffer before transform"); + + u32 w0_t = w0[0]; + u32 w1_t = w0[1]; + u32 w2_t = w0[2]; + u32 w3_t = w0[3]; + u32 w4_t = w1[0]; + u32 w5_t = w1[1]; + u32 w6_t = w1[2]; + u32 w7_t = w1[3]; + u32 w8_t = w2[0]; + u32 w9_t = w2[1]; + u32 wa_t = w2[2]; + u32 wb_t = w2[3]; + u32 wc_t = w3[0]; + u32 wd_t = w3[1]; + u32 we_t = w3[2]; + u32 wf_t = w3[3]; + + // LOG_TMP_BUF_16("Buffer before rounds"); + + int i = 0; + // printf("Main loop execution :\n\n"); + // SM3 main loop, the Compression Function (CF) and Message Expansion (ME) are executed + // step-by-step. SM3_R1_S use SM3_FF0 and SM3_GG0 functions for index 0 to 15 and SM3_R2_S + // use SM3_FF1 and SM3_GG1 functions for index 16 to 63. + SM3_R1_S(a, b, c, d, e, f, g, h, SM3_T00, w0_t, w0_t ^ w4_t); + w0_t = SM3_EXPAND_S(w0_t, w7_t, wd_t, w3_t, wa_t); // printf("w0_t : %.08x\n", w0_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R1_S(d, a, b, c, h, e, f, g, SM3_T01, w1_t, w1_t ^ w5_t); + w1_t = SM3_EXPAND_S(w1_t, w8_t, we_t, w4_t, wb_t); // printf("w1_t : %.08x\n", w1_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R1_S(c, d, a, b, g, h, e, f, SM3_T02, w2_t, w2_t ^ w6_t); + w2_t = SM3_EXPAND_S(w2_t, w9_t, wf_t, w5_t, wc_t);// printf("w2_t : %.08x\n", w2_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R1_S(b, c, d, a, f, g, h, e, SM3_T03, w3_t, w3_t ^ w7_t); + w3_t = SM3_EXPAND_S(w3_t, wa_t, w0_t, w6_t, wd_t);// printf("w3_t : %.08x\n", w3_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R1_S(a, b, c, d, e, f, g, h, SM3_T04, w4_t, w4_t ^ w8_t); + w4_t = SM3_EXPAND_S(w4_t, wb_t, w1_t, w7_t, we_t);// printf("w4_t : %.08x\n", w4_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R1_S(d, a, b, c, h, e, f, g, SM3_T05, w5_t, w5_t ^ w9_t); + w5_t = SM3_EXPAND_S(w5_t, wc_t, w2_t, w8_t, wf_t);// printf("w5_t : %.08x\n", w5_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R1_S(c, d, a, b, g, h, e, f, SM3_T06, w6_t, w6_t ^ wa_t); + w6_t = SM3_EXPAND_S(w6_t, wd_t, w3_t, w9_t, w0_t);// printf("w6_t : %.08x\n", w6_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R1_S(b, c, d, a, f, g, h, e, SM3_T07, w7_t, w7_t ^ wb_t); + w7_t = SM3_EXPAND_S(w7_t, we_t, w4_t, wa_t, w1_t);// printf("w7_t : %.08x\n", w7_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R1_S(a, b, c, d, e, f, g, h, SM3_T08, w8_t, w8_t ^ wc_t); + w8_t = SM3_EXPAND_S(w8_t, wf_t, w5_t, wb_t, w2_t);// printf("w8_t : %.08x\n", w8_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R1_S(d, a, b, c, h, e, f, g, SM3_T09, w9_t, w9_t ^ wd_t); + w9_t = SM3_EXPAND_S(w9_t, w0_t, w6_t, wc_t, w3_t);// printf("w9_t : %.08x\n", w9_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R1_S(c, d, a, b, g, h, e, f, SM3_T10, wa_t, wa_t ^ we_t); + wa_t = SM3_EXPAND_S(wa_t, w1_t, w7_t, wd_t, w4_t);// printf("wa_t : %.08x\n", wa_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R1_S(b, c, d, a, f, g, h, e, SM3_T11, wb_t, wb_t ^ wf_t); + wb_t = SM3_EXPAND_S(wb_t, w2_t, w8_t, we_t, w5_t);// printf("wb_t : %.08x\n", wb_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R1_S(a, b, c, d, e, f, g, h, SM3_T12, wc_t, wc_t ^ w0_t); + wc_t = SM3_EXPAND_S(wc_t, w3_t, w9_t, wf_t, w6_t);// printf("wc_t : %.08x\n", wc_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R1_S(d, a, b, c, h, e, f, g, SM3_T13, wd_t, wd_t ^ w1_t); + wd_t = SM3_EXPAND_S(wd_t, w4_t, wa_t, w0_t, w7_t);// printf("wd_t : %.08x\n", wd_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R1_S(c, d, a, b, g, h, e, f, SM3_T14, we_t, we_t ^ w2_t); + we_t = SM3_EXPAND_S(we_t, w5_t, wb_t, w1_t, w8_t);// printf("we_t : %.08x\n", we_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R1_S(b, c, d, a, f, g, h, e, SM3_T15, wf_t, wf_t ^ w3_t); + wf_t = SM3_EXPAND_S(wf_t, w6_t, wc_t, w2_t, w9_t);// printf("wf_t : %.08x\n", wf_t); + //LOG_LOOP("Main loop", i); i++; + // Index = 16, switch to SM3_R2_S + SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T16, w0_t, w0_t ^ w4_t); + w0_t = SM3_EXPAND_S(w0_t, w7_t, wd_t, w3_t, wa_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T17, w1_t, w1_t ^ w5_t); + w1_t = SM3_EXPAND_S(w1_t, w8_t, we_t, w4_t, wb_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T18, w2_t, w2_t ^ w6_t); + w2_t = SM3_EXPAND_S(w2_t, w9_t, wf_t, w5_t, wc_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T19, w3_t, w3_t ^ w7_t); + w3_t = SM3_EXPAND_S(w3_t, wa_t, w0_t, w6_t, wd_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T20, w4_t, w4_t ^ w8_t); + w4_t = SM3_EXPAND_S(w4_t, wb_t, w1_t, w7_t, we_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T21, w5_t, w5_t ^ w9_t); + w5_t = SM3_EXPAND_S(w5_t, wc_t, w2_t, w8_t, wf_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T22, w6_t, w6_t ^ wa_t); + w6_t = SM3_EXPAND_S(w6_t, wd_t, w3_t, w9_t, w0_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T23, w7_t, w7_t ^ wb_t); + w7_t = SM3_EXPAND_S(w7_t, we_t, w4_t, wa_t, w1_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T24, w8_t, w8_t ^ wc_t); + w8_t = SM3_EXPAND_S(w8_t, wf_t, w5_t, wb_t, w2_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T25, w9_t, w9_t ^ wd_t); + w9_t = SM3_EXPAND_S(w9_t, w0_t, w6_t, wc_t, w3_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T26, wa_t, wa_t ^ we_t); + wa_t = SM3_EXPAND_S(wa_t, w1_t, w7_t, wd_t, w4_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T27, wb_t, wb_t ^ wf_t); + wb_t = SM3_EXPAND_S(wb_t, w2_t, w8_t, we_t, w5_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T28, wc_t, wc_t ^ w0_t); + wc_t = SM3_EXPAND_S(wc_t, w3_t, w9_t, wf_t, w6_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T29, wd_t, wd_t ^ w1_t); + wd_t = SM3_EXPAND_S(wd_t, w4_t, wa_t, w0_t, w7_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T30, we_t, we_t ^ w2_t); + we_t = SM3_EXPAND_S(we_t, w5_t, wb_t, w1_t, w8_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T31, wf_t, wf_t ^ w3_t); + wf_t = SM3_EXPAND_S(wf_t, w6_t, wc_t, w2_t, w9_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T32, w0_t, w0_t ^ w4_t); + w0_t = SM3_EXPAND_S(w0_t, w7_t, wd_t, w3_t, wa_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T33, w1_t, w1_t ^ w5_t); + w1_t = SM3_EXPAND_S(w1_t, w8_t, we_t, w4_t, wb_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T34, w2_t, w2_t ^ w6_t); + w2_t = SM3_EXPAND_S(w2_t, w9_t, wf_t, w5_t, wc_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T35, w3_t, w3_t ^ w7_t); + w3_t = SM3_EXPAND_S(w3_t, wa_t, w0_t, w6_t, wd_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T36, w4_t, w4_t ^ w8_t); + w4_t = SM3_EXPAND_S(w4_t, wb_t, w1_t, w7_t, we_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T37, w5_t, w5_t ^ w9_t); + w5_t = SM3_EXPAND_S(w5_t, wc_t, w2_t, w8_t, wf_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T38, w6_t, w6_t ^ wa_t); + w6_t = SM3_EXPAND_S(w6_t, wd_t, w3_t, w9_t, w0_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T39, w7_t, w7_t ^ wb_t); + w7_t = SM3_EXPAND_S(w7_t, we_t, w4_t, wa_t, w1_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T40, w8_t, w8_t ^ wc_t); + w8_t = SM3_EXPAND_S(w8_t, wf_t, w5_t, wb_t, w2_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T41, w9_t, w9_t ^ wd_t); + w9_t = SM3_EXPAND_S(w9_t, w0_t, w6_t, wc_t, w3_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T42, wa_t, wa_t ^ we_t); + wa_t = SM3_EXPAND_S(wa_t, w1_t, w7_t, wd_t, w4_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T43, wb_t, wb_t ^ wf_t); + wb_t = SM3_EXPAND_S(wb_t, w2_t, w8_t, we_t, w5_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T44, wc_t, wc_t ^ w0_t); + wc_t = SM3_EXPAND_S(wc_t, w3_t, w9_t, wf_t, w6_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T45, wd_t, wd_t ^ w1_t); + wd_t = SM3_EXPAND_S(wd_t, w4_t, wa_t, w0_t, w7_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T46, we_t, we_t ^ w2_t); + we_t = SM3_EXPAND_S(we_t, w5_t, wb_t, w1_t, w8_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T47, wf_t, wf_t ^ w3_t); + wf_t = SM3_EXPAND_S(wf_t, w6_t, wc_t, w2_t, w9_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T48, w0_t, w0_t ^ w4_t); + w0_t = SM3_EXPAND_S(w0_t, w7_t, wd_t, w3_t, wa_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T49, w1_t, w1_t ^ w5_t); + w1_t = SM3_EXPAND_S(w1_t, w8_t, we_t, w4_t, wb_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T50, w2_t, w2_t ^ w6_t); + w2_t = SM3_EXPAND_S(w2_t, w9_t, wf_t, w5_t, wc_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T51, w3_t, w3_t ^ w7_t); + w3_t = SM3_EXPAND_S(w3_t, wa_t, w0_t, w6_t, wd_t); + //LOG_LOOP("Main loop", i); i++; + // No more ME for index 52 to 63. + SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T52, w4_t, w4_t ^ w8_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T53, w5_t, w5_t ^ w9_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T54, w6_t, w6_t ^ wa_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T55, w7_t, w7_t ^ wb_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T56, w8_t, w8_t ^ wc_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T57, w9_t, w9_t ^ wd_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T58, wa_t, wa_t ^ we_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T59, wb_t, wb_t ^ wf_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T60, wc_t, wc_t ^ w0_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T61, wd_t, wd_t ^ w1_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T62, we_t, we_t ^ w2_t); + //LOG_LOOP("Main loop", i); i++; + SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T63, wf_t, wf_t ^ w3_t); + //LOG_LOOP("Main loop", i); i++; + + // LOG_BUF_16("\nBuffer after rounds of SM3"); + + digest[0] ^= a; + digest[1] ^= b; + digest[2] ^= c; + digest[3] ^= d; + digest[4] ^= e; + digest[5] ^= f; + digest[6] ^= g; + digest[7] ^= h; + + // LOG_DIGEST("Digest buffer after transform"); +} + +DECLSPEC void sm3_init (PRIVATE_AS sm3_ctx_t *ctx) +{ + // printf("sm3_init\n"); + ctx->h[0] = SM3_IV_A; + ctx->h[1] = SM3_IV_B; + ctx->h[2] = SM3_IV_C; + ctx->h[3] = SM3_IV_D; + ctx->h[4] = SM3_IV_E; + ctx->h[5] = SM3_IV_F; + ctx->h[6] = SM3_IV_G; + ctx->h[7] = SM3_IV_H; + + ctx->w0[0] = 0; + ctx->w0[1] = 0; + ctx->w0[2] = 0; + ctx->w0[3] = 0; + ctx->w1[0] = 0; + ctx->w1[1] = 0; + ctx->w1[2] = 0; + ctx->w1[3] = 0; + ctx->w2[0] = 0; + ctx->w2[1] = 0; + ctx->w2[2] = 0; + ctx->w2[3] = 0; + ctx->w3[0] = 0; + ctx->w3[1] = 0; + ctx->w3[2] = 0; + ctx->w3[3] = 0; + + ctx->len = 0; +} + +DECLSPEC void sm3_update_64 (PRIVATE_AS sm3_ctx_t *ctx, PRIVATE_AS u32 *w0, PRIVATE_AS u32 *w1, PRIVATE_AS u32 *w2, PRIVATE_AS u32 *w3, const int len) +{ + // printf("sm3_update_64\n"); + if (len == 0) return; + + const int pos = ctx->len & 63; + + ctx->len += len; + + if (pos == 0) + { + ctx->w0[0] = w0[0]; + ctx->w0[1] = w0[1]; + ctx->w0[2] = w0[2]; + ctx->w0[3] = w0[3]; + ctx->w1[0] = w1[0]; + ctx->w1[1] = w1[1]; + ctx->w1[2] = w1[2]; + ctx->w1[3] = w1[3]; + ctx->w2[0] = w2[0]; + ctx->w2[1] = w2[1]; + ctx->w2[2] = w2[2]; + ctx->w2[3] = w2[3]; + ctx->w3[0] = w3[0]; + ctx->w3[1] = w3[1]; + ctx->w3[2] = w3[2]; + ctx->w3[3] = w3[3]; + + if (len == 64) + { + sm3_transform (ctx->w0, ctx->w1, ctx->w2, ctx->w3, ctx->h); + + ctx->w0[0] = 0; + ctx->w0[1] = 0; + ctx->w0[2] = 0; + ctx->w0[3] = 0; + ctx->w1[0] = 0; + ctx->w1[1] = 0; + ctx->w1[2] = 0; + ctx->w1[3] = 0; + ctx->w2[0] = 0; + ctx->w2[1] = 0; + ctx->w2[2] = 0; + ctx->w2[3] = 0; + ctx->w3[0] = 0; + ctx->w3[1] = 0; + ctx->w3[2] = 0; + ctx->w3[3] = 0; + } + } + else + { + if ((pos + len) < 64) + { + switch_buffer_by_offset_be_S (w0, w1, w2, w3, pos); + + ctx->w0[0] |= w0[0]; + ctx->w0[1] |= w0[1]; + ctx->w0[2] |= w0[2]; + ctx->w0[3] |= w0[3]; + ctx->w1[0] |= w1[0]; + ctx->w1[1] |= w1[1]; + ctx->w1[2] |= w1[2]; + ctx->w1[3] |= w1[3]; + ctx->w2[0] |= w2[0]; + ctx->w2[1] |= w2[1]; + ctx->w2[2] |= w2[2]; + ctx->w2[3] |= w2[3]; + ctx->w3[0] |= w3[0]; + ctx->w3[1] |= w3[1]; + ctx->w3[2] |= w3[2]; + ctx->w3[3] |= w3[3]; + } + else + { + u32 c0[4] = { 0 }; + u32 c1[4] = { 0 }; + u32 c2[4] = { 0 }; + u32 c3[4] = { 0 }; + + switch_buffer_by_offset_carry_be_S (w0, w1, w2, w3, c0, c1, c2, c3, pos); + + ctx->w0[0] |= w0[0]; + ctx->w0[1] |= w0[1]; + ctx->w0[2] |= w0[2]; + ctx->w0[3] |= w0[3]; + ctx->w1[0] |= w1[0]; + ctx->w1[1] |= w1[1]; + ctx->w1[2] |= w1[2]; + ctx->w1[3] |= w1[3]; + ctx->w2[0] |= w2[0]; + ctx->w2[1] |= w2[1]; + ctx->w2[2] |= w2[2]; + ctx->w2[3] |= w2[3]; + ctx->w3[0] |= w3[0]; + ctx->w3[1] |= w3[1]; + ctx->w3[2] |= w3[2]; + ctx->w3[3] |= w3[3]; + + sm3_transform (ctx->w0, ctx->w1, ctx->w2, ctx->w3, ctx->h); + + ctx->w0[0] = c0[0]; + ctx->w0[1] = c0[1]; + ctx->w0[2] = c0[2]; + ctx->w0[3] = c0[3]; + ctx->w1[0] = c1[0]; + ctx->w1[1] = c1[1]; + ctx->w1[2] = c1[2]; + ctx->w1[3] = c1[3]; + ctx->w2[0] = c2[0]; + ctx->w2[1] = c2[1]; + ctx->w2[2] = c2[2]; + ctx->w2[3] = c2[3]; + ctx->w3[0] = c3[0]; + ctx->w3[1] = c3[1]; + ctx->w3[2] = c3[2]; + ctx->w3[3] = c3[3]; + } + } +} + +DECLSPEC void sm3_update (PRIVATE_AS sm3_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len) +{ + // printf("sm3_update\n"); + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + sm3_update_64 (ctx, w0, w1, w2, w3, 64); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + sm3_update_64 (ctx, w0, w1, w2, w3, len - pos1); +} + +DECLSPEC void sm3_update_swap (PRIVATE_AS sm3_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len) +{ + // printf("sm3_update_swap\n"); + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = hc_swap32_S (w0[0]); + w0[1] = hc_swap32_S (w0[1]); + w0[2] = hc_swap32_S (w0[2]); + w0[3] = hc_swap32_S (w0[3]); + w1[0] = hc_swap32_S (w1[0]); + w1[1] = hc_swap32_S (w1[1]); + w1[2] = hc_swap32_S (w1[2]); + w1[3] = hc_swap32_S (w1[3]); + w2[0] = hc_swap32_S (w2[0]); + w2[1] = hc_swap32_S (w2[1]); + w2[2] = hc_swap32_S (w2[2]); + w2[3] = hc_swap32_S (w2[3]); + w3[0] = hc_swap32_S (w3[0]); + w3[1] = hc_swap32_S (w3[1]); + w3[2] = hc_swap32_S (w3[2]); + w3[3] = hc_swap32_S (w3[3]); + + sm3_update_64 (ctx, w0, w1, w2, w3, 64); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = hc_swap32_S (w0[0]); + w0[1] = hc_swap32_S (w0[1]); + w0[2] = hc_swap32_S (w0[2]); + w0[3] = hc_swap32_S (w0[3]); + w1[0] = hc_swap32_S (w1[0]); + w1[1] = hc_swap32_S (w1[1]); + w1[2] = hc_swap32_S (w1[2]); + w1[3] = hc_swap32_S (w1[3]); + w2[0] = hc_swap32_S (w2[0]); + w2[1] = hc_swap32_S (w2[1]); + w2[2] = hc_swap32_S (w2[2]); + w2[3] = hc_swap32_S (w2[3]); + w3[0] = hc_swap32_S (w3[0]); + w3[1] = hc_swap32_S (w3[1]); + w3[2] = hc_swap32_S (w3[2]); + w3[3] = hc_swap32_S (w3[3]); + + sm3_update_64 (ctx, w0, w1, w2, w3, len - pos1); +} + +DECLSPEC void sm3_update_utf16le (PRIVATE_AS sm3_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len) +{ + // printf("sm3_update_utf16le\n"); + if (hc_enc_scan (w, len)) + { + hc_enc_t hc_enc; + + hc_enc_init (&hc_enc); + + while (hc_enc_has_next (&hc_enc, len)) + { + u32 enc_buf[16] = { 0 }; + + const int enc_len = hc_enc_next (&hc_enc, w, len, 256, enc_buf, sizeof (enc_buf)); + + if (enc_len == -1) + { + ctx->len = -1; + + return; + } + + sm3_update_64 (ctx, enc_buf + 0, enc_buf + 4, enc_buf + 8, enc_buf + 12, enc_len); + } + + return; + } + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le_S (w1, w2, w3); + make_utf16le_S (w0, w0, w1); + + sm3_update_64 (ctx, w0, w1, w2, w3, 32 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le_S (w1, w2, w3); + make_utf16le_S (w0, w0, w1); + + sm3_update_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); +} + +DECLSPEC void sm3_update_utf16le_swap (PRIVATE_AS sm3_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len) +{ + // printf("sm3_update_utf16le_swap\n"); + if (hc_enc_scan (w, len)) + { + hc_enc_t hc_enc; + + hc_enc_init (&hc_enc); + + while (hc_enc_has_next (&hc_enc, len)) + { + u32 enc_buf[16] = { 0 }; + + const int enc_len = hc_enc_next (&hc_enc, w, len, 256, enc_buf, sizeof (enc_buf)); + + if (enc_len == -1) + { + ctx->len = -1; + + return; + } + + enc_buf[ 0] = hc_swap32_S (enc_buf[ 0]); + enc_buf[ 1] = hc_swap32_S (enc_buf[ 1]); + enc_buf[ 2] = hc_swap32_S (enc_buf[ 2]); + enc_buf[ 3] = hc_swap32_S (enc_buf[ 3]); + enc_buf[ 4] = hc_swap32_S (enc_buf[ 4]); + enc_buf[ 5] = hc_swap32_S (enc_buf[ 5]); + enc_buf[ 6] = hc_swap32_S (enc_buf[ 6]); + enc_buf[ 7] = hc_swap32_S (enc_buf[ 7]); + enc_buf[ 8] = hc_swap32_S (enc_buf[ 8]); + enc_buf[ 9] = hc_swap32_S (enc_buf[ 9]); + enc_buf[10] = hc_swap32_S (enc_buf[10]); + enc_buf[11] = hc_swap32_S (enc_buf[11]); + enc_buf[12] = hc_swap32_S (enc_buf[12]); + enc_buf[13] = hc_swap32_S (enc_buf[13]); + enc_buf[14] = hc_swap32_S (enc_buf[14]); + enc_buf[15] = hc_swap32_S (enc_buf[15]); + + sm3_update_64 (ctx, enc_buf + 0, enc_buf + 4, enc_buf + 8, enc_buf + 12, enc_len); + } + + return; + } + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le_S (w1, w2, w3); + make_utf16le_S (w0, w0, w1); + + w0[0] = hc_swap32_S (w0[0]); + w0[1] = hc_swap32_S (w0[1]); + w0[2] = hc_swap32_S (w0[2]); + w0[3] = hc_swap32_S (w0[3]); + w1[0] = hc_swap32_S (w1[0]); + w1[1] = hc_swap32_S (w1[1]); + w1[2] = hc_swap32_S (w1[2]); + w1[3] = hc_swap32_S (w1[3]); + w2[0] = hc_swap32_S (w2[0]); + w2[1] = hc_swap32_S (w2[1]); + w2[2] = hc_swap32_S (w2[2]); + w2[3] = hc_swap32_S (w2[3]); + w3[0] = hc_swap32_S (w3[0]); + w3[1] = hc_swap32_S (w3[1]); + w3[2] = hc_swap32_S (w3[2]); + w3[3] = hc_swap32_S (w3[3]); + + sm3_update_64 (ctx, w0, w1, w2, w3, 32 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le_S (w1, w2, w3); + make_utf16le_S (w0, w0, w1); + + w0[0] = hc_swap32_S (w0[0]); + w0[1] = hc_swap32_S (w0[1]); + w0[2] = hc_swap32_S (w0[2]); + w0[3] = hc_swap32_S (w0[3]); + w1[0] = hc_swap32_S (w1[0]); + w1[1] = hc_swap32_S (w1[1]); + w1[2] = hc_swap32_S (w1[2]); + w1[3] = hc_swap32_S (w1[3]); + w2[0] = hc_swap32_S (w2[0]); + w2[1] = hc_swap32_S (w2[1]); + w2[2] = hc_swap32_S (w2[2]); + w2[3] = hc_swap32_S (w2[3]); + w3[0] = hc_swap32_S (w3[0]); + w3[1] = hc_swap32_S (w3[1]); + w3[2] = hc_swap32_S (w3[2]); + w3[3] = hc_swap32_S (w3[3]); + + sm3_update_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); +} + +DECLSPEC void sm3_update_global (PRIVATE_AS sm3_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len) +{ + // printf("sm3_update_global\n"); + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + sm3_update_64 (ctx, w0, w1, w2, w3, 64); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + sm3_update_64 (ctx, w0, w1, w2, w3, len - pos1); +} + +DECLSPEC void sm3_update_global_swap (PRIVATE_AS sm3_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len) +{ + // printf("sm3_update_global_swap\n"); + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = hc_swap32_S (w0[0]); + w0[1] = hc_swap32_S (w0[1]); + w0[2] = hc_swap32_S (w0[2]); + w0[3] = hc_swap32_S (w0[3]); + w1[0] = hc_swap32_S (w1[0]); + w1[1] = hc_swap32_S (w1[1]); + w1[2] = hc_swap32_S (w1[2]); + w1[3] = hc_swap32_S (w1[3]); + w2[0] = hc_swap32_S (w2[0]); + w2[1] = hc_swap32_S (w2[1]); + w2[2] = hc_swap32_S (w2[2]); + w2[3] = hc_swap32_S (w2[3]); + w3[0] = hc_swap32_S (w3[0]); + w3[1] = hc_swap32_S (w3[1]); + w3[2] = hc_swap32_S (w3[2]); + w3[3] = hc_swap32_S (w3[3]); + + sm3_update_64 (ctx, w0, w1, w2, w3, 64); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = hc_swap32_S (w0[0]); + w0[1] = hc_swap32_S (w0[1]); + w0[2] = hc_swap32_S (w0[2]); + w0[3] = hc_swap32_S (w0[3]); + w1[0] = hc_swap32_S (w1[0]); + w1[1] = hc_swap32_S (w1[1]); + w1[2] = hc_swap32_S (w1[2]); + w1[3] = hc_swap32_S (w1[3]); + w2[0] = hc_swap32_S (w2[0]); + w2[1] = hc_swap32_S (w2[1]); + w2[2] = hc_swap32_S (w2[2]); + w2[3] = hc_swap32_S (w2[3]); + w3[0] = hc_swap32_S (w3[0]); + w3[1] = hc_swap32_S (w3[1]); + w3[2] = hc_swap32_S (w3[2]); + w3[3] = hc_swap32_S (w3[3]); + + sm3_update_64 (ctx, w0, w1, w2, w3, len - pos1); +} + +DECLSPEC void sm3_update_global_utf16le (PRIVATE_AS sm3_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len) +{ + // printf("sm3_update_global_utf16le\n"); + if (hc_enc_scan_global (w, len)) + { + hc_enc_t hc_enc; + + hc_enc_init (&hc_enc); + + while (hc_enc_has_next (&hc_enc, len)) + { + u32 enc_buf[16] = { 0 }; + + const int enc_len = hc_enc_next_global (&hc_enc, w, len, 256, enc_buf, sizeof (enc_buf)); + + if (enc_len == -1) + { + ctx->len = -1; + + return; + } + + sm3_update_64 (ctx, enc_buf + 0, enc_buf + 4, enc_buf + 8, enc_buf + 12, enc_len); + } + + return; + } + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le_S (w1, w2, w3); + make_utf16le_S (w0, w0, w1); + + sm3_update_64 (ctx, w0, w1, w2, w3, 32 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le_S (w1, w2, w3); + make_utf16le_S (w0, w0, w1); + + sm3_update_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); +} + +DECLSPEC void sm3_update_global_utf16le_swap (PRIVATE_AS sm3_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len) +{ + // printf("sm3_update_global_utf16le_swap\n"); + if (hc_enc_scan_global (w, len)) + { + hc_enc_t hc_enc; + + hc_enc_init (&hc_enc); + + while (hc_enc_has_next (&hc_enc, len)) + { + u32 enc_buf[16] = { 0 }; + + const int enc_len = hc_enc_next_global (&hc_enc, w, len, 256, enc_buf, sizeof (enc_buf)); + + if (enc_len == -1) + { + ctx->len = -1; + + return; + } + + enc_buf[ 0] = hc_swap32_S (enc_buf[ 0]); + enc_buf[ 1] = hc_swap32_S (enc_buf[ 1]); + enc_buf[ 2] = hc_swap32_S (enc_buf[ 2]); + enc_buf[ 3] = hc_swap32_S (enc_buf[ 3]); + enc_buf[ 4] = hc_swap32_S (enc_buf[ 4]); + enc_buf[ 5] = hc_swap32_S (enc_buf[ 5]); + enc_buf[ 6] = hc_swap32_S (enc_buf[ 6]); + enc_buf[ 7] = hc_swap32_S (enc_buf[ 7]); + enc_buf[ 8] = hc_swap32_S (enc_buf[ 8]); + enc_buf[ 9] = hc_swap32_S (enc_buf[ 9]); + enc_buf[10] = hc_swap32_S (enc_buf[10]); + enc_buf[11] = hc_swap32_S (enc_buf[11]); + enc_buf[12] = hc_swap32_S (enc_buf[12]); + enc_buf[13] = hc_swap32_S (enc_buf[13]); + enc_buf[14] = hc_swap32_S (enc_buf[14]); + enc_buf[15] = hc_swap32_S (enc_buf[15]); + + sm3_update_64 (ctx, enc_buf + 0, enc_buf + 4, enc_buf + 8, enc_buf + 12, enc_len); + } + + return; + } + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le_S (w1, w2, w3); + make_utf16le_S (w0, w0, w1); + + w0[0] = hc_swap32_S (w0[0]); + w0[1] = hc_swap32_S (w0[1]); + w0[2] = hc_swap32_S (w0[2]); + w0[3] = hc_swap32_S (w0[3]); + w1[0] = hc_swap32_S (w1[0]); + w1[1] = hc_swap32_S (w1[1]); + w1[2] = hc_swap32_S (w1[2]); + w1[3] = hc_swap32_S (w1[3]); + w2[0] = hc_swap32_S (w2[0]); + w2[1] = hc_swap32_S (w2[1]); + w2[2] = hc_swap32_S (w2[2]); + w2[3] = hc_swap32_S (w2[3]); + w3[0] = hc_swap32_S (w3[0]); + w3[1] = hc_swap32_S (w3[1]); + w3[2] = hc_swap32_S (w3[2]); + w3[3] = hc_swap32_S (w3[3]); + + sm3_update_64 (ctx, w0, w1, w2, w3, 32 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le_S (w1, w2, w3); + make_utf16le_S (w0, w0, w1); + + w0[0] = hc_swap32_S (w0[0]); + w0[1] = hc_swap32_S (w0[1]); + w0[2] = hc_swap32_S (w0[2]); + w0[3] = hc_swap32_S (w0[3]); + w1[0] = hc_swap32_S (w1[0]); + w1[1] = hc_swap32_S (w1[1]); + w1[2] = hc_swap32_S (w1[2]); + w1[3] = hc_swap32_S (w1[3]); + w2[0] = hc_swap32_S (w2[0]); + w2[1] = hc_swap32_S (w2[1]); + w2[2] = hc_swap32_S (w2[2]); + w2[3] = hc_swap32_S (w2[3]); + w3[0] = hc_swap32_S (w3[0]); + w3[1] = hc_swap32_S (w3[1]); + w3[2] = hc_swap32_S (w3[2]); + w3[3] = hc_swap32_S (w3[3]); + + sm3_update_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); +} + +DECLSPEC void sm3_final (PRIVATE_AS sm3_ctx_t *ctx) +{ + // printf("sm3_final\n"); + const int pos = ctx->len & 63; + + // LOG_CTX_BUF_16("Buffer before padding"); + + append_0x80_4x4_S (ctx->w0, ctx->w1, ctx->w2, ctx->w3, pos ^ 3); + + // LOG_CTX_BUF_16("Buffer after padding"); + + if (pos >= 56) + { + sm3_transform (ctx->w0, ctx->w1, ctx->w2, ctx->w3, ctx->h); + + ctx->w0[0] = 0; + ctx->w0[1] = 0; + ctx->w0[2] = 0; + ctx->w0[3] = 0; + ctx->w1[0] = 0; + ctx->w1[1] = 0; + ctx->w1[2] = 0; + ctx->w1[3] = 0; + ctx->w2[0] = 0; + ctx->w2[1] = 0; + ctx->w2[2] = 0; + ctx->w2[3] = 0; + ctx->w3[0] = 0; + ctx->w3[1] = 0; + ctx->w3[2] = 0; + ctx->w3[3] = 0; + } + + ctx->w3[2] = 0; + ctx->w3[3] = ctx->len * 8; + + sm3_transform (ctx->w0, ctx->w1, ctx->w2, ctx->w3, ctx->h); + + // LOG_CTX_BUF_16("Buffer after transform"); + // LOG_CTX_DIGEST("sm3_final : computed digest"); +} + +// while input buf can be a vector datatype, the length of the different elements can not + +DECLSPEC void sm3_transform_vector (PRIVATE_AS const u32x *w0, PRIVATE_AS const u32x *w1, PRIVATE_AS const u32x *w2, PRIVATE_AS const u32x *w3, PRIVATE_AS u32x *digest) +{ + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; + u32x e = digest[4]; + u32x f = digest[5]; + u32x g = digest[6]; + u32x h = digest[7]; + + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; + + // SM3 main loop, the Compression Function (CF) and Message Expansion (ME) are executed + // step-by-step. SM3_R1 use SM3_FF0 and SM3_GG0 functions for index 0 to 15 and SM3_R2 + // use SM3_FF1 and SM3_GG1 functions for index 16 to 63. + SM3_R1(a, b, c, d, e, f, g, h, SM3_T00, w0_t, w0_t ^ w4_t); + w0_t = SM3_EXPAND(w0_t, w7_t, wd_t, w3_t, wa_t); + SM3_R1(d, a, b, c, h, e, f, g, SM3_T01, w1_t, w1_t ^ w5_t); + w1_t = SM3_EXPAND(w1_t, w8_t, we_t, w4_t, wb_t); + SM3_R1(c, d, a, b, g, h, e, f, SM3_T02, w2_t, w2_t ^ w6_t); + w2_t = SM3_EXPAND(w2_t, w9_t, wf_t, w5_t, wc_t); + SM3_R1(b, c, d, a, f, g, h, e, SM3_T03, w3_t, w3_t ^ w7_t); + w3_t = SM3_EXPAND(w3_t, wa_t, w0_t, w6_t, wd_t); + SM3_R1(a, b, c, d, e, f, g, h, SM3_T04, w4_t, w4_t ^ w8_t); + w4_t = SM3_EXPAND(w4_t, wb_t, w1_t, w7_t, we_t); + SM3_R1(d, a, b, c, h, e, f, g, SM3_T05, w5_t, w5_t ^ w9_t); + w5_t = SM3_EXPAND(w5_t, wc_t, w2_t, w8_t, wf_t); + SM3_R1(c, d, a, b, g, h, e, f, SM3_T06, w6_t, w6_t ^ wa_t); + w6_t = SM3_EXPAND(w6_t, wd_t, w3_t, w9_t, w0_t); + SM3_R1(b, c, d, a, f, g, h, e, SM3_T07, w7_t, w7_t ^ wb_t); + w7_t = SM3_EXPAND(w7_t, we_t, w4_t, wa_t, w1_t); + SM3_R1(a, b, c, d, e, f, g, h, SM3_T08, w8_t, w8_t ^ wc_t); + w8_t = SM3_EXPAND(w8_t, wf_t, w5_t, wb_t, w2_t); + SM3_R1(d, a, b, c, h, e, f, g, SM3_T09, w9_t, w9_t ^ wd_t); + w9_t = SM3_EXPAND(w9_t, w0_t, w6_t, wc_t, w3_t); + SM3_R1(c, d, a, b, g, h, e, f, SM3_T10, wa_t, wa_t ^ we_t); + wa_t = SM3_EXPAND(wa_t, w1_t, w7_t, wd_t, w4_t); + SM3_R1(b, c, d, a, f, g, h, e, SM3_T11, wb_t, wb_t ^ wf_t); + wb_t = SM3_EXPAND(wb_t, w2_t, w8_t, we_t, w5_t); + SM3_R1(a, b, c, d, e, f, g, h, SM3_T12, wc_t, wc_t ^ w0_t); + wc_t = SM3_EXPAND(wc_t, w3_t, w9_t, wf_t, w6_t); + SM3_R1(d, a, b, c, h, e, f, g, SM3_T13, wd_t, wd_t ^ w1_t); + wd_t = SM3_EXPAND(wd_t, w4_t, wa_t, w0_t, w7_t); + SM3_R1(c, d, a, b, g, h, e, f, SM3_T14, we_t, we_t ^ w2_t); + we_t = SM3_EXPAND(we_t, w5_t, wb_t, w1_t, w8_t); + SM3_R1(b, c, d, a, f, g, h, e, SM3_T15, wf_t, wf_t ^ w3_t); + wf_t = SM3_EXPAND(wf_t, w6_t, wc_t, w2_t, w9_t); + // Index = 16, switch to SM3_R2 + SM3_R2(a, b, c, d, e, f, g, h, SM3_T16, w0_t, w0_t ^ w4_t); + w0_t = SM3_EXPAND(w0_t, w7_t, wd_t, w3_t, wa_t); + SM3_R2(d, a, b, c, h, e, f, g, SM3_T17, w1_t, w1_t ^ w5_t); + w1_t = SM3_EXPAND(w1_t, w8_t, we_t, w4_t, wb_t); + SM3_R2(c, d, a, b, g, h, e, f, SM3_T18, w2_t, w2_t ^ w6_t); + w2_t = SM3_EXPAND(w2_t, w9_t, wf_t, w5_t, wc_t); + SM3_R2(b, c, d, a, f, g, h, e, SM3_T19, w3_t, w3_t ^ w7_t); + w3_t = SM3_EXPAND(w3_t, wa_t, w0_t, w6_t, wd_t); + SM3_R2(a, b, c, d, e, f, g, h, SM3_T20, w4_t, w4_t ^ w8_t); + w4_t = SM3_EXPAND(w4_t, wb_t, w1_t, w7_t, we_t); + SM3_R2(d, a, b, c, h, e, f, g, SM3_T21, w5_t, w5_t ^ w9_t); + w5_t = SM3_EXPAND(w5_t, wc_t, w2_t, w8_t, wf_t); + SM3_R2(c, d, a, b, g, h, e, f, SM3_T22, w6_t, w6_t ^ wa_t); + w6_t = SM3_EXPAND(w6_t, wd_t, w3_t, w9_t, w0_t); + SM3_R2(b, c, d, a, f, g, h, e, SM3_T23, w7_t, w7_t ^ wb_t); + w7_t = SM3_EXPAND(w7_t, we_t, w4_t, wa_t, w1_t); + SM3_R2(a, b, c, d, e, f, g, h, SM3_T24, w8_t, w8_t ^ wc_t); + w8_t = SM3_EXPAND(w8_t, wf_t, w5_t, wb_t, w2_t); + SM3_R2(d, a, b, c, h, e, f, g, SM3_T25, w9_t, w9_t ^ wd_t); + w9_t = SM3_EXPAND(w9_t, w0_t, w6_t, wc_t, w3_t); + SM3_R2(c, d, a, b, g, h, e, f, SM3_T26, wa_t, wa_t ^ we_t); + wa_t = SM3_EXPAND(wa_t, w1_t, w7_t, wd_t, w4_t); + SM3_R2(b, c, d, a, f, g, h, e, SM3_T27, wb_t, wb_t ^ wf_t); + wb_t = SM3_EXPAND(wb_t, w2_t, w8_t, we_t, w5_t); + SM3_R2(a, b, c, d, e, f, g, h, SM3_T28, wc_t, wc_t ^ w0_t); + wc_t = SM3_EXPAND(wc_t, w3_t, w9_t, wf_t, w6_t); + SM3_R2(d, a, b, c, h, e, f, g, SM3_T29, wd_t, wd_t ^ w1_t); + wd_t = SM3_EXPAND(wd_t, w4_t, wa_t, w0_t, w7_t); + SM3_R2(c, d, a, b, g, h, e, f, SM3_T30, we_t, we_t ^ w2_t); + we_t = SM3_EXPAND(we_t, w5_t, wb_t, w1_t, w8_t); + SM3_R2(b, c, d, a, f, g, h, e, SM3_T31, wf_t, wf_t ^ w3_t); + wf_t = SM3_EXPAND(wf_t, w6_t, wc_t, w2_t, w9_t); + SM3_R2(a, b, c, d, e, f, g, h, SM3_T32, w0_t, w0_t ^ w4_t); + w0_t = SM3_EXPAND(w0_t, w7_t, wd_t, w3_t, wa_t); + SM3_R2(d, a, b, c, h, e, f, g, SM3_T33, w1_t, w1_t ^ w5_t); + w1_t = SM3_EXPAND(w1_t, w8_t, we_t, w4_t, wb_t); + SM3_R2(c, d, a, b, g, h, e, f, SM3_T34, w2_t, w2_t ^ w6_t); + w2_t = SM3_EXPAND(w2_t, w9_t, wf_t, w5_t, wc_t); + SM3_R2(b, c, d, a, f, g, h, e, SM3_T35, w3_t, w3_t ^ w7_t); + w3_t = SM3_EXPAND(w3_t, wa_t, w0_t, w6_t, wd_t); + SM3_R2(a, b, c, d, e, f, g, h, SM3_T36, w4_t, w4_t ^ w8_t); + w4_t = SM3_EXPAND(w4_t, wb_t, w1_t, w7_t, we_t); + SM3_R2(d, a, b, c, h, e, f, g, SM3_T37, w5_t, w5_t ^ w9_t); + w5_t = SM3_EXPAND(w5_t, wc_t, w2_t, w8_t, wf_t); + SM3_R2(c, d, a, b, g, h, e, f, SM3_T38, w6_t, w6_t ^ wa_t); + w6_t = SM3_EXPAND(w6_t, wd_t, w3_t, w9_t, w0_t); + SM3_R2(b, c, d, a, f, g, h, e, SM3_T39, w7_t, w7_t ^ wb_t); + w7_t = SM3_EXPAND(w7_t, we_t, w4_t, wa_t, w1_t); + SM3_R2(a, b, c, d, e, f, g, h, SM3_T40, w8_t, w8_t ^ wc_t); + w8_t = SM3_EXPAND(w8_t, wf_t, w5_t, wb_t, w2_t); + SM3_R2(d, a, b, c, h, e, f, g, SM3_T41, w9_t, w9_t ^ wd_t); + w9_t = SM3_EXPAND(w9_t, w0_t, w6_t, wc_t, w3_t); + SM3_R2(c, d, a, b, g, h, e, f, SM3_T42, wa_t, wa_t ^ we_t); + wa_t = SM3_EXPAND(wa_t, w1_t, w7_t, wd_t, w4_t); + SM3_R2(b, c, d, a, f, g, h, e, SM3_T43, wb_t, wb_t ^ wf_t); + wb_t = SM3_EXPAND(wb_t, w2_t, w8_t, we_t, w5_t); + SM3_R2(a, b, c, d, e, f, g, h, SM3_T44, wc_t, wc_t ^ w0_t); + wc_t = SM3_EXPAND(wc_t, w3_t, w9_t, wf_t, w6_t); + SM3_R2(d, a, b, c, h, e, f, g, SM3_T45, wd_t, wd_t ^ w1_t); + wd_t = SM3_EXPAND(wd_t, w4_t, wa_t, w0_t, w7_t); + SM3_R2(c, d, a, b, g, h, e, f, SM3_T46, we_t, we_t ^ w2_t); + we_t = SM3_EXPAND(we_t, w5_t, wb_t, w1_t, w8_t); + SM3_R2(b, c, d, a, f, g, h, e, SM3_T47, wf_t, wf_t ^ w3_t); + wf_t = SM3_EXPAND(wf_t, w6_t, wc_t, w2_t, w9_t); + SM3_R2(a, b, c, d, e, f, g, h, SM3_T48, w0_t, w0_t ^ w4_t); + w0_t = SM3_EXPAND(w0_t, w7_t, wd_t, w3_t, wa_t); + SM3_R2(d, a, b, c, h, e, f, g, SM3_T49, w1_t, w1_t ^ w5_t); + w1_t = SM3_EXPAND(w1_t, w8_t, we_t, w4_t, wb_t); + SM3_R2(c, d, a, b, g, h, e, f, SM3_T50, w2_t, w2_t ^ w6_t); + w2_t = SM3_EXPAND(w2_t, w9_t, wf_t, w5_t, wc_t); + SM3_R2(b, c, d, a, f, g, h, e, SM3_T51, w3_t, w3_t ^ w7_t); + w3_t = SM3_EXPAND(w3_t, wa_t, w0_t, w6_t, wd_t); + // No more ME for index 52 to 63. + SM3_R2(a, b, c, d, e, f, g, h, SM3_T52, w4_t, w4_t ^ w8_t); + SM3_R2(d, a, b, c, h, e, f, g, SM3_T53, w5_t, w5_t ^ w9_t); + SM3_R2(c, d, a, b, g, h, e, f, SM3_T54, w6_t, w6_t ^ wa_t); + SM3_R2(b, c, d, a, f, g, h, e, SM3_T55, w7_t, w7_t ^ wb_t); + SM3_R2(a, b, c, d, e, f, g, h, SM3_T56, w8_t, w8_t ^ wc_t); + SM3_R2(d, a, b, c, h, e, f, g, SM3_T57, w9_t, w9_t ^ wd_t); + SM3_R2(c, d, a, b, g, h, e, f, SM3_T58, wa_t, wa_t ^ we_t); + SM3_R2(b, c, d, a, f, g, h, e, SM3_T59, wb_t, wb_t ^ wf_t); + SM3_R2(a, b, c, d, e, f, g, h, SM3_T60, wc_t, wc_t ^ w0_t); + SM3_R2(d, a, b, c, h, e, f, g, SM3_T61, wd_t, wd_t ^ w1_t); + SM3_R2(c, d, a, b, g, h, e, f, SM3_T62, we_t, we_t ^ w2_t); + SM3_R2(b, c, d, a, f, g, h, e, SM3_T63, wf_t, wf_t ^ w3_t); + + digest[0] ^= a; + digest[1] ^= b; + digest[2] ^= c; + digest[3] ^= d; + digest[4] ^= e; + digest[5] ^= f; + digest[6] ^= g; + digest[7] ^= h; +} + +DECLSPEC void sm3_init_vector (PRIVATE_AS sm3_ctx_vector_t *ctx) +{ + ctx->h[0] = SM3_IV_A; + ctx->h[1] = SM3_IV_B; + ctx->h[2] = SM3_IV_C; + ctx->h[3] = SM3_IV_D; + ctx->h[4] = SM3_IV_E; + ctx->h[5] = SM3_IV_F; + ctx->h[6] = SM3_IV_G; + ctx->h[7] = SM3_IV_H; + + ctx->w0[0] = 0; + ctx->w0[1] = 0; + ctx->w0[2] = 0; + ctx->w0[3] = 0; + ctx->w1[0] = 0; + ctx->w1[1] = 0; + ctx->w1[2] = 0; + ctx->w1[3] = 0; + ctx->w2[0] = 0; + ctx->w2[1] = 0; + ctx->w2[2] = 0; + ctx->w2[3] = 0; + ctx->w3[0] = 0; + ctx->w3[1] = 0; + ctx->w3[2] = 0; + ctx->w3[3] = 0; + + ctx->len = 0; +} + +DECLSPEC void sm3_init_vector_from_scalar (PRIVATE_AS sm3_ctx_vector_t *ctx, PRIVATE_AS sm3_ctx_t *ctx0) +{ + ctx->h[0] = ctx0->h[0]; + ctx->h[1] = ctx0->h[1]; + ctx->h[2] = ctx0->h[2]; + ctx->h[3] = ctx0->h[3]; + ctx->h[4] = ctx0->h[4]; + ctx->h[5] = ctx0->h[5]; + ctx->h[6] = ctx0->h[6]; + ctx->h[7] = ctx0->h[7]; + + ctx->w0[0] = ctx0->w0[0]; + ctx->w0[1] = ctx0->w0[1]; + ctx->w0[2] = ctx0->w0[2]; + ctx->w0[3] = ctx0->w0[3]; + ctx->w1[0] = ctx0->w1[0]; + ctx->w1[1] = ctx0->w1[1]; + ctx->w1[2] = ctx0->w1[2]; + ctx->w1[3] = ctx0->w1[3]; + ctx->w2[0] = ctx0->w2[0]; + ctx->w2[1] = ctx0->w2[1]; + ctx->w2[2] = ctx0->w2[2]; + ctx->w2[3] = ctx0->w2[3]; + ctx->w3[0] = ctx0->w3[0]; + ctx->w3[1] = ctx0->w3[1]; + ctx->w3[2] = ctx0->w3[2]; + ctx->w3[3] = ctx0->w3[3]; + + ctx->len = ctx0->len; +} + +DECLSPEC void sm3_update_vector_64 (PRIVATE_AS sm3_ctx_vector_t *ctx, PRIVATE_AS u32x *w0, PRIVATE_AS u32x *w1, PRIVATE_AS u32x *w2, PRIVATE_AS u32x *w3, const int len) +{ + if (len == 0) return; + + const int pos = ctx->len & 63; + + ctx->len += len; + + if (pos == 0) + { + ctx->w0[0] = w0[0]; + ctx->w0[1] = w0[1]; + ctx->w0[2] = w0[2]; + ctx->w0[3] = w0[3]; + ctx->w1[0] = w1[0]; + ctx->w1[1] = w1[1]; + ctx->w1[2] = w1[2]; + ctx->w1[3] = w1[3]; + ctx->w2[0] = w2[0]; + ctx->w2[1] = w2[1]; + ctx->w2[2] = w2[2]; + ctx->w2[3] = w2[3]; + ctx->w3[0] = w3[0]; + ctx->w3[1] = w3[1]; + ctx->w3[2] = w3[2]; + ctx->w3[3] = w3[3]; + + if (len == 64) + { + sm3_transform_vector (ctx->w0, ctx->w1, ctx->w2, ctx->w3, ctx->h); + + ctx->w0[0] = 0; + ctx->w0[1] = 0; + ctx->w0[2] = 0; + ctx->w0[3] = 0; + ctx->w1[0] = 0; + ctx->w1[1] = 0; + ctx->w1[2] = 0; + ctx->w1[3] = 0; + ctx->w2[0] = 0; + ctx->w2[1] = 0; + ctx->w2[2] = 0; + ctx->w2[3] = 0; + ctx->w3[0] = 0; + ctx->w3[1] = 0; + ctx->w3[2] = 0; + ctx->w3[3] = 0; + } + } + else + { + if ((pos + len) < 64) + { + switch_buffer_by_offset_be (w0, w1, w2, w3, pos); + + ctx->w0[0] |= w0[0]; + ctx->w0[1] |= w0[1]; + ctx->w0[2] |= w0[2]; + ctx->w0[3] |= w0[3]; + ctx->w1[0] |= w1[0]; + ctx->w1[1] |= w1[1]; + ctx->w1[2] |= w1[2]; + ctx->w1[3] |= w1[3]; + ctx->w2[0] |= w2[0]; + ctx->w2[1] |= w2[1]; + ctx->w2[2] |= w2[2]; + ctx->w2[3] |= w2[3]; + ctx->w3[0] |= w3[0]; + ctx->w3[1] |= w3[1]; + ctx->w3[2] |= w3[2]; + ctx->w3[3] |= w3[3]; + } + else + { + u32x c0[4] = { 0 }; + u32x c1[4] = { 0 }; + u32x c2[4] = { 0 }; + u32x c3[4] = { 0 }; + + switch_buffer_by_offset_carry_be (w0, w1, w2, w3, c0, c1, c2, c3, pos); + + ctx->w0[0] |= w0[0]; + ctx->w0[1] |= w0[1]; + ctx->w0[2] |= w0[2]; + ctx->w0[3] |= w0[3]; + ctx->w1[0] |= w1[0]; + ctx->w1[1] |= w1[1]; + ctx->w1[2] |= w1[2]; + ctx->w1[3] |= w1[3]; + ctx->w2[0] |= w2[0]; + ctx->w2[1] |= w2[1]; + ctx->w2[2] |= w2[2]; + ctx->w2[3] |= w2[3]; + ctx->w3[0] |= w3[0]; + ctx->w3[1] |= w3[1]; + ctx->w3[2] |= w3[2]; + ctx->w3[3] |= w3[3]; + + sm3_transform_vector (ctx->w0, ctx->w1, ctx->w2, ctx->w3, ctx->h); + + ctx->w0[0] = c0[0]; + ctx->w0[1] = c0[1]; + ctx->w0[2] = c0[2]; + ctx->w0[3] = c0[3]; + ctx->w1[0] = c1[0]; + ctx->w1[1] = c1[1]; + ctx->w1[2] = c1[2]; + ctx->w1[3] = c1[3]; + ctx->w2[0] = c2[0]; + ctx->w2[1] = c2[1]; + ctx->w2[2] = c2[2]; + ctx->w2[3] = c2[3]; + ctx->w3[0] = c3[0]; + ctx->w3[1] = c3[1]; + ctx->w3[2] = c3[2]; + ctx->w3[3] = c3[3]; + } + } +} + +DECLSPEC void sm3_update_vector (PRIVATE_AS sm3_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + sm3_update_vector_64 (ctx, w0, w1, w2, w3, 64); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + sm3_update_vector_64 (ctx, w0, w1, w2, w3, len - pos1); +} + +DECLSPEC void sm3_update_vector_swap (PRIVATE_AS sm3_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = hc_swap32 (w0[0]); + w0[1] = hc_swap32 (w0[1]); + w0[2] = hc_swap32 (w0[2]); + w0[3] = hc_swap32 (w0[3]); + w1[0] = hc_swap32 (w1[0]); + w1[1] = hc_swap32 (w1[1]); + w1[2] = hc_swap32 (w1[2]); + w1[3] = hc_swap32 (w1[3]); + w2[0] = hc_swap32 (w2[0]); + w2[1] = hc_swap32 (w2[1]); + w2[2] = hc_swap32 (w2[2]); + w2[3] = hc_swap32 (w2[3]); + w3[0] = hc_swap32 (w3[0]); + w3[1] = hc_swap32 (w3[1]); + w3[2] = hc_swap32 (w3[2]); + w3[3] = hc_swap32 (w3[3]); + + sm3_update_vector_64 (ctx, w0, w1, w2, w3, 64); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = hc_swap32 (w0[0]); + w0[1] = hc_swap32 (w0[1]); + w0[2] = hc_swap32 (w0[2]); + w0[3] = hc_swap32 (w0[3]); + w1[0] = hc_swap32 (w1[0]); + w1[1] = hc_swap32 (w1[1]); + w1[2] = hc_swap32 (w1[2]); + w1[3] = hc_swap32 (w1[3]); + w2[0] = hc_swap32 (w2[0]); + w2[1] = hc_swap32 (w2[1]); + w2[2] = hc_swap32 (w2[2]); + w2[3] = hc_swap32 (w2[3]); + w3[0] = hc_swap32 (w3[0]); + w3[1] = hc_swap32 (w3[1]); + w3[2] = hc_swap32 (w3[2]); + w3[3] = hc_swap32 (w3[3]); + + sm3_update_vector_64 (ctx, w0, w1, w2, w3, len - pos1); +} + +DECLSPEC void sm3_update_vector_utf16le (PRIVATE_AS sm3_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le (w1, w2, w3); + make_utf16le (w0, w0, w1); + + sm3_update_vector_64 (ctx, w0, w1, w2, w3, 32 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le (w1, w2, w3); + make_utf16le (w0, w0, w1); + + sm3_update_vector_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); +} + +DECLSPEC void sm3_update_vector_utf16le_swap (PRIVATE_AS sm3_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le (w1, w2, w3); + make_utf16le (w0, w0, w1); + + w0[0] = hc_swap32 (w0[0]); + w0[1] = hc_swap32 (w0[1]); + w0[2] = hc_swap32 (w0[2]); + w0[3] = hc_swap32 (w0[3]); + w1[0] = hc_swap32 (w1[0]); + w1[1] = hc_swap32 (w1[1]); + w1[2] = hc_swap32 (w1[2]); + w1[3] = hc_swap32 (w1[3]); + w2[0] = hc_swap32 (w2[0]); + w2[1] = hc_swap32 (w2[1]); + w2[2] = hc_swap32 (w2[2]); + w2[3] = hc_swap32 (w2[3]); + w3[0] = hc_swap32 (w3[0]); + w3[1] = hc_swap32 (w3[1]); + w3[2] = hc_swap32 (w3[2]); + w3[3] = hc_swap32 (w3[3]); + + sm3_update_vector_64 (ctx, w0, w1, w2, w3, 32 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le (w1, w2, w3); + make_utf16le (w0, w0, w1); + + w0[0] = hc_swap32 (w0[0]); + w0[1] = hc_swap32 (w0[1]); + w0[2] = hc_swap32 (w0[2]); + w0[3] = hc_swap32 (w0[3]); + w1[0] = hc_swap32 (w1[0]); + w1[1] = hc_swap32 (w1[1]); + w1[2] = hc_swap32 (w1[2]); + w1[3] = hc_swap32 (w1[3]); + w2[0] = hc_swap32 (w2[0]); + w2[1] = hc_swap32 (w2[1]); + w2[2] = hc_swap32 (w2[2]); + w2[3] = hc_swap32 (w2[3]); + w3[0] = hc_swap32 (w3[0]); + w3[1] = hc_swap32 (w3[1]); + w3[2] = hc_swap32 (w3[2]); + w3[3] = hc_swap32 (w3[3]); + + sm3_update_vector_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); +} + +DECLSPEC void sm3_update_vector_utf16beN (PRIVATE_AS sm3_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16beN (w1, w2, w3); + make_utf16beN (w0, w0, w1); + + sm3_update_vector_64 (ctx, w0, w1, w2, w3, 32 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16beN (w1, w2, w3); + make_utf16beN (w0, w0, w1); + + sm3_update_vector_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); +} + +DECLSPEC void sm3_final_vector (PRIVATE_AS sm3_ctx_vector_t *ctx) +{ + const int pos = ctx->len & 63; + + append_0x80_4x4 (ctx->w0, ctx->w1, ctx->w2, ctx->w3, pos ^ 3); + + if (pos >= 56) + { + sm3_transform_vector (ctx->w0, ctx->w1, ctx->w2, ctx->w3, ctx->h); + + ctx->w0[0] = 0; + ctx->w0[1] = 0; + ctx->w0[2] = 0; + ctx->w0[3] = 0; + ctx->w1[0] = 0; + ctx->w1[1] = 0; + ctx->w1[2] = 0; + ctx->w1[3] = 0; + ctx->w2[0] = 0; + ctx->w2[1] = 0; + ctx->w2[2] = 0; + ctx->w2[3] = 0; + ctx->w3[0] = 0; + ctx->w3[1] = 0; + ctx->w3[2] = 0; + ctx->w3[3] = 0; + } + + ctx->w3[2] = 0; + ctx->w3[3] = ctx->len * 8; + + sm3_transform_vector (ctx->w0, ctx->w1, ctx->w2, ctx->w3, ctx->h); +} diff --git a/OpenCL/inc_hash_sm3.h b/OpenCL/inc_hash_sm3.h new file mode 100644 index 000000000..466818a78 --- /dev/null +++ b/OpenCL/inc_hash_sm3.h @@ -0,0 +1,126 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#ifndef INC_HASH_SM3_H +#define INC_HASH_SM3_H + +#define SM3_P0_S(x) ((x) ^ hc_rotl32_S((x), 9) ^ hc_rotl32_S((x), 17)) +#define SM3_P1_S(x) ((x) ^ hc_rotl32_S((x), 15) ^ hc_rotl32_S((x), 23)) + +#define SM3_P0(x) ((x) ^ hc_rotl32((x), 9) ^ hc_rotl32((x), 17)) +#define SM3_P1(x) ((x) ^ hc_rotl32((x), 15) ^ hc_rotl32((x), 23)) + +#define SM3_FF0(x, y, z) ((x) ^ (y) ^ (z)) +#define SM3_GG0(x, y, z) ((x) ^ (y) ^ (z)) + +#define SM3_FF1(x, y, z) (((x) & (y)) | (((x) | (y)) & (z))) +#define SM3_GG1(x, y, z) (((z) ^ ((x) & ((y) ^ (z))))) + +#define SM3_EXPAND_S(a, b, c, d, e) \ + (SM3_P1_S(a ^ b ^ hc_rotl32_S(c, 15)) ^ hc_rotl32_S(d, 7) ^ e) +#define SM3_EXPAND(a, b, c, d, e) \ + (SM3_P1(a ^ b ^ (c, 15)) ^ hc_rotl32(d, 7) ^ e) + +// Only Wj need to be parenthesis because of operator priority +// (Wj = Wi ^ Wi+4) +#define SM3_R1_S(a, b, c, d, e, f, g, h, Tj, Wi, Wj) \ +{ \ + const u32 A_ROTL12 = hc_rotl32_S(a, 12); \ + const u32 SS1 = hc_rotl32_S(A_ROTL12 + e + Tj, 7); \ + const u32 TT1 = SM3_FF0(a, b, c) + d + (SS1 ^ A_ROTL12) + (Wj); \ + const u32 TT2 = SM3_GG0(e, f, g) + h + SS1 + Wi; \ + b = hc_rotl32_S(b, 9); \ + d = TT1; \ + f = hc_rotl32_S(f, 19); \ + h = SM3_P0_S(TT2); \ +} + +#define SM3_R1(a, b, c, d, e, f, g, h, Tj, Wi, Wj) \ +{ \ + const u32 A_ROTL12 = hc_rotl32(a, 12); \ + const u32 SS1 = hc_rotl32(A_ROTL12 + e + Tj, 7); \ + const u32 TT1 = SM3_FF0(a, b, c) + d + (SS1 ^ A_ROTL12) + (Wj); \ + const u32 TT2 = SM3_GG0(e, f, g) + h + SS1 + Wi; \ + b = hc_rotl32(b, 9); \ + d = TT1; \ + f = hc_rotl32(f, 19); \ + h = SM3_P0(TT2); \ +} + +#define SM3_R2_S(a, b, c, d, e, f, g, h, Tj, Wi, Wj) \ +{ \ + const u32 A_ROTL12 = hc_rotl32_S(a, 12); \ + const u32 SS1 = hc_rotl32_S(A_ROTL12 + e + Tj, 7); \ + const u32 TT1 = SM3_FF1(a, b, c) + d + (SS1 ^ A_ROTL12) + (Wj); \ + const u32 TT2 = SM3_GG1(e, f, g) + h + SS1 + Wi; \ + b = hc_rotl32_S(b, 9); \ + d = TT1; \ + f = hc_rotl32_S(f, 19); \ + h = SM3_P0_S(TT2); \ +} + +#define SM3_R2(a, b, c, d, e, f, g, h, Tj, Wi, Wj) \ +{ \ + const u32 A_ROTL12 = hc_rotl32(a, 12); \ + const u32 SS1 = hc_rotl32(A_ROTL12 + e + Tj, 7); \ + const u32 TT1 = SM3_FF1(a, b, c) + d + (SS1 ^ A_ROTL12) + (Wj); \ + const u32 TT2 = SM3_GG1(e, f, g) + h + SS1 + Wi; \ + b = hc_rotl32(b, 9); \ + d = TT1; \ + f = hc_rotl32(f, 19); \ + h = SM3_P0(TT2); \ +} + +typedef struct sm3_ctx +{ + u32 h[8]; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + int len; + +} sm3_ctx_t; + +typedef struct sm3_ctx_vector +{ + u32x h[8]; + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int len; + +} sm3_ctx_vector_t; + +DECLSPEC void sm3_transform (PRIVATE_AS const u32 *w0, PRIVATE_AS const u32 *w1, PRIVATE_AS const u32 *w2, PRIVATE_AS const u32 *w3, PRIVATE_AS u32 *digest); +DECLSPEC void sm3_init (PRIVATE_AS sm3_ctx_t *ctx); +DECLSPEC void sm3_update_64 (PRIVATE_AS sm3_ctx_t *ctx, PRIVATE_AS u32 *w0, PRIVATE_AS u32 *w1, PRIVATE_AS u32 *w2, PRIVATE_AS u32 *w3, const int len); +DECLSPEC void sm3_update (PRIVATE_AS sm3_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len); +DECLSPEC void sm3_update_swap (PRIVATE_AS sm3_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len); +DECLSPEC void sm3_update_utf16le (PRIVATE_AS sm3_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len); +DECLSPEC void sm3_update_utf16le_swap (PRIVATE_AS sm3_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len); +DECLSPEC void sm3_update_global (PRIVATE_AS sm3_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len); +DECLSPEC void sm3_update_global_swap (PRIVATE_AS sm3_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len); +DECLSPEC void sm3_update_global_utf16le (PRIVATE_AS sm3_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len); +DECLSPEC void sm3_update_global_utf16le_swap (PRIVATE_AS sm3_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len); +DECLSPEC void sm3_final (PRIVATE_AS sm3_ctx_t *ctx); + +DECLSPEC void sm3_transform_vector (PRIVATE_AS const u32x *w0, PRIVATE_AS const u32x *w1, PRIVATE_AS const u32x *w2, PRIVATE_AS const u32x *w3, PRIVATE_AS u32x *digest); +DECLSPEC void sm3_init_vector (PRIVATE_AS sm3_ctx_vector_t *ctx); +DECLSPEC void sm3_init_vector_from_scalar (PRIVATE_AS sm3_ctx_vector_t *ctx, PRIVATE_AS sm3_ctx_t *ctx0); +DECLSPEC void sm3_update_vector_64 (PRIVATE_AS sm3_ctx_vector_t *ctx, PRIVATE_AS u32x *w0, PRIVATE_AS u32x *w1, PRIVATE_AS u32x *w2, PRIVATE_AS u32x *w3, const int len); +DECLSPEC void sm3_update_vector (PRIVATE_AS sm3_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len); +DECLSPEC void sm3_update_vector_swap (PRIVATE_AS sm3_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len); +DECLSPEC void sm3_update_vector_utf16le (PRIVATE_AS sm3_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len); +DECLSPEC void sm3_update_vector_utf16le_swap (PRIVATE_AS sm3_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len); +DECLSPEC void sm3_update_vector_utf16beN (PRIVATE_AS sm3_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len); +DECLSPEC void sm3_final_vector (PRIVATE_AS sm3_ctx_vector_t *ctx); + +#endif diff --git a/OpenCL/inc_types.h b/OpenCL/inc_types.h index 575b1538f..63dc5815d 100644 --- a/OpenCL/inc_types.h +++ b/OpenCL/inc_types.h @@ -1679,6 +1679,86 @@ typedef enum blake2s_constants } blake2s_constants_t; +typedef enum sm3_constants +{ + // SM3 Initial Hash Values + SM3_IV_A=0x7380166fUL, + SM3_IV_B=0x4914b2b9UL, + SM3_IV_C=0x172442d7UL, + SM3_IV_D=0xda8a0600UL, + SM3_IV_E=0xa96f30bcUL, + SM3_IV_F=0x163138aaUL, + SM3_IV_G=0xe38dee4dUL, + SM3_IV_H=0xb0fb0e4eUL, + + // SM3 Tj round constants + SM3_T00=0x79CC4519UL, + SM3_T01=0xF3988A32UL, + SM3_T02=0xE7311465UL, + SM3_T03=0xCE6228CBUL, + SM3_T04=0x9CC45197UL, + SM3_T05=0x3988A32FUL, + SM3_T06=0x7311465EUL, + SM3_T07=0xE6228CBCUL, + SM3_T08=0xCC451979UL, + SM3_T09=0x988A32F3UL, + SM3_T10=0x311465E7UL, + SM3_T11=0x6228CBCEUL, + SM3_T12=0xC451979CUL, + SM3_T13=0x88A32F39UL, + SM3_T14=0x11465E73UL, + SM3_T15=0x228CBCE6UL, + SM3_T16=0x9D8A7A87UL, + SM3_T17=0x3B14F50FUL, + SM3_T18=0x7629EA1EUL, + SM3_T19=0xEC53D43CUL, + SM3_T20=0xD8A7A879UL, + SM3_T21=0xB14F50F3UL, + SM3_T22=0x629EA1E7UL, + SM3_T23=0xC53D43CEUL, + SM3_T24=0x8A7A879DUL, + SM3_T25=0x14F50F3BUL, + SM3_T26=0x29EA1E76UL, + SM3_T27=0x53D43CECUL, + SM3_T28=0xA7A879D8UL, + SM3_T29=0x4F50F3B1UL, + SM3_T30=0x9EA1E762UL, + SM3_T31=0x3D43CEC5UL, + SM3_T32=0x7A879D8AUL, + SM3_T33=0xF50F3B14UL, + SM3_T34=0xEA1E7629UL, + SM3_T35=0xD43CEC53UL, + SM3_T36=0xA879D8A7UL, + SM3_T37=0x50F3B14FUL, + SM3_T38=0xA1E7629EUL, + SM3_T39=0x43CEC53DUL, + SM3_T40=0x879D8A7AUL, + SM3_T41=0x0F3B14F5UL, + SM3_T42=0x1E7629EAUL, + SM3_T43=0x3CEC53D4UL, + SM3_T44=0x79D8A7A8UL, + SM3_T45=0xF3B14F50UL, + SM3_T46=0xE7629EA1UL, + SM3_T47=0xCEC53D43UL, + SM3_T48=0x9D8A7A87UL, + SM3_T49=0x3B14F50FUL, + SM3_T50=0x7629EA1EUL, + SM3_T51=0xEC53D43CUL, + SM3_T52=0xD8A7A879UL, + SM3_T53=0xB14F50F3UL, + SM3_T54=0x629EA1E7UL, + SM3_T55=0xC53D43CEUL, + SM3_T56=0x8A7A879DUL, + SM3_T57=0x14F50F3BUL, + SM3_T58=0x29EA1E76UL, + SM3_T59=0x53D43CECUL, + SM3_T60=0xA7A879D8UL, + SM3_T61=0x4F50F3B1UL, + SM3_T62=0x9EA1E762UL, + SM3_T63=0x3D43CEC5UL + +} sm3_constants_t; + typedef enum combinator_mode { COMBINATOR_MODE_BASE_LEFT = 10001, diff --git a/OpenCL/m36000_a0-pure.cl b/OpenCL/m36000_a0-pure.cl new file mode 100644 index 000000000..be3d1c6d9 --- /dev/null +++ b/OpenCL/m36000_a0-pure.cl @@ -0,0 +1,117 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#ifdef KERNEL_STATIC +#include M2S(INCLUDE_PATH/inc_vendor.h) +#include M2S(INCLUDE_PATH/inc_types.h) +#include M2S(INCLUDE_PATH/inc_platform.cl) +#include M2S(INCLUDE_PATH/inc_common.cl) +#include M2S(INCLUDE_PATH/inc_rp.h) +#include M2S(INCLUDE_PATH/inc_rp.cl) +#include M2S(INCLUDE_PATH/inc_scalar.cl) +#include M2S(INCLUDE_PATH/inc_hash_sm3.cl) +#endif + +KERNEL_FQ void m36000_mxx (KERN_ATTR_RULES ()) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + const u64 gid = get_global_id (0); + + if (gid >= GID_CNT) return; + + /** + * base + */ + + COPY_PW (pws[gid]); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++) + { + pw_t tmp = PASTE_PW; + + tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len); + + sm3_ctx_t ctx; + + sm3_init (&ctx); + + sm3_update_swap (&ctx, tmp.i, tmp.pw_len); + + sm3_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +KERNEL_FQ void m36000_sxx (KERN_ATTR_RULES ()) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + const u64 gid = get_global_id (0); + + if (gid >= GID_CNT) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0], + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1], + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2], + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3] + }; + + /** + * base + */ + + COPY_PW (pws[gid]); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++) + { + pw_t tmp = PASTE_PW; + + tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len); + + sm3_ctx_t ctx; + + sm3_init (&ctx); + + sm3_update_swap (&ctx, tmp.i, tmp.pw_len); + + sm3_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/src/modules/module_36000.c b/src/modules/module_36000.c new file mode 100644 index 000000000..16865a754 --- /dev/null +++ b/src/modules/module_36000.c @@ -0,0 +1,242 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#include "common.h" +#include "types.h" +#include "modules.h" +#include "bitops.h" +#include "convert.h" +#include "shared.h" + +static const u32 ATTACK_EXEC = ATTACK_EXEC_INSIDE_KERNEL; +/* +static const u32 DGST_POS0 = 3; +static const u32 DGST_POS1 = 7; +static const u32 DGST_POS2 = 2; +static const u32 DGST_POS3 = 6; +*/ +static const u32 DGST_POS0 = 0; +static const u32 DGST_POS1 = 1; +static const u32 DGST_POS2 = 2; +static const u32 DGST_POS3 = 3; +static const u32 DGST_SIZE = DGST_SIZE_4_8; +static const u32 HASH_CATEGORY = HASH_CATEGORY_RAW_HASH; +static const char *HASH_NAME = "SM3"; +static const u64 KERN_TYPE = 36000; +static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_PRECOMPUTE_INIT + | OPTI_TYPE_EARLY_SKIP + | OPTI_TYPE_NOT_ITERATED + | OPTI_TYPE_NOT_SALTED + | OPTI_TYPE_RAW_HASH; +static const u64 OPTS_TYPE = OPTS_TYPE_STOCK_MODULE + | OPTS_TYPE_PT_GENERATE_BE + | OPTS_TYPE_PT_ADD80 + | OPTS_TYPE_PT_ADDBITS15; +static const u32 SALT_TYPE = SALT_TYPE_NONE; +static const char *ST_PASS = "hashcat"; +static const char *ST_HASH = "51227e48ea74827b77fc142c3ec21d25cc42c794e6ac422825cd47ad4ac7913d"; + +u32 module_attack_exec (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ATTACK_EXEC; } +u32 module_dgst_pos0 (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS0; } +u32 module_dgst_pos1 (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS1; } +u32 module_dgst_pos2 (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS2; } +u32 module_dgst_pos3 (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS3; } +u32 module_dgst_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_SIZE; } +u32 module_hash_category (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return HASH_CATEGORY; } +const char *module_hash_name (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return HASH_NAME; } +u64 module_kern_type (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return KERN_TYPE; } +u32 module_opti_type (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return OPTI_TYPE; } +u64 module_opts_type (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return OPTS_TYPE; } +u32 module_salt_type (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return SALT_TYPE; } +const char *module_st_hash (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_HASH; } +const char *module_st_pass (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_PASS; } + +int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len) +{ + u32 *digest = (u32 *) digest_buf; + + hc_token_t token; + + token.token_cnt = 1; + + token.len_min[0] = 64; + token.len_max[0] = 64; + token.attr[0] = TOKEN_ATTR_VERIFY_LENGTH + | TOKEN_ATTR_VERIFY_HEX; + + const int rc_tokenizer = input_tokenizer ((const u8 *) line_buf, line_len, &token); + + if (rc_tokenizer != PARSER_OK) return (rc_tokenizer); + + const u8 *hash_pos = token.buf[0]; + + digest[0] = hex_to_u32 (hash_pos + 0); + digest[1] = hex_to_u32 (hash_pos + 8); + digest[2] = hex_to_u32 (hash_pos + 16); + digest[3] = hex_to_u32 (hash_pos + 24); + digest[4] = hex_to_u32 (hash_pos + 32); + digest[5] = hex_to_u32 (hash_pos + 40); + digest[6] = hex_to_u32 (hash_pos + 48); + digest[7] = hex_to_u32 (hash_pos + 56); + + digest[0] = byte_swap_32 (digest[0]); + digest[1] = byte_swap_32 (digest[1]); + digest[2] = byte_swap_32 (digest[2]); + digest[3] = byte_swap_32 (digest[3]); + digest[4] = byte_swap_32 (digest[4]); + digest[5] = byte_swap_32 (digest[5]); + digest[6] = byte_swap_32 (digest[6]); + digest[7] = byte_swap_32 (digest[7]); + /* + if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) + { + digest[0] -= SHA256M_A; + digest[1] -= SHA256M_B; + digest[2] -= SHA256M_C; + digest[3] -= SHA256M_D; + digest[4] -= SHA256M_E; + digest[5] -= SHA256M_F; + digest[6] -= SHA256M_G; + digest[7] -= SHA256M_H; + } + */ + return (PARSER_OK); +} + +int module_hash_encode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const void *digest_buf, MAYBE_UNUSED const salt_t *salt, MAYBE_UNUSED const void *esalt_buf, MAYBE_UNUSED const void *hook_salt_buf, MAYBE_UNUSED const hashinfo_t *hash_info, char *line_buf, MAYBE_UNUSED const int line_size) +{ + const u32 *digest = (const u32 *) digest_buf; + + // we can not change anything in the original buffer, otherwise destroying sorting + // therefore create some local buffer + + u32 tmp[8]; + + tmp[0] = digest[0]; + tmp[1] = digest[1]; + tmp[2] = digest[2]; + tmp[3] = digest[3]; + tmp[4] = digest[4]; + tmp[5] = digest[5]; + tmp[6] = digest[6]; + tmp[7] = digest[7]; + /* + if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) + { + tmp[0] += SHA256M_A; + tmp[1] += SHA256M_B; + tmp[2] += SHA256M_C; + tmp[3] += SHA256M_D; + tmp[4] += SHA256M_E; + tmp[5] += SHA256M_F; + tmp[6] += SHA256M_G; + tmp[7] += SHA256M_H; + } + */ + tmp[0] = byte_swap_32 (tmp[0]); + tmp[1] = byte_swap_32 (tmp[1]); + tmp[2] = byte_swap_32 (tmp[2]); + tmp[3] = byte_swap_32 (tmp[3]); + tmp[4] = byte_swap_32 (tmp[4]); + tmp[5] = byte_swap_32 (tmp[5]); + tmp[6] = byte_swap_32 (tmp[6]); + tmp[7] = byte_swap_32 (tmp[7]); + + u8 *out_buf = (u8 *) line_buf; + + u32_to_hex (tmp[0], out_buf + 0); + u32_to_hex (tmp[1], out_buf + 8); + u32_to_hex (tmp[2], out_buf + 16); + u32_to_hex (tmp[3], out_buf + 24); + u32_to_hex (tmp[4], out_buf + 32); + u32_to_hex (tmp[5], out_buf + 40); + u32_to_hex (tmp[6], out_buf + 48); + u32_to_hex (tmp[7], out_buf + 56); + + const int out_len = 64; + + return out_len; +} + +void module_init (module_ctx_t *module_ctx) +{ + module_ctx->module_context_size = MODULE_CONTEXT_SIZE_CURRENT; + module_ctx->module_interface_version = MODULE_INTERFACE_VERSION_CURRENT; + + module_ctx->module_attack_exec = module_attack_exec; + module_ctx->module_benchmark_esalt = MODULE_DEFAULT; + module_ctx->module_benchmark_hook_salt = MODULE_DEFAULT; + module_ctx->module_benchmark_mask = MODULE_DEFAULT; + module_ctx->module_benchmark_charset = MODULE_DEFAULT; + module_ctx->module_benchmark_salt = MODULE_DEFAULT; + module_ctx->module_build_plain_postprocess = MODULE_DEFAULT; + module_ctx->module_deep_comp_kernel = MODULE_DEFAULT; + module_ctx->module_deprecated_notice = MODULE_DEFAULT; + module_ctx->module_dgst_pos0 = module_dgst_pos0; + module_ctx->module_dgst_pos1 = module_dgst_pos1; + module_ctx->module_dgst_pos2 = module_dgst_pos2; + module_ctx->module_dgst_pos3 = module_dgst_pos3; + module_ctx->module_dgst_size = module_dgst_size; + module_ctx->module_dictstat_disable = MODULE_DEFAULT; + module_ctx->module_esalt_size = MODULE_DEFAULT; + module_ctx->module_extra_buffer_size = MODULE_DEFAULT; + module_ctx->module_extra_tmp_size = MODULE_DEFAULT; + module_ctx->module_extra_tuningdb_block = MODULE_DEFAULT; + module_ctx->module_forced_outfile_format = MODULE_DEFAULT; + module_ctx->module_hash_binary_count = MODULE_DEFAULT; + module_ctx->module_hash_binary_parse = MODULE_DEFAULT; + module_ctx->module_hash_binary_save = MODULE_DEFAULT; + module_ctx->module_hash_decode_postprocess = MODULE_DEFAULT; + module_ctx->module_hash_decode_potfile = MODULE_DEFAULT; + module_ctx->module_hash_decode_zero_hash = MODULE_DEFAULT; + module_ctx->module_hash_decode = module_hash_decode; + module_ctx->module_hash_encode_status = MODULE_DEFAULT; + module_ctx->module_hash_encode_potfile = MODULE_DEFAULT; + module_ctx->module_hash_encode = module_hash_encode; + module_ctx->module_hash_init_selftest = MODULE_DEFAULT; + module_ctx->module_hash_mode = MODULE_DEFAULT; + module_ctx->module_hash_category = module_hash_category; + module_ctx->module_hash_name = module_hash_name; + module_ctx->module_hashes_count_min = MODULE_DEFAULT; + module_ctx->module_hashes_count_max = MODULE_DEFAULT; + module_ctx->module_hlfmt_disable = MODULE_DEFAULT; + module_ctx->module_hook_extra_param_size = MODULE_DEFAULT; + module_ctx->module_hook_extra_param_init = MODULE_DEFAULT; + module_ctx->module_hook_extra_param_term = MODULE_DEFAULT; + module_ctx->module_hook12 = MODULE_DEFAULT; + module_ctx->module_hook23 = MODULE_DEFAULT; + module_ctx->module_hook_salt_size = MODULE_DEFAULT; + module_ctx->module_hook_size = MODULE_DEFAULT; + module_ctx->module_jit_build_options = MODULE_DEFAULT; + module_ctx->module_jit_cache_disable = MODULE_DEFAULT; + module_ctx->module_kernel_accel_max = MODULE_DEFAULT; + module_ctx->module_kernel_accel_min = MODULE_DEFAULT; + module_ctx->module_kernel_loops_max = MODULE_DEFAULT; + module_ctx->module_kernel_loops_min = MODULE_DEFAULT; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; + module_ctx->module_kernel_threads_min = MODULE_DEFAULT; + module_ctx->module_kern_type = module_kern_type; + module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; + module_ctx->module_opti_type = module_opti_type; + module_ctx->module_opts_type = module_opts_type; + module_ctx->module_outfile_check_disable = MODULE_DEFAULT; + module_ctx->module_outfile_check_nocomp = MODULE_DEFAULT; + module_ctx->module_potfile_custom_check = MODULE_DEFAULT; + module_ctx->module_potfile_disable = MODULE_DEFAULT; + module_ctx->module_potfile_keep_all_hashes = MODULE_DEFAULT; + module_ctx->module_pwdump_column = MODULE_DEFAULT; + module_ctx->module_pw_max = MODULE_DEFAULT; + module_ctx->module_pw_min = MODULE_DEFAULT; + module_ctx->module_salt_max = MODULE_DEFAULT; + module_ctx->module_salt_min = MODULE_DEFAULT; + module_ctx->module_salt_type = module_salt_type; + module_ctx->module_separator = MODULE_DEFAULT; + module_ctx->module_st_hash = module_st_hash; + module_ctx->module_st_pass = module_st_pass; + module_ctx->module_tmp_size = MODULE_DEFAULT; + module_ctx->module_unstable_warning = MODULE_DEFAULT; + module_ctx->module_warmup_disable = MODULE_DEFAULT; +}