Test convert for -m 1000 with -a 0 for SIMD, speed is now on par or faster than hashcat

2024-12-27 05:13:45 +01:00 · 2016-01-21 16:47:38 +01:00 · 2016-01-21 16:47:38 +01:00 · 4c0e520fd8
commit 4c0e520fd8
parent add18eaa6d
5 changed files with 183 additions and 95 deletions
--- a/OpenCL/common.c
+++ b/OpenCL/common.c
@ -2657,7 +2657,7 @@ static void append_0x80_1x4 (u32 w0[4], const u32 offset)
  }
 }

-static void append_0x80_2x4 (u32 w0[4], u32 w1[4], const u32 offset)
+static void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
 {
  switch (offset)
  {
--- a/OpenCL/m01000_a0.cl
+++ b/OpenCL/m01000_a0.cl
@ -5,6 +5,8 @@

 #define _MD4_

+#define NEW_SIMD_CODE
+
 #include "include/constants.h"
 #include "include/kernel_vendor.h"

@ -18,9 +20,7 @@
 #include "OpenCL/common.c"
 #include "include/rp_kernel.h"
 #include "OpenCL/rp.c"
-
-#define COMPARE_S "OpenCL/check_single_comp4.c"
-#define COMPARE_M "OpenCL/check_multi_comp4.c"
+#include "OpenCL/simd.c"

 __kernel void m01000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
 {
@ -58,56 +58,33 @@ __kernel void m01000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
   * loop
   */

-  for (u32 il_pos = 0; il_pos < rules_cnt; il_pos++)
+  for (u32 il_pos = 0; il_pos < rules_cnt; il_pos += VECT_SIZE)
  {
-    u32 w0[4];
+    u32x w0[4] = { 0 };
+    u32x w1[4] = { 0 };
+    u32x w2[4] = { 0 };
+    u32x w3[4] = { 0 };

-    w0[0] = pw_buf0[0];
-    w0[1] = pw_buf0[1];
-    w0[2] = pw_buf0[2];
-    w0[3] = pw_buf0[3];
-
-    u32 w1[4];
-
-    w1[0] = pw_buf1[0];
-    w1[1] = pw_buf1[1];
-    w1[2] = pw_buf1[2];
-    w1[3] = pw_buf1[3];
-
-    u32 w2[4];
-
-    w2[0] = 0;
-    w2[1] = 0;
-    w2[2] = 0;
-    w2[3] = 0;
-
-    u32 w3[4];
-
-    w3[0] = 0;
-    w3[1] = 0;
-    w3[2] = 0;
-    w3[3] = 0;
-
-    const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len);
+    const u32 out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1);

    append_0x80_2x4 (w0, w1, out_len);

-    u32 w0_t[4];
-    u32 w1_t[4];
-    u32 w2_t[4];
-    u32 w3_t[4];
+    u32x w0_t[4];
+    u32x w1_t[4];
+    u32x w2_t[4];
+    u32x w3_t[4];

    make_unicode (w0, w0_t, w1_t);
    make_unicode (w1, w2_t, w3_t);

    w3_t[2] = out_len * 8 * 2;

-    u32 tmp2;
+    u32x tmp2;

-    u32 a = MD4M_A;
-    u32 b = MD4M_B;
-    u32 c = MD4M_C;
-    u32 d = MD4M_D;
+    u32x a = MD4M_A;
+    u32x b = MD4M_B;
+    u32x c = MD4M_C;
+    u32x d = MD4M_D;

    MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00);
    MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01);
@ -160,12 +137,7 @@ __kernel void m01000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
    MD4_STEP (MD4_H1, c, d, a, b, w1_t[3], MD4C02, MD4S22);
    MD4_STEP (MD4_H2, b, c, d, a, w3_t[3], MD4C02, MD4S23);

-    const u32 r0 = a;
-    const u32 r1 = d;
-    const u32 r2 = c;
-    const u32 r3 = b;
-
-    #include COMPARE_M
+    COMPARE_M_SIMD (a, d, c, b);
  }
 }

@ -225,56 +197,33 @@ __kernel void m01000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
   * loop
   */

-  for (u32 il_pos = 0; il_pos < rules_cnt; il_pos++)
+  for (u32 il_pos = 0; il_pos < rules_cnt; il_pos += VECT_SIZE)
  {
-    u32 w0[4];
+    u32x w0[4] = { 0 };
+    u32x w1[4] = { 0 };
+    u32x w2[4] = { 0 };
+    u32x w3[4] = { 0 };

-    w0[0] = pw_buf0[0];
-    w0[1] = pw_buf0[1];
-    w0[2] = pw_buf0[2];
-    w0[3] = pw_buf0[3];
-
-    u32 w1[4];
-
-    w1[0] = pw_buf1[0];
-    w1[1] = pw_buf1[1];
-    w1[2] = pw_buf1[2];
-    w1[3] = pw_buf1[3];
-
-    u32 w2[4];
-
-    w2[0] = 0;
-    w2[1] = 0;
-    w2[2] = 0;
-    w2[3] = 0;
-
-    u32 w3[4];
-
-    w3[0] = 0;
-    w3[1] = 0;
-    w3[2] = 0;
-    w3[3] = 0;
-
-    const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len);
+    const u32 out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1);

    append_0x80_2x4 (w0, w1, out_len);

-    u32 w0_t[4];
-    u32 w1_t[4];
-    u32 w2_t[4];
-    u32 w3_t[4];
+    u32x w0_t[4];
+    u32x w1_t[4];
+    u32x w2_t[4];
+    u32x w3_t[4];

    make_unicode (w0, w0_t, w1_t);
    make_unicode (w1, w2_t, w3_t);

    w3_t[2] = out_len * 8 * 2;

-    u32 tmp2;
+    u32x tmp2;

-    u32 a = MD4M_A;
-    u32 b = MD4M_B;
-    u32 c = MD4M_C;
-    u32 d = MD4M_D;
+    u32x a = MD4M_A;
+    u32x b = MD4M_B;
+    u32x c = MD4M_C;
+    u32x d = MD4M_D;

    MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00);
    MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01);
@ -323,16 +272,14 @@ __kernel void m01000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
    MD4_STEP (MD4_H1, c, d, a, b, w1_t[1], MD4C02, MD4S22);
    MD4_STEP (MD4_H2, b, c, d, a, w3_t[1], MD4C02, MD4S23);
    MD4_STEP (MD4_H1, a, b, c, d, w0_t[3], MD4C02, MD4S20);
+
+    if (MATCHES_NONE_VS (a, search[0])) continue;
+
    MD4_STEP (MD4_H2, d, a, b, c, w2_t[3], MD4C02, MD4S21);
    MD4_STEP (MD4_H1, c, d, a, b, w1_t[3], MD4C02, MD4S22);
    MD4_STEP (MD4_H2, b, c, d, a, w3_t[3], MD4C02, MD4S23);

-    const u32 r0 = a;
-    const u32 r1 = d;
-    const u32 r2 = c;
-    const u32 r3 = b;
-
-    #include COMPARE_S
+    COMPARE_S_SIMD (a, d, c, b);
  }
 }

--- a/OpenCL/rp.c
+++ b/OpenCL/rp.c
@ -4083,7 +4083,7 @@ u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], u32 buf
  return out_len;
 }

-u32 apply_rules (__global u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len)
+u32 apply_rules (const __global u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len)
 {
  u32 out_len = len;

@ -4100,3 +4100,144 @@ u32 apply_rules (__global u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len)

  return out_len;
 }
+
+u32 apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const u32 pw_len, const __global kernel_rule_t *rules_buf, const u32 il_pos, u32x w0[4], u32x w1[4])
+{
+  #if VECT_SIZE == 1
+
+  w0[0] = pw_buf0[0];
+  w0[1] = pw_buf0[1];
+  w0[2] = pw_buf0[2];
+  w0[3] = pw_buf0[3];
+  w1[0] = pw_buf1[0];
+  w1[1] = pw_buf1[1];
+  w1[2] = pw_buf1[2];
+  w1[3] = pw_buf1[3];
+
+  return apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len);
+
+  #else
+
+  u32 out_len = 0;
+
+  #pragma unroll
+  for (int i = 0; i < VECT_SIZE; i++)
+  {
+    u32 tmp0[4];
+    u32 tmp1[4];
+
+    tmp0[0] = pw_buf0[0];
+    tmp0[1] = pw_buf0[1];
+    tmp0[2] = pw_buf0[2];
+    tmp0[3] = pw_buf0[3];
+    tmp1[0] = pw_buf1[0];
+    tmp1[1] = pw_buf1[1];
+    tmp1[2] = pw_buf1[2];
+    tmp1[3] = pw_buf1[3];
+
+    out_len = apply_rules (rules_buf[il_pos + i].cmds, tmp0, tmp1, pw_len);
+
+    // it's guaranteed to have out_len always the same for each call in the loop
+
+    switch (i)
+    {
+      #if VECT_SIZE >= 2
+      case 0:
+        w0[0].s0 = tmp0[0];
+        w0[1].s0 = tmp0[1];
+        w0[2].s0 = tmp0[2];
+        w0[3].s0 = tmp0[3];
+        w1[0].s0 = tmp1[0];
+        w1[1].s0 = tmp1[1];
+        w1[2].s0 = tmp1[2];
+        w1[3].s0 = tmp1[3];
+        break;
+
+      case 1:
+        w0[0].s1 = tmp0[0];
+        w0[1].s1 = tmp0[1];
+        w0[2].s1 = tmp0[2];
+        w0[3].s1 = tmp0[3];
+        w1[0].s1 = tmp1[0];
+        w1[1].s1 = tmp1[1];
+        w1[2].s1 = tmp1[2];
+        w1[3].s1 = tmp1[3];
+        break;
+      #endif
+
+      #if VECT_SIZE >= 4
+      case 2:
+        w0[0].s2 = tmp0[0];
+        w0[1].s2 = tmp0[1];
+        w0[2].s2 = tmp0[2];
+        w0[3].s2 = tmp0[3];
+        w1[0].s2 = tmp1[0];
+        w1[1].s2 = tmp1[1];
+        w1[2].s2 = tmp1[2];
+        w1[3].s2 = tmp1[3];
+        break;
+
+      case 3:
+        w0[0].s3 = tmp0[0];
+        w0[1].s3 = tmp0[1];
+        w0[2].s3 = tmp0[2];
+        w0[3].s3 = tmp0[3];
+        w1[0].s3 = tmp1[0];
+        w1[1].s3 = tmp1[1];
+        w1[2].s3 = tmp1[2];
+        w1[3].s3 = tmp1[3];
+        break;
+      #endif
+
+      #if VECT_SIZE >= 8
+      case 4:
+        w0[0].s4 = tmp0[0];
+        w0[1].s4 = tmp0[1];
+        w0[2].s4 = tmp0[2];
+        w0[3].s4 = tmp0[3];
+        w1[0].s4 = tmp1[0];
+        w1[1].s4 = tmp1[1];
+        w1[2].s4 = tmp1[2];
+        w1[3].s4 = tmp1[3];
+        break;
+
+      case 5:
+        w0[0].s5 = tmp0[0];
+        w0[1].s5 = tmp0[1];
+        w0[2].s5 = tmp0[2];
+        w0[3].s5 = tmp0[3];
+        w1[0].s5 = tmp1[0];
+        w1[1].s5 = tmp1[1];
+        w1[2].s5 = tmp1[2];
+        w1[3].s5 = tmp1[3];
+        break;
+
+      case 6:
+        w0[0].s6 = tmp0[0];
+        w0[1].s6 = tmp0[1];
+        w0[2].s6 = tmp0[2];
+        w0[3].s6 = tmp0[3];
+        w1[0].s6 = tmp1[0];
+        w1[1].s6 = tmp1[1];
+        w1[2].s6 = tmp1[2];
+        w1[3].s6 = tmp1[3];
+        break;
+
+      case 7:
+        w0[0].s7 = tmp0[0];
+        w0[1].s7 = tmp0[1];
+        w0[2].s7 = tmp0[2];
+        w0[3].s7 = tmp0[3];
+        w1[0].s7 = tmp1[0];
+        w1[1].s7 = tmp1[1];
+        w1[2].s7 = tmp1[2];
+        w1[3].s7 = tmp1[3];
+        break;
+      #endif
+    }
+  }
+
+  return out_len;
+
+  #endif
+}
--- a/src/oclHashcat.c
+++ b/src/oclHashcat.c
@ -12583,7 +12583,7 @@ int main (int argc, char **argv)

        cl_uint vector_width;

-        if (attack_mode == ATTACK_MODE_BF)
+        if (1) // can be removed as soon as all kernel are migrated; if (attack_mode == ATTACK_MODE_BF)
        {
          if (opencl_vector_width == OPENCL_VECTOR_WIDTH)
          {
--- a/tools/test.sh
+++ b/tools/test.sh
@ -19,7 +19,7 @@ NEVER_CRACK="11600"

 SLOW_ALGOS="400 500 501 1600 1800 2100 2500 3200 5200 5800 6211 6221 6231 6241 6251 6261 6271 6281 6300 6400 6500 6600 6700 6800 7100 7200 7400 7900 8200 8800 8900 9000 9100 9200 9300 9400 9500 9600 10000 10300 10500 10700 10900 11300 11600 11900 12000 12100 12200 12300 12400 12500 12800 12900 13000"

-OPTS="--quiet --force --potfile-disable --runtime 200 --gpu-temp-disable --weak-hash-threshold=0 --opencl-device-types 1,2"
+OPTS="--quiet --force --potfile-disable --runtime 200 --gpu-temp-disable --weak-hash-threshold=0 --opencl-device-types 2"

 OUTD="test_$(date +%s)"