Workaround for -m 22100 on NVIDIA

2025-03-04 18:23:02 +01:00 · 2020-03-18 16:13:57 +01:00 · 2020-03-18 16:13:57 +01:00 · 24094793da
commit 24094793da
parent a6cf7caf4a
1 changed files with 27 additions and 3 deletions
--- a/OpenCL/m22100-pure.cl
+++ b/OpenCL/m22100-pure.cl
@ -265,11 +265,35 @@ KERNEL_FQ void m22100_loop (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t))
  {
    #ifdef REAL_SHM

-    for (u32 i = lid; i < FIXED_ITER_INCR; i += lsz)
+    /**
+     * On NVIDIA, the __sync_threads() is not working as expected if called from inside a loop.
+     * This is a unique situation across all hashcat kernels so far.
+     * From CUDA manual:
+     *   __syncthreads() is allowed in conditional code but only if the conditional evaluates identically across the entire thread block,
+     *   otherwise the code execution is likely to hang or produce unintended side effects.
+     * NVIDIA OpenCL runtime is also affected, but other OpenCL runtimes work as they should.
+     * An workaround exists by disabling shared memory access. Speed drop is around 4%.
+     * Another workaround is to let only a single thread do all the work while all other threads wait for it to finish. Speed drop is around 0.05%.
+
+       // original code
+       for (int i = lid; i < FIXED_ITER_INCR; i += lsz)
+       {
+         for (int j = 0; j < 48; j++) // first 16 set to register
+         {
+           s_wb_ke_pc[i][j] = esalt_bufs[digests_offset].wb_ke_pc[loop_pos + t + i][j];
+         }
+       }
+
+     */
+
+    if (lid == 0)
    {
-      for (u32 j = 0; j < 48; j++) // first 16 set to register
+      for (int i = 0; i < FIXED_ITER_INCR; i++)
      {
-        s_wb_ke_pc[i][j] = esalt_bufs[digests_offset].wb_ke_pc[loop_pos + t + i][j];
+        for (int j = 0; j < 48; j++) // first 16 set to register
+        {
+          s_wb_ke_pc[i][j] = esalt_bufs[digests_offset].wb_ke_pc[loop_pos + t + i][j];
+        }
      }
    }