Manually unroll sha2 hashes

2024-11-02 20:39:22 +01:00 · 2019-05-11 23:15:58 +02:00 · 2019-05-11 23:15:58 +02:00 · fa9d073f9a
commit fa9d073f9a
parent 3ca3d1cc60
29 changed files with 225 additions and 0 deletions
--- a/OpenCL/inc_hash_sha224.cl
+++ b/OpenCL/inc_hash_sha224.cl
@ -104,6 +104,11 @@ DECLSPEC void sha224_transform (const u32 *w0, const u32 *w1, const u32 *w2, con

  ROUND_STEP_S (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND_S (); ROUND_STEP_S (16);
+  ROUND_EXPAND_S (); ROUND_STEP_S (32);
+  ROUND_EXPAND_S (); ROUND_STEP_S (48);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -111,6 +116,7 @@ DECLSPEC void sha224_transform (const u32 *w0, const u32 *w1, const u32 *w2, con
  {
    ROUND_EXPAND_S (); ROUND_STEP_S (i);
  }
+  #endif

  #undef ROUND_EXPAND_S
  #undef ROUND_STEP_S
--- a/OpenCL/inc_hash_sha256.cl
+++ b/OpenCL/inc_hash_sha256.cl
@ -104,6 +104,11 @@ DECLSPEC void sha256_transform (const u32 *w0, const u32 *w1, const u32 *w2, con

  ROUND_STEP_S (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND_S (); ROUND_STEP_S (16);
+  ROUND_EXPAND_S (); ROUND_STEP_S (32);
+  ROUND_EXPAND_S (); ROUND_STEP_S (48);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -111,6 +116,7 @@ DECLSPEC void sha256_transform (const u32 *w0, const u32 *w1, const u32 *w2, con
  {
    ROUND_EXPAND_S (); ROUND_STEP_S (i);
  }
+  #endif

  #undef ROUND_EXPAND_S
  #undef ROUND_STEP_S
--- a/OpenCL/inc_hash_sha384.cl
+++ b/OpenCL/inc_hash_sha384.cl
@ -108,6 +108,12 @@ DECLSPEC void sha384_transform (const u32 *w0, const u32 *w1, const u32 *w2, con

  ROUND_STEP_S (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND_S (); ROUND_STEP_S (16);
+  ROUND_EXPAND_S (); ROUND_STEP_S (32);
+  ROUND_EXPAND_S (); ROUND_STEP_S (48);
+  ROUND_EXPAND_S (); ROUND_STEP_S (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -115,6 +121,7 @@ DECLSPEC void sha384_transform (const u32 *w0, const u32 *w1, const u32 *w2, con
  {
    ROUND_EXPAND_S (); ROUND_STEP_S (i);
  }
+  #endif

  #undef ROUND_EXPAND_S
  #undef ROUND_STEP_S
--- a/OpenCL/inc_hash_sha512.cl
+++ b/OpenCL/inc_hash_sha512.cl
@ -108,6 +108,12 @@ DECLSPEC void sha512_transform (const u32 *w0, const u32 *w1, const u32 *w2, con

  ROUND_STEP_S (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND_S (); ROUND_STEP_S (16);
+  ROUND_EXPAND_S (); ROUND_STEP_S (32);
+  ROUND_EXPAND_S (); ROUND_STEP_S (48);
+  ROUND_EXPAND_S (); ROUND_STEP_S (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -115,6 +121,7 @@ DECLSPEC void sha512_transform (const u32 *w0, const u32 *w1, const u32 *w2, con
  {
    ROUND_EXPAND_S (); ROUND_STEP_S (i);
  }
+  #endif

  #undef ROUND_EXPAND_S
  #undef ROUND_STEP_S
--- a/OpenCL/m01700_a0-optimized.cl
+++ b/OpenCL/m01700_a0-optimized.cl
@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m01700_a1-optimized.cl
+++ b/OpenCL/m01700_a1-optimized.cl
@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m01700_a3-optimized.cl
+++ b/OpenCL/m01700_a3-optimized.cl
@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m01710_a0-optimized.cl
+++ b/OpenCL/m01710_a0-optimized.cl
@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m01710_a1-optimized.cl
+++ b/OpenCL/m01710_a1-optimized.cl
@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m01710_a3-optimized.cl
+++ b/OpenCL/m01710_a3-optimized.cl
@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m01720_a0-optimized.cl
+++ b/OpenCL/m01720_a0-optimized.cl
@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m01720_a1-optimized.cl
+++ b/OpenCL/m01720_a1-optimized.cl
@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m01720_a3-optimized.cl
+++ b/OpenCL/m01720_a3-optimized.cl
@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m01730_a0-optimized.cl
+++ b/OpenCL/m01730_a0-optimized.cl
@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m01730_a1-optimized.cl
+++ b/OpenCL/m01730_a1-optimized.cl
@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m01730_a3-optimized.cl
+++ b/OpenCL/m01730_a3-optimized.cl
@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m01740_a0-optimized.cl
+++ b/OpenCL/m01740_a0-optimized.cl
@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m01740_a1-optimized.cl
+++ b/OpenCL/m01740_a1-optimized.cl
@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m01740_a3-optimized.cl
+++ b/OpenCL/m01740_a3-optimized.cl
@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m08000_a0-optimized.cl
+++ b/OpenCL/m08000_a0-optimized.cl
@ -86,6 +86,11 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -93,6 +98,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  digest[0] += a;
  digest[1] += b;
@ -137,6 +143,11 @@ DECLSPEC void sha256_transform_z (u32x *digest)

  ROUND_STEP_Z (0);

+  #ifdef IS_CUDA
+  ROUND_STEP_Z (16);
+  ROUND_STEP_Z (32);
+  ROUND_STEP_Z (48);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -144,6 +155,7 @@ DECLSPEC void sha256_transform_z (u32x *digest)
  {
    ROUND_STEP_Z (i);
  }
+  #endif

  digest[0] += a;
  digest[1] += b;
--- a/OpenCL/m08000_a1-optimized.cl
+++ b/OpenCL/m08000_a1-optimized.cl
@ -84,6 +84,11 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -91,6 +96,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  digest[0] += a;
  digest[1] += b;
@ -135,6 +141,11 @@ DECLSPEC void sha256_transform_z (u32x *digest)

  ROUND_STEP_Z (0);

+  #ifdef IS_CUDA
+  ROUND_STEP_Z (16);
+  ROUND_STEP_Z (32);
+  ROUND_STEP_Z (48);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -142,6 +153,7 @@ DECLSPEC void sha256_transform_z (u32x *digest)
  {
    ROUND_STEP_Z (i);
  }
+  #endif

  digest[0] += a;
  digest[1] += b;
--- a/OpenCL/m08000_a3-optimized.cl
+++ b/OpenCL/m08000_a3-optimized.cl
@ -84,6 +84,11 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -91,6 +96,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  digest[0] += a;
  digest[1] += b;
@ -135,6 +141,11 @@ DECLSPEC void sha256_transform_z (u32x *digest)

  ROUND_STEP_Z (0);

+  #ifdef IS_CUDA
+  ROUND_STEP_Z (16);
+  ROUND_STEP_Z (32);
+  ROUND_STEP_Z (48);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -142,6 +153,7 @@ DECLSPEC void sha256_transform_z (u32x *digest)
  {
    ROUND_STEP_Z (i);
  }
+  #endif

  digest[0] += a;
  digest[1] += b;
--- a/OpenCL/m10800_a0-optimized.cl
+++ b/OpenCL/m10800_a0-optimized.cl
@ -86,6 +86,12 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -93,6 +99,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m10800_a1-optimized.cl
+++ b/OpenCL/m10800_a1-optimized.cl
@ -84,6 +84,12 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -91,6 +97,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m10800_a3-optimized.cl
+++ b/OpenCL/m10800_a3-optimized.cl
@ -84,6 +84,12 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -91,6 +97,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m15000_a0-optimized.cl
+++ b/OpenCL/m15000_a0-optimized.cl
@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m15000_a1-optimized.cl
+++ b/OpenCL/m15000_a1-optimized.cl
@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/OpenCL/m15000_a3-optimized.cl
+++ b/OpenCL/m15000_a3-optimized.cl
@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32

  ROUND_STEP (0);

+  #ifdef IS_CUDA
+  ROUND_EXPAND (); ROUND_STEP (16);
+  ROUND_EXPAND (); ROUND_STEP (32);
+  ROUND_EXPAND (); ROUND_STEP (48);
+  ROUND_EXPAND (); ROUND_STEP (64);
+  #else
  #ifdef _unroll
  #pragma unroll
  #endif
@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
  {
    ROUND_EXPAND (); ROUND_STEP (i);
  }
+  #endif

  /* rev
  digest[0] += a;
--- a/src/autotune.c
+++ b/src/autotune.c
@ -47,6 +47,7 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par
  return exec_msec_prev;
 }

+/*
 static double try_run_preferred (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops)
 {
  hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
@ -93,6 +94,7 @@ static double try_run_preferred (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *

  return exec_msec_prev;
 }
+*/

 static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 {
@ -261,6 +263,8 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
    const u32 kernel_accel_orig = kernel_accel;
    const u32 kernel_loops_orig = kernel_loops;

+    double exec_msec_prev = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
+
    for (int i = 1; i < STEPS_CNT; i++)
    {
      const u32 kernel_accel_try = kernel_accel_orig * (1u << i);
@ -272,6 +276,16 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
      if (kernel_loops_try > kernel_loops_max) continue;
      if (kernel_loops_try < kernel_loops_min) break;

+      // do a real test
+
+      const double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_try, kernel_loops_try);
+
+      if (exec_msec_prev < exec_msec) break;
+
+      exec_msec_prev = exec_msec;
+
+      // so far, so good! save
+
      kernel_accel = kernel_accel_try;
      kernel_loops = kernel_loops_try;

@ -299,6 +313,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
  // start finding best thread count is easier.
  // it's either the preferred or the maximum thread count

+  /*
  const u32 kernel_threads_min = device_param->kernel_threads_min;
  const u32 kernel_threads_max = device_param->kernel_threads_max;

@ -334,6 +349,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
      }
    }
  }
+  */

  if (device_param->is_cuda == true)
  {