From 15f35fa68c1d598eb9e56e700716b444a944b074 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Wed, 21 Apr 2021 15:59:14 +0200
Subject: [PATCH] Scrypt Kernels: Reduced kernel wait times by making it a true
 split kernel where iteration count = N value

---
 OpenCL/inc_common.h        |   2 +
 OpenCL/inc_types.h         |   1 +
 OpenCL/m08900-pure.cl      | 132 ++++++++--
 OpenCL/m15700-pure.cl      | 130 ++++++++--
 OpenCL/m22700-pure.cl      | 130 ++++++++--
 docs/changes.txt           |   1 +
 hashcat.hctune             |  32 +--
 include/types.h            |  63 +++--
 src/backend.c              | 487 +++++++++++++++++++++++--------------
 src/modules/module_02500.c |   1 +
 src/modules/module_02501.c |   1 +
 src/modules/module_03200.c |   1 +
 src/modules/module_08900.c |  13 +-
 src/modules/module_09300.c |  16 +-
 src/modules/module_15700.c |  12 +-
 src/modules/module_16800.c |   1 +
 src/modules/module_16801.c |   1 +
 src/modules/module_22000.c |   1 +
 src/modules/module_22001.c |   1 +
 src/modules/module_22700.c |  11 +-
 20 files changed, 724 insertions(+), 313 deletions(-)

diff --git a/OpenCL/inc_common.h b/OpenCL/inc_common.h
index 9b3437326..86e3b7e7f 100644
--- a/OpenCL/inc_common.h
+++ b/OpenCL/inc_common.h
@@ -62,6 +62,7 @@
   MAYBE_UNUSED           const u32            digests_cnt,          \
   MAYBE_UNUSED           const u32            digests_offset_host,  \
   MAYBE_UNUSED           const u32            combs_mode,           \
+  MAYBE_UNUSED           const u32            salt_repeat,          \
   MAYBE_UNUSED           const u64            pws_pos,              \
   MAYBE_UNUSED           const u64            gid_max
 #else
@@ -100,6 +101,7 @@
   MAYBE_UNUSED           const u32            digests_cnt,          \
   MAYBE_UNUSED           const u32            digests_offset_host,  \
   MAYBE_UNUSED           const u32            combs_mode,           \
+  MAYBE_UNUSED           const u32            salt_repeat,          \
   MAYBE_UNUSED           const u64            pws_pos,              \
   MAYBE_UNUSED           const u64            gid_max
 #endif
diff --git a/OpenCL/inc_types.h b/OpenCL/inc_types.h
index 9a5173c54..a6b9ea85e 100644
--- a/OpenCL/inc_types.h
+++ b/OpenCL/inc_types.h
@@ -1642,6 +1642,7 @@ typedef struct salt
   u32 salt_iter;
   u32 salt_iter2;
   u32 salt_sign[2];
+  u32 salt_repeats;
 
   u32 orig_pos;
 
diff --git a/OpenCL/m08900-pure.cl b/OpenCL/m08900-pure.cl
index cb0077e17..ccae9bda7 100644
--- a/OpenCL/m08900-pure.cl
+++ b/OpenCL/m08900-pure.cl
@@ -170,14 +170,16 @@ DECLSPEC void salsa_r (uint4 *TI)
     TO[idx_r2++] = R3;
   }
 
+  #ifdef _unroll
   #pragma unroll
+  #endif
   for (int i = 0; i < STATE_CNT4; i++)
   {
     TI[i] = TO[i];
   }
 }
 
-DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
+DECLSPEC void scrypt_smix_init (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
 {
   #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z))
   #define CO Coord(xd4,y,z)
@@ -200,9 +202,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
     case 3: V = V3; break;
   }
 
-  #ifdef _unroll
-  #pragma unroll
-  #endif
   for (u32 i = 0; i < STATE_CNT4; i += 4)
   {
     #ifdef IS_CUDA
@@ -230,7 +229,71 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
     for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X);
   }
 
-  for (u32 i = 0; i < SCRYPT_N; i++)
+  for (u32 i = 0; i < STATE_CNT4; i += 4)
+  {
+    #ifdef IS_CUDA
+    T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
+    T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
+    T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
+    T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
+    #else
+    T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
+    T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
+    T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
+    T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
+    #endif
+
+    X[i + 0] = T[0];
+    X[i + 1] = T[1];
+    X[i + 2] = T[2];
+    X[i + 3] = T[3];
+  }
+}
+
+DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
+{
+  #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z))
+  #define CO Coord(xd4,y,z)
+
+  const u32 ySIZE = SCRYPT_N / SCRYPT_TMTO;
+  const u32 zSIZE = STATE_CNT4;
+
+  const u32 x = get_global_id (0);
+
+  const u32 xd4 = x / 4;
+  const u32 xm4 = x & 3;
+
+  GLOBAL_AS uint4 *V;
+
+  switch (xm4)
+  {
+    case 0: V = V0; break;
+    case 1: V = V1; break;
+    case 2: V = V2; break;
+    case 3: V = V3; break;
+  }
+
+  for (u32 i = 0; i < STATE_CNT4; i += 4)
+  {
+    #ifdef IS_CUDA
+    T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
+    T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
+    T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
+    T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
+    #else
+    T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
+    T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
+    T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
+    T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
+    #endif
+
+    X[i + 0] = T[0];
+    X[i + 1] = T[1];
+    X[i + 2] = T[2];
+    X[i + 3] = T[3];
+  }
+
+  for (u32 N_pos = 0; N_pos < 1024; N_pos++)
   {
     const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1);
 
@@ -247,9 +310,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
     salsa_r (X);
   }
 
-  #ifdef _unroll
-  #pragma unroll
-  #endif
   for (u32 i = 0; i < STATE_CNT4; i += 4)
   {
     #ifdef IS_CUDA
@@ -341,6 +401,41 @@ KERNEL_FQ void m08900_init (KERN_ATTR_TMPS (scrypt_tmp_t))
   }
 }
 
+KERNEL_FQ void m08900_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t))
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= gid_max) return;
+
+  // SCRYPT part, init V
+
+  GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf;
+  GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf;
+  GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf;
+  GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf;
+
+  uint4 X[STATE_CNT4];
+  uint4 T[STATE_CNT4];
+
+  const u32 P_offset = salt_repeat * STATE_CNT4;
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]);
+
+  scrypt_smix_init (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]);
+}
+
 KERNEL_FQ void m08900_loop (KERN_ATTR_TMPS (scrypt_tmp_t))
 {
   const u64 gid = get_global_id (0);
@@ -355,28 +450,19 @@ KERNEL_FQ void m08900_loop (KERN_ATTR_TMPS (scrypt_tmp_t))
   uint4 X[STATE_CNT4];
   uint4 T[STATE_CNT4];
 
-  #ifdef _unroll
-  #pragma unroll
-  #endif
-  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[z]);
-
-  scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
+  const u32 P_offset = salt_repeat * STATE_CNT4;
 
   #ifdef _unroll
   #pragma unroll
   #endif
-  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[z] = hc_swap32_4 (X[z]);
+  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]);
 
-  #if SCRYPT_P >= 1
-  for (int i = STATE_CNT4; i < SCRYPT_CNT4; i += STATE_CNT4)
-  {
-    for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[i + z]);
+  scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
 
-    scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
-
-    for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[i + z] = hc_swap32_4 (X[z]);
-  }
+  #ifdef _unroll
+  #pragma unroll
   #endif
+  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]);
 }
 
 KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
diff --git a/OpenCL/m15700-pure.cl b/OpenCL/m15700-pure.cl
index c3e32fae9..d6b5d251f 100644
--- a/OpenCL/m15700-pure.cl
+++ b/OpenCL/m15700-pure.cl
@@ -184,7 +184,7 @@ DECLSPEC void salsa_r (uint4 *TI)
   }
 }
 
-DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
+DECLSPEC void scrypt_smix_init (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
 {
   #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z))
   #define CO Coord(xd4,y,z)
@@ -207,9 +207,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
     case 3: V = V3; break;
   }
 
-  #ifdef _unroll
-  #pragma unroll
-  #endif
   for (u32 i = 0; i < STATE_CNT4; i += 4)
   {
     #ifdef IS_CUDA
@@ -237,7 +234,71 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
     for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X);
   }
 
-  for (u32 i = 0; i < SCRYPT_N; i++)
+  for (u32 i = 0; i < STATE_CNT4; i += 4)
+  {
+    #ifdef IS_CUDA
+    T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
+    T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
+    T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
+    T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
+    #else
+    T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
+    T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
+    T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
+    T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
+    #endif
+
+    X[i + 0] = T[0];
+    X[i + 1] = T[1];
+    X[i + 2] = T[2];
+    X[i + 3] = T[3];
+  }
+}
+
+DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
+{
+  #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z))
+  #define CO Coord(xd4,y,z)
+
+  const u32 ySIZE = SCRYPT_N / SCRYPT_TMTO;
+  const u32 zSIZE = STATE_CNT4;
+
+  const u32 x = get_global_id (0);
+
+  const u32 xd4 = x / 4;
+  const u32 xm4 = x & 3;
+
+  GLOBAL_AS uint4 *V;
+
+  switch (xm4)
+  {
+    case 0: V = V0; break;
+    case 1: V = V1; break;
+    case 2: V = V2; break;
+    case 3: V = V3; break;
+  }
+
+  for (u32 i = 0; i < STATE_CNT4; i += 4)
+  {
+    #ifdef IS_CUDA
+    T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
+    T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
+    T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
+    T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
+    #else
+    T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
+    T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
+    T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
+    T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
+    #endif
+
+    X[i + 0] = T[0];
+    X[i + 1] = T[1];
+    X[i + 2] = T[2];
+    X[i + 3] = T[3];
+  }
+
+  for (u32 N_pos = 0; N_pos < 1024; N_pos++)
   {
     const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1);
 
@@ -254,9 +315,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
     salsa_r (X);
   }
 
-  #ifdef _unroll
-  #pragma unroll
-  #endif
   for (u32 i = 0; i < STATE_CNT4; i += 4)
   {
     #ifdef IS_CUDA
@@ -477,6 +535,41 @@ KERNEL_FQ void m15700_init (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
   }
 }
 
+KERNEL_FQ void m15700_loop_prepare (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_t))
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= gid_max) return;
+
+  // SCRYPT part, init V
+
+  GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf;
+  GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf;
+  GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf;
+  GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf;
+
+  uint4 X[STATE_CNT4];
+  uint4 T[STATE_CNT4];
+
+  const u32 P_offset = salt_repeat * STATE_CNT4;
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]);
+
+  scrypt_smix_init (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]);
+}
+
 KERNEL_FQ void m15700_loop (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_t))
 {
   const u64 gid = get_global_id (0);
@@ -491,28 +584,19 @@ KERNEL_FQ void m15700_loop (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
   uint4 X[STATE_CNT4];
   uint4 T[STATE_CNT4];
 
-  #ifdef _unroll
-  #pragma unroll
-  #endif
-  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[z]);
-
-  scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
+  const u32 P_offset = salt_repeat * STATE_CNT4;
 
   #ifdef _unroll
   #pragma unroll
   #endif
-  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[z] = hc_swap32_4 (X[z]);
+  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]);
 
-  #if SCRYPT_P >= 1
-  for (int i = STATE_CNT4; i < SCRYPT_CNT4; i += STATE_CNT4)
-  {
-    for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[i + z]);
+  scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
 
-    scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
-
-    for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[i + z] = hc_swap32_4 (X[z]);
-  }
+  #ifdef _unroll
+  #pragma unroll
   #endif
+  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]);
 }
 
 KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_t))
diff --git a/OpenCL/m22700-pure.cl b/OpenCL/m22700-pure.cl
index 0d0b50763..c9fb70d0e 100644
--- a/OpenCL/m22700-pure.cl
+++ b/OpenCL/m22700-pure.cl
@@ -225,7 +225,7 @@ DECLSPEC void salsa_r (uint4 *TI)
   }
 }
 
-DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
+DECLSPEC void scrypt_smix_init (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
 {
   #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z))
   #define CO Coord(xd4,y,z)
@@ -248,9 +248,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
     case 3: V = V3; break;
   }
 
-  #ifdef _unroll
-  #pragma unroll
-  #endif
   for (u32 i = 0; i < STATE_CNT4; i += 4)
   {
     #ifdef IS_CUDA
@@ -278,7 +275,71 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
     for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X);
   }
 
-  for (u32 i = 0; i < SCRYPT_N; i++)
+  for (u32 i = 0; i < STATE_CNT4; i += 4)
+  {
+    #ifdef IS_CUDA
+    T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
+    T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
+    T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
+    T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
+    #else
+    T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
+    T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
+    T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
+    T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
+    #endif
+
+    X[i + 0] = T[0];
+    X[i + 1] = T[1];
+    X[i + 2] = T[2];
+    X[i + 3] = T[3];
+  }
+}
+
+DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
+{
+  #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z))
+  #define CO Coord(xd4,y,z)
+
+  const u32 ySIZE = SCRYPT_N / SCRYPT_TMTO;
+  const u32 zSIZE = STATE_CNT4;
+
+  const u32 x = get_global_id (0);
+
+  const u32 xd4 = x / 4;
+  const u32 xm4 = x & 3;
+
+  GLOBAL_AS uint4 *V;
+
+  switch (xm4)
+  {
+    case 0: V = V0; break;
+    case 1: V = V1; break;
+    case 2: V = V2; break;
+    case 3: V = V3; break;
+  }
+
+  for (u32 i = 0; i < STATE_CNT4; i += 4)
+  {
+    #ifdef IS_CUDA
+    T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
+    T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
+    T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
+    T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
+    #else
+    T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
+    T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
+    T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
+    T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
+    #endif
+
+    X[i + 0] = T[0];
+    X[i + 1] = T[1];
+    X[i + 2] = T[2];
+    X[i + 3] = T[3];
+  }
+
+  for (u32 N_pos = 0; N_pos < 1024; N_pos++)
   {
     const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1);
 
@@ -295,9 +356,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
     salsa_r (X);
   }
 
-  #ifdef _unroll
-  #pragma unroll
-  #endif
   for (u32 i = 0; i < STATE_CNT4; i += 4)
   {
     #ifdef IS_CUDA
@@ -429,6 +487,41 @@ KERNEL_FQ void m22700_init (KERN_ATTR_TMPS (scrypt_tmp_t))
   }
 }
 
+KERNEL_FQ void m22700_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t))
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= gid_max) return;
+
+  // SCRYPT part, init V
+
+  GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf;
+  GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf;
+  GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf;
+  GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf;
+
+  uint4 X[STATE_CNT4];
+  uint4 T[STATE_CNT4];
+
+  const u32 P_offset = salt_repeat * STATE_CNT4;
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]);
+
+  scrypt_smix_init (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]);
+}
+
 KERNEL_FQ void m22700_loop (KERN_ATTR_TMPS (scrypt_tmp_t))
 {
   const u64 gid = get_global_id (0);
@@ -443,28 +536,19 @@ KERNEL_FQ void m22700_loop (KERN_ATTR_TMPS (scrypt_tmp_t))
   uint4 X[STATE_CNT4];
   uint4 T[STATE_CNT4];
 
-  #ifdef _unroll
-  #pragma unroll
-  #endif
-  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[z]);
-
-  scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
+  const u32 P_offset = salt_repeat * STATE_CNT4;
 
   #ifdef _unroll
   #pragma unroll
   #endif
-  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[z] = hc_swap32_4 (X[z]);
+  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]);
 
-  #if SCRYPT_P >= 1
-  for (int i = STATE_CNT4; i < SCRYPT_CNT4; i += STATE_CNT4)
-  {
-    for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[i + z]);
+  scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
 
-    scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
-
-    for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[i + z] = hc_swap32_4 (X[z]);
-  }
+  #ifdef _unroll
+  #pragma unroll
   #endif
+  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]);
 }
 
 KERNEL_FQ void m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
diff --git a/docs/changes.txt b/docs/changes.txt
index f0adef031..f58707fd5 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -62,6 +62,7 @@
 - OpenCL Runtime: Workaround JiT compiler deadlock on NVIDIA driver >= 465.89
 - RAR3 Kernels: Improved loop code, improving performance by 23%
 - Startup time: Improved the startup time by avoiding some time intensive operations for skipped devices
+- Scrypt Kernels: Reduced kernel wait times by making it a true split kernel where iteration count = N value
 
 ##
 ## Technical
diff --git a/hashcat.hctune b/hashcat.hctune
index ee3446be1..4f14768c2 100644
--- a/hashcat.hctune
+++ b/hashcat.hctune
@@ -369,14 +369,14 @@ GeForce_GTX_TITAN                               3       9900    2       A
 ## SCRYPT
 ##
 
-DEVICE_TYPE_CPU                                 *       8900    1       N       1
-DEVICE_TYPE_GPU                                 *       8900    1       N       1
-DEVICE_TYPE_CPU                                 *       9300    1       N       1
-DEVICE_TYPE_GPU                                 *       9300    1       N       1
-DEVICE_TYPE_CPU                                 *       15700   1       N       1
-DEVICE_TYPE_GPU                                 *       15700   1       1       1
-DEVICE_TYPE_CPU                                 *       22700   1       N       1
-DEVICE_TYPE_GPU                                 *       22700   1       N       1
+DEVICE_TYPE_CPU                                 *       8900    1       N       A
+DEVICE_TYPE_GPU                                 *       8900    1       N       A
+DEVICE_TYPE_CPU                                 *       9300    1       N       A
+DEVICE_TYPE_GPU                                 *       9300    1       N       A
+DEVICE_TYPE_CPU                                 *       15700   1       N       A
+DEVICE_TYPE_GPU                                 *       15700   1       1       A
+DEVICE_TYPE_CPU                                 *       22700   1       N       A
+DEVICE_TYPE_GPU                                 *       22700   1       N       A
 
 ## Here's an example of how to manually tune SCRYPT algorithm kernels for your hardware.
 ## Manually tuning the GPU will yield increased performance. There is typically no noticeable change to CPU performance.
@@ -466,12 +466,12 @@ DEVICE_TYPE_GPU                                 *       22700   1       N
 ## Find the ideal -n value, then store it here along with the proper compute device name. 
 ## Formatting guidelines are availabe at the top of this document.
 
-GeForce_GTX_980                                 *       8900    1      28       1
-GeForce_GTX_980                                 *       9300    1     128       1
-GeForce_GTX_980                                 *       15700   1       1       1
-GeForce_GTX_980                                 *       22700   1      28       1
+GeForce_GTX_980                                 *       8900    1      28       A
+GeForce_GTX_980                                 *       9300    1     128       A
+GeForce_GTX_980                                 *       15700   1       1       A
+GeForce_GTX_980                                 *       22700   1      28       A
 
-GeForce_RTX_2080_Ti                             *       8900    1       N       1
-GeForce_RTX_2080_Ti                             *       9300    1     544       1
-GeForce_RTX_2080_Ti                             *       15700   1       4       1
-GeForce_RTX_2080_Ti                             *       22700   1       N       1
+GeForce_RTX_2080_Ti                             *       8900    1       N       A
+GeForce_RTX_2080_Ti                             *       9300    1     544       A
+GeForce_RTX_2080_Ti                             *       15700   1       4       A
+GeForce_RTX_2080_Ti                             *       22700   1       N       A
diff --git a/include/types.h b/include/types.h
index e3a31a643..29215f1f8 100644
--- a/include/types.h
+++ b/include/types.h
@@ -257,12 +257,14 @@ typedef enum kern_run
 {
   KERN_RUN_1      = 1000,
   KERN_RUN_12     = 1500,
+  KERN_RUN_2P     = 1999,
   KERN_RUN_2      = 2000,
   KERN_RUN_2E     = 2001,
   KERN_RUN_23     = 2500,
   KERN_RUN_3      = 3000,
   KERN_RUN_4      = 4000,
   KERN_RUN_INIT2  = 5000,
+  KERN_RUN_LOOP2P = 5999,
   KERN_RUN_LOOP2  = 6000,
   KERN_RUN_AUX1   = 7001,
   KERN_RUN_AUX2   = 7002,
@@ -412,30 +414,33 @@ typedef enum opts_type
   OPTS_TYPE_ST_BASE64         = (1ULL << 26),
   OPTS_TYPE_HASH_COPY         = (1ULL << 28),
   OPTS_TYPE_HASH_SPLIT        = (1ULL << 29),
-  OPTS_TYPE_LOOP_EXTENDED     = (1ULL << 30), // a kernel which is called each time normal _loop kernel finished.
+  OPTS_TYPE_LOOP_PREPARE      = (1ULL << 30), // a kernel which is called each time before _loop kernel started.
+                                              // like a hook12 kernel but without extra buffers.
+  OPTS_TYPE_LOOP_EXTENDED     = (1ULL << 31), // a kernel which is called each time normal _loop kernel finished.
                                               // but unlike a hook kernel this kernel is called for every _loop iteration offset
-  OPTS_TYPE_HOOK12            = (1ULL << 31),
-  OPTS_TYPE_HOOK23            = (1ULL << 32),
-  OPTS_TYPE_INIT2             = (1ULL << 33),
-  OPTS_TYPE_LOOP2             = (1ULL << 34),
-  OPTS_TYPE_AUX1              = (1ULL << 35),
-  OPTS_TYPE_AUX2              = (1ULL << 36),
-  OPTS_TYPE_AUX3              = (1ULL << 37),
-  OPTS_TYPE_AUX4              = (1ULL << 38),
-  OPTS_TYPE_BINARY_HASHFILE   = (1ULL << 39),
+  OPTS_TYPE_HOOK12            = (1ULL << 32),
+  OPTS_TYPE_HOOK23            = (1ULL << 33),
+  OPTS_TYPE_INIT2             = (1ULL << 34),
+  OPTS_TYPE_LOOP2_PREPARE     = (1ULL << 35), // same as OPTS_TYPE_LOOP_PREPARE but for loop2 kernel
+  OPTS_TYPE_LOOP2             = (1ULL << 36),
+  OPTS_TYPE_AUX1              = (1ULL << 37),
+  OPTS_TYPE_AUX2              = (1ULL << 38),
+  OPTS_TYPE_AUX3              = (1ULL << 39),
+  OPTS_TYPE_AUX4              = (1ULL << 40),
+  OPTS_TYPE_BINARY_HASHFILE   = (1ULL << 41),
   OPTS_TYPE_BINARY_HASHFILE_OPTIONAL
-                              = (1ULL << 40), // this allows us to not enforce the use of a binary file. requires OPTS_TYPE_BINARY_HASHFILE set to be effective.
-  OPTS_TYPE_PT_ADD06          = (1ULL << 41),
-  OPTS_TYPE_KEYBOARD_MAPPING  = (1ULL << 42),
-  OPTS_TYPE_DEEP_COMP_KERNEL  = (1ULL << 43), // if we have to iterate through each hash inside the comp kernel, for example if each hash has to be decrypted separately
-  OPTS_TYPE_TM_KERNEL         = (1ULL << 44),
-  OPTS_TYPE_SUGGEST_KG        = (1ULL << 45), // suggest keep guessing for modules the user maybe wants to use --keep-guessing
-  OPTS_TYPE_COPY_TMPS         = (1ULL << 46), // if we want to use data from tmps buffer (for example get the PMK in WPA)
-  OPTS_TYPE_POTFILE_NOPASS    = (1ULL << 47), // sometimes the password should not be printed to potfile
-  OPTS_TYPE_DYNAMIC_SHARED    = (1ULL << 48), // use dynamic shared memory (note: needs special kernel changes)
-  OPTS_TYPE_SELF_TEST_DISABLE = (1ULL << 49), // some algos use JiT in combinations with a salt or create too much startup time
-  OPTS_TYPE_MP_MULTI_DISABLE  = (1ULL << 50), // do not multiply the kernel-accel with the multiprocessor count per device to allow more fine-tuned workload settings
-  OPTS_TYPE_NATIVE_THREADS    = (1ULL << 51), // forces "native" thread count: CPU=1, GPU-Intel=8, GPU-AMD=64 (wavefront), GPU-NV=32 (warps)
+                              = (1ULL << 42), // this allows us to not enforce the use of a binary file. requires OPTS_TYPE_BINARY_HASHFILE set to be effective.
+  OPTS_TYPE_PT_ADD06          = (1ULL << 43),
+  OPTS_TYPE_KEYBOARD_MAPPING  = (1ULL << 44),
+  OPTS_TYPE_DEEP_COMP_KERNEL  = (1ULL << 45), // if we have to iterate through each hash inside the comp kernel, for example if each hash has to be decrypted separately
+  OPTS_TYPE_TM_KERNEL         = (1ULL << 46),
+  OPTS_TYPE_SUGGEST_KG        = (1ULL << 47), // suggest keep guessing for modules the user maybe wants to use --keep-guessing
+  OPTS_TYPE_COPY_TMPS         = (1ULL << 48), // if we want to use data from tmps buffer (for example get the PMK in WPA)
+  OPTS_TYPE_POTFILE_NOPASS    = (1ULL << 49), // sometimes the password should not be printed to potfile
+  OPTS_TYPE_DYNAMIC_SHARED    = (1ULL << 50), // use dynamic shared memory (note: needs special kernel changes)
+  OPTS_TYPE_SELF_TEST_DISABLE = (1ULL << 51), // some algos use JiT in combinations with a salt or create too much startup time
+  OPTS_TYPE_MP_MULTI_DISABLE  = (1ULL << 52), // do not multiply the kernel-accel with the multiprocessor count per device to allow more fine-tuned workload settings
+  OPTS_TYPE_NATIVE_THREADS    = (1ULL << 53), // forces "native" thread count: CPU=1, GPU-Intel=8, GPU-AMD=64 (wavefront), GPU-NV=32 (warps)
 
 } opts_type_t;
 
@@ -1094,12 +1099,14 @@ typedef struct hc_device_param
 
   u32     kernel_wgs1;
   u32     kernel_wgs12;
+  u32     kernel_wgs2p;
   u32     kernel_wgs2;
   u32     kernel_wgs2e;
   u32     kernel_wgs23;
   u32     kernel_wgs3;
   u32     kernel_wgs4;
   u32     kernel_wgs_init2;
+  u32     kernel_wgs_loop2p;
   u32     kernel_wgs_loop2;
   u32     kernel_wgs_mp;
   u32     kernel_wgs_mp_l;
@@ -1116,12 +1123,14 @@ typedef struct hc_device_param
 
   u32     kernel_preferred_wgs_multiple1;
   u32     kernel_preferred_wgs_multiple12;
+  u32     kernel_preferred_wgs_multiple2p;
   u32     kernel_preferred_wgs_multiple2;
   u32     kernel_preferred_wgs_multiple2e;
   u32     kernel_preferred_wgs_multiple23;
   u32     kernel_preferred_wgs_multiple3;
   u32     kernel_preferred_wgs_multiple4;
   u32     kernel_preferred_wgs_multiple_init2;
+  u32     kernel_preferred_wgs_multiple_loop2p;
   u32     kernel_preferred_wgs_multiple_loop2;
   u32     kernel_preferred_wgs_multiple_mp;
   u32     kernel_preferred_wgs_multiple_mp_l;
@@ -1138,12 +1147,14 @@ typedef struct hc_device_param
 
   u64     kernel_local_mem_size1;
   u64     kernel_local_mem_size12;
+  u64     kernel_local_mem_size2p;
   u64     kernel_local_mem_size2;
   u64     kernel_local_mem_size2e;
   u64     kernel_local_mem_size23;
   u64     kernel_local_mem_size3;
   u64     kernel_local_mem_size4;
   u64     kernel_local_mem_size_init2;
+  u64     kernel_local_mem_size_loop2p;
   u64     kernel_local_mem_size_loop2;
   u64     kernel_local_mem_size_mp;
   u64     kernel_local_mem_size_mp_l;
@@ -1160,12 +1171,14 @@ typedef struct hc_device_param
 
   u64     kernel_dynamic_local_mem_size1;
   u64     kernel_dynamic_local_mem_size12;
+  u64     kernel_dynamic_local_mem_size2p;
   u64     kernel_dynamic_local_mem_size2;
   u64     kernel_dynamic_local_mem_size2e;
   u64     kernel_dynamic_local_mem_size23;
   u64     kernel_dynamic_local_mem_size3;
   u64     kernel_dynamic_local_mem_size4;
   u64     kernel_dynamic_local_mem_size_init2;
+  u64     kernel_dynamic_local_mem_size_loop2p;
   u64     kernel_dynamic_local_mem_size_loop2;
   u64     kernel_dynamic_local_mem_size_mp;
   u64     kernel_dynamic_local_mem_size_mp_l;
@@ -1273,11 +1286,13 @@ typedef struct hc_device_param
   // workaround cpu spinning
 
   double  exec_us_prev1[EXPECTED_ITERATIONS];
+  double  exec_us_prev2p[EXPECTED_ITERATIONS];
   double  exec_us_prev2[EXPECTED_ITERATIONS];
   double  exec_us_prev2e[EXPECTED_ITERATIONS];
   double  exec_us_prev3[EXPECTED_ITERATIONS];
   double  exec_us_prev4[EXPECTED_ITERATIONS];
   double  exec_us_prev_init2[EXPECTED_ITERATIONS];
+  double  exec_us_prev_loop2p[EXPECTED_ITERATIONS];
   double  exec_us_prev_loop2[EXPECTED_ITERATIONS];
   double  exec_us_prev_aux1[EXPECTED_ITERATIONS];
   double  exec_us_prev_aux2[EXPECTED_ITERATIONS];
@@ -1378,12 +1393,14 @@ typedef struct hc_device_param
 
   CUfunction        cuda_function1;
   CUfunction        cuda_function12;
+  CUfunction        cuda_function2p;
   CUfunction        cuda_function2;
   CUfunction        cuda_function2e;
   CUfunction        cuda_function23;
   CUfunction        cuda_function3;
   CUfunction        cuda_function4;
   CUfunction        cuda_function_init2;
+  CUfunction        cuda_function_loop2p;
   CUfunction        cuda_function_loop2;
   CUfunction        cuda_function_mp;
   CUfunction        cuda_function_mp_l;
@@ -1462,12 +1479,14 @@ typedef struct hc_device_param
 
   cl_kernel         opencl_kernel1;
   cl_kernel         opencl_kernel12;
+  cl_kernel         opencl_kernel2p;
   cl_kernel         opencl_kernel2;
   cl_kernel         opencl_kernel2e;
   cl_kernel         opencl_kernel23;
   cl_kernel         opencl_kernel3;
   cl_kernel         opencl_kernel4;
   cl_kernel         opencl_kernel_init2;
+  cl_kernel         opencl_kernel_loop2p;
   cl_kernel         opencl_kernel_loop2;
   cl_kernel         opencl_kernel_mp;
   cl_kernel         opencl_kernel_mp_l;
diff --git a/src/backend.c b/src/backend.c
index 583d0712d..79c4944c9 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -2998,11 +2998,7 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
   }
   else
   {
-    bool run_init = true;
-    bool run_loop = true;
-    bool run_comp = true;
-
-    if (run_init == true)
+    if (true)
     {
       if (device_param->is_cuda == true)
       {
@@ -3089,165 +3085,190 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
       }
     }
 
-    if (run_loop == true)
+    if (true)
     {
-      u32 iter = hashes->salts_buf[salt_pos].salt_iter;
+      const u32 salt_repeats = hashes->salts_buf[salt_pos].salt_repeats;
 
-      u32 loop_step = device_param->kernel_loops;
-
-      for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++)
+      for (u32 salt_repeat = 0; salt_repeat <= salt_repeats; salt_repeat++)
       {
-        u32 loop_left = iter - loop_pos;
+        device_param->kernel_params_buf32[34] = salt_repeat;
 
-        loop_left = MIN (loop_left, loop_step);
-
-        device_param->kernel_params_buf32[28] = loop_pos;
-        device_param->kernel_params_buf32[29] = loop_left;
-
-        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1;
-
-        if (hashconfig->opts_type & OPTS_TYPE_LOOP_EXTENDED)
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE)
         {
-          if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2E, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1;
+          if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2P, pws_pos, pws_cnt, false, 0) == -1) return -1;
         }
 
-        //bug?
-        //while (status_ctx->run_thread_level2 == false) break;
-        if (status_ctx->run_thread_level2 == false) break;
-
-        /**
-         * speed
-         */
-
-        const float iter_part = (float) (loop_pos + loop_left) / iter;
-
-        const u64 perf_sum_all = (u64) (pws_cnt * iter_part);
-
-        double speed_msec = hc_timer_get (device_param->timer_speed);
-
-        const u32 speed_pos = device_param->speed_pos;
-
-        device_param->speed_cnt[speed_pos] = perf_sum_all;
-
-        device_param->speed_msec[speed_pos] = speed_msec;
-
-        if (user_options->speed_only == true)
+        if (true)
         {
-          if (speed_msec > 4000)
+          const u32 iter = hashes->salts_buf[salt_pos].salt_iter;
+
+          const u32 loop_step = device_param->kernel_loops;
+
+          for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++)
           {
-            device_param->outerloop_multi *= (double) iter / (double) (loop_pos + loop_left);
+            u32 loop_left = iter - loop_pos;
 
-            device_param->speed_pos = 1;
+            loop_left = MIN (loop_left, loop_step);
 
-            device_param->speed_only_finish = true;
+            device_param->kernel_params_buf32[28] = loop_pos;
+            device_param->kernel_params_buf32[29] = loop_left;
 
-            return 0;
+            if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1;
+
+            if (hashconfig->opts_type & OPTS_TYPE_LOOP_EXTENDED)
+            {
+              if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2E, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1;
+            }
+
+            //bug?
+            //while (status_ctx->run_thread_level2 == false) break;
+            if (status_ctx->run_thread_level2 == false) break;
+
+            /**
+             * speed
+             */
+
+            const float iter_part = (float) (loop_pos + loop_left) / iter;
+
+            const u64 perf_sum_all = (u64) (pws_cnt * iter_part);
+
+            double speed_msec = hc_timer_get (device_param->timer_speed);
+
+            const u32 speed_pos = device_param->speed_pos;
+
+            device_param->speed_cnt[speed_pos] = perf_sum_all;
+
+            device_param->speed_msec[speed_pos] = speed_msec;
+
+            if (user_options->speed_only == true)
+            {
+              if (speed_msec > 4000)
+              {
+                device_param->outerloop_multi *= (double) iter / (double) (loop_pos + loop_left);
+
+                device_param->speed_pos = 1;
+
+                device_param->speed_only_finish = true;
+
+                return 0;
+              }
+            }
+          }
+
+          if (hashconfig->opts_type & OPTS_TYPE_HOOK23)
+          {
+            if (run_kernel (hashcat_ctx, device_param, KERN_RUN_23, pws_pos, pws_cnt, false, 0) == -1) return -1;
+
+            if (device_param->is_cuda == true)
+            {
+              if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
+            }
+
+            if (device_param->is_opencl == true)
+            {
+              if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
+            }
+
+            const int hook_threads = (int) user_options->hook_threads;
+
+            hook_thread_param_t *hook_threads_param = (hook_thread_param_t *) hccalloc (hook_threads, sizeof (hook_thread_param_t));
+
+            for (int i = 0; i < hook_threads; i++)
+            {
+              hook_thread_param_t *hook_thread_param = hook_threads_param + i;
+
+              hook_thread_param->tid = i;
+              hook_thread_param->tsz = hook_threads;
+
+              hook_thread_param->module_ctx = module_ctx;
+              hook_thread_param->status_ctx = status_ctx;
+
+              hook_thread_param->device_param = device_param;
+
+              hook_thread_param->hook_extra_param = module_ctx->hook_extra_params[i];
+              hook_thread_param->hook_salts_buf = hashes->hook_salts_buf;
+
+              hook_thread_param->salt_pos = salt_pos;
+
+              hook_thread_param->pws_cnt = pws_cnt;
+            }
+
+            hc_thread_t *c_threads = (hc_thread_t *) hccalloc (hook_threads, sizeof (hc_thread_t));
+
+            for (int i = 0; i < hook_threads; i++)
+            {
+              hook_thread_param_t *hook_thread_param = hook_threads_param + i;
+
+              hc_thread_create (c_threads[i], hook23_thread, hook_thread_param);
+            }
+
+            hc_thread_wait (hook_threads, c_threads);
+
+            hcfree (c_threads);
+
+            hcfree (hook_threads_param);
+
+            if (device_param->is_cuda == true)
+            {
+              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
+            }
+
+            if (device_param->is_opencl == true)
+            {
+              if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
+            }
           }
         }
       }
-
-      if (hashconfig->opts_type & OPTS_TYPE_HOOK23)
-      {
-        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_23, pws_pos, pws_cnt, false, 0) == -1) return -1;
-
-        if (device_param->is_cuda == true)
-        {
-          if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
-        }
-
-        if (device_param->is_opencl == true)
-        {
-          if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
-        }
-
-        const int hook_threads = (int) user_options->hook_threads;
-
-        hook_thread_param_t *hook_threads_param = (hook_thread_param_t *) hccalloc (hook_threads, sizeof (hook_thread_param_t));
-
-        for (int i = 0; i < hook_threads; i++)
-        {
-          hook_thread_param_t *hook_thread_param = hook_threads_param + i;
-
-          hook_thread_param->tid = i;
-          hook_thread_param->tsz = hook_threads;
-
-          hook_thread_param->module_ctx = module_ctx;
-          hook_thread_param->status_ctx = status_ctx;
-
-          hook_thread_param->device_param = device_param;
-
-          hook_thread_param->hook_extra_param = module_ctx->hook_extra_params[i];
-          hook_thread_param->hook_salts_buf = hashes->hook_salts_buf;
-
-          hook_thread_param->salt_pos = salt_pos;
-
-          hook_thread_param->pws_cnt = pws_cnt;
-        }
-
-        hc_thread_t *c_threads = (hc_thread_t *) hccalloc (hook_threads, sizeof (hc_thread_t));
-
-        for (int i = 0; i < hook_threads; i++)
-        {
-          hook_thread_param_t *hook_thread_param = hook_threads_param + i;
-
-          hc_thread_create (c_threads[i], hook23_thread, hook_thread_param);
-        }
-
-        hc_thread_wait (hook_threads, c_threads);
-
-        hcfree (c_threads);
-
-        hcfree (hook_threads_param);
-
-        if (device_param->is_cuda == true)
-        {
-          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
-        }
-
-        if (device_param->is_opencl == true)
-        {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
-        }
-      }
     }
 
-    // init2 and loop2 are kind of special, we use run_loop for them, too
+    // note: they also do not influence the performance screen
+    // in case you want to use this, this cane make sense only if your input data comes out of tmps[]
 
-    if (run_loop == true)
+    if (hashconfig->opts_type & OPTS_TYPE_INIT2)
     {
-      // note: they also do not influence the performance screen
-      // in case you want to use this, this cane make sense only if your input data comes out of tmps[]
+      if (run_kernel (hashcat_ctx, device_param, KERN_RUN_INIT2, pws_pos, pws_cnt, false, 0) == -1) return -1;
+    }
 
-      if (hashconfig->opts_type & OPTS_TYPE_INIT2)
+    if (true)
+    {
+      const u32 salt_repeats = hashes->salts_buf[salt_pos].salt_repeats;
+
+      for (u32 salt_repeat = 0; salt_repeat <= salt_repeats; salt_repeat++)
       {
-        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_INIT2, pws_pos, pws_cnt, false, 0) == -1) return -1;
-      }
+        device_param->kernel_params_buf32[34] = salt_repeat;
 
-      if (hashconfig->opts_type & OPTS_TYPE_LOOP2)
-      {
-        u32 iter = hashes->salts_buf[salt_pos].salt_iter2;
-
-        u32 loop_step = device_param->kernel_loops;
-
-        for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++)
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP2_PREPARE)
         {
-          u32 loop_left = iter - loop_pos;
+          if (run_kernel (hashcat_ctx, device_param, KERN_RUN_LOOP2P, pws_pos, pws_cnt, false, 0) == -1) return -1;
+        }
 
-          loop_left = MIN (loop_left, loop_step);
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP2)
+        {
+          u32 iter = hashes->salts_buf[salt_pos].salt_iter2;
 
-          device_param->kernel_params_buf32[28] = loop_pos;
-          device_param->kernel_params_buf32[29] = loop_left;
+          u32 loop_step = device_param->kernel_loops;
 
-          if (run_kernel (hashcat_ctx, device_param, KERN_RUN_LOOP2, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1;
+          for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++)
+          {
+            u32 loop_left = iter - loop_pos;
 
-          //bug?
-          //while (status_ctx->run_thread_level2 == false) break;
-          if (status_ctx->run_thread_level2 == false) break;
+            loop_left = MIN (loop_left, loop_step);
+
+            device_param->kernel_params_buf32[28] = loop_pos;
+            device_param->kernel_params_buf32[29] = loop_left;
+
+            if (run_kernel (hashcat_ctx, device_param, KERN_RUN_LOOP2, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1;
+
+            //bug?
+            //while (status_ctx->run_thread_level2 == false) break;
+            if (status_ctx->run_thread_level2 == false) break;
+          }
         }
       }
     }
 
-    if (run_comp == true)
+    if (true)
     {
       if (hashconfig->opts_type & OPTS_TYPE_DEEP_COMP_KERNEL)
       {
@@ -3525,6 +3546,10 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       kernel_threads     = device_param->kernel_wgs12;
       dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size12;
       break;
+    case KERN_RUN_2P:
+      kernel_threads     = device_param->kernel_wgs2p;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size2p;
+      break;
     case KERN_RUN_2:
       kernel_threads     = device_param->kernel_wgs2;
       dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size2;
@@ -3549,6 +3574,10 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       kernel_threads     = device_param->kernel_wgs_init2;
       dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_init2;
       break;
+    case KERN_RUN_LOOP2P:
+      kernel_threads     = device_param->kernel_wgs_loop2p;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_loop2p;
+      break;
     case KERN_RUN_LOOP2:
       kernel_threads     = device_param->kernel_wgs_loop2;
       dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_loop2;
@@ -3590,8 +3619,8 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
 
   kernel_threads = MIN (kernel_threads, device_param->kernel_threads);
 
-  device_param->kernel_params_buf64[34] = pws_pos;
-  device_param->kernel_params_buf64[35] = num;
+  device_param->kernel_params_buf64[35] = pws_pos;
+  device_param->kernel_params_buf64[36] = num;
 
   u64 num_elements = num;
 
@@ -3603,19 +3632,21 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
     {
       switch (kern_run)
       {
-        case KERN_RUN_1:      cuda_function = device_param->cuda_function1;      break;
-        case KERN_RUN_12:     cuda_function = device_param->cuda_function12;     break;
-        case KERN_RUN_2:      cuda_function = device_param->cuda_function2;      break;
-        case KERN_RUN_2E:     cuda_function = device_param->cuda_function2e;     break;
-        case KERN_RUN_23:     cuda_function = device_param->cuda_function23;     break;
-        case KERN_RUN_3:      cuda_function = device_param->cuda_function3;      break;
-        case KERN_RUN_4:      cuda_function = device_param->cuda_function4;      break;
-        case KERN_RUN_INIT2:  cuda_function = device_param->cuda_function_init2; break;
-        case KERN_RUN_LOOP2:  cuda_function = device_param->cuda_function_loop2; break;
-        case KERN_RUN_AUX1:   cuda_function = device_param->cuda_function_aux1;  break;
-        case KERN_RUN_AUX2:   cuda_function = device_param->cuda_function_aux2;  break;
-        case KERN_RUN_AUX3:   cuda_function = device_param->cuda_function_aux3;  break;
-        case KERN_RUN_AUX4:   cuda_function = device_param->cuda_function_aux4;  break;
+        case KERN_RUN_1:      cuda_function = device_param->cuda_function1;       break;
+        case KERN_RUN_12:     cuda_function = device_param->cuda_function12;      break;
+        case KERN_RUN_2P:     cuda_function = device_param->cuda_function2p;      break;
+        case KERN_RUN_2:      cuda_function = device_param->cuda_function2;       break;
+        case KERN_RUN_2E:     cuda_function = device_param->cuda_function2e;      break;
+        case KERN_RUN_23:     cuda_function = device_param->cuda_function23;      break;
+        case KERN_RUN_3:      cuda_function = device_param->cuda_function3;       break;
+        case KERN_RUN_4:      cuda_function = device_param->cuda_function4;       break;
+        case KERN_RUN_INIT2:  cuda_function = device_param->cuda_function_init2;  break;
+        case KERN_RUN_LOOP2P: cuda_function = device_param->cuda_function_loop2p; break;
+        case KERN_RUN_LOOP2:  cuda_function = device_param->cuda_function_loop2;  break;
+        case KERN_RUN_AUX1:   cuda_function = device_param->cuda_function_aux1;   break;
+        case KERN_RUN_AUX2:   cuda_function = device_param->cuda_function_aux2;   break;
+        case KERN_RUN_AUX3:   cuda_function = device_param->cuda_function_aux3;   break;
+        case KERN_RUN_AUX4:   cuda_function = device_param->cuda_function_aux4;   break;
       }
 
       if (hc_cuFuncSetAttribute (hashcat_ctx, cuda_function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, dynamic_shared_mem) == -1) return -1;
@@ -3700,19 +3731,21 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
     {
       switch (kern_run)
       {
-        case KERN_RUN_1:      opencl_kernel = device_param->opencl_kernel1;      break;
-        case KERN_RUN_12:     opencl_kernel = device_param->opencl_kernel12;     break;
-        case KERN_RUN_2:      opencl_kernel = device_param->opencl_kernel2;      break;
-        case KERN_RUN_2E:     opencl_kernel = device_param->opencl_kernel2e;     break;
-        case KERN_RUN_23:     opencl_kernel = device_param->opencl_kernel23;     break;
-        case KERN_RUN_3:      opencl_kernel = device_param->opencl_kernel3;      break;
-        case KERN_RUN_4:      opencl_kernel = device_param->opencl_kernel4;      break;
-        case KERN_RUN_INIT2:  opencl_kernel = device_param->opencl_kernel_init2; break;
-        case KERN_RUN_LOOP2:  opencl_kernel = device_param->opencl_kernel_loop2; break;
-        case KERN_RUN_AUX1:   opencl_kernel = device_param->opencl_kernel_aux1;  break;
-        case KERN_RUN_AUX2:   opencl_kernel = device_param->opencl_kernel_aux2;  break;
-        case KERN_RUN_AUX3:   opencl_kernel = device_param->opencl_kernel_aux3;  break;
-        case KERN_RUN_AUX4:   opencl_kernel = device_param->opencl_kernel_aux4;  break;
+        case KERN_RUN_1:      opencl_kernel = device_param->opencl_kernel1;       break;
+        case KERN_RUN_12:     opencl_kernel = device_param->opencl_kernel12;      break;
+        case KERN_RUN_2P:     opencl_kernel = device_param->opencl_kernel2p;      break;
+        case KERN_RUN_2:      opencl_kernel = device_param->opencl_kernel2;       break;
+        case KERN_RUN_2E:     opencl_kernel = device_param->opencl_kernel2e;      break;
+        case KERN_RUN_23:     opencl_kernel = device_param->opencl_kernel23;      break;
+        case KERN_RUN_3:      opencl_kernel = device_param->opencl_kernel3;       break;
+        case KERN_RUN_4:      opencl_kernel = device_param->opencl_kernel4;       break;
+        case KERN_RUN_INIT2:  opencl_kernel = device_param->opencl_kernel_init2;  break;
+        case KERN_RUN_LOOP2P: opencl_kernel = device_param->opencl_kernel_loop2p; break;
+        case KERN_RUN_LOOP2:  opencl_kernel = device_param->opencl_kernel_loop2;  break;
+        case KERN_RUN_AUX1:   opencl_kernel = device_param->opencl_kernel_aux1;   break;
+        case KERN_RUN_AUX2:   opencl_kernel = device_param->opencl_kernel_aux2;   break;
+        case KERN_RUN_AUX3:   opencl_kernel = device_param->opencl_kernel_aux3;   break;
+        case KERN_RUN_AUX4:   opencl_kernel = device_param->opencl_kernel_aux4;   break;
       }
     }
 
@@ -3721,12 +3754,12 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_mem), device_param->kernel_params[i]) == -1) return -1;
     }
 
-    for (u32 i = 24; i <= 33; i++)
+    for (u32 i = 24; i <= 34; i++)
     {
       if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_uint), device_param->kernel_params[i]) == -1) return -1;
     }
 
-    for (u32 i = 34; i <= 35; i++)
+    for (u32 i = 35; i <= 36; i++)
     {
       if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_ulong), device_param->kernel_params[i]) == -1) return -1;
     }
@@ -3786,17 +3819,19 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
         {
           switch (kern_run)
           {
-            case KERN_RUN_1:      if (device_param->exec_us_prev1[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm]      * device_param->spin_damp)); break;
-            case KERN_RUN_2:      if (device_param->exec_us_prev2[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm]      * device_param->spin_damp)); break;
-            case KERN_RUN_2E:     if (device_param->exec_us_prev2e[iterationm]     > 0) usleep ((useconds_t) (device_param->exec_us_prev2e[iterationm]     * device_param->spin_damp)); break;
-            case KERN_RUN_3:      if (device_param->exec_us_prev3[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm]      * device_param->spin_damp)); break;
-            case KERN_RUN_4:      if (device_param->exec_us_prev4[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm]      * device_param->spin_damp)); break;
-            case KERN_RUN_INIT2:  if (device_param->exec_us_prev_init2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm] * device_param->spin_damp)); break;
-            case KERN_RUN_LOOP2:  if (device_param->exec_us_prev_loop2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm] * device_param->spin_damp)); break;
-            case KERN_RUN_AUX1:   if (device_param->exec_us_prev_aux1[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm]  * device_param->spin_damp)); break;
-            case KERN_RUN_AUX2:   if (device_param->exec_us_prev_aux2[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm]  * device_param->spin_damp)); break;
-            case KERN_RUN_AUX3:   if (device_param->exec_us_prev_aux3[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm]  * device_param->spin_damp)); break;
-            case KERN_RUN_AUX4:   if (device_param->exec_us_prev_aux4[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_1:      if (device_param->exec_us_prev1[iterationm]       > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm]       * device_param->spin_damp)); break;
+            case KERN_RUN_2P:     if (device_param->exec_us_prev2p[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev2p[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_2:      if (device_param->exec_us_prev2[iterationm]       > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm]       * device_param->spin_damp)); break;
+            case KERN_RUN_2E:     if (device_param->exec_us_prev2e[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev2e[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_3:      if (device_param->exec_us_prev3[iterationm]       > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm]       * device_param->spin_damp)); break;
+            case KERN_RUN_4:      if (device_param->exec_us_prev4[iterationm]       > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm]       * device_param->spin_damp)); break;
+            case KERN_RUN_INIT2:  if (device_param->exec_us_prev_init2[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_LOOP2P: if (device_param->exec_us_prev_loop2p[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2p[iterationm] * device_param->spin_damp)); break;
+            case KERN_RUN_LOOP2:  if (device_param->exec_us_prev_loop2[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX1:   if (device_param->exec_us_prev_aux1[iterationm]   > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm]   * device_param->spin_damp)); break;
+            case KERN_RUN_AUX2:   if (device_param->exec_us_prev_aux2[iterationm]   > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm]   * device_param->spin_damp)); break;
+            case KERN_RUN_AUX3:   if (device_param->exec_us_prev_aux3[iterationm]   > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm]   * device_param->spin_damp)); break;
+            case KERN_RUN_AUX4:   if (device_param->exec_us_prev_aux4[iterationm]   > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm]   * device_param->spin_damp)); break;
           }
         }
         else
@@ -3830,17 +3865,19 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       {
         switch (kern_run)
         {
-          case KERN_RUN_1:      device_param->exec_us_prev1[iterationm]      = exec_us; break;
-          case KERN_RUN_2:      device_param->exec_us_prev2[iterationm]      = exec_us; break;
-          case KERN_RUN_2E:     device_param->exec_us_prev2e[iterationm]     = exec_us; break;
-          case KERN_RUN_3:      device_param->exec_us_prev3[iterationm]      = exec_us; break;
-          case KERN_RUN_4:      device_param->exec_us_prev4[iterationm]      = exec_us; break;
-          case KERN_RUN_INIT2:  device_param->exec_us_prev_init2[iterationm] = exec_us; break;
-          case KERN_RUN_LOOP2:  device_param->exec_us_prev_loop2[iterationm] = exec_us; break;
-          case KERN_RUN_AUX1:   device_param->exec_us_prev_aux1[iterationm]  = exec_us; break;
-          case KERN_RUN_AUX2:   device_param->exec_us_prev_aux2[iterationm]  = exec_us; break;
-          case KERN_RUN_AUX3:   device_param->exec_us_prev_aux3[iterationm]  = exec_us; break;
-          case KERN_RUN_AUX4:   device_param->exec_us_prev_aux4[iterationm]  = exec_us; break;
+          case KERN_RUN_1:      device_param->exec_us_prev1[iterationm]       = exec_us; break;
+          case KERN_RUN_2P:     device_param->exec_us_prev2p[iterationm]      = exec_us; break;
+          case KERN_RUN_2:      device_param->exec_us_prev2[iterationm]       = exec_us; break;
+          case KERN_RUN_2E:     device_param->exec_us_prev2e[iterationm]      = exec_us; break;
+          case KERN_RUN_3:      device_param->exec_us_prev3[iterationm]       = exec_us; break;
+          case KERN_RUN_4:      device_param->exec_us_prev4[iterationm]       = exec_us; break;
+          case KERN_RUN_INIT2:  device_param->exec_us_prev_init2[iterationm]  = exec_us; break;
+          case KERN_RUN_LOOP2P: device_param->exec_us_prev_loop2p[iterationm] = exec_us; break;
+          case KERN_RUN_LOOP2:  device_param->exec_us_prev_loop2[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX1:   device_param->exec_us_prev_aux1[iterationm]   = exec_us; break;
+          case KERN_RUN_AUX2:   device_param->exec_us_prev_aux2[iterationm]   = exec_us; break;
+          case KERN_RUN_AUX3:   device_param->exec_us_prev_aux3[iterationm]   = exec_us; break;
+          case KERN_RUN_AUX4:   device_param->exec_us_prev_aux4[iterationm]   = exec_us; break;
         }
       }
     }
@@ -9086,8 +9123,9 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
     device_param->kernel_params_buf32[31] = 0; // digests_cnt
     device_param->kernel_params_buf32[32] = 0; // digests_offset
     device_param->kernel_params_buf32[33] = 0; // combs_mode
-    device_param->kernel_params_buf64[34] = 0; // pws_pos
-    device_param->kernel_params_buf64[35] = 0; // gid_max
+    device_param->kernel_params_buf32[34] = 0; // salt_repeat
+    device_param->kernel_params_buf64[35] = 0; // pws_pos
+    device_param->kernel_params_buf64[36] = 0; // gid_max
 
     if (device_param->is_cuda == true)
     {
@@ -9155,8 +9193,9 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
     device_param->kernel_params[31] = &device_param->kernel_params_buf32[31];
     device_param->kernel_params[32] = &device_param->kernel_params_buf32[32];
     device_param->kernel_params[33] = &device_param->kernel_params_buf32[33];
-    device_param->kernel_params[34] = &device_param->kernel_params_buf64[34];
+    device_param->kernel_params[34] = &device_param->kernel_params_buf32[34];
     device_param->kernel_params[35] = &device_param->kernel_params_buf64[35];
+    device_param->kernel_params[36] = &device_param->kernel_params_buf64[36];
 
     if (user_options->slow_candidates == true)
     {
@@ -9554,6 +9593,23 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
         device_param->kernel_preferred_wgs_multiple3 = device_param->cuda_warp_size;
 
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE)
+        {
+          // kernel2p
+
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop_prepare", kern_type);
+
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function2p, device_param->cuda_module, kernel_name) == -1) return -1;
+
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function2p, &device_param->kernel_wgs2p) == -1) return -1;
+
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2p, &device_param->kernel_local_mem_size2p) == -1) return -1;
+
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function2p, &device_param->kernel_dynamic_local_mem_size2p) == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple2p = device_param->cuda_warp_size;
+        }
+
         if (hashconfig->opts_type & OPTS_TYPE_LOOP_EXTENDED)
         {
           // kernel2e
@@ -9622,6 +9678,23 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
           device_param->kernel_preferred_wgs_multiple_init2 = device_param->cuda_warp_size;
         }
 
+        // loop2 prepare
+
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP2_PREPARE)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop2_prepare", kern_type);
+
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_loop2p, device_param->cuda_module, kernel_name) == -1) return -1;
+
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_loop2p, &device_param->kernel_wgs_loop2p) == -1) return -1;
+
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_loop2p, &device_param->kernel_local_mem_size_loop2p) == -1) return -1;
+
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_loop2p, &device_param->kernel_dynamic_local_mem_size_loop2p) == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_loop2p = device_param->cuda_warp_size;
+        }
+
         // loop2
 
         if (hashconfig->opts_type & OPTS_TYPE_LOOP2)
@@ -10142,6 +10215,21 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
         // aux1
 
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop_prepare", kern_type);
+
+          if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel2p) == -1) return -1;
+
+          if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel2p, &device_param->kernel_wgs2p) == -1) return -1;
+
+          if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel2p, &device_param->kernel_local_mem_size2p) == -1) return -1;
+
+          if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel2p, &device_param->kernel_dynamic_local_mem_size2p) == -1) return -1;
+
+          if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel2p, &device_param->kernel_preferred_wgs_multiple2p) == -1) return -1;
+        }
+
         if (hashconfig->opts_type & OPTS_TYPE_LOOP_EXTENDED)
         {
           snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop_extended", kern_type);
@@ -10208,6 +10296,23 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
           if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_init2, &device_param->kernel_preferred_wgs_multiple_init2) == -1) return -1;
         }
 
+        // loop2 prepare
+
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP2_PREPARE)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop2_prepare", kern_type);
+
+          if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel_loop2p) == -1) return -1;
+
+          if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_loop2p, &device_param->kernel_wgs_loop2p) == -1) return -1;
+
+          if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_loop2p, &device_param->kernel_local_mem_size_loop2p) == -1) return -1;
+
+          if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_loop2p, &device_param->kernel_dynamic_local_mem_size_loop2p) == -1) return -1;
+
+          if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_loop2p, &device_param->kernel_preferred_wgs_multiple_loop2p) == -1) return -1;
+        }
+
         // loop2
 
         if (hashconfig->opts_type & OPTS_TYPE_LOOP2)
@@ -11071,12 +11176,14 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
 
       device_param->cuda_function1            = NULL;
       device_param->cuda_function12           = NULL;
+      device_param->cuda_function2p           = NULL;
       device_param->cuda_function2            = NULL;
       device_param->cuda_function2e           = NULL;
       device_param->cuda_function23           = NULL;
       device_param->cuda_function3            = NULL;
       device_param->cuda_function4            = NULL;
       device_param->cuda_function_init2       = NULL;
+      device_param->cuda_function_loop2p      = NULL;
       device_param->cuda_function_loop2       = NULL;
       device_param->cuda_function_mp          = NULL;
       device_param->cuda_function_mp_l        = NULL;
@@ -11139,12 +11246,14 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
 
       if (device_param->opencl_kernel1)          hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel1);
       if (device_param->opencl_kernel12)         hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel12);
+      if (device_param->opencl_kernel2p)         hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel2p);
       if (device_param->opencl_kernel2)          hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel2);
       if (device_param->opencl_kernel2e)         hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel2e);
       if (device_param->opencl_kernel23)         hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel23);
       if (device_param->opencl_kernel3)          hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel3);
       if (device_param->opencl_kernel4)          hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel4);
       if (device_param->opencl_kernel_init2)     hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_init2);
+      if (device_param->opencl_kernel_loop2p)    hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_loop2p);
       if (device_param->opencl_kernel_loop2)     hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_loop2);
       if (device_param->opencl_kernel_mp)        hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_mp);
       if (device_param->opencl_kernel_mp_l)      hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_mp_l);
@@ -11205,12 +11314,14 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       device_param->opencl_d_st_esalts_buf     = NULL;
       device_param->opencl_kernel1             = NULL;
       device_param->opencl_kernel12            = NULL;
+      device_param->opencl_kernel2p            = NULL;
       device_param->opencl_kernel2             = NULL;
       device_param->opencl_kernel2e            = NULL;
       device_param->opencl_kernel23            = NULL;
       device_param->opencl_kernel3             = NULL;
       device_param->opencl_kernel4             = NULL;
       device_param->opencl_kernel_init2        = NULL;
+      device_param->opencl_kernel_loop2p       = NULL;
       device_param->opencl_kernel_loop2        = NULL;
       device_param->opencl_kernel_mp           = NULL;
       device_param->opencl_kernel_mp_l         = NULL;
diff --git a/src/modules/module_02500.c b/src/modules/module_02500.c
index b46ffc4f1..aede25a2f 100644
--- a/src/modules/module_02500.c
+++ b/src/modules/module_02500.c
@@ -579,6 +579,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M
     1,                  // digests_cnt
     0,                  // digests_offset
     0,                  // combs_mode
+    0,                  // salt_repeat
     0,                  // pws_pos
     1                   // gid_max
   );
diff --git a/src/modules/module_02501.c b/src/modules/module_02501.c
index 80d41ebba..e29a64065 100644
--- a/src/modules/module_02501.c
+++ b/src/modules/module_02501.c
@@ -554,6 +554,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M
     1,                  // digests_cnt
     0,                  // digests_offset
     0,                  // combs_mode
+    0,                  // salt_repeat
     0,                  // pws_pos
     1                   // gid_max
   );
diff --git a/src/modules/module_03200.c b/src/modules/module_03200.c
index 766a11213..5b40e989a 100644
--- a/src/modules/module_03200.c
+++ b/src/modules/module_03200.c
@@ -21,6 +21,7 @@ static const char *HASH_NAME      = "bcrypt $2*$, Blowfish (Unix)";
 static const u64   KERN_TYPE      = 3200;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
 static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
+                                  | OPTS_TYPE_MP_MULTI_DISABLE
                                   | OPTS_TYPE_DYNAMIC_SHARED;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
diff --git a/src/modules/module_08900.c b/src/modules/module_08900.c
index 277b90330..6343543da 100644
--- a/src/modules/module_08900.c
+++ b/src/modules/module_08900.c
@@ -24,6 +24,7 @@ static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
 static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_MP_MULTI_DISABLE
                                   | OPTS_TYPE_NATIVE_THREADS
+                                  | OPTS_TYPE_LOOP_PREPARE
                                   | OPTS_TYPE_SELF_TEST_DISABLE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
@@ -63,14 +64,14 @@ bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
 
 u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_min = 1;
+  const u32 kernel_loops_min = 1024;
 
   return kernel_loops_min;
 }
 
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_max = 1;
+  const u32 kernel_loops_max = 1024;
 
   return kernel_loops_max;
 }
@@ -330,6 +331,11 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
   salt->scrypt_r = hc_strtoul ((const char *) r_pos, NULL, 10);
   salt->scrypt_p = hc_strtoul ((const char *) p_pos, NULL, 10);
 
+  salt->salt_iter    = salt->scrypt_N;
+  salt->salt_repeats = salt->scrypt_p - 1;
+
+  if (salt->scrypt_N % 1024) return (PARSER_SALT_VALUE); // we set loop count to 1024 fixed
+
   // salt
 
   const u8 *salt_pos = token.buf[4];
@@ -341,8 +347,7 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
 
   memcpy (salt->salt_buf, tmp_buf, tmp_len);
 
-  salt->salt_len  = tmp_len;
-  salt->salt_iter = 1;
+  salt->salt_len = tmp_len;
 
   // digest - base64 decode
 
diff --git a/src/modules/module_09300.c b/src/modules/module_09300.c
index 73b130663..fbf5a6064 100644
--- a/src/modules/module_09300.c
+++ b/src/modules/module_09300.c
@@ -24,6 +24,7 @@ static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
 static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_MP_MULTI_DISABLE
                                   | OPTS_TYPE_NATIVE_THREADS
+                                  | OPTS_TYPE_LOOP_PREPARE
                                   | OPTS_TYPE_SELF_TEST_DISABLE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
@@ -52,14 +53,14 @@ static const u64 SCRYPT_P = 1;
 
 u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_min = 1;
+  const u32 kernel_loops_min = 1024;
 
   return kernel_loops_min;
 }
 
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_max = 1;
+  const u32 kernel_loops_max = 1024;
 
   return kernel_loops_max;
 }
@@ -299,11 +300,14 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
   memcpy (salt_buf_ptr, salt_pos, salt_len);
 
   salt->salt_len  = salt_len;
-  salt->salt_iter = 1;
 
-  salt->scrypt_N  = 16384;
-  salt->scrypt_r  = 1;
-  salt->scrypt_p  = 1;
+  salt->scrypt_N  = SCRYPT_N;
+  salt->scrypt_r  = SCRYPT_R;
+  salt->scrypt_p  = SCRYPT_P;
+
+  salt->salt_iter    = salt->scrypt_N;
+  salt->salt_repeats = salt->scrypt_p - 1;
+
 
   // base64 decode hash
 
diff --git a/src/modules/module_15700.c b/src/modules/module_15700.c
index 4b473410e..e76cc1427 100644
--- a/src/modules/module_15700.c
+++ b/src/modules/module_15700.c
@@ -24,6 +24,7 @@ static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
 static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_MP_MULTI_DISABLE
                                   | OPTS_TYPE_NATIVE_THREADS
+                                  | OPTS_TYPE_LOOP_PREPARE
                                   | OPTS_TYPE_SELF_TEST_DISABLE
                                   | OPTS_TYPE_ST_HEX;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
@@ -60,14 +61,14 @@ static const u64 SCRYPT_P = 1;
 
 u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_min = 1;
+  const u32 kernel_loops_min = 1024;
 
   return kernel_loops_min;
 }
 
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_max = 1;
+  const u32 kernel_loops_max = 1024;
 
   return kernel_loops_max;
 }
@@ -349,6 +350,11 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
   salt->scrypt_r = scrypt_r;
   salt->scrypt_p = scrypt_p;
 
+  salt->salt_iter    = salt->scrypt_N;
+  salt->salt_repeats = salt->scrypt_p - 1;
+
+  if (salt->scrypt_N % 1024) return (PARSER_SALT_VALUE); // we set loop count to 1024 fixed
+
   // salt
 
   const u8 *salt_pos = token.buf[4];
@@ -367,8 +373,6 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
   ethereum_scrypt->salt_buf[6] = salt->salt_buf[6];
   ethereum_scrypt->salt_buf[7] = salt->salt_buf[7];
 
-  salt->salt_iter = 1;
-
   // ciphertext
 
   const u8 *ciphertext_pos = token.buf[5];
diff --git a/src/modules/module_16800.c b/src/modules/module_16800.c
index 15c0e343f..625d2fbf0 100644
--- a/src/modules/module_16800.c
+++ b/src/modules/module_16800.c
@@ -290,6 +290,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M
     1,                  // digests_cnt
     0,                  // digests_offset
     0,                  // combs_mode
+    0,                  // salt_repeat
     0,                  // pws_pos
     1                   // gid_max
   );
diff --git a/src/modules/module_16801.c b/src/modules/module_16801.c
index 3324fa005..6d237ebf5 100644
--- a/src/modules/module_16801.c
+++ b/src/modules/module_16801.c
@@ -312,6 +312,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M
     1,                  // digests_cnt
     0,                  // digests_offset
     0,                  // combs_mode
+    0,                  // salt_repeat
     0,                  // pws_pos
     1                   // gid_max
   );
diff --git a/src/modules/module_22000.c b/src/modules/module_22000.c
index 996f6eda5..2b5f60cfb 100644
--- a/src/modules/module_22000.c
+++ b/src/modules/module_22000.c
@@ -600,6 +600,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M
     1,                  // digests_cnt
     0,                  // digests_offset
     0,                  // combs_mode
+    0,                  // salt_repeat
     0,                  // pws_pos
     1                   // gid_max
   );
diff --git a/src/modules/module_22001.c b/src/modules/module_22001.c
index 5b8737c3d..dd45f3bd2 100644
--- a/src/modules/module_22001.c
+++ b/src/modules/module_22001.c
@@ -601,6 +601,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M
     1,                  // digests_cnt
     0,                  // digests_offset
     0,                  // combs_mode
+    0,                  // salt_repeat
     0,                  // pws_pos
     1                   // gid_max
   );
diff --git a/src/modules/module_22700.c b/src/modules/module_22700.c
index f866bc235..6a82768d3 100644
--- a/src/modules/module_22700.c
+++ b/src/modules/module_22700.c
@@ -25,6 +25,7 @@ static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_BE
                                   | OPTS_TYPE_PT_UTF16BE
                                   | OPTS_TYPE_MP_MULTI_DISABLE
                                   | OPTS_TYPE_NATIVE_THREADS
+                                  | OPTS_TYPE_LOOP_PREPARE
                                   | OPTS_TYPE_SELF_TEST_DISABLE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
@@ -64,14 +65,14 @@ bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
 
 u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_min = 1;
+  const u32 kernel_loops_min = 1024;
 
   return kernel_loops_min;
 }
 
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_max = 1;
+  const u32 kernel_loops_max = 1024;
 
   return kernel_loops_max;
 }
@@ -320,6 +321,9 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
   salt->scrypt_r = SCRYPT_R;
   salt->scrypt_p = SCRYPT_P;
 
+  salt->salt_iter    = salt->scrypt_N;
+  salt->salt_repeats = salt->scrypt_p - 1;
+
   // version
 
   const u8 *version_pos = token.buf[1];
@@ -353,8 +357,7 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
   salt->salt_buf[10] = hex_to_u32 (b2_pos + 16);
   salt->salt_buf[11] = hex_to_u32 (b2_pos + 24);
 
-  salt->salt_len  = 48;
-  salt->salt_iter =  1;
+  salt->salt_len = 48;
 
   // fake digest: