From 30d950987801d4ece54c08b6dcdd4f748a85725f Mon Sep 17 00:00:00 2001
From: bacqube2 <benoit.bacquey@univ-rouen.fr>
Date: Thu, 2 Feb 2023 13:22:03 +0100
Subject: [PATCH] Creation blake2s inc files

---
 OpenCL/inc_hash_blake2s.cl | 898 +++++++++++++++++++++++++++++++++++++
 OpenCL/inc_hash_blake2s.h  |  99 ++++
 2 files changed, 997 insertions(+)
 create mode 100644 OpenCL/inc_hash_blake2s.cl
 create mode 100644 OpenCL/inc_hash_blake2s.h

diff --git a/OpenCL/inc_hash_blake2s.cl b/OpenCL/inc_hash_blake2s.cl
new file mode 100644
index 000000000..913ff7ef5
--- /dev/null
+++ b/OpenCL/inc_hash_blake2s.cl
@@ -0,0 +1,898 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#include "inc_vendor.h"
+#include "inc_types.h"
+#include "inc_platform.h"
+#include "inc_common.h"
+#include "inc_hash_blake2s.h"
+
+DECLSPEC u32 blake2s_rot16_S (const u32 a)
+{
+  #if defined IS_NV
+
+  vconv32_t in;
+
+  in.v32 = a;
+
+  vconv32_t out;
+
+  out.v16.a = hc_byte_perm_S (in.v16.b, in.v16.a, 0x1076);
+  out.v16.b = hc_byte_perm_S (in.v16.b, in.v16.a, 0x5432);
+
+  return out.v64;
+
+  #elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1
+
+  vconv64_t in;
+
+  in.v64 = a;
+
+  vconv64_t out;
+
+  out.v32.a = hc_byte_perm_S (in.v32.b, in.v32.a, 0x01000706);
+  out.v32.b = hc_byte_perm_S (in.v32.b, in.v32.a, 0x05040302);
+
+  return out.v64;
+
+  #else
+
+  return hc_rotr64_S (a, 16);
+
+  #endif
+}
+
+DECLSPEC u64x blake2b_rot16 (const u64x a)
+{
+  u64x r;
+
+  #if VECT_SIZE == 1
+  r = blake2b_rot16_S (a);
+  #endif
+
+  #if VECT_SIZE >= 2
+  r.s0 = blake2b_rot16_S (a.s0);
+  r.s1 = blake2b_rot16_S (a.s1);
+  #endif
+
+  #if VECT_SIZE >= 4
+  r.s2 = blake2b_rot16_S (a.s2);
+  r.s3 = blake2b_rot16_S (a.s3);
+  #endif
+
+  #if VECT_SIZE >= 8
+  r.s4 = blake2b_rot16_S (a.s4);
+  r.s5 = blake2b_rot16_S (a.s5);
+  r.s6 = blake2b_rot16_S (a.s6);
+  r.s7 = blake2b_rot16_S (a.s7);
+  #endif
+
+  #if VECT_SIZE >= 16
+  r.s8 = blake2b_rot16_S (a.s8);
+  r.s9 = blake2b_rot16_S (a.s9);
+  r.sa = blake2b_rot16_S (a.sa);
+  r.sb = blake2b_rot16_S (a.sb);
+  r.sc = blake2b_rot16_S (a.sc);
+  r.sd = blake2b_rot16_S (a.sd);
+  r.se = blake2b_rot16_S (a.se);
+  r.sf = blake2b_rot16_S (a.sf);
+  #endif
+
+  return r;
+}
+
+DECLSPEC u64 blake2b_rot24_S (const u64 a)
+{
+  #if defined IS_NV
+
+  vconv64_t in;
+
+  in.v64 = a;
+
+  vconv64_t out;
+
+  out.v32.a = hc_byte_perm_S (in.v32.b, in.v32.a, 0x2107);
+  out.v32.b = hc_byte_perm_S (in.v32.b, in.v32.a, 0x6543);
+
+  return out.v64;
+
+  #elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1
+
+  vconv64_t in;
+
+  in.v64 = a;
+
+  vconv64_t out;
+
+  out.v32.a = hc_byte_perm_S (in.v32.b, in.v32.a, 0x02010007);
+  out.v32.b = hc_byte_perm_S (in.v32.b, in.v32.a, 0x06050403);
+
+  return out.v64;
+
+  #else
+
+  return hc_rotr64_S (a, 24);
+
+  #endif
+}
+
+DECLSPEC u64x blake2b_rot24 (const u64x a)
+{
+  u64x r;
+
+  #if VECT_SIZE == 1
+  r = blake2b_rot24_S (a);
+  #endif
+
+  #if VECT_SIZE >= 2
+  r.s0 = blake2b_rot24_S (a.s0);
+  r.s1 = blake2b_rot24_S (a.s1);
+  #endif
+
+  #if VECT_SIZE >= 4
+  r.s2 = blake2b_rot24_S (a.s2);
+  r.s3 = blake2b_rot24_S (a.s3);
+  #endif
+
+  #if VECT_SIZE >= 8
+  r.s4 = blake2b_rot24_S (a.s4);
+  r.s5 = blake2b_rot24_S (a.s5);
+  r.s6 = blake2b_rot24_S (a.s6);
+  r.s7 = blake2b_rot24_S (a.s7);
+  #endif
+
+  #if VECT_SIZE >= 16
+  r.s8 = blake2b_rot24_S (a.s8);
+  r.s9 = blake2b_rot24_S (a.s9);
+  r.sa = blake2b_rot24_S (a.sa);
+  r.sb = blake2b_rot24_S (a.sb);
+  r.sc = blake2b_rot24_S (a.sc);
+  r.sd = blake2b_rot24_S (a.sd);
+  r.se = blake2b_rot24_S (a.se);
+  r.sf = blake2b_rot24_S (a.sf);
+  #endif
+
+  return r;
+}
+
+DECLSPEC u64 blake2b_rot32_S (const u64 a)
+{
+  vconv64_t in;
+
+  in.v64 = a;
+
+  vconv64_t out;
+
+  out.v32.a = in.v32.b;
+  out.v32.b = in.v32.a;
+
+  return out.v64;
+}
+
+DECLSPEC u64x blake2b_rot32 (const u64x a)
+{
+  u64x r;
+
+  #if VECT_SIZE == 1
+  r = blake2b_rot32_S (a);
+  #endif
+
+  #if VECT_SIZE >= 2
+  r.s0 = blake2b_rot32_S (a.s0);
+  r.s1 = blake2b_rot32_S (a.s1);
+  #endif
+
+  #if VECT_SIZE >= 4
+  r.s2 = blake2b_rot32_S (a.s2);
+  r.s3 = blake2b_rot32_S (a.s3);
+  #endif
+
+  #if VECT_SIZE >= 8
+  r.s4 = blake2b_rot32_S (a.s4);
+  r.s5 = blake2b_rot32_S (a.s5);
+  r.s6 = blake2b_rot32_S (a.s6);
+  r.s7 = blake2b_rot32_S (a.s7);
+  #endif
+
+  #if VECT_SIZE >= 16
+  r.s8 = blake2b_rot32_S (a.s8);
+  r.s9 = blake2b_rot32_S (a.s9);
+  r.sa = blake2b_rot32_S (a.sa);
+  r.sb = blake2b_rot32_S (a.sb);
+  r.sc = blake2b_rot32_S (a.sc);
+  r.sd = blake2b_rot32_S (a.sd);
+  r.se = blake2b_rot32_S (a.se);
+  r.sf = blake2b_rot32_S (a.sf);
+  #endif
+
+  return r;
+}
+
+DECLSPEC void blake2b_transform (PRIVATE_AS u64 *h, PRIVATE_AS const u64 *m, const int len, const u64 f0)
+{
+  const u64 t0 = hl32_to_64_S (0, len);
+
+  u64 v[16];
+
+  v[ 0] = h[0];
+  v[ 1] = h[1];
+  v[ 2] = h[2];
+  v[ 3] = h[3];
+  v[ 4] = h[4];
+  v[ 5] = h[5];
+  v[ 6] = h[6];
+  v[ 7] = h[7];
+  v[ 8] = BLAKE2B_IV_00;
+  v[ 9] = BLAKE2B_IV_01;
+  v[10] = BLAKE2B_IV_02;
+  v[11] = BLAKE2B_IV_03;
+  v[12] = BLAKE2B_IV_04 ^ t0;
+  v[13] = BLAKE2B_IV_05; // ^ t1;
+  v[14] = BLAKE2B_IV_06 ^ f0;
+  v[15] = BLAKE2B_IV_07; // ^ f1;
+
+  BLAKE2B_ROUND ( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
+  BLAKE2B_ROUND (14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3);
+  BLAKE2B_ROUND (11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4);
+  BLAKE2B_ROUND ( 7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8);
+  BLAKE2B_ROUND ( 9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13);
+  BLAKE2B_ROUND ( 2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9);
+  BLAKE2B_ROUND (12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11);
+  BLAKE2B_ROUND (13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10);
+  BLAKE2B_ROUND ( 6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5);
+  BLAKE2B_ROUND (10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0);
+  BLAKE2B_ROUND ( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
+  BLAKE2B_ROUND (14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3);
+
+  h[0] = h[0] ^ v[0] ^ v[ 8];
+  h[1] = h[1] ^ v[1] ^ v[ 9];
+  h[2] = h[2] ^ v[2] ^ v[10];
+  h[3] = h[3] ^ v[3] ^ v[11];
+  h[4] = h[4] ^ v[4] ^ v[12];
+  h[5] = h[5] ^ v[5] ^ v[13];
+  h[6] = h[6] ^ v[6] ^ v[14];
+  h[7] = h[7] ^ v[7] ^ v[15];
+}
+
+DECLSPEC void blake2b_init (PRIVATE_AS blake2b_ctx_t *ctx)
+{
+  ctx->h[0] = BLAKE2B_IV_00 ^ 0x01010040; // default output length: 0x40 = 64 bytes
+  ctx->h[1] = BLAKE2B_IV_01;
+  ctx->h[2] = BLAKE2B_IV_02;
+  ctx->h[3] = BLAKE2B_IV_03;
+  ctx->h[4] = BLAKE2B_IV_04;
+  ctx->h[5] = BLAKE2B_IV_05;
+  ctx->h[6] = BLAKE2B_IV_06;
+  ctx->h[7] = BLAKE2B_IV_07;
+
+  ctx->m[ 0] = 0;
+  ctx->m[ 1] = 0;
+  ctx->m[ 2] = 0;
+  ctx->m[ 3] = 0;
+  ctx->m[ 4] = 0;
+  ctx->m[ 5] = 0;
+  ctx->m[ 6] = 0;
+  ctx->m[ 7] = 0;
+  ctx->m[ 8] = 0;
+  ctx->m[ 9] = 0;
+  ctx->m[10] = 0;
+  ctx->m[11] = 0;
+  ctx->m[12] = 0;
+  ctx->m[13] = 0;
+  ctx->m[14] = 0;
+  ctx->m[15] = 0;
+
+  ctx->len = 0;
+}
+
+DECLSPEC void blake2b_update_128 (PRIVATE_AS blake2b_ctx_t *ctx, PRIVATE_AS u32 *w0, PRIVATE_AS u32 *w1, PRIVATE_AS u32 *w2, PRIVATE_AS u32 *w3, PRIVATE_AS u32 *w4, PRIVATE_AS u32 *w5, PRIVATE_AS u32 *w6, PRIVATE_AS u32 *w7, const int len)
+{
+  if (len == 0) return;
+
+  const int pos = ctx->len & 127;
+
+  if (pos == 0)
+  {
+    if (ctx->len > 0) // if new block (pos == 0) AND the (old) len is not zero => transform
+    {
+      blake2b_transform (ctx->h, ctx->m, ctx->len, BLAKE2B_UPDATE);
+    }
+
+    ctx->m[ 0] = hl32_to_64_S (w0[1], w0[0]);
+    ctx->m[ 1] = hl32_to_64_S (w0[3], w0[2]);
+    ctx->m[ 2] = hl32_to_64_S (w1[1], w1[0]);
+    ctx->m[ 3] = hl32_to_64_S (w1[3], w1[2]);
+    ctx->m[ 4] = hl32_to_64_S (w2[1], w2[0]);
+    ctx->m[ 5] = hl32_to_64_S (w2[3], w2[2]);
+    ctx->m[ 6] = hl32_to_64_S (w3[1], w3[0]);
+    ctx->m[ 7] = hl32_to_64_S (w3[3], w3[2]);
+    ctx->m[ 8] = hl32_to_64_S (w4[1], w4[0]);
+    ctx->m[ 9] = hl32_to_64_S (w4[3], w4[2]);
+    ctx->m[10] = hl32_to_64_S (w5[1], w5[0]);
+    ctx->m[11] = hl32_to_64_S (w5[3], w5[2]);
+    ctx->m[12] = hl32_to_64_S (w6[1], w6[0]);
+    ctx->m[13] = hl32_to_64_S (w6[3], w6[2]);
+    ctx->m[14] = hl32_to_64_S (w7[1], w7[0]);
+    ctx->m[15] = hl32_to_64_S (w7[3], w7[2]);
+  }
+  else
+  {
+    if ((pos + len) <= 128)
+    {
+      switch_buffer_by_offset_8x4_le_S (w0, w1, w2, w3, w4, w5, w6, w7, pos);
+
+      ctx->m[ 0] |= hl32_to_64_S (w0[1], w0[0]);
+      ctx->m[ 1] |= hl32_to_64_S (w0[3], w0[2]);
+      ctx->m[ 2] |= hl32_to_64_S (w1[1], w1[0]);
+      ctx->m[ 3] |= hl32_to_64_S (w1[3], w1[2]);
+      ctx->m[ 4] |= hl32_to_64_S (w2[1], w2[0]);
+      ctx->m[ 5] |= hl32_to_64_S (w2[3], w2[2]);
+      ctx->m[ 6] |= hl32_to_64_S (w3[1], w3[0]);
+      ctx->m[ 7] |= hl32_to_64_S (w3[3], w3[2]);
+      ctx->m[ 8] |= hl32_to_64_S (w4[1], w4[0]);
+      ctx->m[ 9] |= hl32_to_64_S (w4[3], w4[2]);
+      ctx->m[10] |= hl32_to_64_S (w5[1], w5[0]);
+      ctx->m[11] |= hl32_to_64_S (w5[3], w5[2]);
+      ctx->m[12] |= hl32_to_64_S (w6[1], w6[0]);
+      ctx->m[13] |= hl32_to_64_S (w6[3], w6[2]);
+      ctx->m[14] |= hl32_to_64_S (w7[1], w7[0]);
+      ctx->m[15] |= hl32_to_64_S (w7[3], w7[2]);
+    }
+    else
+    {
+      u32 c0[4] = { 0 };
+      u32 c1[4] = { 0 };
+      u32 c2[4] = { 0 };
+      u32 c3[4] = { 0 };
+      u32 c4[4] = { 0 };
+      u32 c5[4] = { 0 };
+      u32 c6[4] = { 0 };
+      u32 c7[4] = { 0 };
+
+      switch_buffer_by_offset_8x4_carry_le_S (w0, w1, w2, w3, w4, w5, w6, w7, c0, c1, c2, c3, c4, c5, c6, c7, pos);
+
+      ctx->m[ 0] |= hl32_to_64_S (w0[1], w0[0]);
+      ctx->m[ 1] |= hl32_to_64_S (w0[3], w0[2]);
+      ctx->m[ 2] |= hl32_to_64_S (w1[1], w1[0]);
+      ctx->m[ 3] |= hl32_to_64_S (w1[3], w1[2]);
+      ctx->m[ 4] |= hl32_to_64_S (w2[1], w2[0]);
+      ctx->m[ 5] |= hl32_to_64_S (w2[3], w2[2]);
+      ctx->m[ 6] |= hl32_to_64_S (w3[1], w3[0]);
+      ctx->m[ 7] |= hl32_to_64_S (w3[3], w3[2]);
+      ctx->m[ 8] |= hl32_to_64_S (w4[1], w4[0]);
+      ctx->m[ 9] |= hl32_to_64_S (w4[3], w4[2]);
+      ctx->m[10] |= hl32_to_64_S (w5[1], w5[0]);
+      ctx->m[11] |= hl32_to_64_S (w5[3], w5[2]);
+      ctx->m[12] |= hl32_to_64_S (w6[1], w6[0]);
+      ctx->m[13] |= hl32_to_64_S (w6[3], w6[2]);
+      ctx->m[14] |= hl32_to_64_S (w7[1], w7[0]);
+      ctx->m[15] |= hl32_to_64_S (w7[3], w7[2]);
+
+      // len must be a multiple of 128 (not ctx->len) for BLAKE2B_UPDATE:
+
+      const u32 cur_len = ((ctx->len + len) / 128) * 128;
+
+      blake2b_transform (ctx->h, ctx->m, cur_len, BLAKE2B_UPDATE);
+
+      ctx->m[ 0] = hl32_to_64_S (c0[1], c0[0]);
+      ctx->m[ 1] = hl32_to_64_S (c0[3], c0[2]);
+      ctx->m[ 2] = hl32_to_64_S (c1[1], c1[0]);
+      ctx->m[ 3] = hl32_to_64_S (c1[3], c1[2]);
+      ctx->m[ 4] = hl32_to_64_S (c2[1], c2[0]);
+      ctx->m[ 5] = hl32_to_64_S (c2[3], c2[2]);
+      ctx->m[ 6] = hl32_to_64_S (c3[1], c3[0]);
+      ctx->m[ 7] = hl32_to_64_S (c3[3], c3[2]);
+      ctx->m[ 8] = hl32_to_64_S (c4[1], c4[0]);
+      ctx->m[ 9] = hl32_to_64_S (c4[3], c4[2]);
+      ctx->m[10] = hl32_to_64_S (c5[1], c5[0]);
+      ctx->m[11] = hl32_to_64_S (c5[3], c5[2]);
+      ctx->m[12] = hl32_to_64_S (c6[1], c6[0]);
+      ctx->m[13] = hl32_to_64_S (c6[3], c6[2]);
+      ctx->m[14] = hl32_to_64_S (c7[1], c7[0]);
+      ctx->m[15] = hl32_to_64_S (c7[3], c7[2]);
+    }
+  }
+
+  ctx->len += len;
+}
+
+DECLSPEC void blake2b_update (PRIVATE_AS blake2b_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len)
+{
+  u32 w0[4];
+  u32 w1[4];
+  u32 w2[4];
+  u32 w3[4];
+  u32 w4[4];
+  u32 w5[4];
+  u32 w6[4];
+  u32 w7[4];
+
+  const int limit = (const int) len - 128; // int type needed, could be negative
+
+  int pos1;
+  int pos4;
+
+  for (pos1 = 0, pos4 = 0; pos1 < limit; pos1 += 128, pos4 += 32)
+  {
+    w0[0] = w[pos4 +  0];
+    w0[1] = w[pos4 +  1];
+    w0[2] = w[pos4 +  2];
+    w0[3] = w[pos4 +  3];
+    w1[0] = w[pos4 +  4];
+    w1[1] = w[pos4 +  5];
+    w1[2] = w[pos4 +  6];
+    w1[3] = w[pos4 +  7];
+    w2[0] = w[pos4 +  8];
+    w2[1] = w[pos4 +  9];
+    w2[2] = w[pos4 + 10];
+    w2[3] = w[pos4 + 11];
+    w3[0] = w[pos4 + 12];
+    w3[1] = w[pos4 + 13];
+    w3[2] = w[pos4 + 14];
+    w3[3] = w[pos4 + 15];
+    w4[0] = w[pos4 + 16];
+    w4[1] = w[pos4 + 17];
+    w4[2] = w[pos4 + 18];
+    w4[3] = w[pos4 + 19];
+    w5[0] = w[pos4 + 20];
+    w5[1] = w[pos4 + 21];
+    w5[2] = w[pos4 + 22];
+    w5[3] = w[pos4 + 23];
+    w6[0] = w[pos4 + 24];
+    w6[1] = w[pos4 + 25];
+    w6[2] = w[pos4 + 26];
+    w6[3] = w[pos4 + 27];
+    w7[0] = w[pos4 + 28];
+    w7[1] = w[pos4 + 29];
+    w7[2] = w[pos4 + 30];
+    w7[3] = w[pos4 + 31];
+
+    blake2b_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 128);
+  }
+
+  w0[0] = w[pos4 +  0];
+  w0[1] = w[pos4 +  1];
+  w0[2] = w[pos4 +  2];
+  w0[3] = w[pos4 +  3];
+  w1[0] = w[pos4 +  4];
+  w1[1] = w[pos4 +  5];
+  w1[2] = w[pos4 +  6];
+  w1[3] = w[pos4 +  7];
+  w2[0] = w[pos4 +  8];
+  w2[1] = w[pos4 +  9];
+  w2[2] = w[pos4 + 10];
+  w2[3] = w[pos4 + 11];
+  w3[0] = w[pos4 + 12];
+  w3[1] = w[pos4 + 13];
+  w3[2] = w[pos4 + 14];
+  w3[3] = w[pos4 + 15];
+  w4[0] = w[pos4 + 16];
+  w4[1] = w[pos4 + 17];
+  w4[2] = w[pos4 + 18];
+  w4[3] = w[pos4 + 19];
+  w5[0] = w[pos4 + 20];
+  w5[1] = w[pos4 + 21];
+  w5[2] = w[pos4 + 22];
+  w5[3] = w[pos4 + 23];
+  w6[0] = w[pos4 + 24];
+  w6[1] = w[pos4 + 25];
+  w6[2] = w[pos4 + 26];
+  w6[3] = w[pos4 + 27];
+  w7[0] = w[pos4 + 28];
+  w7[1] = w[pos4 + 29];
+  w7[2] = w[pos4 + 30];
+  w7[3] = w[pos4 + 31];
+
+  blake2b_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - (u32) pos1);
+}
+
+DECLSPEC void blake2b_update_global (PRIVATE_AS blake2b_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len)
+{
+  u32 w0[4];
+  u32 w1[4];
+  u32 w2[4];
+  u32 w3[4];
+  u32 w4[4];
+  u32 w5[4];
+  u32 w6[4];
+  u32 w7[4];
+
+  const int limit = (const int) len - 128; // int type needed, could be negative
+
+  int pos1;
+  int pos4;
+
+  for (pos1 = 0, pos4 = 0; pos1 < limit; pos1 += 128, pos4 += 32)
+  {
+    w0[0] = w[pos4 +  0];
+    w0[1] = w[pos4 +  1];
+    w0[2] = w[pos4 +  2];
+    w0[3] = w[pos4 +  3];
+    w1[0] = w[pos4 +  4];
+    w1[1] = w[pos4 +  5];
+    w1[2] = w[pos4 +  6];
+    w1[3] = w[pos4 +  7];
+    w2[0] = w[pos4 +  8];
+    w2[1] = w[pos4 +  9];
+    w2[2] = w[pos4 + 10];
+    w2[3] = w[pos4 + 11];
+    w3[0] = w[pos4 + 12];
+    w3[1] = w[pos4 + 13];
+    w3[2] = w[pos4 + 14];
+    w3[3] = w[pos4 + 15];
+    w4[0] = w[pos4 + 16];
+    w4[1] = w[pos4 + 17];
+    w4[2] = w[pos4 + 18];
+    w4[3] = w[pos4 + 19];
+    w5[0] = w[pos4 + 20];
+    w5[1] = w[pos4 + 21];
+    w5[2] = w[pos4 + 22];
+    w5[3] = w[pos4 + 23];
+    w6[0] = w[pos4 + 24];
+    w6[1] = w[pos4 + 25];
+    w6[2] = w[pos4 + 26];
+    w6[3] = w[pos4 + 27];
+    w7[0] = w[pos4 + 28];
+    w7[1] = w[pos4 + 29];
+    w7[2] = w[pos4 + 30];
+    w7[3] = w[pos4 + 31];
+
+    blake2b_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 128);
+  }
+
+  w0[0] = w[pos4 +  0];
+  w0[1] = w[pos4 +  1];
+  w0[2] = w[pos4 +  2];
+  w0[3] = w[pos4 +  3];
+  w1[0] = w[pos4 +  4];
+  w1[1] = w[pos4 +  5];
+  w1[2] = w[pos4 +  6];
+  w1[3] = w[pos4 +  7];
+  w2[0] = w[pos4 +  8];
+  w2[1] = w[pos4 +  9];
+  w2[2] = w[pos4 + 10];
+  w2[3] = w[pos4 + 11];
+  w3[0] = w[pos4 + 12];
+  w3[1] = w[pos4 + 13];
+  w3[2] = w[pos4 + 14];
+  w3[3] = w[pos4 + 15];
+  w4[0] = w[pos4 + 16];
+  w4[1] = w[pos4 + 17];
+  w4[2] = w[pos4 + 18];
+  w4[3] = w[pos4 + 19];
+  w5[0] = w[pos4 + 20];
+  w5[1] = w[pos4 + 21];
+  w5[2] = w[pos4 + 22];
+  w5[3] = w[pos4 + 23];
+  w6[0] = w[pos4 + 24];
+  w6[1] = w[pos4 + 25];
+  w6[2] = w[pos4 + 26];
+  w6[3] = w[pos4 + 27];
+  w7[0] = w[pos4 + 28];
+  w7[1] = w[pos4 + 29];
+  w7[2] = w[pos4 + 30];
+  w7[3] = w[pos4 + 31];
+
+  blake2b_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - (u32) pos1);
+}
+
+DECLSPEC void blake2b_final (PRIVATE_AS blake2b_ctx_t *ctx)
+{
+  blake2b_transform (ctx->h, ctx->m, ctx->len, BLAKE2B_FINAL);
+}
+
+DECLSPEC void blake2b_transform_vector (PRIVATE_AS u64x *h, PRIVATE_AS const u64x *m, const u32x len, const u64 f0)
+{
+  const u64x t0 = hl32_to_64 (0, len);
+
+  u64x v[16];
+
+  v[ 0] = h[0];
+  v[ 1] = h[1];
+  v[ 2] = h[2];
+  v[ 3] = h[3];
+  v[ 4] = h[4];
+  v[ 5] = h[5];
+  v[ 6] = h[6];
+  v[ 7] = h[7];
+  v[ 8] = BLAKE2B_IV_00;
+  v[ 9] = BLAKE2B_IV_01;
+  v[10] = BLAKE2B_IV_02;
+  v[11] = BLAKE2B_IV_03;
+  v[12] = make_u64x (BLAKE2B_IV_04) ^ t0;
+  v[13] = BLAKE2B_IV_05; // ^ t1;
+  v[14] = make_u64x (BLAKE2B_IV_06) ^ f0;
+  v[15] = BLAKE2B_IV_07; // ^ f1;
+
+  BLAKE2B_ROUND_VECTOR ( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
+  BLAKE2B_ROUND_VECTOR (14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3);
+  BLAKE2B_ROUND_VECTOR (11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4);
+  BLAKE2B_ROUND_VECTOR ( 7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8);
+  BLAKE2B_ROUND_VECTOR ( 9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13);
+  BLAKE2B_ROUND_VECTOR ( 2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9);
+  BLAKE2B_ROUND_VECTOR (12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11);
+  BLAKE2B_ROUND_VECTOR (13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10);
+  BLAKE2B_ROUND_VECTOR ( 6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5);
+  BLAKE2B_ROUND_VECTOR (10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0);
+  BLAKE2B_ROUND_VECTOR ( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
+  BLAKE2B_ROUND_VECTOR (14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3);
+
+  h[0] = h[0] ^ v[0] ^ v[ 8];
+  h[1] = h[1] ^ v[1] ^ v[ 9];
+  h[2] = h[2] ^ v[2] ^ v[10];
+  h[3] = h[3] ^ v[3] ^ v[11];
+  h[4] = h[4] ^ v[4] ^ v[12];
+  h[5] = h[5] ^ v[5] ^ v[13];
+  h[6] = h[6] ^ v[6] ^ v[14];
+  h[7] = h[7] ^ v[7] ^ v[15];
+}
+
+DECLSPEC void blake2b_init_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx)
+{
+  ctx->h[0] = BLAKE2B_IV_00 ^ 0x01010040; // default output length: 0x40 = 64 bytes
+  ctx->h[1] = BLAKE2B_IV_01;
+  ctx->h[2] = BLAKE2B_IV_02;
+  ctx->h[3] = BLAKE2B_IV_03;
+  ctx->h[4] = BLAKE2B_IV_04;
+  ctx->h[5] = BLAKE2B_IV_05;
+  ctx->h[6] = BLAKE2B_IV_06;
+  ctx->h[7] = BLAKE2B_IV_07;
+
+  ctx->m[ 0] = 0;
+  ctx->m[ 1] = 0;
+  ctx->m[ 2] = 0;
+  ctx->m[ 3] = 0;
+  ctx->m[ 4] = 0;
+  ctx->m[ 5] = 0;
+  ctx->m[ 6] = 0;
+  ctx->m[ 7] = 0;
+  ctx->m[ 8] = 0;
+  ctx->m[ 9] = 0;
+  ctx->m[10] = 0;
+  ctx->m[11] = 0;
+  ctx->m[12] = 0;
+  ctx->m[13] = 0;
+  ctx->m[14] = 0;
+  ctx->m[15] = 0;
+
+  ctx->len = 0;
+}
+
+DECLSPEC void blake2b_init_vector_from_scalar (PRIVATE_AS blake2b_ctx_vector_t *ctx, PRIVATE_AS blake2b_ctx_t *ctx0)
+{
+  ctx->h[0] = ctx0->h[0];
+  ctx->h[1] = ctx0->h[1];
+  ctx->h[2] = ctx0->h[2];
+  ctx->h[3] = ctx0->h[3];
+  ctx->h[4] = ctx0->h[4];
+  ctx->h[5] = ctx0->h[5];
+  ctx->h[6] = ctx0->h[6];
+  ctx->h[7] = ctx0->h[7];
+
+  ctx->m[ 0] = ctx0->m[ 0];
+  ctx->m[ 1] = ctx0->m[ 1];
+  ctx->m[ 2] = ctx0->m[ 2];
+  ctx->m[ 3] = ctx0->m[ 3];
+  ctx->m[ 4] = ctx0->m[ 4];
+  ctx->m[ 5] = ctx0->m[ 5];
+  ctx->m[ 6] = ctx0->m[ 6];
+  ctx->m[ 7] = ctx0->m[ 7];
+  ctx->m[ 8] = ctx0->m[ 8];
+  ctx->m[ 9] = ctx0->m[ 9];
+  ctx->m[10] = ctx0->m[10];
+  ctx->m[11] = ctx0->m[11];
+  ctx->m[12] = ctx0->m[12];
+  ctx->m[13] = ctx0->m[13];
+  ctx->m[14] = ctx0->m[14];
+  ctx->m[15] = ctx0->m[15];
+
+  ctx->len = ctx0->len;
+}
+
+DECLSPEC void blake2b_update_vector_128 (PRIVATE_AS blake2b_ctx_vector_t *ctx, PRIVATE_AS u32x *w0, PRIVATE_AS u32x *w1, PRIVATE_AS u32x *w2, PRIVATE_AS u32x *w3, PRIVATE_AS u32x *w4, PRIVATE_AS u32x *w5, PRIVATE_AS u32x *w6, PRIVATE_AS u32x *w7, const int len)
+{
+  if (len == 0) return;
+
+  const int pos = ctx->len & 127;
+
+  if (pos == 0)
+  {
+    if (ctx->len > 0) // if new block (pos == 0) AND the (old) len is not zero => transform
+    {
+      blake2b_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, BLAKE2B_UPDATE);
+    }
+
+    ctx->m[ 0] = hl32_to_64 (w0[1], w0[0]);
+    ctx->m[ 1] = hl32_to_64 (w0[3], w0[2]);
+    ctx->m[ 2] = hl32_to_64 (w1[1], w1[0]);
+    ctx->m[ 3] = hl32_to_64 (w1[3], w1[2]);
+    ctx->m[ 4] = hl32_to_64 (w2[1], w2[0]);
+    ctx->m[ 5] = hl32_to_64 (w2[3], w2[2]);
+    ctx->m[ 6] = hl32_to_64 (w3[1], w3[0]);
+    ctx->m[ 7] = hl32_to_64 (w3[3], w3[2]);
+    ctx->m[ 8] = hl32_to_64 (w4[1], w4[0]);
+    ctx->m[ 9] = hl32_to_64 (w4[3], w4[2]);
+    ctx->m[10] = hl32_to_64 (w5[1], w5[0]);
+    ctx->m[11] = hl32_to_64 (w5[3], w5[2]);
+    ctx->m[12] = hl32_to_64 (w6[1], w6[0]);
+    ctx->m[13] = hl32_to_64 (w6[3], w6[2]);
+    ctx->m[14] = hl32_to_64 (w7[1], w7[0]);
+    ctx->m[15] = hl32_to_64 (w7[3], w7[2]);
+  }
+  else
+  {
+    if ((pos + len) <= 128)
+    {
+      switch_buffer_by_offset_8x4_le (w0, w1, w2, w3, w4, w5, w6, w7, pos);
+
+      ctx->m[ 0] |= hl32_to_64 (w0[1], w0[0]);
+      ctx->m[ 1] |= hl32_to_64 (w0[3], w0[2]);
+      ctx->m[ 2] |= hl32_to_64 (w1[1], w1[0]);
+      ctx->m[ 3] |= hl32_to_64 (w1[3], w1[2]);
+      ctx->m[ 4] |= hl32_to_64 (w2[1], w2[0]);
+      ctx->m[ 5] |= hl32_to_64 (w2[3], w2[2]);
+      ctx->m[ 6] |= hl32_to_64 (w3[1], w3[0]);
+      ctx->m[ 7] |= hl32_to_64 (w3[3], w3[2]);
+      ctx->m[ 8] |= hl32_to_64 (w4[1], w4[0]);
+      ctx->m[ 9] |= hl32_to_64 (w4[3], w4[2]);
+      ctx->m[10] |= hl32_to_64 (w5[1], w5[0]);
+      ctx->m[11] |= hl32_to_64 (w5[3], w5[2]);
+      ctx->m[12] |= hl32_to_64 (w6[1], w6[0]);
+      ctx->m[13] |= hl32_to_64 (w6[3], w6[2]);
+      ctx->m[14] |= hl32_to_64 (w7[1], w7[0]);
+      ctx->m[15] |= hl32_to_64 (w7[3], w7[2]);
+    }
+    else
+    {
+      u32x c0[4] = { 0 };
+      u32x c1[4] = { 0 };
+      u32x c2[4] = { 0 };
+      u32x c3[4] = { 0 };
+      u32x c4[4] = { 0 };
+      u32x c5[4] = { 0 };
+      u32x c6[4] = { 0 };
+      u32x c7[4] = { 0 };
+
+      switch_buffer_by_offset_8x4_carry_le (w0, w1, w2, w3, w4, w5, w6, w7, c0, c1, c2, c3, c4, c5, c6, c7, pos);
+
+      ctx->m[ 0] |= hl32_to_64 (w0[1], w0[0]);
+      ctx->m[ 1] |= hl32_to_64 (w0[3], w0[2]);
+      ctx->m[ 2] |= hl32_to_64 (w1[1], w1[0]);
+      ctx->m[ 3] |= hl32_to_64 (w1[3], w1[2]);
+      ctx->m[ 4] |= hl32_to_64 (w2[1], w2[0]);
+      ctx->m[ 5] |= hl32_to_64 (w2[3], w2[2]);
+      ctx->m[ 6] |= hl32_to_64 (w3[1], w3[0]);
+      ctx->m[ 7] |= hl32_to_64 (w3[3], w3[2]);
+      ctx->m[ 8] |= hl32_to_64 (w4[1], w4[0]);
+      ctx->m[ 9] |= hl32_to_64 (w4[3], w4[2]);
+      ctx->m[10] |= hl32_to_64 (w5[1], w5[0]);
+      ctx->m[11] |= hl32_to_64 (w5[3], w5[2]);
+      ctx->m[12] |= hl32_to_64 (w6[1], w6[0]);
+      ctx->m[13] |= hl32_to_64 (w6[3], w6[2]);
+      ctx->m[14] |= hl32_to_64 (w7[1], w7[0]);
+      ctx->m[15] |= hl32_to_64 (w7[3], w7[2]);
+
+      // len must be a multiple of 128 (not ctx->len) for BLAKE2B_UPDATE:
+
+      const u32x cur_len = ((ctx->len + len) / 128) * 128;
+
+      blake2b_transform_vector (ctx->h, ctx->m, cur_len, BLAKE2B_UPDATE);
+
+      ctx->m[ 0] = hl32_to_64 (c0[1], c0[0]);
+      ctx->m[ 1] = hl32_to_64 (c0[3], c0[2]);
+      ctx->m[ 2] = hl32_to_64 (c1[1], c1[0]);
+      ctx->m[ 3] = hl32_to_64 (c1[3], c1[2]);
+      ctx->m[ 4] = hl32_to_64 (c2[1], c2[0]);
+      ctx->m[ 5] = hl32_to_64 (c2[3], c2[2]);
+      ctx->m[ 6] = hl32_to_64 (c3[1], c3[0]);
+      ctx->m[ 7] = hl32_to_64 (c3[3], c3[2]);
+      ctx->m[ 8] = hl32_to_64 (c4[1], c4[0]);
+      ctx->m[ 9] = hl32_to_64 (c4[3], c4[2]);
+      ctx->m[10] = hl32_to_64 (c5[1], c5[0]);
+      ctx->m[11] = hl32_to_64 (c5[3], c5[2]);
+      ctx->m[12] = hl32_to_64 (c6[1], c6[0]);
+      ctx->m[13] = hl32_to_64 (c6[3], c6[2]);
+      ctx->m[14] = hl32_to_64 (c7[1], c7[0]);
+      ctx->m[15] = hl32_to_64 (c7[3], c7[2]);
+    }
+  }
+
+  ctx->len += len;
+}
+
+DECLSPEC void blake2b_update_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len)
+{
+  u32x w0[4];
+  u32x w1[4];
+  u32x w2[4];
+  u32x w3[4];
+  u32x w4[4];
+  u32x w5[4];
+  u32x w6[4];
+  u32x w7[4];
+
+  const int limit = (const int) len - 128; // int type needed, could be negative
+
+  int pos1;
+  int pos4;
+
+  for (pos1 = 0, pos4 = 0; pos1 < limit; pos1 += 128, pos4 += 32)
+  {
+    w0[0] = w[pos4 +  0];
+    w0[1] = w[pos4 +  1];
+    w0[2] = w[pos4 +  2];
+    w0[3] = w[pos4 +  3];
+    w1[0] = w[pos4 +  4];
+    w1[1] = w[pos4 +  5];
+    w1[2] = w[pos4 +  6];
+    w1[3] = w[pos4 +  7];
+    w2[0] = w[pos4 +  8];
+    w2[1] = w[pos4 +  9];
+    w2[2] = w[pos4 + 10];
+    w2[3] = w[pos4 + 11];
+    w3[0] = w[pos4 + 12];
+    w3[1] = w[pos4 + 13];
+    w3[2] = w[pos4 + 14];
+    w3[3] = w[pos4 + 15];
+    w4[0] = w[pos4 + 16];
+    w4[1] = w[pos4 + 17];
+    w4[2] = w[pos4 + 18];
+    w4[3] = w[pos4 + 19];
+    w5[0] = w[pos4 + 20];
+    w5[1] = w[pos4 + 21];
+    w5[2] = w[pos4 + 22];
+    w5[3] = w[pos4 + 23];
+    w6[0] = w[pos4 + 24];
+    w6[1] = w[pos4 + 25];
+    w6[2] = w[pos4 + 26];
+    w6[3] = w[pos4 + 27];
+    w7[0] = w[pos4 + 28];
+    w7[1] = w[pos4 + 29];
+    w7[2] = w[pos4 + 30];
+    w7[3] = w[pos4 + 31];
+
+    blake2b_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 128);
+  }
+
+  w0[0] = w[pos4 +  0];
+  w0[1] = w[pos4 +  1];
+  w0[2] = w[pos4 +  2];
+  w0[3] = w[pos4 +  3];
+  w1[0] = w[pos4 +  4];
+  w1[1] = w[pos4 +  5];
+  w1[2] = w[pos4 +  6];
+  w1[3] = w[pos4 +  7];
+  w2[0] = w[pos4 +  8];
+  w2[1] = w[pos4 +  9];
+  w2[2] = w[pos4 + 10];
+  w2[3] = w[pos4 + 11];
+  w3[0] = w[pos4 + 12];
+  w3[1] = w[pos4 + 13];
+  w3[2] = w[pos4 + 14];
+  w3[3] = w[pos4 + 15];
+  w4[0] = w[pos4 + 16];
+  w4[1] = w[pos4 + 17];
+  w4[2] = w[pos4 + 18];
+  w4[3] = w[pos4 + 19];
+  w5[0] = w[pos4 + 20];
+  w5[1] = w[pos4 + 21];
+  w5[2] = w[pos4 + 22];
+  w5[3] = w[pos4 + 23];
+  w6[0] = w[pos4 + 24];
+  w6[1] = w[pos4 + 25];
+  w6[2] = w[pos4 + 26];
+  w6[3] = w[pos4 + 27];
+  w7[0] = w[pos4 + 28];
+  w7[1] = w[pos4 + 29];
+  w7[2] = w[pos4 + 30];
+  w7[3] = w[pos4 + 31];
+
+  blake2b_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - (u32) pos1);
+}
+
+DECLSPEC void blake2b_final_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx)
+{
+  blake2b_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, BLAKE2B_FINAL);
+}
diff --git a/OpenCL/inc_hash_blake2s.h b/OpenCL/inc_hash_blake2s.h
new file mode 100644
index 000000000..425404a3d
--- /dev/null
+++ b/OpenCL/inc_hash_blake2s.h
@@ -0,0 +1,99 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#ifndef INC_HASH_BLAKE2B_H
+#define INC_HASH_BLAKE2B_H
+
+#define BLAKE2B_UPDATE  0
+#define BLAKE2B_FINAL  -1
+
+DECLSPEC u64  blake2b_rot16_S (const u64  a);
+DECLSPEC u64x blake2b_rot16   (const u64x a);
+
+DECLSPEC u64  blake2b_rot24_S (const u64  a);
+DECLSPEC u64x blake2b_rot24   (const u64x a);
+
+DECLSPEC u64  blake2b_rot32_S (const u64  a);
+DECLSPEC u64x blake2b_rot32   (const u64x a);
+
+#define BLAKE2B_G(k0,k1,a,b,c,d) \
+{                                \
+  a = a + b + m[k0];             \
+  d = blake2b_rot32_S (d ^ a);   \
+  c = c + d;                     \
+  b = blake2b_rot24_S (b ^ c);   \
+  a = a + b + m[k1];             \
+  d = blake2b_rot16_S (d ^ a);   \
+  c = c + d;                     \
+  b = hc_rotr64_S (b ^ c, 63);   \
+}
+
+#define BLAKE2B_ROUND(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf) \
+{                                                                      \
+  BLAKE2B_G (c0, c1, v[0], v[4], v[ 8], v[12]);                        \
+  BLAKE2B_G (c2, c3, v[1], v[5], v[ 9], v[13]);                        \
+  BLAKE2B_G (c4, c5, v[2], v[6], v[10], v[14]);                        \
+  BLAKE2B_G (c6, c7, v[3], v[7], v[11], v[15]);                        \
+  BLAKE2B_G (c8, c9, v[0], v[5], v[10], v[15]);                        \
+  BLAKE2B_G (ca, cb, v[1], v[6], v[11], v[12]);                        \
+  BLAKE2B_G (cc, cd, v[2], v[7], v[ 8], v[13]);                        \
+  BLAKE2B_G (ce, cf, v[3], v[4], v[ 9], v[14]);                        \
+}
+
+#define BLAKE2B_G_VECTOR(k0,k1,a,b,c,d) \
+{                                       \
+  a = a + b + m[k0];                    \
+  d = blake2b_rot32 (d ^ a);            \
+  c = c + d;                            \
+  b = blake2b_rot24 (b ^ c);            \
+  a = a + b + m[k1];                    \
+  d = blake2b_rot16 (d ^ a);            \
+  c = c + d;                            \
+  b = hc_rotr64 (b ^ c, 63);            \
+}
+
+#define BLAKE2B_ROUND_VECTOR(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf) \
+{                                                                             \
+  BLAKE2B_G_VECTOR (c0, c1, v[0], v[4], v[ 8], v[12]);                        \
+  BLAKE2B_G_VECTOR (c2, c3, v[1], v[5], v[ 9], v[13]);                        \
+  BLAKE2B_G_VECTOR (c4, c5, v[2], v[6], v[10], v[14]);                        \
+  BLAKE2B_G_VECTOR (c6, c7, v[3], v[7], v[11], v[15]);                        \
+  BLAKE2B_G_VECTOR (c8, c9, v[0], v[5], v[10], v[15]);                        \
+  BLAKE2B_G_VECTOR (ca, cb, v[1], v[6], v[11], v[12]);                        \
+  BLAKE2B_G_VECTOR (cc, cd, v[2], v[7], v[ 8], v[13]);                        \
+  BLAKE2B_G_VECTOR (ce, cf, v[3], v[4], v[ 9], v[14]);                        \
+}
+
+typedef struct blake2b_ctx
+{
+  u64 m[16]; // buffer
+  u64 h[ 8]; // digest
+
+  int len;
+
+} blake2b_ctx_t;
+
+typedef struct blake2b_ctx_vector
+{
+  u64x m[16]; // buffer
+  u64x h[ 8]; // digest
+
+  int len;
+
+} blake2b_ctx_vector_t;
+
+DECLSPEC void blake2b_transform (PRIVATE_AS u64 *h, PRIVATE_AS const u64 *m, const int len, const u64 f0);
+DECLSPEC void blake2b_init (PRIVATE_AS blake2b_ctx_t *ctx);
+DECLSPEC void blake2b_update (PRIVATE_AS blake2b_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len);
+DECLSPEC void blake2b_update_global (PRIVATE_AS blake2b_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len);
+DECLSPEC void blake2b_final (PRIVATE_AS blake2b_ctx_t *ctx);
+
+DECLSPEC void blake2b_transform_vector (PRIVATE_AS u64x *h, PRIVATE_AS const u64x *m, const u32x len, const u64 f0);
+DECLSPEC void blake2b_init_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx);
+DECLSPEC void blake2b_init_vector_from_scalar (PRIVATE_AS blake2b_ctx_vector_t *ctx, PRIVATE_AS blake2b_ctx_t *ctx0);
+DECLSPEC void blake2b_update_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len);
+DECLSPEC void blake2b_final_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx);
+
+#endif // INC_HASH_BLAKE2B_H