mirror of
https://github.com/hashcat/hashcat
synced 2024-11-02 20:39:22 +01:00
Fixed race condition resulting in out of memory error on startup if multiple hashcat instances are started at the same time
This commit is contained in:
parent
6d5e1d3e5d
commit
e21463da4b
@ -12,6 +12,13 @@
|
||||
|
||||
- Fixed too early execution of some module functions which could make use of non-final values opts_type and opti_type
|
||||
- Fixed internal access on module option attribute OPTS_TYPE_SUGGEST_KG with the result that it was unused
|
||||
- Fixed race condition resulting in out of memory error on startup if multiple hashcat instances are started at the same time
|
||||
|
||||
##
|
||||
## Improvements
|
||||
##
|
||||
|
||||
- Startup time: Improved the startup time by avoiding some time intensive operations for skipped devices
|
||||
|
||||
* changes v6.1.0 -> v6.1.1
|
||||
|
||||
|
554
src/backend.c
554
src/backend.c
@ -5540,7 +5540,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
|
||||
device_param->skipped = true;
|
||||
}
|
||||
|
||||
// some attributes have to be hardcoded because they are used for instance in the build options
|
||||
// some attributes have to be hardcoded values because they are used for instance in the build options
|
||||
|
||||
device_param->device_local_mem_type = CL_LOCAL;
|
||||
device_param->opencl_device_type = CL_DEVICE_TYPE_GPU;
|
||||
@ -5616,11 +5616,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
|
||||
cuda_devices_active++;
|
||||
}
|
||||
|
||||
CUcontext cuda_context;
|
||||
|
||||
if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1) return -1;
|
||||
|
||||
if (hc_cuCtxSetCurrent (hashcat_ctx, cuda_context) == -1) return -1;
|
||||
// instruction set
|
||||
|
||||
// bcrypt optimization?
|
||||
//const int rc_cuCtxSetCacheConfig = hc_cuCtxSetCacheConfig (hashcat_ctx, CU_FUNC_CACHE_PREFER_SHARED);
|
||||
@ -5638,47 +5634,14 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
|
||||
device_param->has_mov64 = (sm >= 10) ? true : false;
|
||||
device_param->has_prmt = (sm >= 20) ? true : false;
|
||||
|
||||
/*
|
||||
#define RUN_INSTRUCTION_CHECKS() \
|
||||
device_param->has_add = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_addc = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_sub = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_subc = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_bfe = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_lop3 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_mov64 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned long long r; unsigned int a; unsigned int b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }"); \
|
||||
device_param->has_prmt = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
|
||||
if (backend_devices_idx > 0)
|
||||
{
|
||||
hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
|
||||
|
||||
if (is_same_device_type (device_param, device_param_prev) == true)
|
||||
{
|
||||
device_param->has_add = device_param_prev->has_add;
|
||||
device_param->has_addc = device_param_prev->has_addc;
|
||||
device_param->has_sub = device_param_prev->has_sub;
|
||||
device_param->has_subc = device_param_prev->has_subc;
|
||||
device_param->has_bfe = device_param_prev->has_bfe;
|
||||
device_param->has_lop3 = device_param_prev->has_lop3;
|
||||
device_param->has_mov64 = device_param_prev->has_mov64;
|
||||
device_param->has_prmt = device_param_prev->has_prmt;
|
||||
}
|
||||
else
|
||||
{
|
||||
RUN_INSTRUCTION_CHECKS();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
RUN_INSTRUCTION_CHECKS();
|
||||
}
|
||||
|
||||
#undef RUN_INSTRUCTION_CHECKS
|
||||
*/
|
||||
|
||||
// device_available_mem
|
||||
|
||||
CUcontext cuda_context;
|
||||
|
||||
if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1) return -1;
|
||||
|
||||
if (hc_cuCtxSetCurrent (hashcat_ctx, cuda_context) == -1) return -1;
|
||||
|
||||
size_t free = 0;
|
||||
size_t total = 0;
|
||||
|
||||
@ -6269,6 +6232,25 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
|
||||
}
|
||||
}
|
||||
|
||||
// instruction set
|
||||
|
||||
// fixed values works only for nvidia devices
|
||||
// dynamical values for amd see time intensive section below
|
||||
|
||||
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
|
||||
{
|
||||
const int sm = (device_param->sm_major * 10) + device_param->sm_minor;
|
||||
|
||||
device_param->has_add = (sm >= 12) ? true : false;
|
||||
device_param->has_addc = (sm >= 12) ? true : false;
|
||||
device_param->has_sub = (sm >= 12) ? true : false;
|
||||
device_param->has_subc = (sm >= 12) ? true : false;
|
||||
device_param->has_bfe = (sm >= 20) ? true : false;
|
||||
device_param->has_lop3 = (sm >= 50) ? true : false;
|
||||
device_param->has_mov64 = (sm >= 10) ? true : false;
|
||||
device_param->has_prmt = (sm >= 20) ? true : false;
|
||||
}
|
||||
|
||||
// common driver check
|
||||
|
||||
if (device_param->skipped == false)
|
||||
@ -6432,215 +6414,6 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
|
||||
|
||||
opencl_devices_active++;
|
||||
}
|
||||
|
||||
/**
|
||||
* create context for each device
|
||||
*/
|
||||
|
||||
cl_context context;
|
||||
|
||||
/*
|
||||
cl_context_properties properties[3];
|
||||
|
||||
properties[0] = CL_CONTEXT_PLATFORM;
|
||||
properties[1] = (cl_context_properties) device_param->opencl_platform;
|
||||
properties[2] = 0;
|
||||
|
||||
CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &context);
|
||||
*/
|
||||
|
||||
if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &context) == -1) return -1;
|
||||
|
||||
/**
|
||||
* create command-queue
|
||||
*/
|
||||
|
||||
cl_command_queue command_queue;
|
||||
|
||||
if (hc_clCreateCommandQueue (hashcat_ctx, context, device_param->opencl_device, 0, &command_queue) == -1) return -1;
|
||||
|
||||
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD))
|
||||
{
|
||||
#define RUN_INSTRUCTION_CHECKS()
|
||||
device_param->has_vadd = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vaddc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vadd_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_CO_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vaddc_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vsub = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vsubb = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vsub_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_CO_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vsubb_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vadd3 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD3_U32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vbfe = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_BFE_U32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vperm = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_PERM_B32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \
|
||||
|
||||
if (backend_devices_idx > 0)
|
||||
{
|
||||
hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
|
||||
|
||||
if (is_same_device_type (device_param, device_param_prev) == true)
|
||||
{
|
||||
device_param->has_vadd = device_param_prev->has_vadd;
|
||||
device_param->has_vaddc = device_param_prev->has_vaddc;
|
||||
device_param->has_vadd_co = device_param_prev->has_vadd_co;
|
||||
device_param->has_vaddc_co = device_param_prev->has_vaddc_co;
|
||||
device_param->has_vsub = device_param_prev->has_vsub;
|
||||
device_param->has_vsubb = device_param_prev->has_vsubb;
|
||||
device_param->has_vsub_co = device_param_prev->has_vsub_co;
|
||||
device_param->has_vsubb_co = device_param_prev->has_vsubb_co;
|
||||
device_param->has_vadd3 = device_param_prev->has_vadd3;
|
||||
device_param->has_vbfe = device_param_prev->has_vbfe;
|
||||
device_param->has_vperm = device_param_prev->has_vperm;
|
||||
}
|
||||
else
|
||||
{
|
||||
RUN_INSTRUCTION_CHECKS();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
RUN_INSTRUCTION_CHECKS();
|
||||
}
|
||||
|
||||
#undef RUN_INSTRUCTION_CHECKS
|
||||
}
|
||||
|
||||
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
|
||||
{
|
||||
const int sm = (device_param->sm_major * 10) + device_param->sm_minor;
|
||||
|
||||
device_param->has_add = (sm >= 12) ? true : false;
|
||||
device_param->has_addc = (sm >= 12) ? true : false;
|
||||
device_param->has_sub = (sm >= 12) ? true : false;
|
||||
device_param->has_subc = (sm >= 12) ? true : false;
|
||||
device_param->has_bfe = (sm >= 20) ? true : false;
|
||||
device_param->has_lop3 = (sm >= 50) ? true : false;
|
||||
device_param->has_mov64 = (sm >= 10) ? true : false;
|
||||
device_param->has_prmt = (sm >= 20) ? true : false;
|
||||
|
||||
/*
|
||||
#define RUN_INSTRUCTION_CHECKS() \
|
||||
device_param->has_add = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_addc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_sub = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_subc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_bfe = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_lop3 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_mov64 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { ulong r; uint a; uint b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }"); \
|
||||
device_param->has_prmt = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
|
||||
if (backend_devices_idx > 0)
|
||||
{
|
||||
hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
|
||||
|
||||
if (is_same_device_type (device_param, device_param_prev) == true)
|
||||
{
|
||||
device_param->has_add = device_param_prev->has_add;
|
||||
device_param->has_addc = device_param_prev->has_addc;
|
||||
device_param->has_sub = device_param_prev->has_sub;
|
||||
device_param->has_subc = device_param_prev->has_subc;
|
||||
device_param->has_bfe = device_param_prev->has_bfe;
|
||||
device_param->has_lop3 = device_param_prev->has_lop3;
|
||||
device_param->has_mov64 = device_param_prev->has_mov64;
|
||||
device_param->has_prmt = device_param_prev->has_prmt;
|
||||
}
|
||||
else
|
||||
{
|
||||
RUN_INSTRUCTION_CHECKS();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
RUN_INSTRUCTION_CHECKS();
|
||||
}
|
||||
|
||||
#undef RUN_INSTRUCTION_CHECKS
|
||||
*/
|
||||
}
|
||||
|
||||
// device_available_mem
|
||||
|
||||
#define MAX_ALLOC_CHECKS_CNT 8192
|
||||
#define MAX_ALLOC_CHECKS_SIZE (64 * 1024 * 1024)
|
||||
|
||||
device_param->device_available_mem = device_param->device_global_mem - MAX_ALLOC_CHECKS_SIZE;
|
||||
|
||||
#if defined (_WIN)
|
||||
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
|
||||
#else
|
||||
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) || (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD)))
|
||||
#endif
|
||||
{
|
||||
// OK, so the problem here is the following:
|
||||
// There's just CL_DEVICE_GLOBAL_MEM_SIZE to ask OpenCL about the total memory on the device,
|
||||
// but there's no way to ask for available memory on the device.
|
||||
// In combination, most OpenCL runtimes implementation of clCreateBuffer()
|
||||
// are doing so called lazy memory allocation on the device.
|
||||
// Now, if the user has X11 (or a game or anything that takes a lot of GPU memory)
|
||||
// running on the host we end up with an error type of this:
|
||||
// clEnqueueNDRangeKernel(): CL_MEM_OBJECT_ALLOCATION_FAILURE
|
||||
// The clEnqueueNDRangeKernel() is because of the lazy allocation
|
||||
// The best way to workaround this problem is if we would be able to ask for available memory,
|
||||
// The idea here is to try to evaluate available memory by allocating it till it errors
|
||||
|
||||
cl_mem *tmp_device = (cl_mem *) hccalloc (MAX_ALLOC_CHECKS_CNT, sizeof (cl_mem));
|
||||
|
||||
u64 c;
|
||||
|
||||
for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
|
||||
{
|
||||
if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
|
||||
|
||||
cl_int CL_err;
|
||||
|
||||
OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
|
||||
|
||||
tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err);
|
||||
|
||||
if (CL_err != CL_SUCCESS)
|
||||
{
|
||||
c--;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
// transfer only a few byte should be enough to force the runtime to actually allocate the memory
|
||||
|
||||
u8 tmp_host[8];
|
||||
|
||||
if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
|
||||
|
||||
if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
|
||||
|
||||
if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
|
||||
|
||||
if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
|
||||
}
|
||||
|
||||
device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE;
|
||||
if (c > 0)
|
||||
{
|
||||
device_param->device_available_mem *= c;
|
||||
}
|
||||
|
||||
// clean up
|
||||
|
||||
for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
|
||||
{
|
||||
if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
|
||||
|
||||
if (tmp_device[c] != NULL)
|
||||
{
|
||||
if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) return -1;
|
||||
}
|
||||
}
|
||||
|
||||
hcfree (tmp_device);
|
||||
}
|
||||
|
||||
hc_clReleaseCommandQueue (hashcat_ctx, command_queue);
|
||||
|
||||
hc_clReleaseContext (hashcat_ctx, context);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -6701,6 +6474,279 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
|
||||
}
|
||||
}
|
||||
|
||||
// time or resource intensive operations which we do not run if the corresponding device was skipped by the user
|
||||
|
||||
if (backend_ctx->cuda)
|
||||
{
|
||||
// instruction test for cuda devices was replaced with fixed values (see above)
|
||||
|
||||
/*
|
||||
CUcontext cuda_context;
|
||||
|
||||
if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1) return -1;
|
||||
|
||||
if (hc_cuCtxSetCurrent (hashcat_ctx, cuda_context) == -1) return -1;
|
||||
|
||||
#define RUN_INSTRUCTION_CHECKS() \
|
||||
device_param->has_add = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_addc = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_sub = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_subc = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_bfe = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_lop3 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_mov64 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned long long r; unsigned int a; unsigned int b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }"); \
|
||||
device_param->has_prmt = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
|
||||
if (backend_devices_idx > 0)
|
||||
{
|
||||
hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
|
||||
|
||||
if (is_same_device_type (device_param, device_param_prev) == true)
|
||||
{
|
||||
device_param->has_add = device_param_prev->has_add;
|
||||
device_param->has_addc = device_param_prev->has_addc;
|
||||
device_param->has_sub = device_param_prev->has_sub;
|
||||
device_param->has_subc = device_param_prev->has_subc;
|
||||
device_param->has_bfe = device_param_prev->has_bfe;
|
||||
device_param->has_lop3 = device_param_prev->has_lop3;
|
||||
device_param->has_mov64 = device_param_prev->has_mov64;
|
||||
device_param->has_prmt = device_param_prev->has_prmt;
|
||||
}
|
||||
else
|
||||
{
|
||||
RUN_INSTRUCTION_CHECKS();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
RUN_INSTRUCTION_CHECKS();
|
||||
}
|
||||
|
||||
#undef RUN_INSTRUCTION_CHECKS
|
||||
|
||||
if (hc_cuCtxDestroy (hashcat_ctx, cuda_context) == -1) return -1;
|
||||
|
||||
*/
|
||||
}
|
||||
|
||||
if (backend_ctx->ocl)
|
||||
{
|
||||
for (int backend_devices_cnt = 0; backend_devices_cnt < backend_ctx->backend_devices_cnt; backend_devices_cnt++)
|
||||
{
|
||||
hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_cnt];
|
||||
|
||||
if (device_param->is_opencl == false) continue;
|
||||
|
||||
if (device_param->skipped == true) continue;
|
||||
|
||||
/**
|
||||
* create context for each device
|
||||
*/
|
||||
|
||||
cl_context context;
|
||||
|
||||
/*
|
||||
cl_context_properties properties[3];
|
||||
|
||||
properties[0] = CL_CONTEXT_PLATFORM;
|
||||
properties[1] = (cl_context_properties) device_param->opencl_platform;
|
||||
properties[2] = 0;
|
||||
|
||||
CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &context);
|
||||
*/
|
||||
|
||||
if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &context) == -1) return -1;
|
||||
|
||||
/**
|
||||
* create command-queue
|
||||
*/
|
||||
|
||||
cl_command_queue command_queue;
|
||||
|
||||
if (hc_clCreateCommandQueue (hashcat_ctx, context, device_param->opencl_device, 0, &command_queue) == -1) return -1;
|
||||
|
||||
// instruction set
|
||||
|
||||
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD))
|
||||
{
|
||||
#define RUN_INSTRUCTION_CHECKS()
|
||||
device_param->has_vadd = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vaddc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vadd_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_CO_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vaddc_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vsub = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vsubb = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vsub_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_CO_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vsubb_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vadd3 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD3_U32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vbfe = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_BFE_U32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \
|
||||
device_param->has_vperm = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_PERM_B32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \
|
||||
|
||||
if (backend_devices_idx > 0)
|
||||
{
|
||||
hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
|
||||
|
||||
if (is_same_device_type (device_param, device_param_prev) == true)
|
||||
{
|
||||
device_param->has_vadd = device_param_prev->has_vadd;
|
||||
device_param->has_vaddc = device_param_prev->has_vaddc;
|
||||
device_param->has_vadd_co = device_param_prev->has_vadd_co;
|
||||
device_param->has_vaddc_co = device_param_prev->has_vaddc_co;
|
||||
device_param->has_vsub = device_param_prev->has_vsub;
|
||||
device_param->has_vsubb = device_param_prev->has_vsubb;
|
||||
device_param->has_vsub_co = device_param_prev->has_vsub_co;
|
||||
device_param->has_vsubb_co = device_param_prev->has_vsubb_co;
|
||||
device_param->has_vadd3 = device_param_prev->has_vadd3;
|
||||
device_param->has_vbfe = device_param_prev->has_vbfe;
|
||||
device_param->has_vperm = device_param_prev->has_vperm;
|
||||
}
|
||||
else
|
||||
{
|
||||
RUN_INSTRUCTION_CHECKS();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
RUN_INSTRUCTION_CHECKS();
|
||||
}
|
||||
|
||||
#undef RUN_INSTRUCTION_CHECKS
|
||||
}
|
||||
|
||||
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
|
||||
{
|
||||
// replaced with fixed values see non time intensive section above
|
||||
|
||||
/*
|
||||
#define RUN_INSTRUCTION_CHECKS() \
|
||||
device_param->has_add = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_addc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_sub = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_subc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_bfe = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_lop3 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
device_param->has_mov64 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { ulong r; uint a; uint b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }"); \
|
||||
device_param->has_prmt = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \
|
||||
|
||||
if (backend_devices_idx > 0)
|
||||
{
|
||||
hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
|
||||
|
||||
if (is_same_device_type (device_param, device_param_prev) == true)
|
||||
{
|
||||
device_param->has_add = device_param_prev->has_add;
|
||||
device_param->has_addc = device_param_prev->has_addc;
|
||||
device_param->has_sub = device_param_prev->has_sub;
|
||||
device_param->has_subc = device_param_prev->has_subc;
|
||||
device_param->has_bfe = device_param_prev->has_bfe;
|
||||
device_param->has_lop3 = device_param_prev->has_lop3;
|
||||
device_param->has_mov64 = device_param_prev->has_mov64;
|
||||
device_param->has_prmt = device_param_prev->has_prmt;
|
||||
}
|
||||
else
|
||||
{
|
||||
RUN_INSTRUCTION_CHECKS();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
RUN_INSTRUCTION_CHECKS();
|
||||
}
|
||||
|
||||
#undef RUN_INSTRUCTION_CHECKS
|
||||
*/
|
||||
}
|
||||
|
||||
// available device memory
|
||||
// This test causes an GPU memory usage spike.
|
||||
// In case there are multiple hashcat instances starting at the same time this will cause GPU out of memory errors which otherwise would not exist.
|
||||
// We will simply not run it if that device was skipped by the user.
|
||||
|
||||
#define MAX_ALLOC_CHECKS_CNT 8192
|
||||
#define MAX_ALLOC_CHECKS_SIZE (64 * 1024 * 1024)
|
||||
|
||||
device_param->device_available_mem = device_param->device_global_mem - MAX_ALLOC_CHECKS_SIZE;
|
||||
|
||||
#if defined (_WIN)
|
||||
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
|
||||
#else
|
||||
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) || (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD)))
|
||||
#endif
|
||||
{
|
||||
// OK, so the problem here is the following:
|
||||
// There's just CL_DEVICE_GLOBAL_MEM_SIZE to ask OpenCL about the total memory on the device,
|
||||
// but there's no way to ask for available memory on the device.
|
||||
// In combination, most OpenCL runtimes implementation of clCreateBuffer()
|
||||
// are doing so called lazy memory allocation on the device.
|
||||
// Now, if the user has X11 (or a game or anything that takes a lot of GPU memory)
|
||||
// running on the host we end up with an error type of this:
|
||||
// clEnqueueNDRangeKernel(): CL_MEM_OBJECT_ALLOCATION_FAILURE
|
||||
// The clEnqueueNDRangeKernel() is because of the lazy allocation
|
||||
// The best way to workaround this problem is if we would be able to ask for available memory,
|
||||
// The idea here is to try to evaluate available memory by allocating it till it errors
|
||||
|
||||
cl_mem *tmp_device = (cl_mem *) hccalloc (MAX_ALLOC_CHECKS_CNT, sizeof (cl_mem));
|
||||
|
||||
u64 c;
|
||||
|
||||
for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
|
||||
{
|
||||
if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
|
||||
|
||||
cl_int CL_err;
|
||||
|
||||
OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
|
||||
|
||||
tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err);
|
||||
|
||||
if (CL_err != CL_SUCCESS)
|
||||
{
|
||||
c--;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
// transfer only a few byte should be enough to force the runtime to actually allocate the memory
|
||||
|
||||
u8 tmp_host[8];
|
||||
|
||||
if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
|
||||
|
||||
if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
|
||||
|
||||
if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
|
||||
|
||||
if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
|
||||
}
|
||||
|
||||
device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE;
|
||||
|
||||
if (c > 0)
|
||||
{
|
||||
device_param->device_available_mem *= c;
|
||||
}
|
||||
|
||||
// clean up
|
||||
|
||||
for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
|
||||
{
|
||||
if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
|
||||
|
||||
if (tmp_device[c] != NULL)
|
||||
{
|
||||
if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) return -1;
|
||||
}
|
||||
}
|
||||
|
||||
hcfree (tmp_device);
|
||||
}
|
||||
|
||||
hc_clReleaseCommandQueue (hashcat_ctx, command_queue);
|
||||
|
||||
hc_clReleaseContext (hashcat_ctx, context);
|
||||
}
|
||||
}
|
||||
|
||||
backend_ctx->target_msec = TARGET_MSEC_PROFILE[user_options->workload_profile - 1];
|
||||
|
||||
backend_ctx->need_adl = need_adl;
|
||||
|
Loading…
Reference in New Issue
Block a user