#section init_code_struct /* Why do we need this? */ size_t dim = 2048 * 32; rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, PARAMS, Py_None); if (rand_buf == NULL) { FAIL; } #section support_code_struct PyGpuArrayObject *rand_buf; int gemm16(PyGpuArrayObject *C, float alpha, PyGpuArrayObject *A, PyGpuArrayObject *B, float beta, PyGpuArrayObject **out, PyGpuContextObject *c) { PyGpuArrayObject *_A = NULL; PyGpuArrayObject *_B = NULL; GpuKernel *gk; char *prand, *pA, *pB, *pout; void *params[13]; size_t grid[2]; size_t threads[2]; int res = 0; int flags = 0; int lda, ldb, ldc, n, m, k; int n128, n64; int size = 0; int vec = 0; static unsigned int nprocs = 0; char opA, opB; if (GpuArray_CHKFLAGS(&A->ga, GA_FARRAY) && GpuArray_CHKFLAGS(&B->ga, GA_FARRAY)) { /* * The nervana kernels do not cover the case where both inputs are * trans so we need to copy one of them. We choose the smallest * one. */ if (PyGpuArray_DIM(A, 0) * PyGpuArray_DIM(A, 1) < PyGpuArray_DIM(B, 0) * PyGpuArray_DIM(B, 1)) { _A = pygpu_copy(A, GA_C_ORDER); if (_A == NULL) { res = 1; goto cleanup; } /* * This is not an extra reference on _A so don't add an INCREF. * Also, we don't lose the ref on A since our caller will deal * with it. */ A = _A; } else { _B = pygpu_copy(B, GA_C_ORDER); if (_B == NULL) { res = 1; goto cleanup; } /* * This is not an extra reference on _B so don't add an INCREF * Also, we don't lose the ref on B since our caller will deal * with it. */ B = _B; } } if (GEMM16_INPLACE && GpuArray_CHKFLAGS(&C->ga, GA_CARRAY)) { Py_XDECREF(*out); *out = C; Py_INCREF(*out); } else { *out = theano_try_copy(*out, C); if (*out == NULL) { res = 1; goto cleanup; } } if (GpuArray_CHKFLAGS(&A->ga, GA_FARRAY)) { opA = 't'; lda = PyGpuArray_STRIDE(A, 1); } else { opA = 'n'; lda = PyGpuArray_STRIDE(A, 0); } if (GpuArray_CHKFLAGS(&B->ga, GA_FARRAY)) { opB = 't'; ldb = PyGpuArray_STRIDE(B, 1); } else { opB = 'n'; ldb = PyGpuArray_STRIDE(B, 0); } ldc = PyGpuArray_STRIDE(*out, 0); /* lda and friend are in number of elements, not bytes */ lda /= 2; ldb /= 2; ldc /= 2; m = PyGpuArray_DIM(*out, 0); n = PyGpuArray_DIM(*out, 1); k = PyGpuArray_DIM(B, 0); /* Tuning code adapted from the python version */ grid[0] = (m + 127) / 128; if (opA == 'n' && opB == 't') size = 128; else { if (n < 384-16) { n128 = n % 128; if (n128 < 112) { if (48 < n128 && n128 <= 64) { n64 = n / 64; if (nprocs == 0) if (gpucontext_property(A->context->ctx, GA_CTX_PROP_NUMPROCS, &nprocs)) { nprocs = 0; res = 1; goto cleanup; } n64 *= (grid[0] / nprocs); if (n64 > 1 || (opA == 't' && opB == 'n')) size = 64; else size = 32; } else { size = 32; } } else { size = 128; } } else { size = 128; } } grid[1] = (n + (size-1)) / size; if (size == 128) threads[0] = 256; else threads[0] = 128; threads[1] = 1; if ((opA == 't' && opB == 'n' && m % 8 == 0 && n % 8 == 0) || (opA == 'n' && opB == 'n' && k % 16 == 0 && n % 8 == 0) || (opA == 'n' && opB == 't' && k % 16 == 0)) vec = 1; switch (size) { case 128: if (opA == 'n' && opB == 'n') { if (vec) gk = &k_nn_vec_128x128; else gk = &k_nn_128x128; } else if (opA == 'n' && opB == 't') { if (vec) gk = &k_nt_vec_128x128; else gk = &k_nt_128x128; } else if (opA == 't' && opB == 'n') { if (vec) gk = &k_tn_vec_128x128; else gk = &k_tn_128x128; } break; case 64: if (opA == 'n' && opB == 'n') { if (vec) gk = &k_nn_vec_128x64; else gk = &k_nn_128x64; } else if (opA == 't' && opB == 'n') { if (vec) gk = &k_tn_vec_128x64; else gk = &k_tn_128x64; } break; case 32: if (opA == 'n' && opB == 'n') { if (vec) gk = &k_nn_vec_128x32; else gk = &k_nn_128x32; } else if (opA == 't' && opB == 'n') { if (vec) gk = &k_tn_vec_128x32; else gk = &k_tn_128x32; } break; default: PyErr_SetString(PyExc_RuntimeError, "error selecting kernel"); res = 1; goto cleanup; } prand = *((char **)rand_buf->ga.data); prand += rand_buf->ga.offset; pA = *((char **)A->ga.data); pA += A->ga.offset; pB = *((char **)B->ga.data); pB += B->ga.offset; pout = *((char **)(*out)->ga.data); pout += (*out)->ga.offset; params[0] = &prand; params[1] = &pA; params[2] = &pB; params[3] = &pout; params[4] = &lda; params[5] = &ldb; params[6] = &ldc; params[7] = &m; params[8] = &n; params[9] = &k; params[10] = α params[11] = β params[12] = &flags; if (GpuKernel_call(gk, 2, grid, threads, 0, params) != GA_NO_ERROR) { PyErr_SetString(PyExc_RuntimeError, "error in gemm16 kernel call"); res = 1; } cleanup: Py_XDECREF(_A); Py_XDECREF(_B); return res; }