GPUs are the powerhouse of modern computing systems. Chapel’s general-purpose capabilities for parallelism and locality control makes GPU programming as easy as programming a multi-core CPU. These capabilities makes GPU programming intuitive on a laptop with a GPU, on a leadership-class supercomputer, or anything in between. Chapel supports programming NVIDIA and AMD GPUs in a vendor-neutral way; the same code can be used on GPUs from both vendors without any change.
Example: Matrix Multiplication
config const N = 2000, M = 3000, P = 4000;
var A: [1..N,1..M] real = 1.0;
var B: [1..M,1..P] real = 1.0;
var C: [1..N,1..P] real = 1.0;
forall (i,k) in C.domain {
for j in 1..M {
C[i,k] += A[i,j] * B[j,k];
}
}
config const N = 2000, M = 3000, P = 4000;
var hostA: [1..N,1..M] real = 1.0;
var hostB: [1..M,1..P] real = 1.0;
var hostC: [1..N,1..P] real = 1.0;
on here.gpus[0] {
var devA = hostA; // allocate on GPU
var devB = hostB; // and copy from host to device
var devC: hostC.type;
forall (i,k) in devC.domain {
for j in 1..M {
devC[i,k] += devA[i,j] * devB[j,k];
}
}
hostC = devC; // copy from device to host
}
#include <cuda_runtime.h>
#include <iostream>
inline void __checkCudaError(cudaError_t err, const char* file, int line) {
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error at %s:%d: %s\n", file, line, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
#define checkCudaError(call) __checkCudaError((call), __FILE__, __LINE__)
__global__ void matrixMultiplyKernel(const double* A, const double* B, double* C, int N, int M, int P) {
int i = blockIdx.y * blockDim.y + threadIdx.y;
int k = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N && k < P) {
double sum = 0.0f;
for (int j = 0; j < M; ++j) {
sum += A[i * M + j] * B[j * P + k];
}
C[i * P + k] = sum;
}
}
int main() {
// Matrix sizes
int N = 2000, M = 3000, P = 4000;
double *hostA, *hostB, *hostC;
double *devA, *devB, *devC;
// Allocate host memory
hostA = (double*)malloc(N * M * sizeof(double));
hostB = (double*)malloc(M * P * sizeof(double));
hostC = (double*)malloc(N * P * sizeof(double));
// Verify that allocations succeeded
if (hostA == NULL || hostB == NULL || hostC == NULL) {
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
// Data initialization omitted
// Allocate device memory
checkCudaError(cudaMalloc((void**)&devA, N * M * sizeof(double)));
checkCudaError(cudaMalloc((void**)&devB, M * P * sizeof(double)));
checkCudaError(cudaMalloc((void**)&devC, N * P * sizeof(double)));
// Copy matrices from host to device
checkCudaError(cudaMemcpy(devA, hostA, N * M * sizeof(double), cudaMemcpyHostToDevice));
checkCudaError(cudaMemcpy(devB, hostB, M * P * sizeof(double), cudaMemcpyHostToDevice));
// Define block and grid sizes
dim3 blockDim(16, 16);
dim3 gridDim((P + blockDim.x - 1) / blockDim.x, (N + blockDim.y - 1) / blockDim.y);
// Launch the kernel
matrixMultiplyKernel<<<gridDim, blockDim>>>(devA, devB, devC, N, M, P);
checkCudaError(cudaPeekAtLastError());
checkCudaError(cudaDeviceSynchronize());
// Copy result matrix from device to host
checkCudaError(cudaMemcpy(hostC, devC, N * P * sizeof(double), cudaMemcpyDeviceToHost));
// Free device memory
checkCudaError(cudaFree(devA));
checkCudaError(cudaFree(devB));
checkCudaError(cudaFree(devC));
// Free host memory
free(hostA);
free(hostB);
free(hostC);
return 0;
}
/* Implementation based on https://cnugteren.github.io/tutorial/pages/page3.html
This kernel appears in a different file from main */
__kernel void myGEMM1(const int M, const int N, const int P,
const __global float* A,
const __global float* B,
__global float* C) {
// Thread identifiers
const int globalRow = get_global_id(0); // Row ID of C (0..M)
const int globalCol = get_global_id(1); // Col ID of C (0..N)
// Compute a single element (loop over K)
float acc = 0.0f;
for (int k=0; k<P; k++) {
acc += A[k*M + globalRow] * B[globalCol*P + k];
}
// Store the result
C[globalCol*M + globalRow] = acc;
}
void main() {
// Define the matrix dimensions
const int M = 2000;
const int N = 3000;
const int P = 4000;
// Define OpenCL variables
cl_int err;
cl_platform_id platform = 0;
cl_device_id device = 0;
cl_device_id devices[MAX_NUM_DEVICES];
cl_uint numDevices = 0;
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, 0, 0};
cl_context context = 0;
cl_command_queue queue = 0;
cl_event event = NULL;
cl_program program = NULL;
char deviceName[MAX_DEVICE_NAME];
// Configure the OpenCL environment
err = clGetPlatformIDs(1, &platform, NULL);
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
device = devices[CURRENT_DEVICE];
props[1] = (cl_context_properties)platform;
context = clCreateContext(props, 1, &device, NULL, NULL, &err);
queue = clCreateCommandQueue(context, device, 0, &err);
err = clGetDeviceInfo(device, CL_DEVICE_NAME, MAX_DEVICE_NAME, deviceName, NULL);
checkError(err,__LINE__);
// Read the kernel file from disk
long sizeHeader, sizeSource;
char* header = readKernelFile(CL_INCLUDE_FILE, &sizeHeader);
char* source = readKernelFile(CL_KERNEL_FILE, &sizeSource);
long size = 2 + sizeHeader + sizeSource;
char* code = (char*)malloc(size*sizeof(char));
for (int c=0; c<size; c++) { code[c] = '\0'; }
strcat(code, header);
strcat(code, source);
const char* constCode = code;
free(header);
free(source);
// Compile the kernel file
program = clCreateProgramWithSource(context, 1, &constCode, NULL, &err);
checkError(err,__LINE__);
err = clBuildProgram(program, 0, NULL, COMPILER_OPTIONS, NULL, NULL);
// Check for compilation errors
size_t logSize;
err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
checkError(err,__LINE__);
char* messages = (char*)malloc((1+logSize)*sizeof(char));
err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, messages, NULL);
checkError(err,__LINE__);
messages[logSize] = '\0';
//if (logSize > 10) { printf("## Compiler message: %s\n", messages); }
free(messages);
// Retrieve the PTX code from the OpenCL compiler and output it to disk
size_t binSize;
err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binSize, NULL);
checkError(err,__LINE__);
unsigned char *bin = (unsigned char *)malloc(binSize);
err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &bin, NULL);
checkError(err,__LINE__);
FILE* file = fopen(CL_PTX_FILE, "wb");
fwrite(bin, sizeof(char), binSize, file);
fclose(file);
free(bin);
// Prepare OpenCL memory objects
cl_mem bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, M*P*sizeof(*A), NULL, &err);
cl_mem bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, P*N*sizeof(*B), NULL, &err);
cl_mem bufB_TR = clCreateBuffer(context, CL_MEM_READ_ONLY, N*P*sizeof(*B), NULL, &err);
cl_mem bufC = clCreateBuffer(context, CL_MEM_READ_WRITE, M*N*sizeof(*C), NULL, &err);
checkError(err,__LINE__);
// Copy matrices to the GPU (also C to erase the results of the previous run)
err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M*P*sizeof(*A), A, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, P*N*sizeof(*B), B, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, M*N*sizeof(*C), C, 0, NULL, NULL);
checkError(err,__LINE__);
// Configure the myGEMM kernel
char kernelname[100] = "myGEMM1";
cl_kernel kernel1 = clCreateKernel(program, kernelname, &err);
checkError(err,__LINE__);
// Set the kernel arguments
err = clSetKernelArg(kernel1, 0, sizeof(int), (void*)&M);
err = clSetKernelArg(kernel1, 1, sizeof(int), (void*)&N);
err = clSetKernelArg(kernel1, 2, sizeof(int), (void*)&P);
err = clSetKernelArg(kernel1, 3, sizeof(cl_mem), (void*)&bufA);
err = clSetKernelArg(kernel1, 4, sizeof(cl_mem), (void*)&bufB);
err = clSetKernelArg(kernel1, 5, sizeof(cl_mem), (void*)&bufC);
// Configure the thread/work-group dimensions of the myGEMM kernel
const size_t local[2] = { TS, TS };
const size_t global[2] = { (size_t)M, (size_t)N };
// Run the myGEMM kernel
err = clEnqueueNDRangeKernel(queue, kernel1, 2, NULL, global, local, 0, NULL, &event);
// Copy the output matrix C back to the CPU memory
err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M*N*sizeof(*C), C, 0, NULL, NULL);
checkError(err,__LINE__);
// Free the memory objects
free(code);
clReleaseMemObject(bufA);
clReleaseMemObject(bufB);
clReleaseMemObject(bufB_TR);
clReleaseMemObject(bufC);
clReleaseMemObject(bufA_XL);
clReleaseMemObject(bufB_TR_XL);
clReleaseMemObject(bufC_XL);
// Clean-up OpenCL
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseProgram(program);
clReleaseKernel(kernel1);
}
config const N = 2000, M = 3000, P = 4000;
var A: [1..N,1..M] real = 1.0;
var B: [1..M,1..P] real = 1.0;
var C: [1..N,1..P] real = 1.0;
forall (i,k) in C.domain {
for j in 1..M {
C[i,k] += A[i,j] * B[j,k];
}
}
config const N = 2000, M = 3000, P = 4000;
var hostA: [1..N,1..M] real = 1.0;
var hostB: [1..M,1..P] real = 1.0;
var hostC: [1..N,1..P] real = 1.0;
on here.gpus[0] {
var devA = hostA; // allocate on GPU
var devB = hostB; // and copy from host to device
var devC: hostC.type;
forall (i,k) in devC.domain {
for j in 1..M {
devC[i,k] += devA[i,j] * devB[j,k];
}
}
hostC = devC; // copy from device to host
}
#include <cuda_runtime.h>
#include <iostream>
inline void __checkCudaError(cudaError_t err, const char* file, int line) {
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error at %s:%d: %s\n", file, line, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
#define checkCudaError(call) __checkCudaError((call), __FILE__, __LINE__)
__global__ void matrixMultiplyKernel(const double* A, const double* B, double* C, int N, int M, int P) {
int i = blockIdx.y * blockDim.y + threadIdx.y;
int k = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N && k < P) {
double sum = 0.0f;
for (int j = 0; j < M; ++j) {
sum += A[i * M + j] * B[j * P + k];
}
C[i * P + k] = sum;
}
}
int main() {
// Matrix sizes
int N = 2000, M = 3000, P = 4000;
double *hostA, *hostB, *hostC;
double *devA, *devB, *devC;
// Allocate host memory
hostA = (double*)malloc(N * M * sizeof(double));
hostB = (double*)malloc(M * P * sizeof(double));
hostC = (double*)malloc(N * P * sizeof(double));
// Verify that allocations succeeded
if (hostA == NULL || hostB == NULL || hostC == NULL) {
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
// Data initialization omitted
// Allocate device memory
checkCudaError(cudaMalloc((void**)&devA, N * M * sizeof(double)));
checkCudaError(cudaMalloc((void**)&devB, M * P * sizeof(double)));
checkCudaError(cudaMalloc((void**)&devC, N * P * sizeof(double)));
// Copy matrices from host to device
checkCudaError(cudaMemcpy(devA, hostA, N * M * sizeof(double), cudaMemcpyHostToDevice));
checkCudaError(cudaMemcpy(devB, hostB, M * P * sizeof(double), cudaMemcpyHostToDevice));
// Define block and grid sizes
dim3 blockDim(16, 16);
dim3 gridDim((P + blockDim.x - 1) / blockDim.x, (N + blockDim.y - 1) / blockDim.y);
// Launch the kernel
matrixMultiplyKernel<<<gridDim, blockDim>>>(devA, devB, devC, N, M, P);
checkCudaError(cudaPeekAtLastError());
checkCudaError(cudaDeviceSynchronize());
// Copy result matrix from device to host
checkCudaError(cudaMemcpy(hostC, devC, N * P * sizeof(double), cudaMemcpyDeviceToHost));
// Free device memory
checkCudaError(cudaFree(devA));
checkCudaError(cudaFree(devB));
checkCudaError(cudaFree(devC));
// Free host memory
free(hostA);
free(hostB);
free(hostC);
return 0;
}
/* Implementation based on https://cnugteren.github.io/tutorial/pages/page3.html
This kernel appears in a different file from main */
__kernel void myGEMM1(const int M, const int N, const int P,
const __global float* A,
const __global float* B,
__global float* C) {
// Thread identifiers
const int globalRow = get_global_id(0); // Row ID of C (0..M)
const int globalCol = get_global_id(1); // Col ID of C (0..N)
// Compute a single element (loop over K)
float acc = 0.0f;
for (int k=0; k<P; k++) {
acc += A[k*M + globalRow] * B[globalCol*P + k];
}
// Store the result
C[globalCol*M + globalRow] = acc;
}
void main() {
// Define the matrix dimensions
const int M = 2000;
const int N = 3000;
const int P = 4000;
// Define OpenCL variables
cl_int err;
cl_platform_id platform = 0;
cl_device_id device = 0;
cl_device_id devices[MAX_NUM_DEVICES];
cl_uint numDevices = 0;
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, 0, 0};
cl_context context = 0;
cl_command_queue queue = 0;
cl_event event = NULL;
cl_program program = NULL;
char deviceName[MAX_DEVICE_NAME];
// Configure the OpenCL environment
err = clGetPlatformIDs(1, &platform, NULL);
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
device = devices[CURRENT_DEVICE];
props[1] = (cl_context_properties)platform;
context = clCreateContext(props, 1, &device, NULL, NULL, &err);
queue = clCreateCommandQueue(context, device, 0, &err);
err = clGetDeviceInfo(device, CL_DEVICE_NAME, MAX_DEVICE_NAME, deviceName, NULL);
checkError(err,__LINE__);
// Read the kernel file from disk
long sizeHeader, sizeSource;
char* header = readKernelFile(CL_INCLUDE_FILE, &sizeHeader);
char* source = readKernelFile(CL_KERNEL_FILE, &sizeSource);
long size = 2 + sizeHeader + sizeSource;
char* code = (char*)malloc(size*sizeof(char));
for (int c=0; c<size; c++) { code[c] = '\0'; }
strcat(code, header);
strcat(code, source);
const char* constCode = code;
free(header);
free(source);
// Compile the kernel file
program = clCreateProgramWithSource(context, 1, &constCode, NULL, &err);
checkError(err,__LINE__);
err = clBuildProgram(program, 0, NULL, COMPILER_OPTIONS, NULL, NULL);
// Check for compilation errors
size_t logSize;
err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
checkError(err,__LINE__);
char* messages = (char*)malloc((1+logSize)*sizeof(char));
err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, messages, NULL);
checkError(err,__LINE__);
messages[logSize] = '\0';
//if (logSize > 10) { printf("## Compiler message: %s\n", messages); }
free(messages);
// Retrieve the PTX code from the OpenCL compiler and output it to disk
size_t binSize;
err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binSize, NULL);
checkError(err,__LINE__);
unsigned char *bin = (unsigned char *)malloc(binSize);
err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &bin, NULL);
checkError(err,__LINE__);
FILE* file = fopen(CL_PTX_FILE, "wb");
fwrite(bin, sizeof(char), binSize, file);
fclose(file);
free(bin);
// Prepare OpenCL memory objects
cl_mem bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, M*P*sizeof(*A), NULL, &err);
cl_mem bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, P*N*sizeof(*B), NULL, &err);
cl_mem bufB_TR = clCreateBuffer(context, CL_MEM_READ_ONLY, N*P*sizeof(*B), NULL, &err);
cl_mem bufC = clCreateBuffer(context, CL_MEM_READ_WRITE, M*N*sizeof(*C), NULL, &err);
checkError(err,__LINE__);
// Copy matrices to the GPU (also C to erase the results of the previous run)
err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M*P*sizeof(*A), A, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, P*N*sizeof(*B), B, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, M*N*sizeof(*C), C, 0, NULL, NULL);
checkError(err,__LINE__);
// Configure the myGEMM kernel
char kernelname[100] = "myGEMM1";
cl_kernel kernel1 = clCreateKernel(program, kernelname, &err);
checkError(err,__LINE__);
// Set the kernel arguments
err = clSetKernelArg(kernel1, 0, sizeof(int), (void*)&M);
err = clSetKernelArg(kernel1, 1, sizeof(int), (void*)&N);
err = clSetKernelArg(kernel1, 2, sizeof(int), (void*)&P);
err = clSetKernelArg(kernel1, 3, sizeof(cl_mem), (void*)&bufA);
err = clSetKernelArg(kernel1, 4, sizeof(cl_mem), (void*)&bufB);
err = clSetKernelArg(kernel1, 5, sizeof(cl_mem), (void*)&bufC);
// Configure the thread/work-group dimensions of the myGEMM kernel
const size_t local[2] = { TS, TS };
const size_t global[2] = { (size_t)M, (size_t)N };
// Run the myGEMM kernel
err = clEnqueueNDRangeKernel(queue, kernel1, 2, NULL, global, local, 0, NULL, &event);
// Copy the output matrix C back to the CPU memory
err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M*N*sizeof(*C), C, 0, NULL, NULL);
checkError(err,__LINE__);
// Free the memory objects
free(code);
clReleaseMemObject(bufA);
clReleaseMemObject(bufB);
clReleaseMemObject(bufB_TR);
clReleaseMemObject(bufC);
clReleaseMemObject(bufA_XL);
clReleaseMemObject(bufB_TR_XL);
clReleaseMemObject(bufC_XL);
// Clean-up OpenCL
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseProgram(program);
clReleaseKernel(kernel1);
}