Below shows how to bootstrap OpenCL programming in Mac OS X with the boilerplates included.
Compile
OpenCL is supported by clang
compiler, with OpenCL framework:
$ clang -framework opencl -o helloworld helloworld.c
Macros to make your life easier
Error checking macros to guard against failures (opencl-helper.h
)
/* helper.h - OpenCL helper macros
* Copyright 2010 (c) Adrian Sai-wah Tam <adrian.sw.tam@gmail.com>
* Released under GNU LGPL.
*/
#ifndef __OPENCL_HELPER_MACROS__
#define __OPENCL_HELPER_MACROS__
#include <stdio.h>
#include <stdlib.h>
#include <OpenCL/opencl.h>
#define CL_WRAPPER(FUNC) \
{ \
cl_int err = FUNC; \
if (err != CL_SUCCESS) { \
fprintf(stderr, "Error %d executing %s on %s:%d (%s)\n", \
err, #FUNC, __FILE__, __LINE__, cl_error_to_str(err)); \
abort(); \
}; \
}
/* The following macro assumes the assignment will store the error code to err */
int err; // error code returned from api calls
#define CL_ASSIGN(ASSIGNMENT) \
{ \
ASSIGNMENT; \
if (err != CL_SUCCESS) { \
fprintf(stderr, "Error %d executing %s on %s:%d (%s)\n", \
err, #ASSIGNMENT, __FILE__, __LINE__, cl_error_to_str(err)); \
abort(); \
}; \
}
const char *cl_error_to_str(cl_int e)
{
switch (e) {
case CL_SUCCESS: return "success";
case CL_DEVICE_NOT_FOUND: return "device not found";
case CL_DEVICE_NOT_AVAILABLE: return "device not available";
#if !(defined(CL_PLATFORM_NVIDIA) && CL_PLATFORM_NVIDIA == 0x3001)
case CL_COMPILER_NOT_AVAILABLE: return "device compiler not available";
#endif
case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "mem object allocation failure";
case CL_OUT_OF_RESOURCES: return "out of resources";
case CL_OUT_OF_HOST_MEMORY: return "out of host memory";
case CL_PROFILING_INFO_NOT_AVAILABLE: return "profiling info not available";
case CL_MEM_COPY_OVERLAP: return "mem copy overlap";
case CL_IMAGE_FORMAT_MISMATCH: return "image format mismatch";
case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "image format not supported";
case CL_BUILD_PROGRAM_FAILURE: return "build program failure";
case CL_MAP_FAILURE: return "map failure";
case CL_INVALID_VALUE: return "invalid value";
case CL_INVALID_DEVICE_TYPE: return "invalid device type";
case CL_INVALID_PLATFORM: return "invalid platform";
case CL_INVALID_DEVICE: return "invalid device";
case CL_INVALID_CONTEXT: return "invalid context";
case CL_INVALID_QUEUE_PROPERTIES: return "invalid queue properties";
case CL_INVALID_COMMAND_QUEUE: return "invalid command queue";
case CL_INVALID_HOST_PTR: return "invalid host ptr";
case CL_INVALID_MEM_OBJECT: return "invalid mem object";
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "invalid image format descriptor";
case CL_INVALID_IMAGE_SIZE: return "invalid image size";
case CL_INVALID_SAMPLER: return "invalid sampler";
case CL_INVALID_BINARY: return "invalid binary";
case CL_INVALID_BUILD_OPTIONS: return "invalid build options";
case CL_INVALID_PROGRAM: return "invalid program";
case CL_INVALID_PROGRAM_EXECUTABLE: return "invalid program executable";
case CL_INVALID_KERNEL_NAME: return "invalid kernel name";
case CL_INVALID_KERNEL_DEFINITION: return "invalid kernel definition";
case CL_INVALID_KERNEL: return "invalid kernel";
case CL_INVALID_ARG_INDEX: return "invalid arg index";
case CL_INVALID_ARG_VALUE: return "invalid arg value";
case CL_INVALID_ARG_SIZE: return "invalid arg size";
case CL_INVALID_KERNEL_ARGS: return "invalid kernel args";
case CL_INVALID_WORK_DIMENSION: return "invalid work dimension";
case CL_INVALID_WORK_GROUP_SIZE: return "invalid work group size";
case CL_INVALID_WORK_ITEM_SIZE: return "invalid work item size";
case CL_INVALID_GLOBAL_OFFSET: return "invalid global offset";
case CL_INVALID_EVENT_WAIT_LIST: return "invalid event wait list";
case CL_INVALID_EVENT: return "invalid event";
case CL_INVALID_OPERATION: return "invalid operation";
case CL_INVALID_GL_OBJECT: return "invalid gl object";
case CL_INVALID_BUFFER_SIZE: return "invalid buffer size";
case CL_INVALID_MIP_LEVEL: return "invalid mip level";
#if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1)
case CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR: return "invalid gl sharegroup reference number";
#endif
#ifdef CL_VERSION_1_1
case CL_MISALIGNED_SUB_BUFFER_OFFSET: return "misaligned sub-buffer offset";
case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "exec status error for events in wait list";
case CL_INVALID_GLOBAL_WORK_SIZE: return "invalid global work size";
#endif
default: return "invalid/unknown error code";
}
}
#endif
Device check program
OpenCL provided a series of clGetSomething()
calls to retrieve the device’s
information. The following code does the check (opencl-checkdevice.c
)
/* checkdevice.c - Code to check details of an OpenCL device
* Adrian Sai-wah Tam <adrian.sw.tam@gmail.com>
* Sun, 19 Dec 2010 01:35:33 -0500
*/
#include "helper.h"
int main(int argc, char** argv)
{
cl_uint platform_count; // Number of platforms
cl_uint device_count; // Number of devices
cl_uint totaldevices; // Total number of devices
/*** Detecting platforms and print platform info ***/
CL_WRAPPER( clGetPlatformIDs(0, NULL, &platform_count) );
printf("There are %d platforms found\n", platform_count);
cl_platform_id* platforms = (cl_platform_id*) malloc(platform_count*sizeof(cl_platform_id));
CL_WRAPPER( clGetPlatformIDs(platform_count, platforms, NULL) );
for (cl_uint i = 0; i < platform_count; ++i) {
char buf[1024]; // can't be too small or the list of extensions don't fit
CL_WRAPPER( clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, sizeof(buf), buf, NULL) );
printf(" Platform[%d] profile: %s\n", i, buf);
CL_WRAPPER( clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(buf), buf, NULL) );
printf(" Platform[%d] version: %s\n", i, buf);
CL_WRAPPER( clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(buf), buf, NULL) );
printf(" Platform[%d] name: %s\n", i, buf);
CL_WRAPPER( clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(buf), buf, NULL) );
printf(" Platform[%d] vendor: %s\n", i, buf);
CL_WRAPPER( clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, sizeof(buf), buf, NULL) );
printf(" Platform[%d] extensions: %s\n", i, buf);
/*** Detecting devices ***/
CL_WRAPPER( clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &totaldevices) );
printf("Platform[%d] has %d devices", i, totaldevices);
CL_WRAPPER( clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_CPU, 0, NULL, &device_count) );
printf(" (%d CPU", device_count);
CL_WRAPPER( clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &device_count) );
printf(", %d GPU", device_count);
CL_WRAPPER( clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &device_count) );
printf(", %d accelerator)\n", device_count);
cl_device_id *devices = (cl_device_id*) malloc(totaldevices * sizeof(cl_device_id));
CL_WRAPPER( clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, totaldevices, devices, NULL) );
for (cl_uint j = 0; j < totaldevices; ++j) {
char buf[1024]; // can't be too small or the list of extensions don't fit
long long val;
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(val), &val, NULL) );
printf(" Device %d type: 0x%llx", j, val );
if (val & CL_DEVICE_TYPE_DEFAULT) printf(" default");
if (val & CL_DEVICE_TYPE_CPU) printf(" CPU");
if (val & CL_DEVICE_TYPE_GPU) printf(" GPU");
if (val & CL_DEVICE_TYPE_ACCELERATOR) printf(" accelerator");
printf("\n");
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(buf), buf, NULL) );
printf(" Device %d name: %s by ", j, buf);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, sizeof(buf), buf, NULL) );
printf("%s (", buf);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR_ID, sizeof(val), &val, NULL) );
printf("0x%llx)\n", val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, sizeof(buf), buf, NULL) );
printf(" Device %d driver version: %s\n", j, buf);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_PROFILE, sizeof(buf), buf, NULL) );
printf(" Device %d profile: %s\n", j, buf);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, sizeof(buf), buf, NULL) );
printf(" Device %d version: %s\n", j, buf);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_EXTENSIONS, sizeof(buf), buf, NULL) );
printf(" Device %d extensions: %s\n", j, buf);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_EXECUTION_CAPABILITIES, sizeof(val), &val, NULL) );
printf(" Device %d execution capabilities: 0x%llx", j, val );
if (val & CL_EXEC_KERNEL) printf(" kernel");
if (val & CL_EXEC_NATIVE_KERNEL) printf(" native");
printf("\n");
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(val), &val, NULL) );
printf(" Device %d global mem cache type: 0x%llx", j, val );
if (val == 0) printf(" None");
if (val == 1) printf(" Read-only");
if (val == 2) printf(" Read-write");
printf("\n");
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_TYPE, sizeof(val), &val, NULL) );
printf(" Device %d local mem type: 0x%llx", j, val );
if (val == 1) printf(" Local");
if (val == 2) printf(" Global");
printf("\n");
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_SINGLE_FP_CONFIG, sizeof(val), &val, NULL) );
printf(" Device %d single FP config: 0x%llx\n", j, val );
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(val), &val, NULL) );
printf(" Device %d double FP config: 0x%llx\n", j, val );
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_QUEUE_PROPERTIES, sizeof(val), &val, NULL) );
printf(" Device %d queue properties: 0x%llx\n", j, val );
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(val), &val, NULL) );
printf(" Device %d max compute units: %lld\n", j, val );
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(val), &val, NULL) );
printf(" Device %d max work group size: %lld\n", j, val );
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(val), &val, NULL) );
printf(" Device %d max work item dimensions: (", j);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(long long)*val, buf, NULL) );
for (int k=0; k<val; ++k) {
if (k > 0) printf(", ");
printf("%lld", ((long long*)buf)[k]);
};
printf(")\n");
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(val), &val, NULL) );
printf(" Device %d preferred vector width: char(%lld), ", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(val), &val, NULL) );
printf("short(%lld), ", val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(val), &val, NULL) );
printf("int(%lld), ", val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(val), &val, NULL) );
printf("long(%lld), ", val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(val), &val, NULL) );
printf("float(%lld), ", val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(val), &val, NULL) );
printf("double(%lld)\n", val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_ADDRESS_BITS, sizeof(val), &val, NULL) );
printf(" Device %d address bits: %lld\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE_SUPPORT, sizeof(val), &val, NULL) );
printf(" Device %d image support: %lld\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(val), &val, NULL) );
printf(" Device %d max clock frequency: %lld MHz\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(val), &val, NULL) );
printf(" Device %d max malloc size: %lld\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(val), &val, NULL) );
printf(" Device %d max read image args: %lld\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(val), &val, NULL) );
printf(" Device %d max write image args: %lld\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(val), &val, NULL) );
printf(" Device %d max width * height for 2D image: %lld * ", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(val), &val, NULL) );
printf("%lld\n", val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(val), &val, NULL) );
printf(" Device %d max width * height * depth for 3D image: %lld * ", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(val), &val, NULL) );
printf("%lld * ", val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(val), &val, NULL) );
printf("%lld\n", val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_MAX_SAMPLERS, sizeof(val), &val, NULL) );
printf(" Device %d max samplers: %lld\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_MAX_PARAMETER_SIZE, sizeof(val), &val, NULL) );
printf(" Device %d max parameter size: %lld\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(val), &val, NULL) );
printf(" Device %d mem base addr align: %lld\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, sizeof(val), &val, NULL) );
printf(" Device %d min data type align size: %lld\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(val), &val, NULL) );
printf(" Device %d global mem: size %lld bytes, ", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(val), &val, NULL) );
printf("cache size %lld bytes, ", val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(val), &val, NULL) );
printf("cacheline size %lld bytes\n", val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(val), &val, NULL) );
printf(" Device %d max constant buffer size: %lld\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(val), &val, NULL) );
printf(" Device %d max constant args: %lld\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(val), &val, NULL) );
printf(" Device %d local mem size: %lld bytes\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(val), &val, NULL) );
printf(" Device %d error correction support: %lld\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(val), &val, NULL) );
printf(" Device %d profiling timer resolution: %lld\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_ENDIAN_LITTLE, sizeof(val), &val, NULL) );
printf(" Device %d little endian: %lld\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_COMPILER_AVAILABLE, sizeof(val), &val, NULL) );
printf(" Device %d compiler available: %lld\n", j, val);
CL_WRAPPER( clGetDeviceInfo(devices[j], CL_DEVICE_AVAILABLE, sizeof(val), &val, NULL) );
printf(" Device %d available: %lld\n", j, val);
};
free(devices);
};
free(platforms);
return 0;
}
Hello world program
The following code runs a hello world program on the first GPU
(opencl-helloworld.c
)
/* helloworld.c - Hello World program for OpenCL, modified from an example from the web
* Adrian Sai-wah Tam <adrian.sw.tam@gmail.com>
* Sun, 19 Dec 2010 01:35:33 -0500
*/
#include "helper.h"
const char *KernelSource =
"__kernel square(__global float* input, \n" \
" __global float* output, \n" \
" const unsigned int count) \n" \
"{ \n" \
" int i = get_global_id(0); \n" \
" if (i < count) \n" \
" output[i] = input[i] * input[i]; \n" \
"} \n";
int main(int argc, char** argv)
{
const int count = 4096; // Size of data
float data[count]; // Input data set
float results[count]; // Output data set
cl_device_id device_id; // Compute device
cl_context context; // Compute context
cl_command_queue commands; // Compute command queue
cl_program program; // Compute program
cl_kernel kernel; // Compute kernel
cl_mem input; // Device memory used for the input array
cl_mem output; // Device memory used for the output array
/*** Fill input data with random float values ***/
for(int i = 0; i < count; i++)
data[i] = rand() / (float)RAND_MAX;
/*** Prepare a compute device ***/
const int gpu = 1;
printf("Connecting to the 1st %s device\n", gpu ? "GPU" : "CPU");
CL_WRAPPER( clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL) );
printf("Creating context on the compute device\n");
CL_ASSIGN( context = clCreateContext(0, 1, &device_id, NULL, NULL, &err) );
const int do_profiling = 0;
cl_command_queue_properties qprop = 0;
if (do_profiling) {
qprop |= CL_QUEUE_PROFILING_ENABLE;
}
printf("Creating command queue %s profiling\n", do_profiling ? "with" : "without");
CL_ASSIGN( commands = clCreateCommandQueue(context, device_id, qprop, &err) );
/*** Prepare kernel code ***/
printf("Making program object with one kernel:\n%s", KernelSource);
CL_ASSIGN( program = clCreateProgramWithSource(context, 1, (const char**)&KernelSource, NULL, &err) );
printf("Compiling and linking program executable\n");
char const *options = NULL;
err = clBuildProgram(program, 0, NULL, options, NULL, NULL);
if (err != CL_SUCCESS) {
size_t len;
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
char* buffer = (char*) malloc(len);
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
printf("Error %d: Failed to build program executable (%s)\n", err, cl_error_to_str(err));
free(buffer);
abort();
}
/*** Preparing kernel function and I/O memory ***/
char* kernel_name = "square"; // name of the function in the kernel code
printf("Extracting the __kernel function named `%s'\n", kernel_name);
CL_ASSIGN( kernel = clCreateKernel(program, kernel_name, &err) );
printf("Preparing input and output memory in device\n");
CL_ASSIGN( input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL) );
CL_ASSIGN( output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL) );
printf("Transferring data to input memory in device\n");
CL_WRAPPER( clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL) );
/*** Prepare to run: Set arguments, set workgroup size ***/
printf("Setting arguments to kernel\n");
CL_WRAPPER(clSetKernelArg(kernel, 0, sizeof(cl_mem), &input) |
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output) |
clSetKernelArg(kernel, 2, sizeof(unsigned int), &count)
);
size_t global, local; // Size of context and size of workgroup
CL_WRAPPER( clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL) );
printf("Setting the kernel to run on maximum workgroup size %lu over a global data set of size %u\n", local, count);
/*** Run the kernel ***/
global = count;
CL_WRAPPER( clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL) );
printf("Kernel running\n");
clFinish(commands); // Wait for the command queue finish
/*** Retrieve output from device memory, and verify result ***/
printf("Retrieving output data from device\n");
CL_WRAPPER( clEnqueueReadBuffer(commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ) );
printf("Validating results\n");
int correct = 0;
for (int i = 0; i < count; i++) {
if (results[i] == data[i] * data[i])
correct++;
}
printf("Computed '%d/%d' correct values!\n", correct, count);
/*** Clean up ***/
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
Models
Computation Model
- Each piece of metal is a host
- Each host contains several OpenCL compute devices (e.g. GPU)
- Each compute device contains several compute units (e.g. workgroup)
- Each compute unit contains several processing elements (e.g. workitem, ALU in GPU)
Memory model
- Global memory: Memory on GPU board, accessible by everyone
- Local memory: Local to compute unit, accessible by processing elements in the same compute unit
- Constant memory: A piece of memory expected to be read only. If declared a memory to be constant, it is also supposed to fit into a constant memory at runtime. Otherwise, use global memory which would be slower but not bounded by the local memory size.
- Private memory: Memory local to a work item
Debug
Dump log messages to stdout (otherwise hidden):
export CL_LOG_ERRORS=stdout
When you found the command queue or context destroyed unexpectedly, it is likely that the kernel access to an unallocated memory.