티스토리 뷰
배열 A의 값과 배열 B의 값을 더해서 배열 C에 저장하는 간단한 예제를 작성해보았다.
C프로그래밍에 비해 설정이 많기 때문에 상대적으로 난이도가 높은 것 같다.
아래는 실행코드이다.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <strings.h>
#include <CL/cl.h>
void vec_add(float *A, float *B, float *C, int N);
void verify(float *C1, float *C2, int N);
void vec_add_opencl(float *A, float *B, float *C, int N);
int main(int argc, char *argv[])
{
int i;
int N = 4096;
if(argc == 2) N = atoi(argv[1]);
float *A = (float *)malloc(sizeof(float) * N);
float *B = (float *)malloc(sizeof(float) * N);
float *C1 = (float *)malloc(sizeof(float) * N);
float *C2 = (float *)malloc(sizeof(float) * N);
srand(0);
for(i = 0; i < N ; i++)
{
A[i] = (float)(rand() % 1000) / 100.0f;
B[i] = (float)(rand() % 1000) / 100.0f;
}
printf("== Vector Addition ==\n");
printf("N : %d\n", N);
vec_add_opencl(A, B, C1, N);
vec_add(A, B, C2, N);
verify(C1, C2, N);
free(A);
free(B);
free(C1);
free(C2);
return EXIT_SUCCESS;
}
void vec_add(float *A, float *B, float *C, int N)
{
int i;
for( i = 0; i < N ; i++)
C[i] = A[i] + B[i];
}
void verify(float *C1, float *C2, int N)
{
int passed = 1;
int i;
for( i = 0 ; i < N ; i++)
{
if(C1[i] != C2[i])
{
fprintf(stderr, "%d: C1=%.2f vs. C2=%.2f\n", i, C1[i], C2[i]);
passed = 0;
}
}
if(passed)
{
printf("PASSED.\n");
}
}
#define CHECK_ERROR(err)\
if(err != CL_SUCCESS){\
fprintf(stderr, "[%s:%d] ERROR : %d\n", __FILE__, __LINE__, err);\
exit(EXIT_FAILURE);\
}\
void vec_add_opencl(float *A, float *B, float *C, int N)
{
cl_platform_id *platform;
cl_device_type dev_type = CL_DEVICE_TYPE_DEFAULT;
cl_device_id dev;
cl_context context;
cl_command_queue cmd_queue;
cl_program program;
cl_kernel kernel;
cl_mem mem_A, mem_B, mem_C;
cl_int err;
cl_uint num_platforms;
cl_uint num_dev = 0;
int i;
//Get the device type to use from the environmental variable.
char *dtype = getenv("CL_DEV_TYPE");
if(dtype)
{
if(strcasecmp(dtype, "cpu") == 0)
{
dev_type = CL_DEVICE_TYPE_CPU;
} else if(strcasecmp(dtype, "gpu") == 0)
{
dev_type = CL_DEVICE_TYPE_GPU;
}
}
// Platform
err = clGetPlatformIDs(0, NULL, &num_platforms);
CHECK_ERROR(err);
if(num_platforms == 0)
{
fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
printf("Number of platforms : %u\n", num_platforms);
platform = (cl_platform_id*) malloc(sizeof(cl_platform_id) * num_platforms);
err = clGetPlatformIDs(num_platforms, platform, NULL);
CHECK_ERROR(err);
// Device
for( i = 0; i < num_platforms ; i++)
{
err = clGetDeviceIDs(platform[i], dev_type, 1, &dev, &num_dev);
if(err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err);
if(num_dev == 1) break;
}
if(num_dev < 1)
{
fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
// Print the device name.
size_t name_size;
clGetDeviceInfo(dev, CL_DEVICE_NAME, 0, NULL, &name_size);
char *dev_name = (char *)malloc(name_size + 1);
err = clGetDeviceInfo(dev, CL_DEVICE_NAME, name_size, dev_name, NULL);
CHECK_ERROR(err);
printf("Device : %s\n", dev_name);
free(dev_name);
// Context
context = clCreateContext(NULL, 1, &dev, NULL, NULL, &err);
CHECK_ERROR(err);
// Command queue
cmd_queue = clCreateCommandQueue(context, dev, 0, &err);
CHECK_ERROR(err);
// Create a program.
char *source_code = "__kernel void vec_add(__global float *A, __global float *B, __global float *C, int N){\n"
" int i = get_global_id(0);\n"
" if (i < N) {\n"
" C[i] = A[i] + B[i];\n"
" }\n"
"}\n";
size_t source_len = strlen(source_code);
program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_len, &err);
CHECK_ERROR(err);
// Build the program.
err = clBuildProgram(program, 1, &dev, NULL, NULL, NULL);
if(err != CL_SUCCESS)
{
//Print the build log.
size_t log_size;
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char*)malloc(log_size + 1);
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
fprintf(stderr, "\n-----------BUILD LOG------------\n%s\n",log);
fprintf(stderr, "-----------------------------------------\n");
free(log);
CHECK_ERROR(err);
}
// kernel
kernel = clCreateKernel(program, "vec_add", &err);
CHECK_ERROR(err);
// Buffers
mem_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * N, A, &err);
CHECK_ERROR(err);
mem_B = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * N, B, &err);
CHECK_ERROR(err);
mem_C = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * N, NULL, &err);
CHECK_ERROR(err);
// Set the arguments.
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_A);
CHECK_ERROR(err);
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_B);
CHECK_ERROR(err);
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_C);
CHECK_ERROR(err);
err = clSetKernelArg(kernel, 3, sizeof(int), &N);
CHECK_ERROR(err);
// Enqueue the kernel.
size_t lws[1] = {128};
size_t gws[1];
gws[0] = (size_t)ceil((double)N / lws[0]) * lws[0];
err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, gws, lws, 0, NULL, NULL);
CHECK_ERROR(err);
// Read the result.
err = clEnqueueReadBuffer(cmd_queue, mem_C, CL_TRUE, 0, sizeof(float) * N, C, 0, NULL, NULL);
CHECK_ERROR(err);
// Release
clReleaseMemObject(mem_A);
clReleaseMemObject(mem_B);
clReleaseMemObject(mem_C);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmd_queue);
clReleaseContext(context);
free(platform);
}
// Makefile도 같이 올린다
TARGET = vec_add
OBJS = vec_add.o
CC = gcc
INCLUDES =
CFLAGS = -O2 -Wall $(INCLUDES)
LDFLAGS = -lm -lOpenCL
all: $(TARGET)
$(TARGET): $(OBJS)
$(CC) $(LDFLAGS) $(OBJS) -o $@
.c.o:
$(CC) $(CFLAGS) -c $< -o $@
clean:
rm -f $(OBJS) $(TARGET)