Abstract This is a small tutorial about running a simple OpenCL application in an i.MX6Q. It covers a very small introduction to OpenCL, the explanation of the code and how to compile and run it. Requirements Any i.MX6Q board. Linux BSP with the gpu-viv-bin-mx6q package (for instructions on how to build the BSP, check the BSP Users Guide) OpenCL overview OpenCL allows any program to use the GPGPU features of the GC2000 (General-Purpose Computing on Graphics Processing Units) that means to use the i.MX6Q GPU processing power in any program. OpenCL uses kernels which are functions that can be executed in the GPU. These functions must be written in a C99 like code. In our current GPU there is no scheduling so each kernel will execute in a FIFO fashion. iMx6Q GPU is OpenCL 1.1 EP conformant. The Code The example provided here performs a simple addition of arrays in the GPU. The header needed to use openCL is cl.h and is under /usr/include/CL in your BSP rootfs when you install the gpu-viv-bin-mx6q package. The header is typically included like this: #include <CL/cl.h> The libraries needed to link the program are libGAL.so and libOpenCL.so those are under /usr/lib in your BSP rootfs. For details on the OpenCL API check the khronos page: http://www.khronos.org/opencl/ Our kernel source is as follows: __kernel void VectorAdd(__global int* c, __global int* a,__global int* b) { // Index of the elements to add unsigned int n = get_global_id(0); // Sum the nth element of vectors a and b and store in c c[n] = a[n] + b[n]; } The kernel is declared with the signature __kernel void VectorAdd(__global int* c, __global int* a,__global int* b). This takes vectors a and b as arguments adds them and stores the result in the vector c. It looks like a normal C99 method except for the keywords kernel and global. kernel tells the compiler this function is a kernel, global tells the compiler this attributes are of global address space. get_global_id built-in function This function will tell us to which index of the vector this kernel corresponds to. And in the last line the vectors are added. Below is the full source code commented. //************************************************************ // Demo OpenCL application to compute a simple vector addition // computation between 2 arrays on the GPU // ************************************************************ #include <stdio.h> #include <stdlib.h> #include <CL/cl.h> // // OpenCL source code const char* OpenCLSource[] = { "__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)", "{", " // Index of the elements to add \n", " unsigned int n = get_global_id(0);", " // Sum the nth element of vectors a and b and store in c \n", " c[n] = a[n] + b[n];", "}" }; // Some interesting data for the vectors int InitialData1[20] = {37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17}; int InitialData2[20] = {35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15}; // Number of elements in the vectors to be added #define SIZE 100 // Main function // ************************************************************ int main(int argc, char **argv) { // Two integer source vectors in Host memory int HostVector1[SIZE], HostVector2[SIZE]; //Output Vector int HostOutputVector[SIZE]; // Initialize with some interesting repeating data for(int c = 0; c < SIZE; c++) { HostVector1[c] = InitialData1[c%20]; HostVector2[c] = InitialData2[c%20]; HostOutputVector[c] = 0; } //Get an OpenCL platform cl_platform_id cpPlatform; clGetPlatformIDs(1, &cpPlatform, NULL); // Get a GPU device cl_device_id cdDevice; clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); char cBuffer[1024]; clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL); printf("CL_DEVICE_NAME: %s\n", cBuffer); clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL); printf("CL_DRIVER_VERSION: %s\n\n", cBuffer); // Create a context to run OpenCL enabled GPU cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL); // Create a command-queue on the GPU device cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext, cdDevice, 0, NULL); // Allocate GPU memory for source vectors AND initialize from CPU memory cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE, HostVector1, NULL); cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE, HostVector2, NULL); // Allocate output memory on GPU cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(int) * SIZE, NULL, NULL); // Create OpenCL program with source code cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 7, OpenCLSource, NULL, NULL); // Build the program (OpenCL JIT compilation) clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL); // Create a handle to the compiled OpenCL function (Kernel) cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "VectorAdd", NULL); // In the next step we associate the GPU memory with the Kernel arguments clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUOutputVector); clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1); clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2); // Launch the Kernel on the GPU // This kernel only uses global data size_t WorkSize[1] = {SIZE}; // one dimensional Range clEnqueueNDRangeKernel(cqCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, NULL); // Copy the output in GPU memory back to CPU memory clEnqueueReadBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0, SIZE * sizeof(int), HostOutputVector, 0, NULL, NULL); // Cleanup clReleaseKernel(OpenCLVectorAdd); clReleaseProgram(OpenCLProgram); clReleaseCommandQueue(cqCommandQueue); clReleaseContext(GPUContext); clReleaseMemObject(GPUVector1); clReleaseMemObject(GPUVector2); clReleaseMemObject(GPUOutputVector); for( int i =0 ; i < SIZE; i++) printf("[%d + %d = %d]\n",HostVector1[i], HostVector2[i], HostOutputVector[i]); return 0; } How to compile in Host Get to your ltib folder and run $./ltib m shell This way you will be using the cross compiler ltib uses and the default include and lib directories will be the ones in your bsp. Then run LTIB> gcc cl_sample.c -lGAL -lOpenCL -o cl_sample. How to run in the i.MX6Q Insert the GPU module root@freescale/home/user $ modprobe galcore Copy the compiled CL program and then run root@freescale /home/user$ ./cl_sample References [1] ttp://www.khronos.org/opencl/ Original Attachment has been moved to: libOpenCL.so.zip Original Attachment has been moved to: libCLC_Android.so.zip Original Attachment has been moved to: libOpenCL_Android.so.zip Original Attachment has been moved to: libCLC.so.zip
查看全文