Hi all,
I am learning opencl on i.mx6d, I wrote kernel code like this:
__kernel void helloworld ( __global uchar *input1,
__global uchar *input2,
__global uchar *output,
int width,
int height,
)
{
int y = get_global_id (0);
int x = get_global_id (1);
int id = (y * width) + x;
output[id] = input1[id]; // ok
// output[id] = input2[id]; // ok
// output[id] = input1[id] + input2[id]; // error
// output[id] = 0x01; // error
}
and host code like this:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include "../include/CL/cl.h"
#define FSLCL_ERROR -1
#define FSLCL_SUCCESS CL_SUCCESS
#define MAX_SOURCE_SIZE (0x100000)
cl_mem buffer_input = NULL;
cl_mem buffer_input2 = NULL;
cl_mem buffer_output = NULL;
cl_kernel buffer_kernel = NULL;
size_t buffer_size = 0;
int buffer_width = 1024;
int buffer_height = 1024;
cl_platform_id platform_id;
cl_device_id device_id;
cl_context context;
cl_command_queue cq;
cl_program program;
cl_kernel kernel;
struct fsl_kernel_src_str
{
char *src;
long size;
};
typedef struct fsl_kernel_src_str fsl_kernel_src;
cl_int FSLCL_LoadKernelSource (char *filename, fsl_kernel_src *kernel)
{
FILE *fp = NULL;
fp = fopen (filename, "rb");
if (fp == NULL)
{
printf ("\nFailed to open: %s\n", filename);
return FSLCL_ERROR;
}
fseek (fp, 0, SEEK_END);
kernel->size = ftell (fp);
rewind (fp);
kernel->src = (char *) malloc (10 + sizeof (char) * kernel->size);
if (! kernel->src)
{
printf ("\nError Allocating memory to load CL program");
return FSLCL_ERROR;
}
kernel->size = fread (kernel->src, 1, MAX_SOURCE_SIZE, fp);
if (kernel->size < 1) {
printf("read file(%s) fail, ret=%d.\n", filename, kernel->size);
}
kernel->src[kernel->size] = '\0';
fclose (fp);
fp = NULL;
printf("close fp.\n");
return FSLCL_SUCCESS;
}
int main (int argc, char **argv)
{
int dimension = 2;
size_t global[2] = {buffer_width, buffer_height};
size_t local[2] = {4, 16};
int size_2d = buffer_width * buffer_height;
cl_int ret;
cl_int platforms;
char **data;
char **data2;
char **data0;
int i, j;
data0= (char **) malloc (buffer_width * sizeof (char *));
data = (char **) malloc (buffer_width * sizeof (char *));
data2 = (char **) malloc (buffer_width * sizeof (char *));
if(NULL == data0 || NULL == data || NULL == data2) {
printf("malloc1 err: %x, %x, %x\n", data0, data, data2);
return -1;
}
for (i = 0; i < buffer_width; i++)
{
data0[i] = (char *) malloc (buffer_height * sizeof (char));
data[i] = (char *) malloc (buffer_height * sizeof (char));
data2[i] = (char *) malloc (buffer_height * sizeof (char));
if(NULL == data0[i] || NULL == data[i] || NULL == data2[i]) {
printf("malloc2 err in i%d: %x, %x, %x\n", i, data0[i], data[i], data2[i]);
return -1;
}
}
for (i = 0; i < buffer_width; i++)
{
for (j = 0; j < buffer_height; j++)
{
data0[i][j] = 0;
data[i][j] = 0;
data2[i][j] = 0;
}
}
ret = clGetPlatformIDs (1, &platform_id, &platforms );
printf("clGetPlatformIDs ret = %d\n", ret);
assert (ret == CL_SUCCESS);
cl_int devices;
ret = clGetDeviceIDs (platform_id,CL_DEVICE_TYPE_GPU,1,&device_id, &devices);
assert (ret == CL_SUCCESS);
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 0};
context = clCreateContext(properties, devices, &device_id, NULL,NULL,&ret);
assert (ret == CL_SUCCESS);
printf("clCreateContext ret =%d\n", ret);
cq = clCreateCommandQueue(context, device_id, 0, &ret);
assert (ret == CL_SUCCESS);
printf("clCreateCommandQueue ret =%d\n", ret);
fsl_kernel_src app_kernel;
ret = FSLCL_LoadKernelSource ((char *)"helloworld.cl", &app_kernel);
printf("FSLCL_LoadKernelSource ret =%d\n", ret);
// Submit the source code of the kernel to OpenCL
program = clCreateProgramWithSource (context, 1, (const char **)&app_kernel.src, 0,&ret);
printf("clCreateProgramWithSource ret =%d\n", ret);
// and compile it (after this we could extract the compiled version)
if (ret == CL_SUCCESS)
ret = clBuildProgram (program, 1, device_id, NULL, NULL, NULL);
if (ret < 0)
{
printf ("Failed\n");
printf ("\nReturn: %d\n", ret);
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, app_kernel.size, app_kernel.src, NULL);
printf ("\n%s", app_kernel.src);
}
assert(ret == CL_SUCCESS);
buffer_input = clCreateBuffer (context, CL_MEM_READ_ONLY, size_2d, NULL, &ret);
printf("clCreateBuffer for input ret =%d\n", ret);
assert (ret == CL_SUCCESS);
buffer_input2 = clCreateBuffer (context, CL_MEM_READ_ONLY, size_2d, NULL, &ret);
printf("clCreateBuffer for input2 ret =%d\n", ret);
assert (ret == CL_SUCCESS);
buffer_output = clCreateBuffer (context, CL_MEM_WRITE_ONLY , size_2d, NULL, &ret);
assert (ret == CL_SUCCESS);
// get a handle and map parameters for the kernel
kernel = clCreateKernel(program, "helloworld", &ret);
printf("clCreateKernel ret =%d\n", ret);
usleep(100*1000);
assert (ret == CL_SUCCESS);
clSetKernelArg (kernel, 0, sizeof(cl_mem), &buffer_input);
clSetKernelArg (kernel, 1, sizeof(cl_mem), &buffer_input2);
clSetKernelArg (kernel, 2, sizeof(cl_mem), &buffer_output);
clSetKernelArg (kernel, 3, sizeof(int), &buffer_width);
clSetKernelArg (kernel, 4, sizeof(int), &buffer_height);
for (i = 0; i < buffer_width; i++)
{
for (j = 0; j < buffer_height; j++)
{
data0[i][j] = rand () % 10;
data[i][j] = rand () % 10;
}
}
ret = clEnqueueWriteBuffer (cq, buffer_input, CL_TRUE, 0, size_2d, data0, 0, NULL, NULL);
printf("clEnqueueWriteBuffer input1 ret =%d\n", ret);
assert (ret == CL_SUCCESS);
ret = clEnqueueWriteBuffer (cq, buffer_input2, CL_TRUE, 0, size_2d, data, 0, NULL, NULL);
printf("clEnqueueWriteBuffer input2 ret =%d\n", ret);
assert (ret == CL_SUCCESS);
ret = clEnqueueNDRangeKernel(cq, kernel, dimension, NULL, &global, &local, 0, NULL, NULL);
printf("clEnqueueNDRangeKernel ret =%d\n", ret);
assert (ret == CL_SUCCESS);
clFinish(cq);
ret = clEnqueueReadBuffer(cq, buffer_output, CL_TRUE, 0, size_2d, data2, 0, NULL, NULL);
printf("clEnqueueReadBuffer ret =%d\n", ret);
assert (ret == CL_SUCCESS);
printf ("\nResult:\n");
for (i = 0; i < buffer_width; i++)
{
for (j = 0; j < buffer_height; j++)
{
// printf ("\n%d , %d -- %d", data0[i][j], data[i][j], data2[i][j]);
printf("%d, ", data2[i][j]);
}
}
printf ("\n");
clFlush( cq);
clFinish(cq);
clReleaseMemObject (buffer_input);
clReleaseMemObject (buffer_input2);
clReleaseMemObject (buffer_output);
clReleaseContext(context);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cq);
for (i = 0; i < buffer_width; i++)
{
free(data0[i]);
free(data[i]);
free(data2[i]);
}
free(data0);
free(data);
free(data2);
return 0;
}
Question:
If the computing method in kernel code is "output[id] = input1[id];" or "output[id] = input2[id];", the whole program works well.
But, if the computing method in kernel code changed to "output[id] = input1[id]+input2[id];" or "output[id] = 0x01;", the program runs with an error output: segmentation fault. I have corfirmed that the line leads to this is "printf("%d, ", data2[i][j]);" in host code, but i am not sure what is the true cause. I will really appreciate it if someone can help me.
Hi zheng
please look at hello world example on
OpenCL Hello World | NXP Community
Best regards
igor
-----------------------------------------------------------------------------------------------------------------------
Note: If this post answers your question, please click the Correct Answer button. Thank you!
-----------------------------------------------------------------------------------------------------------------------