use GPU openCL to convert YUY2 to RGB

weideding · ‎09-04-2019

I am using MX8MQ GPU to convert YUY2 to RGB for 1080P@30fps camera. clEnqueueNDRangeKernel executes 30ms, but clEnqueueReadBuffer takes long time > 70ms to finish reading. I think this buffer is from part of LPDDR4, and is controlled by GPU, why does clEnqueueReadBuffer need so long time? Normally, CPU's memcpy 1080x1920x3bytes, just takes 4ms.

use sample code from arch64-mx8m-poky-linux\imx-gpu-sdk\5.3.0-r0\git\DemoApps\OpenCL\SoftISP:

void ConvertToRGBA(const Kernel& kernel, const Buffer& inBuffer, Buffer& outBuffer, const CommandQueue& commandQueue, void* ptr)

{

...

clSetKernelArg(kernel.Get(), 0, sizeof(cl_mem), inBuffer.GetPointer());

clSetKernelArg(kernel.Get(), 1, sizeof(cl_mem), outBuffer.GetPointer());

RAPIDOPENCL_CHECK(clEnqueueNDRangeKernel(commandQueue.Get(), kernel.Get(), 2, nullptr, globalWorkSize, localWorkSize, 0, nullptr, &event));

RAPIDOPENCL_CHECK(clWaitForEvents(1, &event)); //GPU need 30ms

RAPIDOPENCL_CHECK(

clEnqueueReadBuffer(commandQueue.Get(), outBuffer.Get(), CL_FALSE, 0, sizeof(cl_char) * imgSize * 4, ptr, 0, nullptr, nullptr)); //read >70ms

}

Thanks for your atttention!

weideding · ‎09-11-2019

online, is waiting for more information until now. we cannot throw our projects with this powerful CPU IMX8MQ into garbage. NOT support H264 HW encode, NOT support G2D, and do a simple color space convert need such long time.

Bio_TICFSL · ‎09-09-2019

Hello Weide,

You can avoid the copy from GPU to CPU by using the clEnqueueMapBuffer function instead of clEnqueueReadBuffer. Using mapping method, the CPU will map the memory region allocated by the GPU into its address space. Both GPU and CPU using RAM, address conversion will be fast. Accessing the buffer should only take a few seconds using this method.

For this to work, you also need to create the buffer using CL_MEM_ALLOC_HOST_PTR flag.

Regards

weideding · ‎09-10-2019

Hi Bio,

Thanks for your kindly help! I tried CL_MEM_ALLOC_HOST_PTR + clEnqueueMapBuffer. It takes 11ms to get pointer from data=clEnqueueMapBuffer, it is really fast. But we I do memcpy(tmp, data), it takes78ms. So total time is almost same with clEnqueueReadBuffer. I want to know why a simple memcpy takes 78ms? I have to do memcpy for opencv library use.

Besides 256MB LPDDR4 to GPU use, does GPU gc7000lite still have its own memory inside GPU? How much is the RAM size inside GPU?

I attached my test code below:

unsigned int mxt_get_ms(void)
{
struct timespec tp;

clock_gettime(CLOCK_MONOTONIC, &tp);
return (unsigned int)(tp.tv_sec * 1000 + tp.tv_nsec / 1000000);
}

cl_uint GetDeviceCount(const cl_context context)
{
std::size_t nDeviceBytes;
RAPIDOPENCL_CHECK(clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, nullptr, &nDeviceBytes));
return static_cast<cl_uint>(nDeviceBytes / sizeof(cl_device_id));
}

cl_uint GetNumComputeUnits(const cl_platform_id platform, const cl_device_type deviceType)
{
// Get all the devices
LOGD("Get the Device info and select Device...");
const auto devices = OpenCLHelper::GetDeviceIDs(platform, deviceType);

// Set target device and Query number of compute units on targetDevice
LOGD("# of Devices Available = %lu", devices.size());

cl_uint numComputeUnits;
RAPIDOPENCL_CHECK(clGetDeviceInfo(devices[0], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numComputeUnits), &numComputeUnits, nullptr));

LOGD("# of Compute Units = %u", numComputeUnits);
return numComputeUnits;
}

double GetExecutionTime(const cl_event event)
{
cl_ulong start;
cl_ulong end;
cl_int err;
err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, nullptr);
err |= clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr);
if (err)
{
return 0;
}
return static_cast<double>(1.0e-6 * (end - start)); // convert nanoseconds to ms
}
}

SoftISP::SoftISP(const DemoAppConfig& config)
: DemoAppOpenCL(config)
, m_context()
, m_deviceId(0)
, m_commandQueue()
{
LOGD("Initializing device(s)...");
const cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
// create the OpenCL context on available GPU devices
m_context.Reset(deviceType);

if (GetDeviceCount(m_context.Get()) <= 0)
throw InitFailedException("No OpenCL specific devices!");
const cl_uint ciComputeUnitsCount = GetNumComputeUnits(m_context.GetPlatformId(), deviceType);
LOGD("# compute units = %u", ciComputeUnitsCount);

LOGD("Getting device id...");
RAPIDOPENCL_CHECK(clGetContextInfo(m_context.Get(), CL_CONTEXT_DEVICES, sizeof(cl_device_id), &m_deviceId, nullptr));

LOGD("Creating Command Queue...");
m_commandQueue.Reset(m_context.Get(), m_deviceId, CL_QUEUE_PROFILING_ENABLE);
}

SoftISP::~SoftISP()
{
}

void SoftISP::Run()
{
unsigned int start;

AllocateMemory(m_context.Get(), m_imgSize);

const std::string strProgram = GetContentManager()->ReadAllText("isp_kernel.cl");
ProgramEx program(m_context.Get(), m_deviceId, strProgram);

const int KERNEL_NUM = 2;
LOGD("Creating kernels...");
LOGD("Please wait for compiling and building kernels, about one minute...");
Kernel kernels[KERNEL_NUM];

kernels[0].Reset(program.Get(), "yuyv2rgba");

clSetKernelArg(kernels[0].Get(), 0, sizeof(cl_mem), m_deviceImg[0].GetPointer());
clSetKernelArg(kernels[0].Get(), 1, sizeof(cl_mem), m_deviceImg[1].GetPointer());
clSetKernelArg(kernels[0].Get(), 2, sizeof(cl_mem), m_deviceImg[2].GetPointer());

const std::size_t globalWorkSizeDiv8[2] = {m_imgWid / 4, m_imgHei / 2};
const std::size_t localWorkSize32[2] = {8, 4};

GetContentManager()->ReadAllBytes(m_dst0.data(), m_imgSize, "bayer.data");
GetContentManager()->ReadAllBytes(m_dst1.data(), m_imgSize, "bayer.data");

LOGD("clEnqueueWriteBuffer m_dst0");
RAPIDOPENCL_CHECK(
clEnqueueWriteBuffer(m_commandQueue.Get(), m_deviceImg[0].Get(), CL_FALSE, 0, sizeof(cl_char) * m_imgSize, m_dst0.data(), 0, nullptr, nullptr));

LOGD("clEnqueueWriteBuffer m_dst1");
RAPIDOPENCL_CHECK(
clEnqueueWriteBuffer(m_commandQueue.Get(), m_deviceImg[1].Get(), CL_FALSE, 0, sizeof(cl_char) * m_imgSize, m_dst1.data(), 0, nullptr, nullptr));

LOGD("Writing data to gpu buffer done");

cl_event hEvent;
LOGD("clEnqueueNDRangeKernel start");

RAPIDOPENCL_CHECK(
clEnqueueNDRangeKernel(m_commandQueue.Get(), kernels[0].Get(), 2, nullptr, globalWorkSizeDiv8, localWorkSize32, 0, nullptr, &hEvent));
RAPIDOPENCL_CHECK(clWaitForEvents(1, &hEvent));

LOGD("clEnqueueNDRangeKernel done");

double time = GetExecutionTime(hEvent);
LOGD("Kernel execution time on GPU (kernel: badpixel): %lf ms", time);

#if MY_USE_HOST_PTR
LOGD("clEnqueueReadBuffer start");
start = mxt_get_ms();
RAPIDOPENCL_CHECK(
clEnqueueReadBuffer(m_commandQueue.Get(), m_deviceImg[1].Get(), CL_FALSE, 0, sizeof(cl_char) * m_imgSize * 4, m_dst2.data(), 0, nullptr, nullptr));

LOGD("clEnqueueReadBuffer done, take %dms", mxt_get_ms() - star);
#else
{
cl_uchar *my_hostPtr;
cl_int errcode_ret;

LOGD("clEnqueueMapBuffer start");

start = mxt_get_ms();
my_hostPtr = (cl_uchar *)clEnqueueMapBuffer(m_commandQueue.Get(), m_deviceImg[2].Get(), CL_TRUE, CL_MAP_READ, 0, sizeof(cl_char) *m_imgSize*4, 0, NULL, NULL, &errcode_ret);

LOGD("clEnqueueMapBuffer done, take %dms", mxt_get_ms() - start);

doTestCopy(my_hostPtr, sizeof(cl_char) *m_imgSize*4);
}
#endif

LOGD("end all");
}

void SoftISP::dumpMemoryInfo(RapidOpenCL1::Buffer& buffer)
{
size_t sz;
cl_int type;
cl_int flag;
cl_char *host_ptr;

buffer.GetMemObjectInfo(CL_MEM_TYPE, sizeof(type), &type, &sz);
buffer.GetMemObjectInfo(CL_MEM_FLAGS, sizeof(flag), &flag, &sz);
buffer.GetMemObjectInfo(CL_MEM_HOST_PTR, sizeof(host_ptr), &host_ptr, &sz);
LOGD("cl_mem(0x%p):flag=0x%x type=%d host_ptr=0x%p", buffer.Get(), flag, type, host_ptr);
}

void SoftISP::doTestCopy(cl_uchar *hostPtr, size_t size)
{
unsigned char *tmp = (unsigned char *)malloc(size);
unsigned int start = mxt_get_ms();

LOGD("memcpy start");

memcpy(tmp, hostPtr, size);

LOGD("memcpy done, take %dms", mxt_get_ms() - start);

LOGD("%02x %02x %02x %02x", tmp[0], tmp[1], tmp[2], tmp[3]);
LOGD("%02x %02x %02x %02x", tmp[4], tmp[5], tmp[6], tmp[7]);

free(tmp);
}

void SoftISP::AllocateMemory(const cl_context context, const std::size_t size)
{
m_dst0.resize(size);
m_dst1.resize(size);
m_dst2.resize(size*4);

m_deviceImg.resize(3);

m_deviceImg[0].Reset(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_char) * size, m_dst0.data());
m_deviceImg[1].Reset(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_char) * size, m_dst1.data());

#if MY_USE_HOST_PTR
m_deviceImg[2].Reset(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_char) * size * 4, m_dst2.data());
#else
m_deviceImg[2].Reset(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_char) * size * 4, NULL);
#endif

//dumpMemoryInfo(m_deviceImg[2]);
}

LOG:

root@imx8mqevk:/opt/imx-gpu-sdk/OpenCL/SoftISP# ./SoftISP
[928.333][MXT/D] "Initializing device(s)..."
[928.511][MXT/D] "Get the Device info and select Device..."
[928.511][MXT/D] "# of Devices Available = 1"
[928.511][MXT/D] "# of Compute Units = 1"
[928.511][MXT/D] "# compute units = 1"
[928.511][MXT/D] "Getting device id..."
[928.511][MXT/D] "Creating Command Queue..."
[928.857][MXT/D] "Creating kernels..."
[928.857][MXT/D] "Please wait for compiling and building kernels, about one minute..."
[928.943][MXT/D] "clEnqueueWriteBuffer m_dst0"
[929.058][MXT/D] "clEnqueueWriteBuffer m_dst1"
[929.059][MXT/D] "Writing data to gpu buffer done"
[929.059][MXT/D] "clEnqueueNDRangeKernel start"
[929.076][MXT/D] "clEnqueueNDRangeKernel done"
[929.077][MXT/D] "Kernel execution time on GPU (kernel: badpixel): 7.294000 ms"
[929.077][MXT/D] "clEnqueueMapBuffer start"
[929.088][MXT/D] "clEnqueueMapBuffer done, take 11ms"
[929.088][MXT/D] "memcpy start"
[929.166][MXT/D] "memcpy done, take 78ms"
[929.166][MXT/D] "82 82 82 ff"
[929.166][MXT/D] "82 9d 4d ff"
[929.166][MXT/D] "end all"

use GPU openCL to convert YUY2 to RGB

use GPU openCL to convert YUY2 to RGB

i.MX 8M | i.MX 8M Mini | i.MX 8M Nano