I don't have physical access to the device over the holiday period, so it will be at least a week or possibly two before I can update the OS. If you happen to have a device to hand, the C code below provides the signed-vs-unsigned tests as images, using the cf4ocl framework, which is portable and very lightweight https://github.com/nunofachada/cf4ocl
Just compile and run checkvec_image.c in the same folder as checkvec_image.cl - it will prompt for which device on the i.MX8MP you wish to use (GC7000UL or VIP8000Nano), and write out 4 .png images demonstrating the 4 test-cases in the .cl file. checkvec_image-0.png should contain only a single upper-diagonal triangle (like images 1 and 2), but instead looks like checkvec_image-3.png which is using the unsigned comparison.
checkvec_image.c:
#define KERNEL_FILENAME "./checkvec_image.cl"
#define KERNEL_NAME "checkvec_image"
#define IMAGE_FILENAME_BASE "./checkvec_image"
/**
* @file
* Testing possible i.MX8MPlus GPU bug
*
* @note
* Developed using cf4ocl (C Framework for OpenCL)
* Based on the cf4ocl image_filter example, this generates an output image
* Requires OpenCL >= 1.1.
*
* @author Shamus Husheer
*/
/*
* Description
* -----------
*
* Calls the checkvec_image kernel, saves each cl_short as 0x00 if ==0 else 0xFF to one of 4 8-bit 1024x1024 PNG files
*
* */
#define _POSIX_C_SOURCE 200112L
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <cf4ocl2.h>
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"
/* Error handling macros. */
#define ERROR_MSG_AND_EXIT(msg) \
do { fprintf(stderr, "\n%s\n", msg); exit(EXIT_FAILURE); } while(0)
#define HANDLE_ERROR(err) \
if (err != NULL) { ERROR_MSG_AND_EXIT(err->message); }
#define HANDLE_NONZERO_ERROR(err) \
if (err != 0) { fprintf(stderr, "\nError code: %d\n", err); exit(EXIT_FAILURE); }
#define HANDLE_NULL_ERROR(ptr) \
if (ptr == NULL) { fprintf(stderr, "\nNULL pointer returned\n"); exit(EXIT_FAILURE); }
/**
* Example main function.
* */
int main(int argc, char* argv[]) {
/* Wrappers for OpenCL objects. */
CCLContext* ctx;
CCLDevice* dev;
CCLQueue* queue;
CCLProgram* prg;
CCLKernel* krnl;
/* Device selected specified in the command line. */
int dev_idx = -1;
/* Error handling object (must be initialized to NULL). */
CCLErr* err = NULL;
/* Profiling object. */
CCLProf* prof;
/* Image data in host. */
cl_short * output_image = NULL;
/* Image data on device. */
CCLBuffer * output_image_device = NULL;
/* Image properties. */
int width=1024, height=1024, n_channels=4;
/* Real worksize. */
size_t real_ws[2];
/* Global and local worksizes. */
size_t gws[2];
size_t lws[2];
/* Check arguments. */
if (argc < 2) {
ERROR_MSG_AND_EXIT("Usage: checkvec_image [device_index]");
} else if (argc >= 3) {
/* Check if a device was specified in the command line. */
dev_idx = atoi(argv[2]);
}
/* Allocate space for output image on host, aligned to 64byte boundary to enable zero-copy as per I.MX_GRAPHICS_USERS_GUIDE section 5.5. */
HANDLE_NONZERO_ERROR(posix_memalign((void*)&output_image, 64, n_channels*height*width*2));
/* Real work size (kernel operates on short4 outputs). */
real_ws[0] = width/4; real_ws[1] = height;
/* Create context using device selected from menu. */
ctx = ccl_context_new_from_menu_full(&dev_idx, &err);
HANDLE_ERROR(err);
/* Get first device in context. */
dev = ccl_context_get_device(ctx, 0, &err);
HANDLE_ERROR(err);
/* Create a command queue. */
queue = ccl_queue_new(ctx, dev, CL_QUEUE_PROFILING_ENABLE, &err);
HANDLE_ERROR(err);
/* Instantiate and initialize device buffers. */
output_image_device = ccl_buffer_new(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, n_channels*height*width*2, output_image, &err);
HANDLE_ERROR(err);
/* Create program from kernel source file and compile it. */
prg = ccl_program_new_from_source_file(ctx, KERNEL_FILENAME, &err);
HANDLE_ERROR(err);
ccl_program_build(prg, NULL, &err);
if (err != NULL)
{
CCLErr* err2 = NULL;
fprintf(stderr, "\n%s\nBuild Error: Log Start\n", err->message);
char* log = ccl_program_get_build_info_array(prg, dev, CL_PROGRAM_BUILD_LOG, char*, &err2);
fprintf(stderr, "\n%s\nBuild Log End\n", log);
HANDLE_ERROR(err2);
}
HANDLE_ERROR(err);
/* Get kernel wrapper. */
krnl = ccl_program_get_kernel(prg, KERNEL_NAME, &err);
HANDLE_ERROR(err);
/* Determine nice local and global worksizes. */
ccl_kernel_suggest_worksizes(krnl, dev, 2, real_ws, gws, lws, &err);
HANDLE_ERROR(err);
/* Show information to user. */
printf("\n * Image size: %d x %d, %d channels\n",
width, height, n_channels);
printf(" * Global work-size: (%d, %d)\n", (int) gws[0], (int) gws[1]);
printf(" * Local work-size: (%d, %d)\n", (int) lws[0], (int) lws[1]);
/* Start profiling. */
prof = ccl_prof_new();
ccl_prof_start(prof);
/* Apply kernel. */
ccl_kernel_set_args_and_enqueue_ndrange(
krnl, queue, 2, NULL, gws, lws, NULL, &err,
output_image_device, NULL);
HANDLE_ERROR(err);
/* Prepare to read image data back to host. This is zero-copy on Unified Memory model such as I.MX8. */
ccl_buffer_enqueue_map(output_image_device, queue, CL_TRUE, CL_MAP_WRITE,
0, n_channels*height*width*2, NULL, NULL, &err);
HANDLE_ERROR(err);
/* Stop profiling timer and add queues for analysis. */
ccl_prof_stop(prof);
ccl_prof_add_queue(prof, "Queue1", queue);
/* Write 8-bit images to files. */
cl_uchar * file_image = NULL;
file_image = (cl_uchar *) malloc(height*width);
HANDLE_NULL_ERROR(file_image);
for(int channel=0; channel<n_channels; channel++)
{
for(int idx =0; idx<height*width; idx++) {
cl_short val = output_image[idx+channel*width*height];
// Create black-and-white PNG image, white if val!=0 (because scalar TRUE = 1, vector TRUE = -1)
file_image[idx]=(val==0?0:0xFF);
}
char filename[sizeof(IMAGE_FILENAME_BASE)+8];
sprintf(filename, "%s-%d.png", IMAGE_FILENAME_BASE, channel);
int file_write_status = 0;
file_write_status = stbi_write_png(filename, width, height, 1, file_image, width);
if (file_write_status) {
fprintf(stdout, "\nImage saved in file %s\n",filename);
} else {
ERROR_MSG_AND_EXIT("Unable to save image.");
}
}
free(file_image);
/* Process profiling info. */
ccl_prof_calc(prof, &err);
HANDLE_ERROR(err);
/* Print profiling info. */
ccl_prof_print_summary(prof);
/* Release host images. */
free(output_image);
/* Release wrappers. */
ccl_buffer_destroy(output_image_device);
ccl_program_destroy(prg);
ccl_queue_destroy(queue);
ccl_context_destroy(ctx);
/* Destroy profiler. */
ccl_prof_destroy(prof);
/* Check all wrappers have been destroyed. */
assert(ccl_wrapper_memcheck());
/* Terminate. */
return EXIT_SUCCESS;
}
checkvec_image.cl
__kernel void checkvec_image(__global short4 *output_buff) {
int x = get_global_id(0);
int y = get_global_id(1);
if ((x < 1024/4) && (y < 1024)) {
short xs = ((short)x*4)-512;
short4 xs4 = (short4)(xs,xs+1,xs+2,xs+3);
short ys = (short)(y-512);
short4 s4veccomp = xs4>=ys;
int4 i4veccomp = convert_int4(xs4)>=(int)ys;
short4 scalecmp;
scalecmp.s0 = xs4.s0>ys;
scalecmp.s1 = xs4.s1>ys;
scalecmp.s2 = xs4.s2>ys;
scalecmp.s3 = xs4.s3>ys;
ushort4 xus4 = convert_ushort4(xs4);
ushort yus = (ushort)ys;
short4 us4veccomp = xus4>=yus;
output_buff[0*1024*256 + y*256 + x] = s4veccomp;
output_buff[1*1024*256 + y*256 + x] = convert_short4(i4veccomp);
output_buff[2*1024*256 + y*256 + x] = scalecmp;
output_buff[3*1024*256 + y*256 + x] = us4veccomp;
}
}