i.MX8MP / GC7000UL OpenCL char,short vector comparisons give incorrect results

shusheer · ‎12-24-2022

OpenCL kernels containing char or short vector comparisons, to other vectors or constants, give incorrect results under certain circumstances.

IMX8MPLUS #GC7000UL #VIP8000Nano

This snippet should give comparison (>=0) of (short4)(True, True, False, False). However, it actually gives (short4)(True, True, True, True). Additional tests below of different vector lengths and types.

int x = get_global_id(0); // this is zero, global worksize=1, but if constant used the calculations optimize away
short ival = (short)x*4;
short4 input = (short4)(ival,ival+1,0-(ival+2),0-(ival+3));
short4 veccomp = input>=(short)0; // this should be (-1,-1,0,0) but is (-1,-1,-1,-1)

This is on i.MX8MP GPU (GC7000UL.6204.0000) and NPU (VIP8000Nano-S+I.8002.0000), on Linux kernel 5.10.72-lts-5.10.y. The following is abbreviated clinfo output using /opt/viv_samples/cl11/UnitTest/clinfo

CL_PLATFORM_NAME: Vivante OpenCL Platform
CL_PLATFORM_PROFILE: FULL_PROFILE
CL_PLATFORM_VERSION: OpenCL 3.0 V6.4.3.p2.336687
...
CL_DEVICE_NAME: Vivante OpenCL Device GC7000UL.6204.0000
CL_DEVICE_VENDOR: Vivante Corporation
CL_DEVICE_TYPE: GPU 
CL_DEVICE_OPENCL_C_VERSION: OpenCL C 1.2 
CL_DEVICE_VENDOR_ID: 0x00564956
CL_DEVICE_PLATFORM: 0xb49efc50
CL_DEVICE_VERSION: OpenCL 3.0 
CL_DEVICE_PROFILE: FULL_PROFILE
CL_DRIVER_VERSION: OpenCL 3.0 V6.4.3.p2.336687
...
CL_DEVICE_NAME: Vivante OpenCL Device VIP8000Nano-S+I.8002.0000
CL_DEVICE_VENDOR: Vivante Corporation
CL_DEVICE_TYPE: GPU 
CL_DEVICE_OPENCL_C_VERSION: OpenCL C 1.2 
CL_DEVICE_VENDOR_ID: 0x00564956
CL_DEVICE_PLATFORM: 0xb49efc50
CL_DEVICE_VERSION: OpenCL 3.0 
CL_DEVICE_PROFILE: FULL_PROFILE
CL_DRIVER_VERSION: OpenCL 3.0 V6.4.3.p2.336687

The following is a set of kernels making 6 test cases - only tests1-3 fail. This tells us that the problem is when the vector being compared is computed (as opposed to constants, which might optimize away, as shown in test5), that the problem is specific to char/short (as int4 works fine, in test4), and is common to vec2/vec4 lengths (and perhaps others).

However, explicitly casting a calculated char/short vector to the same length of int vector makes the comparison work correctly (test6) giving a workaround to the issue.

I am unable to determine if this is a driver or silicon issue. If a hardware issue, it might be possible to patch the driver to compile code to transparently perform explicit conversion of charn/shortn to intn, perform the comparison, and explicitly convert back to charn/shortn.

OpenCL kernels used for testing - in all cases, the output is 12 shorts, the first 4 being elements that are tested for >=0, the second 4 being the results of a vector comparison (which should be -1, -1, 0, 0 because vector comparisons result in all bits set for true), and the third 4 being the results of scalar comparisons (which should be 1, 1, 0, 0 because scalar comparisons result in 1 for true).

__kernel void test1(__global short *output_buff) {
int x = get_global_id(0);
if (x < 1) {
char ival = (char)x*4;
char2 input = (char2)(ival,0-(ival+2));
char2 veccomp = input>=(char)0;
output_buff[0] = input.s0;
output_buff[1] = input.s1;
output_buff[2] = input.s0;
output_buff[3] = input.s1;
output_buff[4] = veccomp.s0;
output_buff[5] = veccomp.s1;
output_buff[6] = veccomp.s0;
output_buff[7] = veccomp.s1;
output_buff[8] = input.s0>=0;
output_buff[9] = input.s1>=0;
output_buff[10] = input.s0>=0;
output_buff[11] = input.s1>=0;
}
}

__kernel void test2(__global short *output_buff) {
int x = get_global_id(0);
if (x < 1) {
char ival = (char)x*4;
char4 input = (char4)(ival,ival+1,0-(ival+2),0-(ival+3));
char4 veccomp = input>=(char)0;
output_buff[0] = input.s0;
output_buff[1] = input.s1;
output_buff[2] = input.s2;
output_buff[3] = input.s3;
output_buff[4] = veccomp.s0;
output_buff[5] = veccomp.s1;
output_buff[6] = veccomp.s2;
output_buff[7] = veccomp.s3;
output_buff[8] = input.s0>=0;
output_buff[9] = input.s1>=0;
output_buff[10] = input.s2>=0;
output_buff[11] = input.s3>=0;
}
}

__kernel void test3(__global short *output_buff) {
int x = get_global_id(0);
if (x < 1) {
short ival = (short)x*4;
short4 input = (short4)(ival,ival+1,0-(ival+2),0-(ival+3));
short4 veccomp = input>=(short)0;
output_buff[0] = input.s0;
output_buff[1] = input.s1;
output_buff[2] = input.s2;
output_buff[3] = input.s3;
output_buff[4] = veccomp.s0;
output_buff[5] = veccomp.s1;
output_buff[6] = veccomp.s2;
output_buff[7] = veccomp.s3;
output_buff[8] = input.s0>=0;
output_buff[9] = input.s1>=0;
output_buff[10] = input.s2>=0;
output_buff[11] = input.s3>=0;
}
}

__kernel void test4(__global short *output_buff) {
int x = get_global_id(0);
if (x < 1) {
int ival = (int)x*4;
int4 input = (int4)(ival,ival+1,0-(ival+2),0-(ival+3));
int4 veccomp = input>=0;
output_buff[0] = input.s0;
output_buff[1] = input.s1;
output_buff[2] = input.s2;
output_buff[3] = input.s3;
output_buff[4] = veccomp.s0;
output_buff[5] = veccomp.s1;
output_buff[6] = veccomp.s2;
output_buff[7] = veccomp.s3;
output_buff[8] = input.s0>=0;
output_buff[9] = input.s1>=0;
output_buff[10] = input.s2>=0;
output_buff[11] = input.s3>=0;
}
}

__kernel void test5(__global short *output_buff) {
int x = get_global_id(0);
if (x < 1) {
short4 input = (short4)(0,1,-2,-3);
short4 veccomp = input>=(short)0;
output_buff[0] = input.s0;
output_buff[1] = input.s1;
output_buff[2] = input.s2;
output_buff[3] = input.s3;
output_buff[4] = veccomp.s0;
output_buff[5] = veccomp.s1;
output_buff[6] = veccomp.s2;
output_buff[7] = veccomp.s3;
output_buff[8] = input.s0>=0;
output_buff[9] = input.s1>=0;
output_buff[10] = input.s2>=0;
output_buff[11] = input.s3>=0;
}
}

__kernel void test6(__global short *output_buff) {
int x = get_global_id(0);
if (x < 1) {
short ival = (short)x*4;
short4 input = (short4)(ival,ival+1,0-(ival+2),0-(ival+3));
int4 veccomp = convert_int4(input)>=0;
output_buff[0] = input.s0;
output_buff[1] = input.s1;
output_buff[2] = input.s2;
output_buff[3] = input.s3;
output_buff[4] = veccomp.s0;
output_buff[5] = veccomp.s1;
output_buff[6] = veccomp.s2;
output_buff[7] = veccomp.s3;
output_buff[8] = input.s0>=0;
output_buff[9] = input.s1>=0;
output_buff[10] = input.s2>=0;
output_buff[11] = input.s3>=0;
}
}

shusheer · ‎12-26-2022

Update: It appears very much as though comparisons on signed vectors of char or short are being performed as though they were unsigned vectors of the same type, containing the same bits (i.e. the sign bit is considered as part of the magnitude of the number).

The following kernel outputs a four planes of 1024*1024 short results. Each plane uses a different mechanism to test the same thing, which is (get_global_id(0)-512)>(get_global_id(1)-512). The correct result would be images that have the lower triangle set to zero, and the upper triangle set to -1 or +1 for vector or scalar comparisons respectively.

The result using short4 vector comparisons is an image of 4 quadrants (corresponding to the sign of each side of the comparison). When both sides share the same sign (top-left, bottom-right), the results are correct. When the signs differ, the negative value always "wins", which is exactly what you would get if you treated the signed values as unsigned in the comparison. For good measure, the fourth result plane is exactly this, and gives exactly the same result.

This narrows down the possible causes of the failure, and also allows for a more nuanced approach to mitigation : programmers can decide not to promote char/short to int if the values are guaranteed to be of the same sign.

__kernel void checkvec_image(__global short4 *output_buff) {
int x = get_global_id(0);
int y = get_global_id(1);
if ((x < 1024/4) && (y < 1024)) {
short xs = ((short)x*4)-512;
short4 xs4 = (short4)(xs,xs+1,xs+2,xs+3);
short ys = (short)(y-512);
short4 s4veccomp = xs4>=ys;
int4 i4veccomp = convert_int4(xs4)>=(int)ys;
short4 scalecmp;
scalecmp.s0 = xs4.s0>ys;
scalecmp.s1 = xs4.s1>ys;
scalecmp.s2 = xs4.s2>ys;
scalecmp.s3 = xs4.s3>ys;
ushort4 xus4 = convert_ushort4(xs4);
ushort yus = (ushort)ys;
short4 us4veccomp = xus4>=yus;
output_buff[0*1024*256 + y*256 + x] = s4veccomp;
output_buff[1*1024*256 + y*256 + x] = convert_short4(i4veccomp);
output_buff[2*1024*256 + y*256 + x] = scalecmp;
output_buff[3*1024*256 + y*256 + x] = us4veccomp;
}
}

Zhiming_Liu · ‎12-28-2022

Hi @shusheer

Can you reproduce this issue on L5.15.71?

shusheer · ‎12-29-2022

I don't have physical access to the device over the holiday period, so it will be at least a week or possibly two before I can update the OS. If you happen to have a device to hand, the C code below provides the signed-vs-unsigned tests as images, using the cf4ocl framework, which is portable and very lightweight https://github.com/nunofachada/cf4ocl

Just compile and run checkvec_image.c in the same folder as checkvec_image.cl - it will prompt for which device on the i.MX8MP you wish to use (GC7000UL or VIP8000Nano), and write out 4 .png images demonstrating the 4 test-cases in the .cl file. checkvec_image-0.png should contain only a single upper-diagonal triangle (like images 1 and 2), but instead looks like checkvec_image-3.png which is using the unsigned comparison.

checkvec_image.c:

#define KERNEL_FILENAME "./checkvec_image.cl"
#define KERNEL_NAME "checkvec_image"
#define IMAGE_FILENAME_BASE "./checkvec_image"

/**
* @file
* Testing possible i.MX8MPlus GPU bug
*
* @note
* Developed using cf4ocl (C Framework for OpenCL)
* Based on the cf4ocl image_filter example, this generates an output image 
* Requires OpenCL >= 1.1.
*
* @author Shamus Husheer
*/

/*
* Description
* -----------
*
* Calls the checkvec_image kernel, saves each cl_short as 0x00 if ==0 else 0xFF to one of 4 8-bit 1024x1024 PNG files
*
* */

#define _POSIX_C_SOURCE 200112L

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <cf4ocl2.h>

#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"

/* Error handling macros. */
#define ERROR_MSG_AND_EXIT(msg) \
do { fprintf(stderr, "\n%s\n", msg); exit(EXIT_FAILURE); } while(0)

#define HANDLE_ERROR(err) \
if (err != NULL) { ERROR_MSG_AND_EXIT(err->message); }

#define HANDLE_NONZERO_ERROR(err) \
if (err != 0) { fprintf(stderr, "\nError code: %d\n", err); exit(EXIT_FAILURE); }

#define HANDLE_NULL_ERROR(ptr) \
if (ptr == NULL) { fprintf(stderr, "\nNULL pointer returned\n"); exit(EXIT_FAILURE); }

/**
* Example main function.
* */
int main(int argc, char* argv[]) {

/* Wrappers for OpenCL objects. */
CCLContext* ctx;
CCLDevice* dev;
CCLQueue* queue;
CCLProgram* prg;
CCLKernel* krnl;

/* Device selected specified in the command line. */
int dev_idx = -1;

/* Error handling object (must be initialized to NULL). */
CCLErr* err = NULL;

/* Profiling object. */
CCLProf* prof;

/* Image data in host. */
cl_short * output_image = NULL;

/* Image data on device. */
CCLBuffer * output_image_device = NULL;

/* Image properties. */
int width=1024, height=1024, n_channels=4;

/* Real worksize. */
size_t real_ws[2];

/* Global and local worksizes. */
size_t gws[2];
size_t lws[2];

/* Check arguments. */
if (argc < 2) {
ERROR_MSG_AND_EXIT("Usage: checkvec_image [device_index]");
} else if (argc >= 3) {
/* Check if a device was specified in the command line. */
dev_idx = atoi(argv[2]);
}

/* Allocate space for output image on host, aligned to 64byte boundary to enable zero-copy as per I.MX_GRAPHICS_USERS_GUIDE section 5.5. */
HANDLE_NONZERO_ERROR(posix_memalign((void*)&output_image, 64, n_channels*height*width*2));

/* Real work size (kernel operates on short4 outputs). */
real_ws[0] = width/4; real_ws[1] = height;

/* Create context using device selected from menu. */
ctx = ccl_context_new_from_menu_full(&dev_idx, &err);
HANDLE_ERROR(err);

/* Get first device in context. */
dev = ccl_context_get_device(ctx, 0, &err);
HANDLE_ERROR(err);

/* Create a command queue. */
queue = ccl_queue_new(ctx, dev, CL_QUEUE_PROFILING_ENABLE, &err);
HANDLE_ERROR(err);

/* Instantiate and initialize device buffers. */
output_image_device = ccl_buffer_new(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, n_channels*height*width*2, output_image, &err);
HANDLE_ERROR(err);

/* Create program from kernel source file and compile it. */
prg = ccl_program_new_from_source_file(ctx, KERNEL_FILENAME, &err);
HANDLE_ERROR(err);

ccl_program_build(prg, NULL, &err);
if (err != NULL)
{
CCLErr* err2 = NULL;
fprintf(stderr, "\n%s\nBuild Error: Log Start\n", err->message);
char* log = ccl_program_get_build_info_array(prg, dev, CL_PROGRAM_BUILD_LOG, char*, &err2);
fprintf(stderr, "\n%s\nBuild Log End\n", log);
HANDLE_ERROR(err2);
}
HANDLE_ERROR(err);

/* Get kernel wrapper. */
krnl = ccl_program_get_kernel(prg, KERNEL_NAME, &err);
HANDLE_ERROR(err);

/* Determine nice local and global worksizes. */
ccl_kernel_suggest_worksizes(krnl, dev, 2, real_ws, gws, lws, &err);
HANDLE_ERROR(err);

/* Show information to user. */
printf("\n * Image size: %d x %d, %d channels\n",
width, height, n_channels);
printf(" * Global work-size: (%d, %d)\n", (int) gws[0], (int) gws[1]);
printf(" * Local work-size: (%d, %d)\n", (int) lws[0], (int) lws[1]);

/* Start profiling. */
prof = ccl_prof_new();
ccl_prof_start(prof);

/* Apply kernel. */
ccl_kernel_set_args_and_enqueue_ndrange(
krnl, queue, 2, NULL, gws, lws, NULL, &err,
output_image_device, NULL);
HANDLE_ERROR(err);

/* Prepare to read image data back to host. This is zero-copy on Unified Memory model such as I.MX8. */
ccl_buffer_enqueue_map(output_image_device, queue, CL_TRUE, CL_MAP_WRITE,
0, n_channels*height*width*2, NULL, NULL, &err);
HANDLE_ERROR(err);

/* Stop profiling timer and add queues for analysis. */
ccl_prof_stop(prof);
ccl_prof_add_queue(prof, "Queue1", queue);

/* Write 8-bit images to files. */
cl_uchar * file_image = NULL;
file_image = (cl_uchar *) malloc(height*width);
HANDLE_NULL_ERROR(file_image);

for(int channel=0; channel<n_channels; channel++)
{
for(int idx =0; idx<height*width; idx++) {
cl_short val = output_image[idx+channel*width*height];
// Create black-and-white PNG image, white if val!=0 (because scalar TRUE = 1, vector TRUE = -1)
file_image[idx]=(val==0?0:0xFF);
}

char filename[sizeof(IMAGE_FILENAME_BASE)+8];
sprintf(filename, "%s-%d.png", IMAGE_FILENAME_BASE, channel);
int file_write_status = 0;
file_write_status = stbi_write_png(filename, width, height, 1, file_image, width);
if (file_write_status) {
fprintf(stdout, "\nImage saved in file %s\n",filename);
} else {
ERROR_MSG_AND_EXIT("Unable to save image.");
}
}

free(file_image);

/* Process profiling info. */
ccl_prof_calc(prof, &err);
HANDLE_ERROR(err);

/* Print profiling info. */
ccl_prof_print_summary(prof);

/* Release host images. */
free(output_image);

/* Release wrappers. */
ccl_buffer_destroy(output_image_device);
ccl_program_destroy(prg);
ccl_queue_destroy(queue);
ccl_context_destroy(ctx);

/* Destroy profiler. */
ccl_prof_destroy(prof);

/* Check all wrappers have been destroyed. */
assert(ccl_wrapper_memcheck());

/* Terminate. */
return EXIT_SUCCESS;

}

checkvec_image.cl

__kernel void checkvec_image(__global short4 *output_buff) {
int x = get_global_id(0);
int y = get_global_id(1);
if ((x < 1024/4) && (y < 1024)) {
short xs = ((short)x*4)-512;
short4 xs4 = (short4)(xs,xs+1,xs+2,xs+3);
short ys = (short)(y-512);
short4 s4veccomp = xs4>=ys;
int4 i4veccomp = convert_int4(xs4)>=(int)ys;
short4 scalecmp;
scalecmp.s0 = xs4.s0>ys;
scalecmp.s1 = xs4.s1>ys;
scalecmp.s2 = xs4.s2>ys;
scalecmp.s3 = xs4.s3>ys;
ushort4 xus4 = convert_ushort4(xs4);
ushort yus = (ushort)ys;
short4 us4veccomp = xus4>=yus;
output_buff[0*1024*256 + y*256 + x] = s4veccomp;
output_buff[1*1024*256 + y*256 + x] = convert_short4(i4veccomp);
output_buff[2*1024*256 + y*256 + x] = scalecmp;
output_buff[3*1024*256 + y*256 + x] = us4veccomp;
}
}