I have just tracked down a bug in a opencl kernel i have written. The code had been working fine until one of the users got a graphics driver update (versione 20.19.15.4835).
The code had worked for about 1 year on a wide assortment of CPU's and integrated and dedicated GPU's, both when compiled with x64 and x86. The old code still works on the CPU when compiled with either x64 or x86, and on the integrated gpu when compiled with x86. But when run on integrated graphics cards, with the newest driver, in x64 mode, it failes.
i have been able to track it down to this line of code:
'global float* inputCPtr = inputC + turbines * windDirIndex;
out[gid] = inputCPtr[inputA[gid]];'
Seemingly randomly, this line would return 0 instead of the content in xCoords. Changing the code to the following fixes the bug.
global float* inputCPtr = inputC + turbines * windDirIndex;
out[gid] = inputC[turbines * windDirIndex + gid];'
The vector "test" should contain all 1's, but on the HD 5500 with the newest driver, compiled in visual studio on 64 bit on windows 8, it contains a mix of 1's and 0's, seemingly in blocks of a multiple of 8.
#define __CL_ENABLE_EXCEPTIONS
#include "cl/cl.hpp"
#include <string>
#include <fstream>
#include <streambuf>
#include <algorithm>
#include <iostream>
#include <iomanip>
#include <sstream>
#include <vector>
#include <string>
typedef struct Device
{
cl::Device DevicePtr; // the device handle
cl::Program Program; // The compiled program for this device. Should only be compiled for Device
cl::Context Context; // A context for this device. Must be the same context used for Program
Device(cl::Device device, std::string source)
{
DevicePtr = device;
Context = cl::Context(device);
Program = cl::Program(Context, source);
Program.build();
}
} Device;
std::vector<cl::Device> getAllDevices()
{
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
std::vector<cl::Device> devices;
for (auto platform : platforms)
{
std::vector<cl::Device> platformDevices;
platform.getDevices(CL_DEVICE_TYPE_ALL, &platformDevices);
devices.insert(devices.end(), platformDevices.begin(), platformDevices.end());
}
return devices;
}
int main()
{
std::string kernelsString =
"kernel void example(global int* inputA, global ushort* inputB, global float* inputC, ushort turbines, uint threadCount, global float* out)\n"
"{\n"
"uint gid = get_global_id(0);\n"
"if (gid >= threadCount) return;\n"
"ushort windDirIndex = inputB[gid];\n"
"global float* inputCPtr = inputC + turbines * windDirIndex;\n"
"out[gid] = inputCPtr[inputA[gid]];\n"
"}";
for (auto ptr : getAllDevices()) {
auto dev = Device(ptr, kernelsString);
cl::Context context = dev.Context;
auto queue = cl::CommandQueue(context);
auto inputA = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(cl_int) * 100);
queue.enqueueFillBuffer(inputA, cl_int(0), 0, sizeof(cl_int) * 100);
cl::Buffer inputB = cl::Buffer(context, NULL, sizeof(cl_ushort) * 100);
queue.enqueueFillBuffer(inputB, cl_ushort(0), 0, sizeof(cl_ushort) * 100);
cl::Buffer inputC = cl::Buffer(context, NULL, sizeof(cl_float) * 100);
queue.enqueueFillBuffer(inputC, cl_float(1), 0, sizeof(cl_float) * 100);
auto outputA = cl::Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_float)*100);
auto calculateFactorKernel = cl::Kernel(dev.Program, "example");
calculateFactorKernel.setArg(0, inputA);
calculateFactorKernel.setArg(1, inputB);
calculateFactorKernel.setArg(2, inputC);
calculateFactorKernel.setArg(3, cl_ushort(0));
calculateFactorKernel.setArg(4, 100);
calculateFactorKernel.setArg(5, outputA);
queue.enqueueNDRangeKernel(calculateFactorKernel, cl::NullRange, cl::NDRange(100));
float* testptr = static_cast<cl_float*>(queue.enqueueMapBuffer(outputA, true, CL_MAP_READ, 0, sizeof(cl_float) * 100));
std::vector<cl_float> test(testptr, testptr + 100);
}
}