Overlapping Data Transfers with Kernel Computation
Applications like database analytics have much larger data set than the available memory on the acceleration device. They require the complete data to be transferred and processed in blocks. Techniques that overlap the data transfers with the computation are critical to achieve high performance for these applications.
Below is the vector add kernel from the OpenCL Overlap Data Transfers with Kernel Computation Example in the host category from Xilinx On-boarding Example GitHub.
kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void vadd(global int* c,
global const int* a,
global const int* b,
const int offset,
const int elements)
{
int end = offset + elements;
vadd_loop: for (int x=offset; x<end; ++x) {
c[x] = a[x] + b[x];
}
}
Figure: Event Triggering Set Up
The host code snippet below enqueues the four tasks in a loop. It also sets up event synchronization between different tasks to ensure that data dependencies are met for each task. The double buffering is set up by passing different memory objects values to clEnqueueMigrateMemObjects API. The event synchronization is achieved by having each API to wait for other event as well as trigger its own event when the API completes.
for (size_t iteration_idx = 0; iteration_idx < num_iterations; iteration_idx++) {
int flag = iteration_idx % 2;
if (iteration_idx >= 2) {
clWaitForEvents(1, &map_events[flag]);
OCL_CHECK(clReleaseMemObject(buffer_a[flag]));
OCL_CHECK(clReleaseMemObject(buffer_b[flag]));
OCL_CHECK(clReleaseMemObject(buffer_c[flag]));
OCL_CHECK(clReleaseEvent(read_events[flag]));
OCL_CHECK(clReleaseEvent(kernel_events[flag]));
}
buffer_a[flag] = clCreateBuffer(world.context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
bytes_per_iteration, &A[iteration_idx * elements_per_iteration], NULL);
buffer_b[flag] = clCreateBuffer(world.context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
bytes_per_iteration, &B[iteration_idx * elements_per_iteration], NULL);
buffer_c[flag] = clCreateBuffer(world.context,
CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
bytes_per_iteration, &device_result[iteration_idx * elements_per_iteration], NULL);
array<cl_event, 2> write_events;
printf("Enqueueing Migrate Mem Object (Host to Device) calls\n");
// These calls are asynchronous with respect to the main thread because we
// are passing the CL_FALSE as the third parameter. Because we are passing
// the events from the previous kernel call into the wait list, it will wait
// for the previous operations to complete before continuing
OCL_CHECK(clEnqueueMigrateMemObjects(
world.command_queue, 1, &buffer_a[iteration_idx % 2],
0 /* flags, 0 means from host */,
0, NULL,
&write_events[0]));
set_callback(write_events[0], "ooo_queue");
OCL_CHECK(clEnqueueMigrateMemObjects(
world.command_queue, 1, &buffer_b[iteration_idx % 2],
0 /* flags, 0 means from host */,
0, NULL,
&write_events[1]));
set_callback(write_events[1], "ooo_queue");
xcl_set_kernel_arg(kernel, 0, sizeof(cl_mem), &buffer_c[iteration_idx % 2]);
xcl_set_kernel_arg(kernel, 1, sizeof(cl_mem), &buffer_a[iteration_idx % 2]);
xcl_set_kernel_arg(kernel, 2, sizeof(cl_mem), &buffer_b[iteration_idx % 2]);
xcl_set_kernel_arg(kernel, 3, sizeof(int), &elements_per_iteration);
printf("Enqueueing NDRange kernel.\n");
// This event needs to wait for the write buffer operations to complete
// before executing. We are sending the write_events into its wait list to
// ensure that the order of operations is correct.
OCL_CHECK(clEnqueueNDRangeKernel(world.command_queue, kernel, 1, nullptr,
&global, &local, 2 , write_events.data(),
&kernel_events[flag]));
set_callback(kernel_events[flag], "ooo_queue");
printf("Enqueueing Migrate Mem Object (Device to Host) calls\n");
// This operation only needs to wait for the kernel call. This call will
// potentially overlap the next kernel call as well as the next read
// operations
OCL_CHECK( clEnqueueMigrateMemObjects(world.command_queue, 1, &buffer_c[iteration_idx % 2],
CL_MIGRATE_MEM_OBJECT_HOST, 1, &kernel_events[flag], &read_events[flag]));
set_callback(read_events[flag], "ooo_queue");
clEnqueueMapBuffer(world.command_queue, buffer_c[flag], CL_FALSE, CL_MAP_READ, 0,
bytes_per_iteration, 1, &read_events[flag], &map_events[flag], 0);
set_callback(map_events[flag], "ooo_queue");
OCL_CHECK(clReleaseEvent(write_events[0]));
OCL_CHECK(clReleaseEvent(write_events[1]));
}
Figure: Data Transfer Time Hidden in Application Timeline View