1538 lines
59 KiB
C++
1538 lines
59 KiB
C++
/*
|
|
Copyright (c) 2005-2017 Intel Corporation
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
|
|
|
|
|
|
|
|
*/
|
|
|
|
#ifndef __TBB_flow_graph_opencl_node_H
|
|
#define __TBB_flow_graph_opencl_node_H
|
|
|
|
#include "tbb/tbb_config.h"
|
|
#if __TBB_PREVIEW_OPENCL_NODE
|
|
|
|
#include "flow_graph.h"
|
|
|
|
#include <vector>
|
|
#include <string>
|
|
#include <algorithm>
|
|
#include <iostream>
|
|
#include <fstream>
|
|
#include <map>
|
|
#include <mutex>
|
|
|
|
#ifdef __APPLE__
|
|
#include <OpenCL/opencl.h>
|
|
#else
|
|
#include <CL/cl.h>
|
|
#endif
|
|
|
|
namespace tbb {
|
|
namespace flow {
|
|
|
|
namespace interface9 {
|
|
|
|
class opencl_foundation;
|
|
class opencl_device_list;
|
|
|
|
template <typename Factory>
|
|
class opencl_buffer_impl;
|
|
|
|
template <typename Factory>
|
|
class opencl_program;
|
|
|
|
class default_opencl_factory;
|
|
|
|
class opencl_graph : public graph {
|
|
public:
|
|
//! Constructs a graph with isolated task_group_context
|
|
opencl_graph() : my_opencl_foundation( NULL ) {}
|
|
//! Constructs a graph with an user context
|
|
explicit opencl_graph( task_group_context& context ) : graph( context ), my_opencl_foundation( NULL ) {}
|
|
//! Destroys a graph
|
|
~opencl_graph();
|
|
//! Available devices
|
|
const opencl_device_list& available_devices();
|
|
default_opencl_factory& opencl_factory();
|
|
protected:
|
|
opencl_foundation *my_opencl_foundation;
|
|
opencl_foundation &get_opencl_foundation();
|
|
|
|
template <typename T, typename Factory>
|
|
friend class opencl_buffer;
|
|
template <cl_channel_order channel_order, cl_channel_type channel_type, typename Factory>
|
|
friend class opencl_image2d;
|
|
template<typename... Args>
|
|
friend class opencl_node;
|
|
template <typename DeviceFilter>
|
|
friend class opencl_factory;
|
|
};
|
|
|
|
template <typename DeviceFilter>
|
|
class opencl_factory;
|
|
|
|
template <typename T, typename Factory>
|
|
class dependency_msg;
|
|
|
|
|
|
inline void enforce_cl_retcode( cl_int err, std::string msg ) {
|
|
if ( err != CL_SUCCESS ) {
|
|
std::cerr << msg << "; error code: " << err << std::endl;
|
|
throw msg;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
T event_info( cl_event e, cl_event_info i ) {
|
|
T res;
|
|
enforce_cl_retcode( clGetEventInfo( e, i, sizeof( res ), &res, NULL ), "Failed to get OpenCL event information" );
|
|
return res;
|
|
}
|
|
|
|
template <typename T>
|
|
T device_info( cl_device_id d, cl_device_info i ) {
|
|
T res;
|
|
enforce_cl_retcode( clGetDeviceInfo( d, i, sizeof( res ), &res, NULL ), "Failed to get OpenCL device information" );
|
|
return res;
|
|
}
|
|
template <>
|
|
std::string device_info<std::string>( cl_device_id d, cl_device_info i ) {
|
|
size_t required;
|
|
enforce_cl_retcode( clGetDeviceInfo( d, i, 0, NULL, &required ), "Failed to get OpenCL device information" );
|
|
|
|
char *buff = (char*)alloca( required );
|
|
enforce_cl_retcode( clGetDeviceInfo( d, i, required, buff, NULL ), "Failed to get OpenCL device information" );
|
|
|
|
return buff;
|
|
}
|
|
template <typename T>
|
|
T platform_info( cl_platform_id p, cl_platform_info i ) {
|
|
T res;
|
|
enforce_cl_retcode( clGetPlatformInfo( p, i, sizeof( res ), &res, NULL ), "Failed to get OpenCL platform information" );
|
|
return res;
|
|
}
|
|
template <>
|
|
std::string platform_info<std::string>( cl_platform_id p, cl_platform_info i ) {
|
|
size_t required;
|
|
enforce_cl_retcode( clGetPlatformInfo( p, i, 0, NULL, &required ), "Failed to get OpenCL platform information" );
|
|
|
|
char *buff = (char*)alloca( required );
|
|
enforce_cl_retcode( clGetPlatformInfo( p, i, required, buff, NULL ), "Failed to get OpenCL platform information" );
|
|
|
|
return buff;
|
|
}
|
|
|
|
|
|
class opencl_device {
|
|
public:
|
|
typedef size_t device_id_type;
|
|
enum : device_id_type {
|
|
unknown = device_id_type( -2 ),
|
|
host = device_id_type( -1 )
|
|
};
|
|
|
|
opencl_device() : my_device_id( unknown ) {}
|
|
|
|
opencl_device( cl_device_id cl_d_id, device_id_type device_id ) : my_device_id( device_id ), my_cl_device_id( cl_d_id ) {}
|
|
|
|
std::string platform_profile() const {
|
|
return platform_info<std::string>( platform(), CL_PLATFORM_PROFILE );
|
|
}
|
|
std::string platform_version() const {
|
|
return platform_info<std::string>( platform(), CL_PLATFORM_VERSION );
|
|
}
|
|
std::string platform_name() const {
|
|
return platform_info<std::string>( platform(), CL_PLATFORM_NAME );
|
|
}
|
|
std::string platform_vendor() const {
|
|
return platform_info<std::string>( platform(), CL_PLATFORM_VENDOR );
|
|
}
|
|
std::string platform_extensions() const {
|
|
return platform_info<std::string>( platform(), CL_PLATFORM_EXTENSIONS );
|
|
}
|
|
|
|
template <typename T>
|
|
void info( cl_device_info i, T &t ) const {
|
|
t = device_info<T>( my_cl_device_id, i );
|
|
}
|
|
std::string version() const {
|
|
// The version string format: OpenCL<space><major_version.minor_version><space><vendor-specific information>
|
|
return device_info<std::string>( my_cl_device_id, CL_DEVICE_VERSION );
|
|
}
|
|
int major_version() const {
|
|
int major;
|
|
std::sscanf( version().c_str(), "OpenCL %d", &major );
|
|
return major;
|
|
}
|
|
int minor_version() const {
|
|
int major, minor;
|
|
std::sscanf( version().c_str(), "OpenCL %d.%d", &major, &minor );
|
|
return minor;
|
|
}
|
|
bool out_of_order_exec_mode_on_host_present() const {
|
|
#if CL_VERSION_2_0
|
|
if ( major_version() >= 2 )
|
|
return (device_info<cl_command_queue_properties>( my_cl_device_id, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES ) & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;
|
|
else
|
|
#endif /* CL_VERSION_2_0 */
|
|
return (device_info<cl_command_queue_properties>( my_cl_device_id, CL_DEVICE_QUEUE_PROPERTIES ) & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;
|
|
}
|
|
bool out_of_order_exec_mode_on_device_present() const {
|
|
#if CL_VERSION_2_0
|
|
if ( major_version() >= 2 )
|
|
return (device_info<cl_command_queue_properties>( my_cl_device_id, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES ) & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;
|
|
else
|
|
#endif /* CL_VERSION_2_0 */
|
|
return false;
|
|
}
|
|
std::array<size_t, 3> max_work_item_sizes() const {
|
|
return device_info<std::array<size_t, 3>>( my_cl_device_id, CL_DEVICE_MAX_WORK_ITEM_SIZES );
|
|
}
|
|
size_t max_work_group_size() const {
|
|
return device_info<size_t>( my_cl_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE );
|
|
}
|
|
bool built_in_kernel_available( const std::string& k ) const {
|
|
const std::string semi = ";";
|
|
// Added semicolumns to force an exact match (to avoid a partial match, e.g. "add" is partly matched with "madd").
|
|
return (semi + built_in_kernels() + semi).find( semi + k + semi ) != std::string::npos;
|
|
}
|
|
std::string built_in_kernels() const {
|
|
return device_info<std::string>( my_cl_device_id, CL_DEVICE_BUILT_IN_KERNELS );
|
|
}
|
|
std::string name() const {
|
|
return device_info<std::string>( my_cl_device_id, CL_DEVICE_NAME );
|
|
}
|
|
cl_bool available() const {
|
|
return device_info<cl_bool>( my_cl_device_id, CL_DEVICE_AVAILABLE );
|
|
}
|
|
cl_bool compiler_available() const {
|
|
return device_info<cl_bool>( my_cl_device_id, CL_DEVICE_COMPILER_AVAILABLE );
|
|
}
|
|
cl_bool linker_available() const {
|
|
return device_info<cl_bool>( my_cl_device_id, CL_DEVICE_LINKER_AVAILABLE );
|
|
}
|
|
bool extension_available( const std::string &ext ) const {
|
|
const std::string space = " ";
|
|
// Added space to force an exact match (to avoid a partial match, e.g. "ext" is partly matched with "ext2").
|
|
return (space + extensions() + space).find( space + ext + space ) != std::string::npos;
|
|
}
|
|
std::string extensions() const {
|
|
return device_info<std::string>( my_cl_device_id, CL_DEVICE_EXTENSIONS );
|
|
}
|
|
|
|
cl_device_type type() const {
|
|
return device_info<cl_device_type>( my_cl_device_id, CL_DEVICE_TYPE );
|
|
}
|
|
|
|
std::string vendor() const {
|
|
return device_info<std::string>( my_cl_device_id, CL_DEVICE_VENDOR );
|
|
}
|
|
|
|
cl_uint address_bits() const {
|
|
return device_info<cl_uint>( my_cl_device_id, CL_DEVICE_ADDRESS_BITS );
|
|
}
|
|
|
|
cl_device_id device_id() const {
|
|
return my_cl_device_id;
|
|
}
|
|
|
|
cl_command_queue command_queue() const {
|
|
return my_cl_command_queue;
|
|
}
|
|
|
|
void set_command_queue( cl_command_queue cmd_queue ) {
|
|
my_cl_command_queue = cmd_queue;
|
|
}
|
|
|
|
private:
|
|
opencl_device( cl_device_id d_id ) : my_device_id( unknown ), my_cl_device_id( d_id ) {}
|
|
|
|
cl_platform_id platform() const {
|
|
return device_info<cl_platform_id>( my_cl_device_id, CL_DEVICE_PLATFORM );
|
|
}
|
|
|
|
device_id_type my_device_id;
|
|
cl_device_id my_cl_device_id;
|
|
cl_command_queue my_cl_command_queue;
|
|
|
|
friend bool operator==(opencl_device d1, opencl_device d2) { return d1.my_cl_device_id == d2.my_cl_device_id; }
|
|
|
|
template <typename DeviceFilter>
|
|
friend class opencl_factory;
|
|
template <typename Factory>
|
|
friend class opencl_memory;
|
|
template <typename Factory>
|
|
friend class opencl_program;
|
|
friend class opencl_foundation;
|
|
|
|
#if TBB_USE_ASSERT
|
|
template <typename T, typename Factory>
|
|
friend class opencl_buffer;
|
|
#endif
|
|
};
|
|
|
|
class opencl_device_list {
|
|
typedef std::vector<opencl_device> container_type;
|
|
public:
|
|
typedef container_type::iterator iterator;
|
|
typedef container_type::const_iterator const_iterator;
|
|
typedef container_type::size_type size_type;
|
|
|
|
opencl_device_list() {}
|
|
opencl_device_list( std::initializer_list<opencl_device> il ) : my_container( il ) {}
|
|
|
|
void add( opencl_device d ) { my_container.push_back( d ); }
|
|
size_type size() const { return my_container.size(); }
|
|
bool empty() const { return my_container.empty(); }
|
|
iterator begin() { return my_container.begin(); }
|
|
iterator end() { return my_container.end(); }
|
|
const_iterator begin() const { return my_container.begin(); }
|
|
const_iterator end() const { return my_container.end(); }
|
|
const_iterator cbegin() const { return my_container.cbegin(); }
|
|
const_iterator cend() const { return my_container.cend(); }
|
|
private:
|
|
container_type my_container;
|
|
};
|
|
|
|
class callback_base : tbb::internal::no_copy {
|
|
public:
|
|
virtual void call() = 0;
|
|
virtual ~callback_base() {}
|
|
};
|
|
|
|
template <typename Callback, typename T>
|
|
class callback : public callback_base {
|
|
graph &my_graph;
|
|
Callback my_callback;
|
|
T my_data;
|
|
public:
|
|
callback( graph &g, Callback c, const T& t ) : my_graph( g ), my_callback( c ), my_data( t ) {
|
|
// Extend the graph lifetime until the callback completion.
|
|
my_graph.increment_wait_count();
|
|
}
|
|
~callback() {
|
|
// Release the reference to the graph.
|
|
my_graph.decrement_wait_count();
|
|
}
|
|
void call() __TBB_override {
|
|
my_callback( my_data );
|
|
}
|
|
};
|
|
|
|
template <typename T, typename Factory = default_opencl_factory>
|
|
class dependency_msg : public async_msg<T> {
|
|
public:
|
|
typedef T value_type;
|
|
|
|
dependency_msg() : my_callback_flag_ptr( std::make_shared< tbb::atomic<bool>>() ) {
|
|
my_callback_flag_ptr->store<tbb::relaxed>(false);
|
|
}
|
|
|
|
explicit dependency_msg( const T& data ) : my_data(data), my_callback_flag_ptr( std::make_shared<tbb::atomic<bool>>() ) {
|
|
my_callback_flag_ptr->store<tbb::relaxed>(false);
|
|
}
|
|
|
|
dependency_msg( opencl_graph &g, const T& data ) : my_data(data), my_graph(&g), my_callback_flag_ptr( std::make_shared<tbb::atomic<bool>>() ) {
|
|
my_callback_flag_ptr->store<tbb::relaxed>(false);
|
|
}
|
|
|
|
dependency_msg( const T& data, cl_event event ) : my_data(data), my_event(event), my_is_event(true), my_callback_flag_ptr( std::make_shared<tbb::atomic<bool>>() ) {
|
|
my_callback_flag_ptr->store<tbb::relaxed>(false);
|
|
enforce_cl_retcode( clRetainEvent( my_event ), "Failed to retain an event" );
|
|
}
|
|
|
|
T& data( bool wait = true ) {
|
|
if ( my_is_event && wait ) {
|
|
enforce_cl_retcode( clWaitForEvents( 1, &my_event ), "Failed to wait for an event" );
|
|
enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to release an event" );
|
|
my_is_event = false;
|
|
}
|
|
return my_data;
|
|
}
|
|
|
|
const T& data( bool wait = true ) const {
|
|
if ( my_is_event && wait ) {
|
|
enforce_cl_retcode( clWaitForEvents( 1, &my_event ), "Failed to wait for an event" );
|
|
enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to release an event" );
|
|
my_is_event = false;
|
|
}
|
|
return my_data;
|
|
}
|
|
|
|
dependency_msg( const dependency_msg &dmsg ) : async_msg<T>(dmsg),
|
|
my_data(dmsg.my_data), my_event(dmsg.my_event), my_is_event( dmsg.my_is_event ), my_graph( dmsg.my_graph ),
|
|
my_callback_flag_ptr(dmsg.my_callback_flag_ptr)
|
|
{
|
|
if ( my_is_event )
|
|
enforce_cl_retcode( clRetainEvent( my_event ), "Failed to retain an event" );
|
|
}
|
|
|
|
dependency_msg( dependency_msg &&dmsg ) : async_msg<T>(std::move(dmsg)),
|
|
my_data(std::move(dmsg.my_data)), my_event(dmsg.my_event), my_is_event(dmsg.my_is_event), my_graph(dmsg.my_graph),
|
|
my_callback_flag_ptr( std::move(dmsg.my_callback_flag_ptr) )
|
|
{
|
|
dmsg.my_is_event = false;
|
|
}
|
|
|
|
dependency_msg& operator=(const dependency_msg &dmsg) {
|
|
async_msg<T>::operator =(dmsg);
|
|
|
|
// Release original event
|
|
if ( my_is_event )
|
|
enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to retain an event" );
|
|
|
|
my_data = dmsg.my_data;
|
|
my_event = dmsg.my_event;
|
|
my_is_event = dmsg.my_is_event;
|
|
my_graph = dmsg.my_graph;
|
|
|
|
// Retain copied event
|
|
if ( my_is_event )
|
|
enforce_cl_retcode( clRetainEvent( my_event ), "Failed to retain an event" );
|
|
|
|
my_callback_flag_ptr = dmsg.my_callback_flag_ptr;
|
|
return *this;
|
|
}
|
|
|
|
~dependency_msg() {
|
|
if ( my_is_event )
|
|
enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to release an event" );
|
|
}
|
|
|
|
cl_event const * get_event() const { return my_is_event ? &my_event : NULL; }
|
|
void set_event( cl_event e ) const {
|
|
if ( my_is_event ) {
|
|
cl_command_queue cq = event_info<cl_command_queue>( my_event, CL_EVENT_COMMAND_QUEUE );
|
|
if ( cq != event_info<cl_command_queue>( e, CL_EVENT_COMMAND_QUEUE ) )
|
|
enforce_cl_retcode( clFlush( cq ), "Failed to flush an OpenCL command queue" );
|
|
enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to release an event" );
|
|
}
|
|
my_is_event = true;
|
|
my_event = e;
|
|
clRetainEvent( my_event );
|
|
}
|
|
|
|
void set_graph( graph &g ) {
|
|
my_graph = &g;
|
|
}
|
|
|
|
void clear_event() const {
|
|
if ( my_is_event ) {
|
|
enforce_cl_retcode( clFlush( event_info<cl_command_queue>( my_event, CL_EVENT_COMMAND_QUEUE ) ), "Failed to flush an OpenCL command queue" );
|
|
enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to release an event" );
|
|
}
|
|
my_is_event = false;
|
|
}
|
|
|
|
template <typename Callback>
|
|
void register_callback( Callback c ) const {
|
|
__TBB_ASSERT( my_is_event, "The OpenCL event is not set" );
|
|
__TBB_ASSERT( my_graph, "The graph is not set" );
|
|
enforce_cl_retcode( clSetEventCallback( my_event, CL_COMPLETE, register_callback_func, new callback<Callback, T>( *my_graph, c, my_data ) ), "Failed to set an OpenCL callback" );
|
|
}
|
|
|
|
operator T&() { return data(); }
|
|
operator const T&() const { return data(); }
|
|
|
|
protected:
|
|
// Overridden in this derived class to inform that
|
|
// async calculation chain is over
|
|
void finalize() const __TBB_override {
|
|
receive_if_memory_object(*this);
|
|
if (! my_callback_flag_ptr->fetch_and_store(true)) {
|
|
dependency_msg a(*this);
|
|
if (my_is_event) {
|
|
register_callback([a](const T& t) mutable {
|
|
a.set(t);
|
|
});
|
|
}
|
|
else {
|
|
a.set(my_data);
|
|
}
|
|
}
|
|
clear_event();
|
|
}
|
|
|
|
private:
|
|
static void CL_CALLBACK register_callback_func( cl_event, cl_int event_command_exec_status, void *data ) {
|
|
tbb::internal::suppress_unused_warning( event_command_exec_status );
|
|
__TBB_ASSERT( event_command_exec_status == CL_COMPLETE, NULL );
|
|
__TBB_ASSERT( data, NULL );
|
|
callback_base *c = static_cast<callback_base*>(data);
|
|
c->call();
|
|
delete c;
|
|
}
|
|
|
|
T my_data;
|
|
mutable cl_event my_event;
|
|
mutable bool my_is_event = false;
|
|
graph *my_graph = NULL;
|
|
|
|
std::shared_ptr< tbb::atomic<bool> > my_callback_flag_ptr;
|
|
};
|
|
|
|
template <typename K, typename T, typename Factory>
|
|
K key_from_message( const dependency_msg<T, Factory> &dmsg ) {
|
|
using tbb::flow::key_from_message;
|
|
const T &t = dmsg.data( false );
|
|
__TBB_STATIC_ASSERT( true, "" );
|
|
return key_from_message<K, T>( t );
|
|
}
|
|
|
|
template <typename Factory>
|
|
class opencl_memory {
|
|
public:
|
|
opencl_memory() {}
|
|
opencl_memory( Factory &f ) : my_host_ptr( NULL ), my_factory( &f ), my_sending_event_present( false ) {
|
|
my_curr_device_id = my_factory->devices().begin()->my_device_id;
|
|
}
|
|
|
|
~opencl_memory() {
|
|
if ( my_sending_event_present ) enforce_cl_retcode( clReleaseEvent( my_sending_event ), "Failed to release an event for the OpenCL buffer" );
|
|
enforce_cl_retcode( clReleaseMemObject( my_cl_mem ), "Failed to release an memory object" );
|
|
}
|
|
|
|
cl_mem get_cl_mem() const {
|
|
return my_cl_mem;
|
|
}
|
|
|
|
void* get_host_ptr() {
|
|
if ( !my_host_ptr ) {
|
|
dependency_msg<void*, Factory> d = receive( NULL );
|
|
d.data();
|
|
__TBB_ASSERT( d.data() == my_host_ptr, NULL );
|
|
}
|
|
return my_host_ptr;
|
|
}
|
|
|
|
Factory *factory() const { return my_factory; }
|
|
|
|
dependency_msg<void*, Factory> send( opencl_device d, const cl_event *e );
|
|
dependency_msg<void*, Factory> receive( const cl_event *e );
|
|
virtual void map_memory( opencl_device, dependency_msg<void*, Factory> & ) = 0;
|
|
protected:
|
|
cl_mem my_cl_mem;
|
|
tbb::atomic<opencl_device::device_id_type> my_curr_device_id;
|
|
void* my_host_ptr;
|
|
Factory *my_factory;
|
|
|
|
tbb::spin_mutex my_sending_lock;
|
|
bool my_sending_event_present;
|
|
cl_event my_sending_event;
|
|
};
|
|
|
|
template <typename Factory>
|
|
class opencl_buffer_impl : public opencl_memory<Factory> {
|
|
size_t my_size;
|
|
public:
|
|
opencl_buffer_impl( size_t size, Factory& f ) : opencl_memory<Factory>( f ), my_size( size ) {
|
|
cl_int err;
|
|
this->my_cl_mem = clCreateBuffer( this->my_factory->context(), CL_MEM_ALLOC_HOST_PTR, size, NULL, &err );
|
|
enforce_cl_retcode( err, "Failed to create an OpenCL buffer" );
|
|
}
|
|
|
|
// The constructor for subbuffers.
|
|
opencl_buffer_impl( cl_mem m, size_t index, size_t size, Factory& f ) : opencl_memory<Factory>( f ), my_size( size ) {
|
|
cl_int err;
|
|
cl_buffer_region region = { index, size };
|
|
this->my_cl_mem = clCreateSubBuffer( m, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err );
|
|
enforce_cl_retcode( err, "Failed to create an OpenCL subbuffer" );
|
|
}
|
|
|
|
size_t size() const {
|
|
return my_size;
|
|
}
|
|
|
|
void map_memory( opencl_device device, dependency_msg<void*, Factory> &dmsg ) __TBB_override {
|
|
this->my_factory->enque_map_buffer( device, *this, dmsg );
|
|
}
|
|
|
|
#if TBB_USE_ASSERT
|
|
template <typename, typename>
|
|
friend class opencl_buffer;
|
|
#endif
|
|
};
|
|
|
|
enum access_type {
|
|
read_write,
|
|
write_only,
|
|
read_only
|
|
};
|
|
|
|
template <typename T, typename Factory = default_opencl_factory>
|
|
class opencl_subbuffer;
|
|
|
|
template <typename T, typename Factory = default_opencl_factory>
|
|
class opencl_buffer {
|
|
public:
|
|
typedef cl_mem native_object_type;
|
|
typedef opencl_buffer memory_object_type;
|
|
typedef Factory opencl_factory_type;
|
|
|
|
template<access_type a> using iterator = T*;
|
|
|
|
template <access_type a>
|
|
iterator<a> access() const {
|
|
T* ptr = (T*)my_impl->get_host_ptr();
|
|
__TBB_ASSERT( ptr, NULL );
|
|
return iterator<a>( ptr );
|
|
}
|
|
|
|
T* data() const { return &access<read_write>()[0]; }
|
|
|
|
template <access_type a = read_write>
|
|
iterator<a> begin() const { return access<a>(); }
|
|
|
|
template <access_type a = read_write>
|
|
iterator<a> end() const { return access<a>()+my_impl->size()/sizeof(T); }
|
|
|
|
size_t size() const { return my_impl->size()/sizeof(T); }
|
|
|
|
T& operator[] ( ptrdiff_t k ) { return begin()[k]; }
|
|
|
|
opencl_buffer() {}
|
|
opencl_buffer( opencl_graph &g, size_t size );
|
|
opencl_buffer( Factory &f, size_t size ) : my_impl( std::make_shared<impl_type>( size*sizeof(T), f ) ) {}
|
|
|
|
cl_mem native_object() const {
|
|
return my_impl->get_cl_mem();
|
|
}
|
|
|
|
const opencl_buffer& memory_object() const {
|
|
return *this;
|
|
}
|
|
|
|
void send( opencl_device device, dependency_msg<opencl_buffer, Factory> &dependency ) const {
|
|
__TBB_ASSERT( dependency.data( /*wait = */false ) == *this, NULL );
|
|
dependency_msg<void*, Factory> d = my_impl->send( device, dependency.get_event() );
|
|
const cl_event *e = d.get_event();
|
|
if ( e ) dependency.set_event( *e );
|
|
else dependency.clear_event();
|
|
}
|
|
void receive( const dependency_msg<opencl_buffer, Factory> &dependency ) const {
|
|
__TBB_ASSERT( dependency.data( /*wait = */false ) == *this, NULL );
|
|
dependency_msg<void*, Factory> d = my_impl->receive( dependency.get_event() );
|
|
const cl_event *e = d.get_event();
|
|
if ( e ) dependency.set_event( *e );
|
|
else dependency.clear_event();
|
|
}
|
|
|
|
opencl_subbuffer<T, Factory> subbuffer( size_t index, size_t size ) const;
|
|
private:
|
|
// The constructor for subbuffers.
|
|
opencl_buffer( Factory &f, cl_mem m, size_t index, size_t size ) : my_impl( std::make_shared<impl_type>( m, index*sizeof(T), size*sizeof(T), f ) ) {}
|
|
|
|
typedef opencl_buffer_impl<Factory> impl_type;
|
|
|
|
std::shared_ptr<impl_type> my_impl;
|
|
|
|
friend bool operator==(const opencl_buffer<T, Factory> &lhs, const opencl_buffer<T, Factory> &rhs) {
|
|
return lhs.my_impl == rhs.my_impl;
|
|
}
|
|
|
|
template <typename>
|
|
friend class opencl_factory;
|
|
template <typename, typename>
|
|
friend class opencl_subbuffer;
|
|
};
|
|
|
|
template <typename T, typename Factory>
|
|
class opencl_subbuffer : public opencl_buffer<T, Factory> {
|
|
opencl_buffer<T, Factory> my_owner;
|
|
public:
|
|
opencl_subbuffer() {}
|
|
opencl_subbuffer( const opencl_buffer<T, Factory> &owner, size_t index, size_t size ) :
|
|
opencl_buffer<T, Factory>( *owner.my_impl->factory(), owner.native_object(), index, size ), my_owner( owner ) {}
|
|
};
|
|
|
|
template <typename T, typename Factory>
|
|
opencl_subbuffer<T, Factory> opencl_buffer<T, Factory>::subbuffer( size_t index, size_t size ) const {
|
|
return opencl_subbuffer<T, Factory>( *this, index, size );
|
|
}
|
|
|
|
|
|
#define is_typedef(type) \
|
|
template <typename T> \
|
|
struct is_##type { \
|
|
template <typename C> \
|
|
static std::true_type check( typename C::type* ); \
|
|
template <typename C> \
|
|
static std::false_type check( ... ); \
|
|
\
|
|
static const bool value = decltype(check<T>(0))::value; \
|
|
}
|
|
|
|
is_typedef( native_object_type );
|
|
is_typedef( memory_object_type );
|
|
|
|
template <typename T>
|
|
typename std::enable_if<is_native_object_type<T>::value, typename T::native_object_type>::type get_native_object( const T &t ) {
|
|
return t.native_object();
|
|
}
|
|
|
|
template <typename T>
|
|
typename std::enable_if<!is_native_object_type<T>::value, T>::type get_native_object( T t ) {
|
|
return t;
|
|
}
|
|
|
|
// send_if_memory_object checks if the T type has memory_object_type and call the send method for the object.
|
|
template <typename T, typename Factory>
|
|
typename std::enable_if<is_memory_object_type<T>::value>::type send_if_memory_object( opencl_device device, dependency_msg<T, Factory> &dmsg ) {
|
|
const T &t = dmsg.data( false );
|
|
typedef typename T::memory_object_type mem_obj_t;
|
|
mem_obj_t mem_obj = t.memory_object();
|
|
dependency_msg<mem_obj_t, Factory> d( mem_obj );
|
|
if ( dmsg.get_event() ) d.set_event( *dmsg.get_event() );
|
|
mem_obj.send( device, d );
|
|
if ( d.get_event() ) dmsg.set_event( *d.get_event() );
|
|
}
|
|
|
|
template <typename T>
|
|
typename std::enable_if<is_memory_object_type<T>::value>::type send_if_memory_object( opencl_device device, T &t ) {
|
|
typedef typename T::memory_object_type mem_obj_t;
|
|
mem_obj_t mem_obj = t.memory_object();
|
|
dependency_msg<mem_obj_t, typename mem_obj_t::opencl_factory_type> dmsg( mem_obj );
|
|
mem_obj.send( device, dmsg );
|
|
}
|
|
|
|
template <typename T>
|
|
typename std::enable_if<!is_memory_object_type<T>::value>::type send_if_memory_object( opencl_device, T& ) {};
|
|
|
|
// receive_if_memory_object checks if the T type has memory_object_type and call the receive method for the object.
|
|
template <typename T, typename Factory>
|
|
typename std::enable_if<is_memory_object_type<T>::value>::type receive_if_memory_object( const dependency_msg<T, Factory> &dmsg ) {
|
|
const T &t = dmsg.data( false );
|
|
typedef typename T::memory_object_type mem_obj_t;
|
|
mem_obj_t mem_obj = t.memory_object();
|
|
dependency_msg<mem_obj_t, Factory> d( mem_obj );
|
|
if ( dmsg.get_event() ) d.set_event( *dmsg.get_event() );
|
|
mem_obj.receive( d );
|
|
if ( d.get_event() ) dmsg.set_event( *d.get_event() );
|
|
}
|
|
|
|
template <typename T>
|
|
typename std::enable_if<!is_memory_object_type<T>::value>::type receive_if_memory_object( const T& ) {}
|
|
|
|
class opencl_range {
|
|
public:
|
|
typedef size_t range_index_type;
|
|
typedef std::array<range_index_type, 3> nd_range_type;
|
|
|
|
template <typename G = std::initializer_list<int>, typename L = std::initializer_list<int>,
|
|
typename = typename std::enable_if<!std::is_same<typename std::decay<G>::type, opencl_range>::value>::type>
|
|
opencl_range(G&& global_work = std::initializer_list<int>({ 0 }), L&& local_work = std::initializer_list<int>({ 0, 0, 0 })) {
|
|
auto g_it = global_work.begin();
|
|
auto l_it = local_work.begin();
|
|
my_global_work_size = { size_t(-1), size_t(-1), size_t(-1) };
|
|
// my_local_work_size is still uninitialized
|
|
for (int s = 0; s < 3 && g_it != global_work.end(); ++g_it, ++l_it, ++s) {
|
|
__TBB_ASSERT(l_it != local_work.end(), "global_work & local_work must have same size");
|
|
my_global_work_size[s] = *g_it;
|
|
my_local_work_size[s] = *l_it;
|
|
}
|
|
}
|
|
|
|
const nd_range_type& global_range() const { return my_global_work_size; }
|
|
const nd_range_type& local_range() const { return my_local_work_size; }
|
|
|
|
private:
|
|
nd_range_type my_global_work_size;
|
|
nd_range_type my_local_work_size;
|
|
};
|
|
|
|
template <typename DeviceFilter>
|
|
class opencl_factory {
|
|
public:
|
|
template<typename T> using async_msg_type = dependency_msg<T, opencl_factory<DeviceFilter>>;
|
|
typedef opencl_device device_type;
|
|
|
|
class kernel : tbb::internal::no_assign {
|
|
public:
|
|
kernel( const kernel& k ) : my_factory( k.my_factory ) {
|
|
// Clone my_cl_kernel via opencl_program
|
|
size_t ret_size = 0;
|
|
|
|
std::vector<char> kernel_name;
|
|
for ( size_t curr_size = 32;; curr_size <<= 1 ) {
|
|
kernel_name.resize( curr_size <<= 1 );
|
|
enforce_cl_retcode( clGetKernelInfo( k.my_cl_kernel, CL_KERNEL_FUNCTION_NAME, curr_size, kernel_name.data(), &ret_size ), "Failed to get kernel info" );
|
|
if ( ret_size < curr_size ) break;
|
|
}
|
|
|
|
cl_program program;
|
|
enforce_cl_retcode( clGetKernelInfo( k.my_cl_kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, &ret_size ), "Failed to get kernel info" );
|
|
__TBB_ASSERT( ret_size == sizeof(program), NULL );
|
|
|
|
my_cl_kernel = opencl_program< factory_type >( my_factory, program ).get_cl_kernel( kernel_name.data() );
|
|
}
|
|
|
|
~kernel() {
|
|
enforce_cl_retcode( clReleaseKernel( my_cl_kernel ), "Failed to release a kernel" );
|
|
}
|
|
|
|
private:
|
|
typedef opencl_factory<DeviceFilter> factory_type;
|
|
|
|
kernel( const cl_kernel& k, factory_type& f ) : my_cl_kernel( k ), my_factory( f ) {}
|
|
|
|
// Data
|
|
cl_kernel my_cl_kernel;
|
|
factory_type& my_factory;
|
|
|
|
template <typename DeviceFilter_>
|
|
friend class opencl_factory;
|
|
|
|
template <typename Factory>
|
|
friend class opencl_program;
|
|
};
|
|
|
|
typedef kernel kernel_type;
|
|
|
|
// 'range_type' enables kernel_executor with range support
|
|
// it affects expectations for enqueue_kernel(.....) interface method
|
|
typedef opencl_range range_type;
|
|
|
|
opencl_factory( opencl_graph &g ) : my_graph( g ) {}
|
|
~opencl_factory() {
|
|
if ( my_devices.size() ) {
|
|
for ( auto d = my_devices.begin(); d != my_devices.end(); ++d ) {
|
|
enforce_cl_retcode( clReleaseCommandQueue( (*d).my_cl_command_queue ), "Failed to release a command queue" );
|
|
}
|
|
enforce_cl_retcode( clReleaseContext( my_cl_context ), "Failed to release a context" );
|
|
}
|
|
}
|
|
|
|
bool init( const opencl_device_list &device_list ) {
|
|
tbb::spin_mutex::scoped_lock lock( my_devices_mutex );
|
|
if ( !my_devices.size() ) {
|
|
my_devices = device_list;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
private:
|
|
template <typename Factory>
|
|
void enque_map_buffer( opencl_device device, opencl_buffer_impl<Factory> &buffer, dependency_msg<void*, Factory>& dmsg ) {
|
|
cl_event const* e1 = dmsg.get_event();
|
|
cl_event e2;
|
|
cl_int err;
|
|
void *ptr = clEnqueueMapBuffer( device.my_cl_command_queue, buffer.get_cl_mem(), false, CL_MAP_READ | CL_MAP_WRITE, 0, buffer.size(),
|
|
e1 == NULL ? 0 : 1, e1, &e2, &err );
|
|
enforce_cl_retcode( err, "Failed to map a buffer" );
|
|
dmsg.data( false ) = ptr;
|
|
dmsg.set_event( e2 );
|
|
enforce_cl_retcode( clReleaseEvent( e2 ), "Failed to release an event" );
|
|
}
|
|
|
|
|
|
template <typename Factory>
|
|
void enque_unmap_buffer( opencl_device device, opencl_memory<Factory> &memory, dependency_msg<void*, Factory>& dmsg ) {
|
|
cl_event const* e1 = dmsg.get_event();
|
|
cl_event e2;
|
|
enforce_cl_retcode(
|
|
clEnqueueUnmapMemObject( device.my_cl_command_queue, memory.get_cl_mem(), memory.get_host_ptr(), e1 == NULL ? 0 : 1, e1, &e2 ),
|
|
"Failed to unmap a buffer" );
|
|
dmsg.set_event( e2 );
|
|
enforce_cl_retcode( clReleaseEvent( e2 ), "Failed to release an event" );
|
|
}
|
|
|
|
// --------- Kernel argument & event list helpers --------- //
|
|
template <size_t NUM_ARGS, typename T>
|
|
void process_one_arg( const kernel_type& kernel, std::array<cl_event, NUM_ARGS>&, int&, int& place, const T& t ) {
|
|
auto p = get_native_object(t);
|
|
enforce_cl_retcode( clSetKernelArg(kernel.my_cl_kernel, place++, sizeof(p), &p), "Failed to set a kernel argument" );
|
|
}
|
|
|
|
template <size_t NUM_ARGS, typename T, typename F>
|
|
void process_one_arg( const kernel_type& kernel, std::array<cl_event, NUM_ARGS>& events, int& num_events, int& place, const dependency_msg<T, F>& msg ) {
|
|
__TBB_ASSERT((static_cast<typename std::array<cl_event, NUM_ARGS>::size_type>(num_events) < events.size()), NULL);
|
|
|
|
const cl_event * const e = msg.get_event();
|
|
if (e != NULL) {
|
|
events[num_events++] = *e;
|
|
}
|
|
|
|
process_one_arg( kernel, events, num_events, place, msg.data(false) );
|
|
}
|
|
|
|
template <size_t NUM_ARGS, typename T, typename ...Rest>
|
|
void process_arg_list( const kernel_type& kernel, std::array<cl_event, NUM_ARGS>& events, int& num_events, int& place, const T& t, const Rest&... args ) {
|
|
process_one_arg( kernel, events, num_events, place, t );
|
|
process_arg_list( kernel, events, num_events, place, args... );
|
|
}
|
|
|
|
template <size_t NUM_ARGS>
|
|
void process_arg_list( const kernel_type&, std::array<cl_event, NUM_ARGS>&, int&, int& ) {}
|
|
// ------------------------------------------- //
|
|
template <typename T>
|
|
void update_one_arg( cl_event, T& ) {}
|
|
|
|
template <typename T, typename F>
|
|
void update_one_arg( cl_event e, dependency_msg<T, F>& msg ) {
|
|
msg.set_event( e );
|
|
msg.set_graph( my_graph );
|
|
}
|
|
|
|
template <typename T, typename ...Rest>
|
|
void update_arg_list( cl_event e, T& t, Rest&... args ) {
|
|
update_one_arg( e, t );
|
|
update_arg_list( e, args... );
|
|
}
|
|
|
|
void update_arg_list( cl_event ) {}
|
|
// ------------------------------------------- //
|
|
public:
|
|
template <typename ...Args>
|
|
void send_kernel( opencl_device device, const kernel_type& kernel, const range_type& work_size, Args&... args ) {
|
|
std::array<cl_event, sizeof...(Args)> events;
|
|
int num_events = 0;
|
|
int place = 0;
|
|
process_arg_list( kernel, events, num_events, place, args... );
|
|
|
|
const cl_event e = send_kernel_impl( device, kernel.my_cl_kernel, work_size, num_events, events.data() );
|
|
|
|
update_arg_list(e, args...);
|
|
|
|
// Release our own reference to cl_event
|
|
enforce_cl_retcode( clReleaseEvent(e), "Failed to release an event" );
|
|
}
|
|
|
|
// ------------------------------------------- //
|
|
template <typename T, typename ...Rest>
|
|
void send_data(opencl_device device, T& t, Rest&... args) {
|
|
send_if_memory_object( device, t );
|
|
send_data( device, args... );
|
|
}
|
|
|
|
void send_data(opencl_device) {}
|
|
// ------------------------------------------- //
|
|
|
|
private:
|
|
cl_event send_kernel_impl( opencl_device device, const cl_kernel& kernel,
|
|
const range_type& work_size, cl_uint num_events, cl_event* event_list ) {
|
|
const typename range_type::nd_range_type g_offset = { { 0, 0, 0 } };
|
|
const typename range_type::nd_range_type& g_size = work_size.global_range();
|
|
const typename range_type::nd_range_type& l_size = work_size.local_range();
|
|
cl_uint s;
|
|
for ( s = 1; s < 3 && g_size[s] != size_t(-1); ++s) {}
|
|
cl_event event;
|
|
enforce_cl_retcode(
|
|
clEnqueueNDRangeKernel( device.my_cl_command_queue, kernel, s,
|
|
g_offset.data(), g_size.data(), l_size[0] ? l_size.data() : NULL, num_events, num_events ? event_list : NULL, &event ),
|
|
"Failed to enqueue a kernel" );
|
|
return event;
|
|
}
|
|
|
|
// ------------------------------------------- //
|
|
template <typename T>
|
|
bool get_event_from_one_arg( cl_event&, const T& ) {
|
|
return false;
|
|
}
|
|
|
|
template <typename T, typename F>
|
|
bool get_event_from_one_arg( cl_event& e, const dependency_msg<T, F>& msg) {
|
|
cl_event const *e_ptr = msg.get_event();
|
|
|
|
if ( e_ptr != NULL ) {
|
|
e = *e_ptr;
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
template <typename T, typename ...Rest>
|
|
bool get_event_from_args( cl_event& e, const T& t, const Rest&... args ) {
|
|
if ( get_event_from_one_arg( e, t ) ) {
|
|
return true;
|
|
}
|
|
|
|
return get_event_from_args( e, args... );
|
|
}
|
|
|
|
bool get_event_from_args( cl_event& ) {
|
|
return false;
|
|
}
|
|
// ------------------------------------------- //
|
|
|
|
struct finalize_fn : tbb::internal::no_assign {
|
|
virtual ~finalize_fn() {}
|
|
virtual void operator() () {}
|
|
};
|
|
|
|
template<typename Fn>
|
|
struct finalize_fn_leaf : public finalize_fn {
|
|
Fn my_fn;
|
|
finalize_fn_leaf(Fn fn) : my_fn(fn) {}
|
|
void operator() () __TBB_override { my_fn(); }
|
|
};
|
|
|
|
static void CL_CALLBACK finalize_callback(cl_event, cl_int event_command_exec_status, void *data) {
|
|
tbb::internal::suppress_unused_warning(event_command_exec_status);
|
|
__TBB_ASSERT(event_command_exec_status == CL_COMPLETE, NULL);
|
|
|
|
finalize_fn * const fn_ptr = static_cast<finalize_fn*>(data);
|
|
__TBB_ASSERT(fn_ptr != NULL, "Invalid finalize function pointer");
|
|
(*fn_ptr)();
|
|
|
|
// Function pointer was created by 'new' & this callback must be called once only
|
|
delete fn_ptr;
|
|
}
|
|
public:
|
|
template <typename FinalizeFn, typename ...Args>
|
|
void finalize( opencl_device device, FinalizeFn fn, Args&... args ) {
|
|
cl_event e;
|
|
|
|
if ( get_event_from_args( e, args... ) ) {
|
|
enforce_cl_retcode( clSetEventCallback( e, CL_COMPLETE, finalize_callback,
|
|
new finalize_fn_leaf<FinalizeFn>(fn) ), "Failed to set a callback" );
|
|
}
|
|
|
|
enforce_cl_retcode( clFlush( device.my_cl_command_queue ), "Failed to flush an OpenCL command queue" );
|
|
}
|
|
|
|
const opencl_device_list& devices() {
|
|
std::call_once( my_once_flag, &opencl_factory::init_once, this );
|
|
return my_devices;
|
|
}
|
|
|
|
private:
|
|
bool is_same_context( opencl_device::device_id_type d1, opencl_device::device_id_type d2 ) {
|
|
__TBB_ASSERT( d1 != opencl_device::unknown && d2 != opencl_device::unknown, NULL );
|
|
// Currently, factory supports only one context so if the both devices are not host it means the are in the same context.
|
|
if ( d1 != opencl_device::host && d2 != opencl_device::host )
|
|
return true;
|
|
return d1 == d2;
|
|
}
|
|
private:
|
|
opencl_factory( const opencl_factory& );
|
|
opencl_factory& operator=(const opencl_factory&);
|
|
|
|
cl_context context() {
|
|
std::call_once( my_once_flag, &opencl_factory::init_once, this );
|
|
return my_cl_context;
|
|
}
|
|
|
|
void init_once();
|
|
|
|
std::once_flag my_once_flag;
|
|
opencl_device_list my_devices;
|
|
cl_context my_cl_context;
|
|
opencl_graph &my_graph;
|
|
|
|
tbb::spin_mutex my_devices_mutex;
|
|
|
|
template <typename Factory>
|
|
friend class opencl_program;
|
|
template <typename Factory>
|
|
friend class opencl_buffer_impl;
|
|
template <typename Factory>
|
|
friend class opencl_memory;
|
|
};
|
|
|
|
template <typename Factory>
|
|
dependency_msg<void*, Factory> opencl_memory<Factory>::receive( const cl_event *e ) {
|
|
dependency_msg<void*, Factory> d = e ? dependency_msg<void*, Factory>( my_host_ptr, *e ) : dependency_msg<void*, Factory>( my_host_ptr );
|
|
// Concurrent receives are prohibited so we do not worry about synchronization.
|
|
if ( my_curr_device_id.load<tbb::relaxed>() != opencl_device::host ) {
|
|
map_memory( *my_factory->devices().begin(), d );
|
|
my_curr_device_id.store<tbb::relaxed>( opencl_device::host );
|
|
my_host_ptr = d.data( false );
|
|
}
|
|
// Release the sending event
|
|
if ( my_sending_event_present ) {
|
|
enforce_cl_retcode( clReleaseEvent( my_sending_event ), "Failed to release an event" );
|
|
my_sending_event_present = false;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
template <typename Factory>
|
|
dependency_msg<void*, Factory> opencl_memory<Factory>::send( opencl_device device, const cl_event *e ) {
|
|
opencl_device::device_id_type device_id = device.my_device_id;
|
|
if ( !my_factory->is_same_context( my_curr_device_id.load<tbb::acquire>(), device_id ) ) {
|
|
{
|
|
tbb::spin_mutex::scoped_lock lock( my_sending_lock );
|
|
if ( !my_factory->is_same_context( my_curr_device_id.load<tbb::relaxed>(), device_id ) ) {
|
|
__TBB_ASSERT( my_host_ptr, "The buffer has not been mapped" );
|
|
dependency_msg<void*, Factory> d( my_host_ptr );
|
|
my_factory->enque_unmap_buffer( device, *this, d );
|
|
my_sending_event = *d.get_event();
|
|
my_sending_event_present = true;
|
|
enforce_cl_retcode( clRetainEvent( my_sending_event ), "Failed to retain an event" );
|
|
my_host_ptr = NULL;
|
|
my_curr_device_id.store<tbb::release>(device_id);
|
|
}
|
|
}
|
|
__TBB_ASSERT( my_sending_event_present, NULL );
|
|
}
|
|
|
|
// !e means that buffer has come from the host
|
|
if ( !e && my_sending_event_present ) e = &my_sending_event;
|
|
|
|
__TBB_ASSERT( !my_host_ptr, "The buffer has not been unmapped" );
|
|
return e ? dependency_msg<void*, Factory>( NULL, *e ) : dependency_msg<void*, Factory>( NULL );
|
|
}
|
|
|
|
struct default_opencl_factory_device_filter {
|
|
opencl_device_list operator()( const opencl_device_list &devices ) {
|
|
opencl_device_list dl;
|
|
dl.add( *devices.begin() );
|
|
return dl;
|
|
}
|
|
};
|
|
|
|
class default_opencl_factory : public opencl_factory < default_opencl_factory_device_filter > {
|
|
public:
|
|
template<typename T> using async_msg_type = dependency_msg<T, default_opencl_factory>;
|
|
|
|
default_opencl_factory( opencl_graph &g ) : opencl_factory( g ) {}
|
|
private:
|
|
default_opencl_factory( const default_opencl_factory& );
|
|
default_opencl_factory& operator=(const default_opencl_factory&);
|
|
};
|
|
|
|
class opencl_foundation : tbb::internal::no_assign {
|
|
struct default_device_selector_type {
|
|
opencl_device operator()( default_opencl_factory& f ) {
|
|
__TBB_ASSERT( ! f.devices().empty(), "No available devices" );
|
|
return *( f.devices().begin() );
|
|
}
|
|
};
|
|
public:
|
|
opencl_foundation(opencl_graph &g) : my_default_opencl_factory(g), my_default_device_selector() {
|
|
cl_uint num_platforms;
|
|
enforce_cl_retcode(clGetPlatformIDs(0, NULL, &num_platforms), "clGetPlatformIDs failed");
|
|
|
|
std::vector<cl_platform_id> platforms(num_platforms);
|
|
enforce_cl_retcode(clGetPlatformIDs(num_platforms, platforms.data(), NULL), "clGetPlatformIDs failed");
|
|
|
|
cl_uint num_devices;
|
|
std::vector<cl_platform_id>::iterator platforms_it = platforms.begin();
|
|
cl_uint num_all_devices = 0;
|
|
while (platforms_it != platforms.end()) {
|
|
cl_int err = clGetDeviceIDs(*platforms_it, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
|
|
if (err == CL_DEVICE_NOT_FOUND) {
|
|
platforms_it = platforms.erase(platforms_it);
|
|
} else {
|
|
enforce_cl_retcode(err, "clGetDeviceIDs failed");
|
|
num_all_devices += num_devices;
|
|
++platforms_it;
|
|
}
|
|
}
|
|
|
|
std::vector<cl_device_id> devices(num_all_devices);
|
|
std::vector<cl_device_id>::iterator devices_it = devices.begin();
|
|
for (auto p = platforms.begin(); p != platforms.end(); ++p) {
|
|
enforce_cl_retcode(clGetDeviceIDs((*p), CL_DEVICE_TYPE_ALL, (cl_uint)std::distance(devices_it, devices.end()), &*devices_it, &num_devices), "clGetDeviceIDs failed");
|
|
devices_it += num_devices;
|
|
}
|
|
|
|
for (auto d = devices.begin(); d != devices.end(); ++d) {
|
|
my_devices.add(opencl_device((*d)));
|
|
}
|
|
}
|
|
|
|
default_opencl_factory &get_default_opencl_factory() {
|
|
return my_default_opencl_factory;
|
|
}
|
|
|
|
const opencl_device_list &get_all_devices() {
|
|
return my_devices;
|
|
}
|
|
|
|
default_device_selector_type get_default_device_selector() { return my_default_device_selector; }
|
|
|
|
private:
|
|
default_opencl_factory my_default_opencl_factory;
|
|
opencl_device_list my_devices;
|
|
|
|
const default_device_selector_type my_default_device_selector;
|
|
};
|
|
|
|
opencl_foundation &opencl_graph::get_opencl_foundation() {
|
|
opencl_foundation* INITIALIZATION = (opencl_foundation*)1;
|
|
if ( my_opencl_foundation <= INITIALIZATION ) {
|
|
if ( tbb::internal::as_atomic( my_opencl_foundation ).compare_and_swap( INITIALIZATION, NULL ) == 0 ) {
|
|
my_opencl_foundation = new opencl_foundation( *this );
|
|
}
|
|
else {
|
|
tbb::internal::spin_wait_while_eq( my_opencl_foundation, INITIALIZATION );
|
|
}
|
|
}
|
|
|
|
__TBB_ASSERT( my_opencl_foundation > INITIALIZATION, "opencl_foundation is not initialized");
|
|
return *my_opencl_foundation;
|
|
}
|
|
|
|
opencl_graph::~opencl_graph() {
|
|
if ( my_opencl_foundation )
|
|
delete my_opencl_foundation;
|
|
}
|
|
|
|
template <typename DeviceFilter>
|
|
void opencl_factory<DeviceFilter>::init_once() {
|
|
{
|
|
tbb::spin_mutex::scoped_lock lock( my_devices_mutex );
|
|
if ( !my_devices.size() )
|
|
my_devices = DeviceFilter()(my_graph.get_opencl_foundation().get_all_devices());
|
|
}
|
|
|
|
enforce_cl_retcode( my_devices.size() ? CL_SUCCESS : CL_INVALID_DEVICE, "No devices in the device list" );
|
|
cl_platform_id platform_id = my_devices.begin()->platform();
|
|
for ( opencl_device_list::iterator it = ++my_devices.begin(); it != my_devices.end(); ++it )
|
|
enforce_cl_retcode( it->platform() == platform_id ? CL_SUCCESS : CL_INVALID_PLATFORM, "All devices should be in the same platform" );
|
|
|
|
std::vector<cl_device_id> cl_device_ids;
|
|
for (auto d = my_devices.begin(); d != my_devices.end(); ++d) {
|
|
cl_device_ids.push_back((*d).my_cl_device_id);
|
|
}
|
|
|
|
cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, (cl_context_properties)NULL };
|
|
cl_int err;
|
|
cl_context ctx = clCreateContext( context_properties,
|
|
(cl_uint)cl_device_ids.size(),
|
|
cl_device_ids.data(),
|
|
NULL, NULL, &err );
|
|
enforce_cl_retcode( err, "Failed to create context" );
|
|
my_cl_context = ctx;
|
|
|
|
size_t device_counter = 0;
|
|
for ( auto d = my_devices.begin(); d != my_devices.end(); d++ ) {
|
|
(*d).my_device_id = device_counter++;
|
|
cl_int err2;
|
|
cl_command_queue cq;
|
|
#if CL_VERSION_2_0
|
|
if ( (*d).major_version() >= 2 ) {
|
|
if ( (*d).out_of_order_exec_mode_on_host_present() ) {
|
|
cl_queue_properties props[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0 };
|
|
cq = clCreateCommandQueueWithProperties( ctx, (*d).my_cl_device_id, props, &err2 );
|
|
} else {
|
|
cl_queue_properties props[] = { 0 };
|
|
cq = clCreateCommandQueueWithProperties( ctx, (*d).my_cl_device_id, props, &err2 );
|
|
}
|
|
} else
|
|
#endif
|
|
{
|
|
cl_command_queue_properties props = (*d).out_of_order_exec_mode_on_host_present() ? CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE : 0;
|
|
// Suppress "declared deprecated" warning for the next line.
|
|
#if __TBB_GCC_WARNING_SUPPRESSION_PRESENT
|
|
#pragma GCC diagnostic push
|
|
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
|
#endif
|
|
#if _MSC_VER || __INTEL_COMPILER
|
|
#pragma warning( push )
|
|
#if __INTEL_COMPILER
|
|
#pragma warning (disable: 1478)
|
|
#else
|
|
#pragma warning (disable: 4996)
|
|
#endif
|
|
#endif
|
|
cq = clCreateCommandQueue( ctx, (*d).my_cl_device_id, props, &err2 );
|
|
#if _MSC_VER || __INTEL_COMPILER
|
|
#pragma warning( pop )
|
|
#endif
|
|
#if __TBB_GCC_WARNING_SUPPRESSION_PRESENT
|
|
#pragma GCC diagnostic pop
|
|
#endif
|
|
}
|
|
enforce_cl_retcode( err2, "Failed to create command queue" );
|
|
(*d).my_cl_command_queue = cq;
|
|
}
|
|
}
|
|
|
|
const opencl_device_list &opencl_graph::available_devices() {
|
|
return get_opencl_foundation().get_all_devices();
|
|
}
|
|
|
|
default_opencl_factory &opencl_graph::opencl_factory() {
|
|
return get_opencl_foundation().get_default_opencl_factory();
|
|
}
|
|
|
|
template <typename T, typename Factory>
|
|
opencl_buffer<T, Factory>::opencl_buffer( opencl_graph &g, size_t size ) : my_impl( std::make_shared<impl_type>( size*sizeof(T), g.get_opencl_foundation().get_default_opencl_factory() ) ) {}
|
|
|
|
|
|
enum class opencl_program_type {
|
|
SOURCE,
|
|
PRECOMPILED,
|
|
SPIR
|
|
};
|
|
|
|
template <typename Factory = default_opencl_factory>
|
|
class opencl_program : tbb::internal::no_assign {
|
|
public:
|
|
typedef typename Factory::kernel_type kernel_type;
|
|
|
|
opencl_program( Factory& factory, opencl_program_type type, const std::string& program_name ) : my_factory( factory ), my_type(type) , my_arg_str( program_name) {}
|
|
opencl_program( Factory& factory, const char* program_name ) : opencl_program( factory, std::string( program_name ) ) {}
|
|
opencl_program( Factory& factory, const std::string& program_name ) : opencl_program( factory, opencl_program_type::SOURCE, program_name ) {}
|
|
|
|
opencl_program( opencl_graph& graph, opencl_program_type type, const std::string& program_name ) : opencl_program( graph.opencl_factory(), type, program_name ) {}
|
|
opencl_program( opencl_graph& graph, const char* program_name ) : opencl_program( graph.opencl_factory(), program_name ) {}
|
|
opencl_program( opencl_graph& graph, const std::string& program_name ) : opencl_program( graph.opencl_factory(), program_name ) {}
|
|
opencl_program( opencl_graph& graph, opencl_program_type type ) : opencl_program( graph.opencl_factory(), type ) {}
|
|
|
|
opencl_program( const opencl_program &src ) : my_factory( src.my_factory ), my_type( src.type ), my_arg_str( src.my_arg_str ), my_cl_program( src.my_cl_program ) {
|
|
// Set my_do_once_flag to the called state.
|
|
std::call_once( my_do_once_flag, [](){} );
|
|
}
|
|
|
|
kernel_type get_kernel( const std::string& k ) const {
|
|
return kernel_type( get_cl_kernel(k), my_factory );
|
|
}
|
|
|
|
private:
|
|
opencl_program( Factory& factory, cl_program program ) : my_factory( factory ), my_cl_program( program ) {
|
|
// Set my_do_once_flag to the called state.
|
|
std::call_once( my_do_once_flag, [](){} );
|
|
}
|
|
|
|
cl_kernel get_cl_kernel( const std::string& k ) const {
|
|
std::call_once( my_do_once_flag, [this, &k](){ this->init( k ); } );
|
|
cl_int err;
|
|
cl_kernel kernel = clCreateKernel( my_cl_program, k.c_str(), &err );
|
|
enforce_cl_retcode( err, std::string( "Failed to create kernel: " ) + k );
|
|
return kernel;
|
|
}
|
|
|
|
class file_reader {
|
|
public:
|
|
file_reader( const std::string& filepath ) {
|
|
std::ifstream file_descriptor( filepath, std::ifstream::binary );
|
|
if ( !file_descriptor.is_open() ) {
|
|
std::string str = std::string( "Could not open file: " ) + filepath;
|
|
std::cerr << str << std::endl;
|
|
throw str;
|
|
}
|
|
file_descriptor.seekg( 0, file_descriptor.end );
|
|
size_t length = size_t( file_descriptor.tellg() );
|
|
file_descriptor.seekg( 0, file_descriptor.beg );
|
|
my_content.resize( length );
|
|
char* begin = &*my_content.begin();
|
|
file_descriptor.read( begin, length );
|
|
file_descriptor.close();
|
|
}
|
|
const char* content() { return &*my_content.cbegin(); }
|
|
size_t length() { return my_content.length(); }
|
|
private:
|
|
std::string my_content;
|
|
};
|
|
|
|
class opencl_program_builder {
|
|
public:
|
|
typedef void (CL_CALLBACK *cl_callback_type)(cl_program, void*);
|
|
opencl_program_builder( Factory& f, const std::string& name, cl_program program,
|
|
cl_uint num_devices, cl_device_id* device_list,
|
|
const char* options, cl_callback_type callback,
|
|
void* user_data ) {
|
|
cl_int err = clBuildProgram( program, num_devices, device_list, options,
|
|
callback, user_data );
|
|
if( err == CL_SUCCESS )
|
|
return;
|
|
std::string str = std::string( "Failed to build program: " ) + name;
|
|
if ( err == CL_BUILD_PROGRAM_FAILURE ) {
|
|
const opencl_device_list &devices = f.devices();
|
|
for ( auto d = devices.begin(); d != devices.end(); ++d ) {
|
|
std::cerr << "Build log for device: " << (*d).name() << std::endl;
|
|
size_t log_size;
|
|
cl_int query_err = clGetProgramBuildInfo(
|
|
program, (*d).my_cl_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL,
|
|
&log_size );
|
|
enforce_cl_retcode( query_err, "Failed to get build log size" );
|
|
if( log_size ) {
|
|
std::vector<char> output;
|
|
output.resize( log_size );
|
|
query_err = clGetProgramBuildInfo(
|
|
program, (*d).my_cl_device_id, CL_PROGRAM_BUILD_LOG,
|
|
output.size(), output.data(), NULL );
|
|
enforce_cl_retcode( query_err, "Failed to get build output" );
|
|
std::cerr << output.data() << std::endl;
|
|
} else {
|
|
std::cerr << "No build log available" << std::endl;
|
|
}
|
|
}
|
|
}
|
|
enforce_cl_retcode( err, str );
|
|
}
|
|
};
|
|
|
|
class opencl_device_filter {
|
|
public:
|
|
template<typename Filter>
|
|
opencl_device_filter( cl_uint& num_devices, cl_device_id* device_list,
|
|
Filter filter, const char* message ) {
|
|
for ( cl_uint i = 0; i < num_devices; ++i )
|
|
if ( filter(device_list[i]) ) {
|
|
device_list[i--] = device_list[--num_devices];
|
|
}
|
|
if ( !num_devices )
|
|
enforce_cl_retcode( CL_DEVICE_NOT_AVAILABLE, message );
|
|
}
|
|
};
|
|
|
|
void init( const std::string& ) const {
|
|
cl_uint num_devices;
|
|
enforce_cl_retcode( clGetContextInfo( my_factory.context(), CL_CONTEXT_NUM_DEVICES, sizeof( num_devices ), &num_devices, NULL ),
|
|
"Failed to get OpenCL context info" );
|
|
if ( !num_devices )
|
|
enforce_cl_retcode( CL_DEVICE_NOT_FOUND, "No supported devices found" );
|
|
cl_device_id *device_list = (cl_device_id *)alloca( num_devices*sizeof( cl_device_id ) );
|
|
enforce_cl_retcode( clGetContextInfo( my_factory.context(), CL_CONTEXT_DEVICES, num_devices*sizeof( cl_device_id ), device_list, NULL ),
|
|
"Failed to get OpenCL context info" );
|
|
const char *options = NULL;
|
|
switch ( my_type ) {
|
|
case opencl_program_type::SOURCE: {
|
|
file_reader fr( my_arg_str );
|
|
const char *s[] = { fr.content() };
|
|
const size_t l[] = { fr.length() };
|
|
cl_int err;
|
|
my_cl_program = clCreateProgramWithSource( my_factory.context(), 1, s, l, &err );
|
|
enforce_cl_retcode( err, std::string( "Failed to create program: " ) + my_arg_str );
|
|
opencl_device_filter(
|
|
num_devices, device_list,
|
|
[]( const opencl_device& d ) -> bool {
|
|
return !d.compiler_available() || !d.linker_available();
|
|
}, "No one device supports building program from sources" );
|
|
opencl_program_builder(
|
|
my_factory, my_arg_str, my_cl_program, num_devices, device_list,
|
|
options, /*callback*/ NULL, /*user data*/NULL );
|
|
break;
|
|
}
|
|
case opencl_program_type::SPIR:
|
|
options = "-x spir";
|
|
case opencl_program_type::PRECOMPILED: {
|
|
file_reader fr( my_arg_str );
|
|
std::vector<const unsigned char*> s(
|
|
num_devices, reinterpret_cast<const unsigned char*>(fr.content()) );
|
|
std::vector<size_t> l( num_devices, fr.length() );
|
|
std::vector<cl_int> bin_statuses( num_devices, -1 );
|
|
cl_int err;
|
|
my_cl_program = clCreateProgramWithBinary( my_factory.context(), num_devices,
|
|
device_list, l.data(), s.data(),
|
|
bin_statuses.data(), &err );
|
|
if( err != CL_SUCCESS ) {
|
|
std::string statuses_str;
|
|
for (auto st = bin_statuses.begin(); st != bin_statuses.end(); ++st) {
|
|
statuses_str += std::to_string((*st));
|
|
}
|
|
|
|
enforce_cl_retcode( err, std::string( "Failed to create program, error " + std::to_string( err ) + " : " ) + my_arg_str +
|
|
std::string( ", binary_statuses = " ) + statuses_str );
|
|
}
|
|
opencl_program_builder(
|
|
my_factory, my_arg_str, my_cl_program, num_devices, device_list,
|
|
options, /*callback*/ NULL, /*user data*/NULL );
|
|
break;
|
|
}
|
|
default:
|
|
__TBB_ASSERT( false, "Unsupported program type" );
|
|
}
|
|
}
|
|
|
|
Factory& my_factory;
|
|
opencl_program_type my_type;
|
|
std::string my_arg_str;
|
|
mutable cl_program my_cl_program;
|
|
mutable std::once_flag my_do_once_flag;
|
|
|
|
template <typename DeviceFilter>
|
|
friend class opencl_factory;
|
|
|
|
template <typename DeviceFilter>
|
|
friend class opencl_factory<DeviceFilter>::kernel;
|
|
};
|
|
|
|
template<typename... Args>
|
|
class opencl_node;
|
|
|
|
template<typename JP, typename Factory, typename... Ports>
|
|
class opencl_node< tuple<Ports...>, JP, Factory > : public streaming_node< tuple<Ports...>, JP, Factory > {
|
|
typedef streaming_node < tuple<Ports...>, JP, Factory > base_type;
|
|
public:
|
|
typedef typename base_type::kernel_type kernel_type;
|
|
|
|
opencl_node( opencl_graph &g, const kernel_type& kernel )
|
|
: base_type( g, kernel, g.get_opencl_foundation().get_default_device_selector(), g.get_opencl_foundation().get_default_opencl_factory() )
|
|
{}
|
|
|
|
opencl_node( opencl_graph &g, const kernel_type& kernel, Factory &f )
|
|
: base_type( g, kernel, g.get_opencl_foundation().get_default_device_selector(), f )
|
|
{}
|
|
|
|
template <typename DeviceSelector>
|
|
opencl_node( opencl_graph &g, const kernel_type& kernel, DeviceSelector d, Factory &f)
|
|
: base_type( g, kernel, d, f)
|
|
{}
|
|
};
|
|
|
|
template<typename JP, typename... Ports>
|
|
class opencl_node< tuple<Ports...>, JP > : public opencl_node < tuple<Ports...>, JP, default_opencl_factory > {
|
|
typedef opencl_node < tuple<Ports...>, JP, default_opencl_factory > base_type;
|
|
public:
|
|
typedef typename base_type::kernel_type kernel_type;
|
|
|
|
opencl_node( opencl_graph &g, const kernel_type& kernel )
|
|
: base_type( g, kernel, g.get_opencl_foundation().get_default_device_selector(), g.get_opencl_foundation().get_default_opencl_factory() )
|
|
{}
|
|
|
|
template <typename DeviceSelector>
|
|
opencl_node( opencl_graph &g, const kernel_type& kernel, DeviceSelector d )
|
|
: base_type( g, kernel, d, g.get_opencl_foundation().get_default_opencl_factory() )
|
|
{}
|
|
};
|
|
|
|
template<typename... Ports>
|
|
class opencl_node< tuple<Ports...> > : public opencl_node < tuple<Ports...>, queueing, default_opencl_factory > {
|
|
typedef opencl_node < tuple<Ports...>, queueing, default_opencl_factory > base_type;
|
|
public:
|
|
typedef typename base_type::kernel_type kernel_type;
|
|
|
|
opencl_node( opencl_graph &g, const kernel_type& kernel )
|
|
: base_type( g, kernel, g.get_opencl_foundation().get_default_device_selector(), g.get_opencl_foundation().get_default_opencl_factory() )
|
|
{}
|
|
|
|
template <typename DeviceSelector>
|
|
opencl_node( opencl_graph &g, const kernel_type& kernel, DeviceSelector d )
|
|
: base_type( g, kernel, d, g.get_opencl_foundation().get_default_opencl_factory() )
|
|
{}
|
|
};
|
|
|
|
} // namespace interface9
|
|
|
|
using interface9::opencl_graph;
|
|
using interface9::opencl_node;
|
|
using interface9::read_only;
|
|
using interface9::read_write;
|
|
using interface9::write_only;
|
|
using interface9::opencl_buffer;
|
|
using interface9::opencl_subbuffer;
|
|
using interface9::opencl_device;
|
|
using interface9::opencl_device_list;
|
|
using interface9::opencl_program;
|
|
using interface9::opencl_program_type;
|
|
using interface9::dependency_msg;
|
|
using interface9::opencl_factory;
|
|
using interface9::opencl_range;
|
|
|
|
} // namespace flow
|
|
} // namespace tbb
|
|
#endif /* __TBB_PREVIEW_OPENCL_NODE */
|
|
|
|
#endif // __TBB_flow_graph_opencl_node_H
|