#!/usr/bin/env python
import numpy as np
try:
import pyopencl as cl
import pyopencl.array as cla
import pyopencl.clmath as clm
import pyopencl.algorithm as algorithm
HAS_OPENCL = True
except:
HAS_OPENCL = False
import time
import cl_fft
import warnings
def ignore_compiler_warnings():
import warnings
warnings.filterwarnings('ignore', category=cl.CompilerWarning)
class SimpleCLWarning(Warning):
pass
#Helper class for automatically managing kernel calls.
class AutoKernel(object):
def __init__(self, kernel, parent):
self.kernel = kernel
self.parent = parent
self.work_group = self.parent.max_work_group_size
def __call__(self, *args, **kwargs):
if 'work_group' in kwargs:
work_group = kwargs.pop('work_group')
default_wg = False
else:
work_group = (self.work_group,)
default_wg = True
if type(work_group) is int: work_group = (work_group,)
work_items = kwargs.pop('work_items', None)
if work_items is None: work_items = tuple(map(lambda x: x*self.parent.max_compute_units, work_group))
args = map(lambda x: self.parent.arg_map_types.get(type(x), lambda y:y)(x), args)
try:
return self.kernel(self.parent.queue, work_items, work_group, *args, **kwargs)
except cl.RuntimeError, e:
if 'out of resources' in str(e).lower() and default_wg and self.work_group > 1:
name = self.kernel.get_info(cl.kernel_info.FUNCTION_NAME)
warnings.warn('kernel %s out of resources, trying a smaller work group size (%d => %d)' % (name, self.work_group, self.work_group//2), SimpleCLWarning, stacklevel=2)
self.work_group //= 2
return self(*args, **kwargs)
else:
raise
[docs]def acquire_opencl_devices(devname):
'''Find OpenCL devices with matching descriptions.
Parameters
----------
devname : string
A partial name for an opencl device, or "cpu" or "gpu".
Returns
-------
devices : list of OpenCL devices
All matching devices. May be empty
'''
n = devname.lower()
devs = []
if n in ('cpu', 'gpu'):
for platform in cl.get_platforms():
try: devs += platform.get_devices(cl.device_type.CPU if n=='cpu' else cl.device_type.GPU)
except: pass #Throws an error if no device found on this platform
else:
for platform in cl.get_platforms():
try: devs += filter(lambda dev: n in dev.name.lower(), platform.get_devices())
except: pass #Throws an error if no device found on this platform
return devs
[docs]class CLSession(object):
'''Create an OpenCL session on the specified device, while defining
float precision and some utility functions.
The actual creation of the OpenCL contexts are delayed until necessary.
This allows the context parameters to be modified before it is used --
this is useful for external modules where the user may which to change the
device used for calculations.
The following data types are defined as members of the class:
=============== ======================================= ============
Name OpenCL type Size (bytes)
=============== ======================================= ============
``real`` ``REAL`` (``float`` or ``double``) 4 or 8
``complex`` ``COMPLEX`` (``float2`` or ``double2``) 8 or 16
``char`` ``char`` 1
``uchar`` ``uchar`` 1
``short`` ``short`` 2
``ushort`` ``ushort`` 2
``int`` ``int`` 4
``uint`` ``uint`` 4
``long`` ``long`` 8
``ulong`` ``ulong`` 8
=============== ======================================= ============
The following functions and constants will also be made available to any
kernel compiled via the session:
=========== =================== =============================== ============================
Type Name Parameters Description
=========== =================== =============================== ============================
``COMPLEX`` ``c_mul`` ``COMPLEX x, COMPLEX y`` :math:`x * y`
``COMPLEX`` ``c_div`` ``COMPLEX x, COMPLEX y`` :math:`x / y`
``COMPLEX`` ``conj`` ``COMPLEX x`` :math:`x^*`
``COMPLEX`` ``c_exp`` ``COMPLEX x`` :math:`\exp(x)`
``REAL`` ``c_angle`` ``COMPLEX x`` :math:`\mathrm{arg}(x)`
``REAL`` ``c_abs`` ``COMPLEX x`` :math:`|x|`
``REAL`` ``c_abs_sq`` ``COMPLEX x`` :math:`|x|^2`
``COMPLEX`` ``c_exp_i`` ``REAL x`` :math:`\exp(i x)`
``COMPLEX`` ``native_exp_i`` ``REAL x`` (uses native precision math)
``COMPLEX`` ``c_exp_i_T`` ``REAL x`` :math:`\exp(-i x)`
``COMPLEX`` ``native_exp_i_T`` ``REAL x`` (uses native precision math)
``REAL`` ``PI`` *constant* :math:`\pi`
``REAL`` ``TWO_PI`` *constant* :math:`2 \pi`
``REAL`` ``EULER_CONSTANT`` *constant* :math:`\gamma = 0.57721...`
=========== =================== =============================== ============================
Parameters
----------
device : openCL device, description string or list of strings
The openCL device to use. If a string or list is passed,
:function: `acquire_opencl_devices` will be used to find devices.
As soon as it finds a matching device it stops; lists of devices
used for fallbacks.
use_doubles : bool (default: False)
Specify if REAL and COMPLEX types are double precision
group_size : int (default: max for device)
The default group_size in calls. If not specified, determined by
the max_workgroup_size for the device (recommended). If the device
type is CPU, this will be set to 1 (CPUs sometimes erroneously
report more than 1 for this value)
show_warnings : bool, optional (default: False)
If True, warnings displayed in build error messages.
'''
device_info = cl.device_info
def __init__(self, device=['cpu', 'gpu'], use_doubles=False, group_size=None, show_warnings=False):
self.device = device
self.use_doubles = use_doubles
self.group_size = group_size
self.delayed_code = []
self.char = np.int8
self.uchar = np.uint8
self.short = np.int16
self.ushort = np.uint16
self.int = np.int32
self.uint = np.uint32
self.long = np.int64
self.ulong = np.uint64
self.show_warnings = show_warnings
def __setattr__(self, k, v):
if k in ('device', 'use_doubles', 'group_size') and hasattr(self, 'context'):
raise RuntimeError('this context has already been used -- ')
else: object.__setattr__(self, k, v)
def _check(self):
if not hasattr(self, 'context'):
self.initialize()
def __getattr__(self, attr):
if attr in ('real', 'complex'):
self._check()
if attr == 'real': return np.float64 if self.use_doubles else np.float32
else: return np.complex128 if self.use_doubles else np.complex64
elif hasattr(self, 'kernels') and attr in self.kernels: return self.kernels[attr]
else: raise AttributeError("CLSession has no attribute '%s'" % attr)
[docs] def initialize(self, context=None):
'''Initialize the OpenCL context, if not already done.
Parameters
----------
context : None or pyopencl Context
The OpenCL context to used. If none, created using the preset
"device" attribute.
'''
if hasattr(self, 'context'): return
if not HAS_OPENCL:
raise RuntimeError('pyopencl is not installed on this computer\n(if OpenCL is installed on this computer, try running:\n sudo easy_install pyopencl)')
if context is None:
if type(self.device) is not cl.Device:
if type(self.device) == str: self.device = [self.device]
for d in self.device:
devs = acquire_opencl_devices(d)
if devs:
self.device = devs[0]
break
else:
raise RuntimeError('failed to find the specified openCL devices: %s\nIs opencl installed and configured properly?' % (self.device))
self.context = cl.Context([self.device])
else:
if isinstance(context, CLSession): self.context = context.context
else: self.context = context
self.queue = cl.CommandQueue(self.context)
if self.use_doubles:
self.prefix = '''
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define PYOPENCL_DEFINE_CDOUBLE
#include <pyopencl-complex.h>
#define REAL double
#define REAL2 double2
#define REAL4 double4
#define REAL8 double8
#define COMPLEX cdouble_t
#define c_exp cdouble_exp
#define c_mul cdouble_mul
#define c_div cdouble_div
'''
#COMPLEX native_exp_i(REAL x) {return (COMPLEX)((double)native_cos((float)x), (double)native_sin((float)x));}
#COMPLEX native_exp_i_T(REAL x) {return (COMPLEX)((double)native_cos((float)x), -(double)native_sin((float)x));}
self.real = np.float64
self.complex = np.complex128
else:
self.prefix = '''
#include <pyopencl-complex.h>
#define REAL float
#define REAL2 float2
#define REAL4 float4
#define REAL8 float8
#define COMPLEX cfloat_t
#define c_exp cfloat_exp
#define c_mul cfloat_mul
#define c_div cfloat_div
//NPM commented out//
//COMPLEX native_exp_i(REAL x) {return (COMPLEX)(native_cos(x), native_sin(x));}//
//COMPLEX native_exp_i_T(REAL x) {return (COMPLEX)(native_cos(x), -native_sin(x));}//
'''
self.real = np.float32
self.complex = np.complex64
self.prefix += '''
#define PI (REAL)3.141592653589793
#define TWO_PI (REAL)6.283185307179586
#define EULER_CONSTANT (REAL)0.57721566490153286
#define get_3D(X, index) ((REAL4)(X[3*(index)+0], X[3*(index)+1], X[3*(index)+2], 0.0))
void set_3D(__global REAL* X, int index, REAL4 Y) {
X[3*index + 0] = Y.x;
X[3*index + 1] = Y.y;
X[3*index + 2] = Y.z;
}
REAL c_angle(COMPLEX x) {return atan2(x.y, x.x);}
//NPM commented out//
//COMPLEX c_exp_i(REAL x) {return (COMPLEX)(cos(x), sin(x));}//
//COMPLEX c_exp_i_T(REAL x) {return (COMPLEX)(cos(x), -sin(x));}//
//COMPLEX conj(COMPLEX x) {return (COMPLEX)(x.x, -x.y);}//
//REAL c_abs(COMPLEX x) {return length(x);}//
//REAL c_abs_sq(COMPLEX x) {return dot(x, x);}//
'''
#Get some info.
if self.group_size is None:
if self.device.type == cl.device_type.CPU:
self.max_work_group_size = 1
else:
self.max_work_group_size = self.device.get_info(cl.device_info.MAX_WORK_GROUP_SIZE)
else: self.max_work_group_size = self.group_size
self.max_compute_units = self.device.get_info(cl.device_info.MAX_COMPUTE_UNITS)
self.default_work_items = (self.max_compute_units * self.max_work_group_size, )
self.default_work_group = (self.max_work_group_size,)
self.programs = []
self.kernels = {}
self.arg_map_types = {
cla.Array: lambda x: x.data,
int: lambda x: self.int(x),
float: lambda x: self.real(x),
np.ndarray: lambda x: self.to_device(x).data,
}
for code in self.delayed_code:
self.compile(code)
[docs] def to_device(self, X):
'''Make a copy of a local array on the OpenCL device.
Parameters
----------
X : numpy array
Returns
-------
X_cl : pyopencl array
A version of the array stored on the OpenCL device. If the data
type is float or double, it will be converted to the precision
of the device context.
'''
self._check()
if X.dtype in ('f', 'd'): X = X.astype(self.real)
elif X.dtype in ('complex128', 'complex64'): X = X.astype(self.complex)
return cla.to_device(self.queue, np.require(X, requirements='C'))
[docs] def compile_file(self, fn):
'''Compile OpenCL kernels from file. See :meth:`compile` for details.
'''
with open(fn, 'rt') as f:
code = f.read()
return self.compile(code)
[docs] def compile(self, code):
'''Compile OpenCL kernel code (as a string) on the present device.
Compliation is delayed if this session is not initialized.
If you require access to the Program object (which is usually not
important), call :meth:`initialize` first.
Returns
-------
program : pyopencl Program or None
The compiled program. May be used to make function calls, although
the contained kernels will automatically become functions of the
base class. If context is not initialized yet, returns None.
'''
if not hasattr(self, 'context'):
self.delayed_code.append(code)
return None
else:
try:
p = cl.Program(self.context, self.prefix + code).build()
except cl.RuntimeError, e:
if self.show_warnings: raise
else: #Filter the error messge to get rid of tons and tons of useless warnings.
#prefix_lines = len(self.prefix.splitlines())
prefix_lines = self.prefix.count('\n')
msg = str(e).splitlines()
new_msg = []
while msg:
line = msg.pop(0)
parts = line.split(':')
if len(parts) >= 4 and parts[3].strip().lower() in ('warning', 'note'):
msg = msg[2:]
if new_msg[-1].startswith('In file included') and new_msg[-2].startswith('In file included'):
new_msg.pop(-1)
else:
#Try to change the line number.
if len(parts) >= 4 and parts[3].strip().lower() == 'error':
try: parts[1] = str(int(parts[1]) - prefix_lines)
except: pass
new_msg.append(':'.join(parts))
raise cl.RuntimeError('\n'.join(new_msg))
for kernel in p.all_kernels():
name = kernel.get_info(cl.kernel_info.FUNCTION_NAME)
#print name
kernel = getattr(p, name)
#print name, type(kernel), dir(kernel)
if name in self.kernels: print "WARNING: %s already in kernels, overwriting." % name
self.kernels[name] = AutoKernel(kernel, self)
self.programs.append(p)
return p
[docs] def empty(self, shape, dtype=None):
'''Create an empty array on the OpenCL device.
Parameters
----------
shape : tuple
dtype : numpy data type
Returns
-------
arr : pyopencl array
An empty array of the specified type.
'''
self._check()
if dtype is None: dtype = self.real
return cla.empty(self.queue, shape, dtype=dtype)
[docs] def empty_like(self, arr):
'''Create an empty array with copied shape/dtype.'''
self._check()
return cla.empty_like(arr)
[docs] def zeros(self, shape, dtype=None):
'''Create a zeroed array on the OpenCL device.
Parameters
----------
shape : tuple
dtype : numpy data type
Returns
-------
arr : pyopencl array
A zeroed array of the specified type.
'''
self._check()
if dtype is None: dtype = self.real
return cla.zeros(self.queue, shape, dtype=dtype)
[docs] def ones_like(self, arr):
'''Create a one filled array with copied shape/dtype.'''
self._check()
return self.to_device(np.ones(arr.shape, dtype=arr.dtype))
[docs] def ones(self, shape, dtype=None):
'''Create a one filled array on the OpenCL device.
Parameters
----------
shape : tuple
dtype : numpy data type
Returns
-------
arr : pyopencl array
A zeroed array of the specified type.
'''
self._check()
if dtype is None: dtype = self.real
return self.to_device(np.ones(shape, dtype))
[docs] def zeros_like(self, arr):
'''Create a zeroed array with copied shape/dtype.'''
self._check()
return cla.zeros_like(arr)
[docs] def local_memory(self, size):
'''Create a local memory object on the OpenCL device.
Parameters
----------
size : int
The size (in bytes) of the local memory required.
Returns
-------
mem : cl.LocalMemory
A local memory object for an OpenCL kernel.
'''
self._check()
return cl.LocalMemory(size)
[docs] def enqueue_copy(self, dst, src):
'''Call ``pyopencl.enqueue_copy`` with the session queue.'''
return cl.enqueue_copy(self.queue, dst, src)
[docs] def fft(self, arr, sign=+1, inplace=True, swap=None, max_threads=None):
'''Perform a multidimensional FFT on a pyopencl array.
Note: array dimension sizes must be multiples of 2, 3, 5, 7.
Parameters
----------
arr : pyopencl array, or an object castable to one (e.g. numpy array)
sign : integer (+-1)
Forward or backward transform.
inplace : bool
In place transform? If not, a copy is made first.
Note: if a numpy array is passed it will never by in place!
swap : pyopencl array (default: None)
The swap buffer used in the calculation. If not specified, one
will be created. Should have the same dimensions/type as arr
max_threads : int (default self.max_work_group_size)
The max_thread argument used for the fft code generation. Note that
if a given size FFT has already been generated, this will be
ignored. (Generally, this should be left unset, unless there isn't
enough local memory to transform multiple blocks.)
Returns
-------
arr : opencl array
The output array; will match the input array if inplace.
'''
self._check()
if not hasattr(self, 'fft_kernels'): self.fft_kernels = {}
w_size = max(arr.shape)
data_size = 0
if max_threads is None: max_threads = self.max_work_group_size
for N in arr.shape:
if N not in self.fft_kernels:
func_name = name='fft_shift_%d' % N
blocks_per_group, threads, code = cl_fft.gen_fft_code(N, max_threads=max_threads, func_name=func_name)
#print '-'*40
#print code
#print '-'*40
self.compile(code)
self.fft_kernels[N] = (blocks_per_group, threads, self.kernels[func_name])
else:
blocks_per_group = self.fft_kernels[N][0]
ds = blocks_per_group * N
if ds > data_size: data_size = ds
print data_size, w_size
local_mem = [cl.LocalMemory(np.dtype(self.complex).itemsize * points) for points in (w_size, data_size, data_size)]
if type(arr) == cla.Array:
if inplace: a1 = arr
else: a1 = arr.copy()
else:
a1 = self.to_device(arr)
inplace = False
if swap is not None:
if type(swap) == type(a1) and a1.shape == swap.shape:
a2 = swap
else:
swap = None
if swap is None:
a2 = self.empty_like(a1)
Ntot = np.prod(arr.shape)
for N in arr.shape[::-1]:
#print N, Ntot//N
blocks_per_group, threads, func = self.fft_kernels[N]
job = func(a1, a2, self.int(sign), self.int(Ntot//N), *local_mem) #, work_group=(threads,))
#job.wait()
a1, a2 = a2, a1
if inplace:
if a1 is not arr: cl.enqueue_copy(self.queue, arr.data, a1.data)
return arr
else:
return a1
[docs] def get_device_info(self, field):
'''Get info about the opencl device for the session. See :func:`get_device_info`'''
return get_device_info(self.device, field)
[docs]def get_device_info(device, field):
'''Get information about an opencl device.
Parameters
----------
field : A valid field from ``pyopencl.device_info`` or str
The field to obtain. If specified as a string, name should match a
member of pyopencl.device_info, ignoring case. Examples include:
``"global_mem_size"`` and ``"max_compute_units"``.
Returns
-------
value : varies (usually int)
The value returned by calling device.get_info(...)
'''
if isinstance(field, basestring):
f0 = field
field = getattr(cl.device_info, field.upper(), None)
if field is None:
raise ValueError('"%s" is not a valid device info field.\n(should be a member of pyopencl.device_info)' )
return device.get_info(field)
[docs]def is_job_done(event):
'''Check if an OpenCL task is done.
Parameters
----------
event : pyopencl Event
Returns
-------
done : bool
'''
return event.command_execution_status == cl.command_execution_status.COMPLETE
#__global_cl_session = None
#__global_cl_params = {}
#__global_cl_kernels = []
#
#def get_cl(**kwargs):
# if kwargs and kwargs !=
#This was a dumb idea! Built this into the main CLSession class instead.
#
#class CLSessionOnDemand(object):
# '''An object which returns and OpenCL Session on demand.
#
# Used by other modules which wish to create OpenCL Sessions that allow
# users to optionally configure the calculations.
#
# Also delays compilation -- this is useful if modules have only some
# functions that depend on OpenCL, since it will allow users that don't have
# pyopencl installed to run the other functions.
#
# Parameters
# ----------
# device : list
# use_doubles : bool
# group_size : None or int
# source_file : string
# The filename of a code file to be compiled once the session is initialized.
#
# '''
# def __init__(self, device=['cpu', 'gpu'], use_doubles=False, group_size=None, source_file=None):
# self.device = device
# self.use_doubles = use_doubles
# self.group_size = group_size
# self.source_file = source_file
# self.session = None
#
#
# def _check(self):
# if self.session is not None:
# raise RuntimeError('CLSession parameters need to be set before OpenCL calls')
#
#
# def set_device(self, device):
# '''Set the CLSession device.
#
# Parameters
# ----------
# device : openCL device, description string or list of strings
# The openCL device to use. If a string or list is passed,
# :function: `acquire_opencl_devices` will be used to find devices.
# As soon as it finds a matching device it stops; lists of devices
# used for fallbacks.
# '''
# self._check()
# self.device = device
#
#
# def set_doubles(self, use_doubles):
# '''Set the REAL and COMPLEX precision of the session.
#
# Parameters
# ----------
# use_doubles : bool (default: False)
# Specify if REAL and COMPLEX types are double precision
# '''
# self._check()
# self.use_doubles = use_doubles
#
#
# def set_group_size(self, group_size):
# '''Set the group size of the session.
#
# Parameters
# ----------
# group_size : int (default: max for device)
# The default group_size in calls. If not specified, determined by
# the max_workgroup_size for the device (recommended). If the device
# type is CPU, this will be set to 1 (CPUs sometimes erroniously
# report more than 1 for this value)
# '''
# self._check()
# self.group_size = group_size
#
#
# def use_existing_session(self, cls):
# '''Share an existing CLSession.
#
# *Note: the parameters of the existing session will be used instead
# of those set by this class.*
#
# Parameters
# ----------
# cls : CLSesssion
# '''
# if self.session is not None:
# raise RuntimeError('this on demand CL session has already been created, cannot assign new session')
# self.session = cls
# self.session.complile_file(self.source_file)
#
#
# def get(self):
# '''Return a CLSession, creating it if needed.'''
# if self.session is None:
# self.session = CLSession(device=self.device, group_size=self.group_size)
#
# return self.session
#
if __name__ == '__main__':
NP = 10**6
REPEATS = 10
ctx = CLSession(device='cpu')
ctx.compile('''
__kernel void dot_product(__global REAL* a, __global REAL* b, int np, __global REAL* result)
{
for(int i = get_global_id(0); i < np; i += get_global_size(0)) {
result[i] = dot(get_3D(a, i), get_3D(b, i));
}
}''')
#a = ctx.to_device(np.random.rand(NP, 3))
a = ctx.to_device(np.random.rand(NP, 3))
b = ctx.to_device(np.random.rand(NP, 3))
result = ctx.empty(NP)
start = time.time()
# for n in range(REPEATS):
stat = {
cl.command_execution_status.COMPLETE: 'complete',
cl.command_execution_status.QUEUED: 'queued',
cl.command_execution_status.RUNNING: 'running',
cl.command_execution_status.SUBMITTED: 'submitted',
}
event = ctx.dot_product(a, b, NP, result)
event = ctx.dot_product(a, b, NP, result)
#for n in range(10):
# time.sleep(1E-3)
# print stat[event.command_execution_status]
result.get()
print time.time() - start
print np.abs((a.get()*b.get()).sum(-1) - result.get()).sum() / NP