Source code for ilpm.simple_cl

#!/usr/bin/env python
import numpy as np
try:
    import pyopencl as cl
    import pyopencl.array as cla
    import pyopencl.clmath as clm
    import pyopencl.algorithm as algorithm
    HAS_OPENCL = True
except:
    HAS_OPENCL = False
import time
import cl_fft
import warnings


def ignore_compiler_warnings():
    import warnings
    warnings.filterwarnings('ignore', category=cl.CompilerWarning)


class SimpleCLWarning(Warning):
    pass

#Helper class for automatically managing kernel calls.
class AutoKernel(object):
    def __init__(self, kernel, parent):
        self.kernel = kernel
        self.parent = parent
        self.work_group = self.parent.max_work_group_size
        
    def __call__(self, *args, **kwargs):
        if 'work_group' in kwargs:
            work_group = kwargs.pop('work_group')
            default_wg = False
        else:
            work_group = (self.work_group,)
            default_wg = True

        if type(work_group) is int: work_group = (work_group,)
            
        work_items = kwargs.pop('work_items', None)
        if work_items is None: work_items = tuple(map(lambda x: x*self.parent.max_compute_units, work_group))
        
        args = map(lambda x: self.parent.arg_map_types.get(type(x), lambda y:y)(x), args)
        
        try:
            return self.kernel(self.parent.queue, work_items, work_group, *args, **kwargs)
        except cl.RuntimeError, e:
            if 'out of resources' in str(e).lower() and default_wg and self.work_group > 1:
                name = self.kernel.get_info(cl.kernel_info.FUNCTION_NAME)
                warnings.warn('kernel %s out of resources, trying a smaller work group size (%d => %d)' % (name, self.work_group, self.work_group//2), SimpleCLWarning, stacklevel=2)
                self.work_group //= 2
                return self(*args, **kwargs)
            else:
                raise
         
[docs]def acquire_opencl_devices(devname): '''Find OpenCL devices with matching descriptions. Parameters ---------- devname : string A partial name for an opencl device, or "cpu" or "gpu". Returns ------- devices : list of OpenCL devices All matching devices. May be empty ''' n = devname.lower() devs = [] if n in ('cpu', 'gpu'): for platform in cl.get_platforms(): try: devs += platform.get_devices(cl.device_type.CPU if n=='cpu' else cl.device_type.GPU) except: pass #Throws an error if no device found on this platform else: for platform in cl.get_platforms(): try: devs += filter(lambda dev: n in dev.name.lower(), platform.get_devices()) except: pass #Throws an error if no device found on this platform return devs
[docs]class CLSession(object): '''Create an OpenCL session on the specified device, while defining float precision and some utility functions. The actual creation of the OpenCL contexts are delayed until necessary. This allows the context parameters to be modified before it is used -- this is useful for external modules where the user may which to change the device used for calculations. The following data types are defined as members of the class: =============== ======================================= ============ Name OpenCL type Size (bytes) =============== ======================================= ============ ``real`` ``REAL`` (``float`` or ``double``) 4 or 8 ``complex`` ``COMPLEX`` (``float2`` or ``double2``) 8 or 16 ``char`` ``char`` 1 ``uchar`` ``uchar`` 1 ``short`` ``short`` 2 ``ushort`` ``ushort`` 2 ``int`` ``int`` 4 ``uint`` ``uint`` 4 ``long`` ``long`` 8 ``ulong`` ``ulong`` 8 =============== ======================================= ============ The following functions and constants will also be made available to any kernel compiled via the session: =========== =================== =============================== ============================ Type Name Parameters Description =========== =================== =============================== ============================ ``COMPLEX`` ``c_mul`` ``COMPLEX x, COMPLEX y`` :math:`x * y` ``COMPLEX`` ``c_div`` ``COMPLEX x, COMPLEX y`` :math:`x / y` ``COMPLEX`` ``conj`` ``COMPLEX x`` :math:`x^*` ``COMPLEX`` ``c_exp`` ``COMPLEX x`` :math:`\exp(x)` ``REAL`` ``c_angle`` ``COMPLEX x`` :math:`\mathrm{arg}(x)` ``REAL`` ``c_abs`` ``COMPLEX x`` :math:`|x|` ``REAL`` ``c_abs_sq`` ``COMPLEX x`` :math:`|x|^2` ``COMPLEX`` ``c_exp_i`` ``REAL x`` :math:`\exp(i x)` ``COMPLEX`` ``native_exp_i`` ``REAL x`` (uses native precision math) ``COMPLEX`` ``c_exp_i_T`` ``REAL x`` :math:`\exp(-i x)` ``COMPLEX`` ``native_exp_i_T`` ``REAL x`` (uses native precision math) ``REAL`` ``PI`` *constant* :math:`\pi` ``REAL`` ``TWO_PI`` *constant* :math:`2 \pi` ``REAL`` ``EULER_CONSTANT`` *constant* :math:`\gamma = 0.57721...` =========== =================== =============================== ============================ Parameters ---------- device : openCL device, description string or list of strings The openCL device to use. If a string or list is passed, :function: `acquire_opencl_devices` will be used to find devices. As soon as it finds a matching device it stops; lists of devices used for fallbacks. use_doubles : bool (default: False) Specify if REAL and COMPLEX types are double precision group_size : int (default: max for device) The default group_size in calls. If not specified, determined by the max_workgroup_size for the device (recommended). If the device type is CPU, this will be set to 1 (CPUs sometimes erroneously report more than 1 for this value) show_warnings : bool, optional (default: False) If True, warnings displayed in build error messages. ''' device_info = cl.device_info def __init__(self, device=['cpu', 'gpu'], use_doubles=False, group_size=None, show_warnings=False): self.device = device self.use_doubles = use_doubles self.group_size = group_size self.delayed_code = [] self.char = np.int8 self.uchar = np.uint8 self.short = np.int16 self.ushort = np.uint16 self.int = np.int32 self.uint = np.uint32 self.long = np.int64 self.ulong = np.uint64 self.show_warnings = show_warnings def __setattr__(self, k, v): if k in ('device', 'use_doubles', 'group_size') and hasattr(self, 'context'): raise RuntimeError('this context has already been used -- ') else: object.__setattr__(self, k, v) def _check(self): if not hasattr(self, 'context'): self.initialize() def __getattr__(self, attr): if attr in ('real', 'complex'): self._check() if attr == 'real': return np.float64 if self.use_doubles else np.float32 else: return np.complex128 if self.use_doubles else np.complex64 elif hasattr(self, 'kernels') and attr in self.kernels: return self.kernels[attr] else: raise AttributeError("CLSession has no attribute '%s'" % attr)
[docs] def initialize(self, context=None): '''Initialize the OpenCL context, if not already done. Parameters ---------- context : None or pyopencl Context The OpenCL context to used. If none, created using the preset "device" attribute. ''' if hasattr(self, 'context'): return if not HAS_OPENCL: raise RuntimeError('pyopencl is not installed on this computer\n(if OpenCL is installed on this computer, try running:\n sudo easy_install pyopencl)') if context is None: if type(self.device) is not cl.Device: if type(self.device) == str: self.device = [self.device] for d in self.device: devs = acquire_opencl_devices(d) if devs: self.device = devs[0] break else: raise RuntimeError('failed to find the specified openCL devices: %s\nIs opencl installed and configured properly?' % (self.device)) self.context = cl.Context([self.device]) else: if isinstance(context, CLSession): self.context = context.context else: self.context = context self.queue = cl.CommandQueue(self.context) if self.use_doubles: self.prefix = ''' #pragma OPENCL EXTENSION cl_khr_fp64: enable #define PYOPENCL_DEFINE_CDOUBLE #include <pyopencl-complex.h> #define REAL double #define REAL2 double2 #define REAL4 double4 #define REAL8 double8 #define COMPLEX cdouble_t #define c_exp cdouble_exp #define c_mul cdouble_mul #define c_div cdouble_div ''' #COMPLEX native_exp_i(REAL x) {return (COMPLEX)((double)native_cos((float)x), (double)native_sin((float)x));} #COMPLEX native_exp_i_T(REAL x) {return (COMPLEX)((double)native_cos((float)x), -(double)native_sin((float)x));} self.real = np.float64 self.complex = np.complex128 else: self.prefix = ''' #include <pyopencl-complex.h> #define REAL float #define REAL2 float2 #define REAL4 float4 #define REAL8 float8 #define COMPLEX cfloat_t #define c_exp cfloat_exp #define c_mul cfloat_mul #define c_div cfloat_div //NPM commented out// //COMPLEX native_exp_i(REAL x) {return (COMPLEX)(native_cos(x), native_sin(x));}// //COMPLEX native_exp_i_T(REAL x) {return (COMPLEX)(native_cos(x), -native_sin(x));}// ''' self.real = np.float32 self.complex = np.complex64 self.prefix += ''' #define PI (REAL)3.141592653589793 #define TWO_PI (REAL)6.283185307179586 #define EULER_CONSTANT (REAL)0.57721566490153286 #define get_3D(X, index) ((REAL4)(X[3*(index)+0], X[3*(index)+1], X[3*(index)+2], 0.0)) void set_3D(__global REAL* X, int index, REAL4 Y) { X[3*index + 0] = Y.x; X[3*index + 1] = Y.y; X[3*index + 2] = Y.z; } REAL c_angle(COMPLEX x) {return atan2(x.y, x.x);} //NPM commented out// //COMPLEX c_exp_i(REAL x) {return (COMPLEX)(cos(x), sin(x));}// //COMPLEX c_exp_i_T(REAL x) {return (COMPLEX)(cos(x), -sin(x));}// //COMPLEX conj(COMPLEX x) {return (COMPLEX)(x.x, -x.y);}// //REAL c_abs(COMPLEX x) {return length(x);}// //REAL c_abs_sq(COMPLEX x) {return dot(x, x);}// ''' #Get some info. if self.group_size is None: if self.device.type == cl.device_type.CPU: self.max_work_group_size = 1 else: self.max_work_group_size = self.device.get_info(cl.device_info.MAX_WORK_GROUP_SIZE) else: self.max_work_group_size = self.group_size self.max_compute_units = self.device.get_info(cl.device_info.MAX_COMPUTE_UNITS) self.default_work_items = (self.max_compute_units * self.max_work_group_size, ) self.default_work_group = (self.max_work_group_size,) self.programs = [] self.kernels = {} self.arg_map_types = { cla.Array: lambda x: x.data, int: lambda x: self.int(x), float: lambda x: self.real(x), np.ndarray: lambda x: self.to_device(x).data, } for code in self.delayed_code: self.compile(code)
[docs] def to_device(self, X): '''Make a copy of a local array on the OpenCL device. Parameters ---------- X : numpy array Returns ------- X_cl : pyopencl array A version of the array stored on the OpenCL device. If the data type is float or double, it will be converted to the precision of the device context. ''' self._check() if X.dtype in ('f', 'd'): X = X.astype(self.real) elif X.dtype in ('complex128', 'complex64'): X = X.astype(self.complex) return cla.to_device(self.queue, np.require(X, requirements='C'))
[docs] def compile_file(self, fn): '''Compile OpenCL kernels from file. See :meth:`compile` for details. ''' with open(fn, 'rt') as f: code = f.read() return self.compile(code)
[docs] def compile(self, code): '''Compile OpenCL kernel code (as a string) on the present device. Compliation is delayed if this session is not initialized. If you require access to the Program object (which is usually not important), call :meth:`initialize` first. Returns ------- program : pyopencl Program or None The compiled program. May be used to make function calls, although the contained kernels will automatically become functions of the base class. If context is not initialized yet, returns None. ''' if not hasattr(self, 'context'): self.delayed_code.append(code) return None else: try: p = cl.Program(self.context, self.prefix + code).build() except cl.RuntimeError, e: if self.show_warnings: raise else: #Filter the error messge to get rid of tons and tons of useless warnings. #prefix_lines = len(self.prefix.splitlines()) prefix_lines = self.prefix.count('\n') msg = str(e).splitlines() new_msg = [] while msg: line = msg.pop(0) parts = line.split(':') if len(parts) >= 4 and parts[3].strip().lower() in ('warning', 'note'): msg = msg[2:] if new_msg[-1].startswith('In file included') and new_msg[-2].startswith('In file included'): new_msg.pop(-1) else: #Try to change the line number. if len(parts) >= 4 and parts[3].strip().lower() == 'error': try: parts[1] = str(int(parts[1]) - prefix_lines) except: pass new_msg.append(':'.join(parts)) raise cl.RuntimeError('\n'.join(new_msg)) for kernel in p.all_kernels(): name = kernel.get_info(cl.kernel_info.FUNCTION_NAME) #print name kernel = getattr(p, name) #print name, type(kernel), dir(kernel) if name in self.kernels: print "WARNING: %s already in kernels, overwriting." % name self.kernels[name] = AutoKernel(kernel, self) self.programs.append(p) return p
[docs] def empty(self, shape, dtype=None): '''Create an empty array on the OpenCL device. Parameters ---------- shape : tuple dtype : numpy data type Returns ------- arr : pyopencl array An empty array of the specified type. ''' self._check() if dtype is None: dtype = self.real return cla.empty(self.queue, shape, dtype=dtype)
[docs] def empty_like(self, arr): '''Create an empty array with copied shape/dtype.''' self._check() return cla.empty_like(arr)
[docs] def zeros(self, shape, dtype=None): '''Create a zeroed array on the OpenCL device. Parameters ---------- shape : tuple dtype : numpy data type Returns ------- arr : pyopencl array A zeroed array of the specified type. ''' self._check() if dtype is None: dtype = self.real return cla.zeros(self.queue, shape, dtype=dtype)
[docs] def ones_like(self, arr): '''Create a one filled array with copied shape/dtype.''' self._check() return self.to_device(np.ones(arr.shape, dtype=arr.dtype))
[docs] def ones(self, shape, dtype=None): '''Create a one filled array on the OpenCL device. Parameters ---------- shape : tuple dtype : numpy data type Returns ------- arr : pyopencl array A zeroed array of the specified type. ''' self._check() if dtype is None: dtype = self.real return self.to_device(np.ones(shape, dtype))
[docs] def zeros_like(self, arr): '''Create a zeroed array with copied shape/dtype.''' self._check() return cla.zeros_like(arr)
[docs] def local_memory(self, size): '''Create a local memory object on the OpenCL device. Parameters ---------- size : int The size (in bytes) of the local memory required. Returns ------- mem : cl.LocalMemory A local memory object for an OpenCL kernel. ''' self._check() return cl.LocalMemory(size)
[docs] def enqueue_copy(self, dst, src): '''Call ``pyopencl.enqueue_copy`` with the session queue.''' return cl.enqueue_copy(self.queue, dst, src)
[docs] def fft(self, arr, sign=+1, inplace=True, swap=None, max_threads=None): '''Perform a multidimensional FFT on a pyopencl array. Note: array dimension sizes must be multiples of 2, 3, 5, 7. Parameters ---------- arr : pyopencl array, or an object castable to one (e.g. numpy array) sign : integer (+-1) Forward or backward transform. inplace : bool In place transform? If not, a copy is made first. Note: if a numpy array is passed it will never by in place! swap : pyopencl array (default: None) The swap buffer used in the calculation. If not specified, one will be created. Should have the same dimensions/type as arr max_threads : int (default self.max_work_group_size) The max_thread argument used for the fft code generation. Note that if a given size FFT has already been generated, this will be ignored. (Generally, this should be left unset, unless there isn't enough local memory to transform multiple blocks.) Returns ------- arr : opencl array The output array; will match the input array if inplace. ''' self._check() if not hasattr(self, 'fft_kernels'): self.fft_kernels = {} w_size = max(arr.shape) data_size = 0 if max_threads is None: max_threads = self.max_work_group_size for N in arr.shape: if N not in self.fft_kernels: func_name = name='fft_shift_%d' % N blocks_per_group, threads, code = cl_fft.gen_fft_code(N, max_threads=max_threads, func_name=func_name) #print '-'*40 #print code #print '-'*40 self.compile(code) self.fft_kernels[N] = (blocks_per_group, threads, self.kernels[func_name]) else: blocks_per_group = self.fft_kernels[N][0] ds = blocks_per_group * N if ds > data_size: data_size = ds print data_size, w_size local_mem = [cl.LocalMemory(np.dtype(self.complex).itemsize * points) for points in (w_size, data_size, data_size)] if type(arr) == cla.Array: if inplace: a1 = arr else: a1 = arr.copy() else: a1 = self.to_device(arr) inplace = False if swap is not None: if type(swap) == type(a1) and a1.shape == swap.shape: a2 = swap else: swap = None if swap is None: a2 = self.empty_like(a1) Ntot = np.prod(arr.shape) for N in arr.shape[::-1]: #print N, Ntot//N blocks_per_group, threads, func = self.fft_kernels[N] job = func(a1, a2, self.int(sign), self.int(Ntot//N), *local_mem) #, work_group=(threads,)) #job.wait() a1, a2 = a2, a1 if inplace: if a1 is not arr: cl.enqueue_copy(self.queue, arr.data, a1.data) return arr else: return a1
[docs] def get_device_info(self, field): '''Get info about the opencl device for the session. See :func:`get_device_info`''' return get_device_info(self.device, field)
[docs]def get_device_info(device, field): '''Get information about an opencl device. Parameters ---------- field : A valid field from ``pyopencl.device_info`` or str The field to obtain. If specified as a string, name should match a member of pyopencl.device_info, ignoring case. Examples include: ``"global_mem_size"`` and ``"max_compute_units"``. Returns ------- value : varies (usually int) The value returned by calling device.get_info(...) ''' if isinstance(field, basestring): f0 = field field = getattr(cl.device_info, field.upper(), None) if field is None: raise ValueError('"%s" is not a valid device info field.\n(should be a member of pyopencl.device_info)' ) return device.get_info(field)
[docs]def is_job_done(event): '''Check if an OpenCL task is done. Parameters ---------- event : pyopencl Event Returns ------- done : bool ''' return event.command_execution_status == cl.command_execution_status.COMPLETE
#__global_cl_session = None #__global_cl_params = {} #__global_cl_kernels = [] # #def get_cl(**kwargs): # if kwargs and kwargs != #This was a dumb idea! Built this into the main CLSession class instead. # #class CLSessionOnDemand(object): # '''An object which returns and OpenCL Session on demand. # # Used by other modules which wish to create OpenCL Sessions that allow # users to optionally configure the calculations. # # Also delays compilation -- this is useful if modules have only some # functions that depend on OpenCL, since it will allow users that don't have # pyopencl installed to run the other functions. # # Parameters # ---------- # device : list # use_doubles : bool # group_size : None or int # source_file : string # The filename of a code file to be compiled once the session is initialized. # # ''' # def __init__(self, device=['cpu', 'gpu'], use_doubles=False, group_size=None, source_file=None): # self.device = device # self.use_doubles = use_doubles # self.group_size = group_size # self.source_file = source_file # self.session = None # # # def _check(self): # if self.session is not None: # raise RuntimeError('CLSession parameters need to be set before OpenCL calls') # # # def set_device(self, device): # '''Set the CLSession device. # # Parameters # ---------- # device : openCL device, description string or list of strings # The openCL device to use. If a string or list is passed, # :function: `acquire_opencl_devices` will be used to find devices. # As soon as it finds a matching device it stops; lists of devices # used for fallbacks. # ''' # self._check() # self.device = device # # # def set_doubles(self, use_doubles): # '''Set the REAL and COMPLEX precision of the session. # # Parameters # ---------- # use_doubles : bool (default: False) # Specify if REAL and COMPLEX types are double precision # ''' # self._check() # self.use_doubles = use_doubles # # # def set_group_size(self, group_size): # '''Set the group size of the session. # # Parameters # ---------- # group_size : int (default: max for device) # The default group_size in calls. If not specified, determined by # the max_workgroup_size for the device (recommended). If the device # type is CPU, this will be set to 1 (CPUs sometimes erroniously # report more than 1 for this value) # ''' # self._check() # self.group_size = group_size # # # def use_existing_session(self, cls): # '''Share an existing CLSession. # # *Note: the parameters of the existing session will be used instead # of those set by this class.* # # Parameters # ---------- # cls : CLSesssion # ''' # if self.session is not None: # raise RuntimeError('this on demand CL session has already been created, cannot assign new session') # self.session = cls # self.session.complile_file(self.source_file) # # # def get(self): # '''Return a CLSession, creating it if needed.''' # if self.session is None: # self.session = CLSession(device=self.device, group_size=self.group_size) # # return self.session # if __name__ == '__main__': NP = 10**6 REPEATS = 10 ctx = CLSession(device='cpu') ctx.compile(''' __kernel void dot_product(__global REAL* a, __global REAL* b, int np, __global REAL* result) { for(int i = get_global_id(0); i < np; i += get_global_size(0)) { result[i] = dot(get_3D(a, i), get_3D(b, i)); } }''') #a = ctx.to_device(np.random.rand(NP, 3)) a = ctx.to_device(np.random.rand(NP, 3)) b = ctx.to_device(np.random.rand(NP, 3)) result = ctx.empty(NP) start = time.time() # for n in range(REPEATS): stat = { cl.command_execution_status.COMPLETE: 'complete', cl.command_execution_status.QUEUED: 'queued', cl.command_execution_status.RUNNING: 'running', cl.command_execution_status.SUBMITTED: 'submitted', } event = ctx.dot_product(a, b, NP, result) event = ctx.dot_product(a, b, NP, result) #for n in range(10): # time.sleep(1E-3) # print stat[event.command_execution_status] result.get() print time.time() - start print np.abs((a.get()*b.get()).sum(-1) - result.get()).sum() / NP