Source code for ilpm.simple_cl

#!/usr/bin/env python
import numpy as np
try:
    import pyopencl as cl
    import pyopencl.array as cla
    import pyopencl.clmath as clm
    import pyopencl.algorithm as algorithm
    HAS_OPENCL = True
except:
    HAS_OPENCL = False
import time
import cl_fft
import warnings


def ignore_compiler_warnings():
    import warnings
    warnings.filterwarnings('ignore', category=cl.CompilerWarning)


class SimpleCLWarning(Warning):
    pass

#Helper class for automatically managing kernel calls.
class AutoKernel(object):
    def __init__(self, kernel, parent):
        self.kernel = kernel
        self.parent = parent
        self.work_group = self.parent.max_work_group_size
        
    def __call__(self, *args, **kwargs):
        if 'work_group' in kwargs:
            work_group = kwargs.pop('work_group')
            default_wg = False
        else:
            work_group = (self.work_group,)
            default_wg = True

        if type(work_group) is int: work_group = (work_group,)
            
        work_items = kwargs.pop('work_items', None)
        if work_items is None: work_items = tuple(map(lambda x: x*self.parent.max_compute_units, work_group))
        
        args = map(lambda x: self.parent.arg_map_types.get(type(x), lambda y:y)(x), args)
        
        try:
            return self.kernel(self.parent.queue, work_items, work_group, *args, **kwargs)
        except cl.RuntimeError, e:
            if 'out of resources' in str(e).lower() and default_wg and self.work_group > 1:
                name = self.kernel.get_info(cl.kernel_info.FUNCTION_NAME)
                warnings.warn('kernel %s out of resources, trying a smaller work group size (%d => %d)' % (name, self.work_group, self.work_group//2), SimpleCLWarning, stacklevel=2)
                self.work_group //= 2
                return self(*args, **kwargs)
            else:
                raise
         
[docs]def acquire_opencl_devices(devname):
    '''Find OpenCL devices with matching descriptions.
    
    Parameters
    ----------
    devname : string
        A partial name for an opencl device, or "cpu" or "gpu".
        
    Returns
    -------
    devices : list of OpenCL devices
        All matching devices.  May be empty
    '''
    n = devname.lower()

    devs = []

    if n in ('cpu', 'gpu'):
        for platform in cl.get_platforms():
            try: devs += platform.get_devices(cl.device_type.CPU if n=='cpu' else cl.device_type.GPU)
            except: pass #Throws an error if no device found on this platform
    else:
        for platform in cl.get_platforms():
            try: devs += filter(lambda dev: n in dev.name.lower(), platform.get_devices())
            except: pass #Throws an error if no device found on this platform
            
    return devs

        
[docs]class CLSession(object):
    '''Create an OpenCL session on the specified device, while defining
    float precision and some utility functions.
    
    The actual creation of the OpenCL contexts are delayed until necessary.
    This allows the context parameters to be modified before it is used --
    this is useful for external modules where the user may which to change the
    device used for calculations.
    
    The following data types are defined as members of the class: 
    
    =============== ======================================= ============
    Name            OpenCL type                             Size (bytes)
    =============== ======================================= ============
    ``real``        ``REAL`` (``float`` or ``double``)      4 or 8
    ``complex``     ``COMPLEX`` (``float2`` or ``double2``) 8 or 16
    ``char``        ``char``                                1
    ``uchar``       ``uchar``                               1
    ``short``       ``short``                               2
    ``ushort``      ``ushort``                              2
    ``int``         ``int``                                 4
    ``uint``        ``uint``                                4
    ``long``        ``long``                                8
    ``ulong``       ``ulong``                               8    
    =============== ======================================= ============
    
    The following functions and constants will also be made available to any
    kernel compiled via the session:
    
    =========== =================== =============================== ============================
    Type        Name                Parameters                      Description
    =========== =================== =============================== ============================
    ``COMPLEX`` ``c_mul``           ``COMPLEX x, COMPLEX y``        :math:`x * y`
    ``COMPLEX`` ``c_div``           ``COMPLEX x, COMPLEX y``        :math:`x / y`
    ``COMPLEX`` ``conj``            ``COMPLEX x``                   :math:`x^*`
    ``COMPLEX`` ``c_exp``           ``COMPLEX x``                   :math:`\exp(x)`
    ``REAL``    ``c_angle``         ``COMPLEX x``                   :math:`\mathrm{arg}(x)`
    ``REAL``    ``c_abs``           ``COMPLEX x``                   :math:`|x|`
    ``REAL``    ``c_abs_sq``        ``COMPLEX x``                   :math:`|x|^2`
    
    ``COMPLEX`` ``c_exp_i``         ``REAL x``                      :math:`\exp(i x)`
    ``COMPLEX`` ``native_exp_i``    ``REAL x``                      (uses native precision math)
    ``COMPLEX`` ``c_exp_i_T``       ``REAL x``                      :math:`\exp(-i x)`
    ``COMPLEX`` ``native_exp_i_T``  ``REAL x``                      (uses native precision math)

    ``REAL``    ``PI``              *constant*                      :math:`\pi`
    ``REAL``    ``TWO_PI``          *constant*                      :math:`2 \pi`
    ``REAL``    ``EULER_CONSTANT``  *constant*                      :math:`\gamma = 0.57721...`
    
    =========== =================== =============================== ============================
    
    Parameters
    ----------
    device : openCL device, description string or list of strings
        The openCL device to use.  If a string or list is passed,
        :function: `acquire_opencl_devices` will be used to find devices.
        As soon as it finds a matching device it stops; lists of devices
        used for fallbacks.
    use_doubles : bool (default: False)
        Specify if REAL and COMPLEX types are double precision
    group_size : int (default: max for device)
        The default group_size in calls.  If not specified, determined by
        the max_workgroup_size for the device (recommended).  If the device
        type is CPU, this will be set to 1 (CPUs sometimes erroneously
        report more than 1 for this value)
    show_warnings : bool, optional (default: False)
        If True, warnings displayed in build error messages.
    '''
    device_info = cl.device_info

    def __init__(self, device=['cpu', 'gpu'], use_doubles=False, group_size=None, show_warnings=False):
        self.device = device
        self.use_doubles = use_doubles
        self.group_size = group_size
        self.delayed_code = []    
    
        self.char = np.int8
        self.uchar = np.uint8
        self.short = np.int16
        self.ushort = np.uint16
        self.int = np.int32
        self.uint = np.uint32
        self.long = np.int64
        self.ulong = np.uint64
        self.show_warnings = show_warnings
    
    
    def __setattr__(self, k, v):
        if k in ('device', 'use_doubles', 'group_size') and hasattr(self, 'context'):
            raise RuntimeError('this context has already been used -- ')
        else: object.__setattr__(self, k, v)
    
    
    def _check(self):
        if not hasattr(self, 'context'):
            self.initialize()
        
    def __getattr__(self, attr):
        if attr in ('real', 'complex'):
            self._check()
            if attr == 'real': return np.float64 if self.use_doubles else np.float32
            else:              return np.complex128 if self.use_doubles else np.complex64                 
        elif hasattr(self, 'kernels') and attr in self.kernels: return self.kernels[attr]
        else: raise AttributeError("CLSession has no attribute '%s'" % attr)
            
    
[docs]    def initialize(self, context=None):
        '''Initialize the OpenCL context, if not already done.
        
        Parameters
        ----------
        context : None or pyopencl Context
            The OpenCL context to used.  If none, created using the preset
            "device" attribute.
        '''
        if hasattr(self, 'context'): return
        
        if not HAS_OPENCL:
            raise RuntimeError('pyopencl is not installed on this computer\n(if OpenCL is installed on this computer, try running:\n    sudo easy_install pyopencl)')
        
        if context is None:        
            if type(self.device) is not cl.Device:
                if type(self.device) == str: self.device = [self.device]
                
                for d in self.device:        
                    devs = acquire_opencl_devices(d)
                    if devs:
                        self.device = devs[0]
                        break
                else:
                    raise RuntimeError('failed to find the specified openCL devices: %s\nIs opencl installed and configured properly?' % (self.device))
                
            self.context = cl.Context([self.device])
        else:
            if isinstance(context, CLSession): self.context = context.context
            else: self.context = context
            
        self.queue = cl.CommandQueue(self.context)

        if self.use_doubles:
            self.prefix = '''
                    #pragma OPENCL EXTENSION cl_khr_fp64: enable
                    #define PYOPENCL_DEFINE_CDOUBLE
                    #include <pyopencl-complex.h>
                    #define REAL      double
                    #define REAL2     double2
                    #define REAL4     double4
                    #define REAL8     double8
                    #define COMPLEX   cdouble_t
                    #define c_exp     cdouble_exp       
                    #define c_mul     cdouble_mul
                    #define c_div     cdouble_div
                    '''
                    #COMPLEX native_exp_i(REAL x)   {return (COMPLEX)((double)native_cos((float)x), (double)native_sin((float)x));}
                    #COMPLEX native_exp_i_T(REAL x) {return (COMPLEX)((double)native_cos((float)x), -(double)native_sin((float)x));}
                    
            self.real = np.float64
            self.complex = np.complex128
            
        else:
            self.prefix = '''
                    #include <pyopencl-complex.h>
                    #define REAL      float
                    #define REAL2     float2
                    #define REAL4     float4
                    #define REAL8     float8
                    #define COMPLEX   cfloat_t            
                    #define c_exp     cfloat_exp       
                    #define c_mul     cfloat_mul
                    #define c_div     cfloat_div
                    //NPM commented out//
                    //COMPLEX native_exp_i(REAL x)   {return (COMPLEX)(native_cos(x), native_sin(x));}//
                    //COMPLEX native_exp_i_T(REAL x) {return (COMPLEX)(native_cos(x), -native_sin(x));}//
                    '''
            self.real = np.float32
            self.complex = np.complex64

        self.prefix += '''
                    #define PI             (REAL)3.141592653589793
                    #define TWO_PI         (REAL)6.283185307179586
                    #define EULER_CONSTANT (REAL)0.57721566490153286
                    
                    #define get_3D(X, index)   ((REAL4)(X[3*(index)+0], X[3*(index)+1], X[3*(index)+2], 0.0))
                    
                    void set_3D(__global REAL* X, int index, REAL4 Y) {
                        X[3*index + 0] = Y.x;
                        X[3*index + 1] = Y.y;
                        X[3*index + 2] = Y.z;
                    }

                    REAL    c_angle(COMPLEX x)     {return atan2(x.y, x.x);}
                    //NPM commented out//
                    //COMPLEX c_exp_i(REAL x)        {return (COMPLEX)(cos(x), sin(x));}//
                    //COMPLEX c_exp_i_T(REAL x)      {return (COMPLEX)(cos(x), -sin(x));}//
                    //COMPLEX conj(COMPLEX x)        {return (COMPLEX)(x.x, -x.y);}//
                    //REAL    c_abs(COMPLEX x)       {return length(x);}//
                    //REAL    c_abs_sq(COMPLEX x)    {return dot(x, x);}//
                    '''
        
        #Get some info.
        if self.group_size is None:
            if self.device.type == cl.device_type.CPU:
                self.max_work_group_size = 1
            else:    
                self.max_work_group_size = self.device.get_info(cl.device_info.MAX_WORK_GROUP_SIZE)
        else: self.max_work_group_size = self.group_size
        self.max_compute_units = self.device.get_info(cl.device_info.MAX_COMPUTE_UNITS)
        
        self.default_work_items = (self.max_compute_units * self.max_work_group_size, )
        self.default_work_group = (self.max_work_group_size,)
        self.programs = []
        self.kernels = {}
        self.arg_map_types = {
            cla.Array: lambda x: x.data,
            int: lambda x: self.int(x),
            float: lambda x: self.real(x),
            np.ndarray: lambda x: self.to_device(x).data,
        }
        
        for code in self.delayed_code:
            self.compile(code)
        

[docs]    def to_device(self, X):
        '''Make a copy of a local array on the OpenCL device.
        
        Parameters
        ----------
        X : numpy array
        
        Returns
        -------
        X_cl : pyopencl array
            A version of the array stored on the OpenCL device.  If the data
            type is float or double, it will be converted to the precision
            of the device context.
        '''
        self._check()
        
        if X.dtype in ('f', 'd'): X = X.astype(self.real)
        elif X.dtype in ('complex128', 'complex64'): X = X.astype(self.complex)
            
        return cla.to_device(self.queue, np.require(X, requirements='C'))
        
        
[docs]    def compile_file(self, fn):
        '''Compile OpenCL kernels from file.  See :meth:`compile` for details.
        '''
        with open(fn, 'rt') as f:
            code = f.read()
        
        return self.compile(code)
        
        
[docs]    def compile(self, code):
        '''Compile OpenCL kernel code (as a string) on the present device.

        Compliation is delayed if this session is not initialized.
        If you require access to the Program object (which is usually not
        important), call :meth:`initialize` first.
        
        Returns
        -------
        program : pyopencl Program or None
            The compiled program.  May be used to make function calls, although
            the contained kernels will automatically become functions of the
            base class.  If context is not initialized yet, returns None.
        '''
        if not hasattr(self, 'context'):
            self.delayed_code.append(code)
            return None
        
        else:         
            try:
                p = cl.Program(self.context, self.prefix + code).build()
            except cl.RuntimeError, e:
                if self.show_warnings: raise
                else: #Filter the error messge to get rid of tons and tons of useless warnings.
                    #prefix_lines = len(self.prefix.splitlines())
                    prefix_lines = self.prefix.count('\n')
                    msg = str(e).splitlines() 
                    new_msg = []
                    while msg:
                        line = msg.pop(0)
                        parts = line.split(':')
                        if len(parts) >= 4 and parts[3].strip().lower() in ('warning', 'note'):
                            msg = msg[2:]
                            if new_msg[-1].startswith('In file included') and new_msg[-2].startswith('In file included'):
                                new_msg.pop(-1)
                        else:
                            #Try to change the line number.
                            if len(parts) >= 4 and parts[3].strip().lower() == 'error':
                                try: parts[1] = str(int(parts[1]) - prefix_lines)
                                except: pass
                                    
                            new_msg.append(':'.join(parts))
                            
                    raise cl.RuntimeError('\n'.join(new_msg))

    
            for kernel in p.all_kernels():
                name = kernel.get_info(cl.kernel_info.FUNCTION_NAME)
                #print name
                
                kernel = getattr(p, name)
                
                #print name, type(kernel), dir(kernel)
                if name in self.kernels: print "WARNING: %s already in kernels, overwriting." % name
                
                self.kernels[name] = AutoKernel(kernel, self)
            
            self.programs.append(p)
            return p
    
    
        

[docs]    def empty(self, shape, dtype=None):
        '''Create an empty array on the OpenCL device.
        
        Parameters
        ----------
        shape : tuple
        dtype : numpy data type
            
        Returns
        -------
        arr : pyopencl array
            An empty array of the specified type.
        '''
        self._check()
        
        if dtype is None: dtype = self.real
        return cla.empty(self.queue, shape, dtype=dtype)


[docs]    def empty_like(self, arr):
        '''Create an empty array with copied shape/dtype.'''
        self._check()
        return cla.empty_like(arr)


[docs]    def zeros(self, shape, dtype=None):
        '''Create a zeroed array on the OpenCL device.
        
        Parameters
        ----------
        shape : tuple
        dtype : numpy data type
            
        Returns
        -------
        arr : pyopencl array
            A zeroed array of the specified type.
        '''
        self._check()
        
        if dtype is None: dtype = self.real
        return cla.zeros(self.queue, shape, dtype=dtype)


[docs]    def ones_like(self, arr):
        '''Create a one filled array with copied shape/dtype.'''
        self._check()
        
        return self.to_device(np.ones(arr.shape, dtype=arr.dtype))


[docs]    def ones(self, shape, dtype=None):
        '''Create a one filled array on the OpenCL device.
        
        Parameters
        ----------
        shape : tuple
        dtype : numpy data type
            
        Returns
        -------
        arr : pyopencl array
            A zeroed array of the specified type.
        '''
        self._check()
        
        if dtype is None: dtype = self.real
        return self.to_device(np.ones(shape, dtype))


[docs]    def zeros_like(self, arr):
        '''Create a zeroed array with copied shape/dtype.'''
        self._check()

        return cla.zeros_like(arr)    
    
[docs]    def local_memory(self, size):
        '''Create a local memory object on the OpenCL device.
        
        Parameters
        ----------
        size : int
            The size (in bytes) of the local memory required.
            
        Returns
        -------
        mem : cl.LocalMemory
            A local memory object for an OpenCL kernel.
        '''
        self._check()

        return cl.LocalMemory(size)

[docs]    def enqueue_copy(self, dst, src):
        '''Call ``pyopencl.enqueue_copy`` with the session queue.'''
        return cl.enqueue_copy(self.queue, dst, src)


[docs]    def fft(self, arr, sign=+1, inplace=True, swap=None, max_threads=None):
        '''Perform a multidimensional FFT on a pyopencl array.
        Note: array dimension sizes must be multiples of 2, 3, 5, 7.
        
        Parameters
        ----------
        arr : pyopencl array, or an object castable to one (e.g. numpy array)
        sign : integer (+-1)
            Forward or backward transform.
        inplace : bool
            In place transform?  If not, a copy is made first.
            Note: if a numpy array is passed it will never by in place!
        swap : pyopencl array (default: None)
            The swap buffer used in the calculation.  If not specified, one
            will be created.  Should have the same dimensions/type as arr
        max_threads : int (default self.max_work_group_size)
            The max_thread argument used for the fft code generation.  Note that
            if a given size FFT has already been generated, this will be
            ignored.  (Generally, this should be left unset, unless there isn't
            enough local memory to transform multiple blocks.)
        
        Returns
        -------
        arr : opencl array
            The output array; will match the input array if inplace.
        '''
        self._check()
        
        if not hasattr(self, 'fft_kernels'): self.fft_kernels = {}
        
        w_size = max(arr.shape)
        data_size = 0 
        
        if max_threads is None: max_threads = self.max_work_group_size
        
        for N in arr.shape:
            if N not in self.fft_kernels:
                func_name = name='fft_shift_%d' % N
                blocks_per_group, threads, code = cl_fft.gen_fft_code(N, max_threads=max_threads, func_name=func_name)
                #print '-'*40
                #print code
                #print '-'*40
                self.compile(code)
                self.fft_kernels[N] = (blocks_per_group, threads, self.kernels[func_name])
            else:
                blocks_per_group = self.fft_kernels[N][0]
            
            ds = blocks_per_group * N
            if ds > data_size: data_size = ds
        
        print data_size, w_size
            
        local_mem = [cl.LocalMemory(np.dtype(self.complex).itemsize * points) for points in (w_size, data_size, data_size)]
        
        if type(arr) == cla.Array:
            if inplace: a1 = arr
            else: a1 = arr.copy()
        else:
            a1 = self.to_device(arr)
            inplace = False
            
        if swap is not None:
            if type(swap) == type(a1) and a1.shape == swap.shape:
                a2 = swap
            else:
                swap = None
        
        if swap is None:
            a2 = self.empty_like(a1)
            
        Ntot = np.prod(arr.shape)
        for N in arr.shape[::-1]:
            #print N, Ntot//N
            blocks_per_group, threads, func = self.fft_kernels[N]
            job = func(a1, a2, self.int(sign), self.int(Ntot//N), *local_mem) #, work_group=(threads,))
            #job.wait()
            a1, a2 = a2, a1
            
        if inplace:
            if a1 is not arr: cl.enqueue_copy(self.queue, arr.data, a1.data)
            return arr
        else:
            return a1
        
[docs]    def get_device_info(self, field):
        '''Get info about the opencl device for the session.  See :func:`get_device_info`'''
        return get_device_info(self.device, field)
        
        

[docs]def get_device_info(device, field):
    '''Get information about an opencl device.
    
    Parameters
    ----------
    field : A valid field from ``pyopencl.device_info`` or str
        The field to obtain.  If specified as a string, name should match a
        member of pyopencl.device_info, ignoring case.  Examples include:
        ``"global_mem_size"`` and ``"max_compute_units"``.
        
    Returns
    -------
    value : varies (usually int)
        The value returned by calling device.get_info(...)
    '''
    if isinstance(field, basestring):
        f0 = field
        field = getattr(cl.device_info, field.upper(), None)
        if field is None:
            raise ValueError('"%s" is not a valid device info field.\n(should be a member of pyopencl.device_info)' )
    
    return device.get_info(field)
        

[docs]def is_job_done(event):
    '''Check if an OpenCL task is done.
    
    Parameters
    ----------
    event : pyopencl Event
    
    Returns
    -------
    done : bool
    '''
    return event.command_execution_status == cl.command_execution_status.COMPLETE

#__global_cl_session = None
#__global_cl_params = {}
#__global_cl_kernels = []
#
#def get_cl(**kwargs):
#    if kwargs and kwargs != 


#This was a dumb idea!  Built this into the main CLSession class instead.
#
#class CLSessionOnDemand(object):
#    '''An object which returns and OpenCL Session on demand.
#    
#    Used by other modules which wish to create OpenCL Sessions that allow
#    users to optionally configure the calculations.
#    
#    Also delays compilation -- this is useful if modules have only some
#    functions that depend on OpenCL, since it will allow users that don't have
#    pyopencl installed to run the other functions.
#    
#    Parameters
#    ----------
#    device : list
#    use_doubles : bool
#    group_size : None or int
#    source_file : string
#        The filename of a code file to be compiled once the session is initialized.
#
#    '''
#    def __init__(self, device=['cpu', 'gpu'], use_doubles=False, group_size=None, source_file=None):
#        self.device = device
#        self.use_doubles = use_doubles
#        self.group_size = group_size
#        self.source_file = source_file
#        self.session = None
#        
#    
#    def _check(self):
#        if self.session is not None:
#            raise RuntimeError('CLSession parameters need to be set before OpenCL calls')
#
#    
#    def set_device(self, device):
#        '''Set the CLSession device.
#        
#        Parameters
#        ----------
#        device : openCL device, description string or list of strings
#            The openCL device to use.  If a string or list is passed,
#            :function: `acquire_opencl_devices` will be used to find devices.
#            As soon as it finds a matching device it stops; lists of devices
#            used for fallbacks.
#        '''
#        self._check()
#        self.device = device
#
#    
#    def set_doubles(self, use_doubles):
#        '''Set the REAL and COMPLEX precision of the session.
#        
#        Parameters
#        ----------
#        use_doubles : bool (default: False)
#            Specify if REAL and COMPLEX types are double precision
#        '''
#        self._check()
#        self.use_doubles = use_doubles
#    
#        
#    def set_group_size(self, group_size):
#        '''Set the group size of the session.
#        
#        Parameters
#        ----------
#        group_size : int (default: max for device)
#            The default group_size in calls.  If not specified, determined by
#            the max_workgroup_size for the device (recommended).  If the device
#            type is CPU, this will be set to 1 (CPUs sometimes erroniously
#            report more than 1 for this value)
#        '''
#        self._check()
#        self.group_size = group_size
#        
#    
#    def use_existing_session(self, cls):
#        '''Share an existing CLSession.
#        
#        *Note: the parameters of the existing session will be used instead
#        of those set by this class.*
#        
#        Parameters
#        ----------
#        cls : CLSesssion
#        '''
#        if self.session is not None:
#            raise RuntimeError('this on demand CL session has already been created, cannot assign new session')
#        self.session = cls
#        self.session.complile_file(self.source_file)
#
#    
#    def get(self):
#        '''Return a CLSession, creating it if needed.'''
#        if self.session is None:
#            self.session = CLSession(device=self.device, group_size=self.group_size)
#        
#        return self.session
#            

if __name__ == '__main__':
    NP = 10**6
    REPEATS = 10
    
    
    ctx = CLSession(device='cpu')
    
    ctx.compile('''
        __kernel void dot_product(__global REAL* a, __global REAL* b, int np, __global REAL* result)
        {
            for(int i = get_global_id(0); i < np; i += get_global_size(0)) {
            result[i] = dot(get_3D(a, i), get_3D(b, i));
            }
        }''')
    
    #a = ctx.to_device(np.random.rand(NP, 3))
    
    a = ctx.to_device(np.random.rand(NP, 3))
    b = ctx.to_device(np.random.rand(NP, 3))
    
    result = ctx.empty(NP)
    
    start = time.time()
#    for n in range(REPEATS):

    stat = {
        cl.command_execution_status.COMPLETE: 'complete',
        cl.command_execution_status.QUEUED: 'queued',
        cl.command_execution_status.RUNNING: 'running',
        cl.command_execution_status.SUBMITTED: 'submitted',
    }

    event = ctx.dot_product(a, b, NP, result)
    event = ctx.dot_product(a, b, NP, result)
    #for n in range(10):
    #    time.sleep(1E-3)
    #    print stat[event.command_execution_status]
     
    
    result.get()
    print time.time() - start
    
    print np.abs((a.get()*b.get()).sum(-1) - result.get()).sum() / NP