#+ # ARES/HADES/BORG Package -- ./experiments/CIC/docic.py # Copyright (C) 2014-2020 Guilhem Lavaux # Copyright (C) 2009-2020 Jens Jasche # # Additional contributions from: # Guilhem Lavaux (2023) # #+ import cosmotool as ct import numpy as np import pyopencl as cl import pyopencl.array as cl_array CIC_PREKERNEL=''' #define NDIM {ndim} typedef {cicType} BASIC_TYPE; ''' CIC_KERNEL='''///CL/// #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable __kernel void init_pcell(__global int *p_cell, const int value) { int i = get_global_id(0); p_cell[i] = value; } __kernel void build_indices(__global const BASIC_TYPE *pos, __global int *part_mesh, __global int *part_list, const int N, const BASIC_TYPE delta) { int i_part = get_global_id(0); long shifter = 1; long idx = 0; int d; for (d = 0; d < NDIM; d++) { BASIC_TYPE x = pos[i_part*NDIM + d]; int m = (int)floor(x*delta) %% N; idx += shifter * m; shifter *= N; } // Head of the list int initial_elt = atom_xchg(&part_mesh[idx], i_part); if (initial_elt == -1) { return; } // Point the next pointer of old_end to i_part part_list[i_part] = initial_elt; } __kernel void reverse_list(__global int *part_mesh, __global int *part_list) { int mid = get_global_id(0); int current_part = part_mesh[mid]; if (current_part >= 0) { int next_part = part_list[current_part]; part_list[current_part] = -1; while (next_part != -1) { int p = part_list[next_part]; part_list[next_part] = current_part; current_part = next_part; next_part = p; } part_mesh[mid] = current_part; } } __kernel void dance(__global const BASIC_TYPE *pos, __global BASIC_TYPE *density, __global int *part_mesh, __global int *part_list, const int N, const BASIC_TYPE delta) { int m[NDIM]; int shifter = 1; int i; int first, i_part; int idx = 0; for (i = 0; i < NDIM; i++) { m[i] = get_global_id(i); idx += shifter * m[i]; shifter *= N; } first = 1; //BEGIN LOOPER %(looperFor)s //END LOOPER int idx_dance = 0; BASIC_TYPE w = 0; //LOOPER INDEX int r[NDIM] = { %(looperVariables)s }; //END LOOPER i_part = part_mesh[idx]; while (i_part != -1) { BASIC_TYPE w0 = 1; for (int d = 0; d < NDIM; d++) { BASIC_TYPE x = pos[i_part*NDIM + d]*delta; BASIC_TYPE q = floor(x); BASIC_TYPE dx = x - q; w0 *= (r[d] == 1) ? dx : ((BASIC_TYPE)1-dx); } i_part = part_list[i_part]; w += w0; } shifter = 1; for (i = 0; i < NDIM; i++) { idx_dance += shifter * ((m[i]+r[i])%%N); shifter *= N; } density[idx_dance] += w; // One dance done. Wait for everybody for the next iteration barrier(CLK_GLOBAL_MEM_FENCE); %(looperForEnd)s } ''' class CIC_CL(object): def __init__(self, context, ndim=2, ktype=np.float32): global CIC_PREKERNEL, CIC_KERNEL translator = {} if ktype == np.float32: translator['cicType'] = 'float' pragmas = '' elif ktype == np.float64: translator['cicType'] = 'double' pragmas = '#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n' else: raise ValueError("Invalid ktype") # 2 dimensions translator['ndim'] = ndim looperVariables = ','.join(['id%d' % d for d in xrange(ndim)]) looperFor = '\n'.join(['for (int id{dim}=0; id{dim} < 2; id{dim}++) {{'.format(dim=d) for d in xrange(ndim)]) looperForEnd = '}' * ndim kern = pragmas + CIC_PREKERNEL.format(**translator) + (CIC_KERNEL % {'looperVariables': looperVariables, 'looperFor': looperFor, 'looperForEnd':looperForEnd}) self.kern_code = kern self.ctx = context self.queue = cl.CommandQueue(context)#, properties=cl.OUT_OF_ORDER_EXEC_MODE_ENABLE) self.ktype = ktype self.ndim = ndim self.prog = cl.Program(self.ctx, kern).build() def run(self, particles, Ng, L): assert particles.strides[1] == self.ktype().itemsize # This is C-ordering assert particles.shape[1] == self.ndim print("Start again") ndim = self.ndim part_pos = cl_array.to_device(self.queue, particles) part_mesh = cl_array.empty(self.queue, (Ng,)*ndim, np.int32, order='C') density = cl_array.zeros(self.queue, (Ng,)*ndim, self.ktype, order='C') part_list = cl_array.empty(self.queue, (particles.shape[0],), np.int32, order='C') if True: delta = Ng/L with ct.time_block("Init pcell array"): e = self.prog.init_pcell(self.queue, (Ng**ndim,), None, part_mesh.data, np.int32(-1)) e.wait() with ct.time_block("Init idx array"): e=self.prog.init_pcell(self.queue, (particles.shape[0],), None, part_list.data, np.int32(-1)) e.wait() with ct.time_block("Build indices"): self.prog.build_indices(self.queue, (particles.shape[0],), None, part_pos.data, part_mesh.data, part_list.data, np.int32(Ng), self.ktype(delta)) if True: with ct.time_block("Reverse list"): lastevt = self.prog.reverse_list(self.queue, (Ng**ndim,), None, part_mesh.data, part_list.data) # We require pmax pass, particles are ordered according to part_idx with ct.time_block("dance"): self.prog.dance(self.queue, (Ng,)*ndim, None, part_pos.data, density.data, part_mesh.data, part_list.data, np.int32(Ng), self.ktype(delta)) print("Grab result") self.queue.finish() print("Del ppos") del part_pos print("Del pm") del part_mesh print("Del pl") del part_list print("Del density") with ct.time_block("download"): return density.get() def go_test(): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) Nparts = 512**3 Ng = 512 cic = CIC_CL(ctx,ndim=3,ktype=np.float32) print ("Generate") parts = np.random.rand(Nparts, 3).astype(np.float32) # ix=(np.arange(512)*1.0/512).astype(np.float32) # parts = [ix[:,None,None].repeat(512,axis=1).repeat(512,axis=2), # ix[None,:,None].repeat(512,axis=0).repeat(512,axis=2), # ix[None,None,:].repeat(512,axis=0).repeat(512,axis=1)] # parts = np.array(parts).transpose((3,0,1,2)).reshape((Nparts,3),order='C') with ct.time_block("run cic"): cic.run(parts, Ng, 1.0) print("Done") with ct.time_block("run cic 2"): density1 = cic.run(parts, Ng, 1.0) with ct.time_block("classic cic"): density2 = ct.project_cic((parts-0.5).astype(ct.DTYPE), None, Ng, 1.0, True) return density1, density2 vv = go_test()