borg_public/experiments/CIC/docic.py
2023-05-29 10:41:03 +02:00

237 lines
7.3 KiB
Python

#+
# ARES/HADES/BORG Package -- ./experiments/CIC/docic.py
# Copyright (C) 2014-2020 Guilhem Lavaux <guilhem.lavaux@iap.fr>
# Copyright (C) 2009-2020 Jens Jasche <jens.jasche@fysik.su.se>
#
# Additional contributions from:
# Guilhem Lavaux <guilhem.lavaux@iap.fr> (2023)
#
#+
import cosmotool as ct
import numpy as np
import pyopencl as cl
import pyopencl.array as cl_array
CIC_PREKERNEL='''
#define NDIM {ndim}
typedef {cicType} BASIC_TYPE;
'''
CIC_KERNEL='''///CL///
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
__kernel void init_pcell(__global int *p_cell, const int value)
{
int i = get_global_id(0);
p_cell[i] = value;
}
__kernel void build_indices(__global const BASIC_TYPE *pos,
__global int *part_mesh, __global int *part_list, const int N, const BASIC_TYPE delta)
{
int i_part = get_global_id(0);
long shifter = 1;
long idx = 0;
int d;
for (d = 0; d < NDIM; d++) {
BASIC_TYPE x = pos[i_part*NDIM + d];
int m = (int)floor(x*delta) %% N;
idx += shifter * m;
shifter *= N;
}
// Head of the list
int initial_elt = atom_xchg(&part_mesh[idx], i_part);
if (initial_elt == -1) {
return;
}
// Point the next pointer of old_end to i_part
part_list[i_part] = initial_elt;
}
__kernel void reverse_list(__global int *part_mesh, __global int *part_list)
{
int mid = get_global_id(0);
int current_part = part_mesh[mid];
if (current_part >= 0) {
int next_part = part_list[current_part];
part_list[current_part] = -1;
while (next_part != -1) {
int p = part_list[next_part];
part_list[next_part] = current_part;
current_part = next_part;
next_part = p;
}
part_mesh[mid] = current_part;
}
}
__kernel void dance(__global const BASIC_TYPE *pos,
__global BASIC_TYPE *density,
__global int *part_mesh, __global int *part_list, const int N, const BASIC_TYPE delta)
{
int m[NDIM];
int shifter = 1;
int i;
int first, i_part;
int idx = 0;
for (i = 0; i < NDIM; i++) {
m[i] = get_global_id(i);
idx += shifter * m[i];
shifter *= N;
}
first = 1;
//BEGIN LOOPER
%(looperFor)s
//END LOOPER
int idx_dance = 0;
BASIC_TYPE w = 0;
//LOOPER INDEX
int r[NDIM] = { %(looperVariables)s };
//END LOOPER
i_part = part_mesh[idx];
while (i_part != -1) {
BASIC_TYPE w0 = 1;
for (int d = 0; d < NDIM; d++) {
BASIC_TYPE x = pos[i_part*NDIM + d]*delta;
BASIC_TYPE q = floor(x);
BASIC_TYPE dx = x - q;
w0 *= (r[d] == 1) ? dx : ((BASIC_TYPE)1-dx);
}
i_part = part_list[i_part];
w += w0;
}
shifter = 1;
for (i = 0; i < NDIM; i++) {
idx_dance += shifter * ((m[i]+r[i])%%N);
shifter *= N;
}
density[idx_dance] += w;
// One dance done. Wait for everybody for the next iteration
barrier(CLK_GLOBAL_MEM_FENCE);
%(looperForEnd)s
}
'''
class CIC_CL(object):
def __init__(self, context, ndim=2, ktype=np.float32):
global CIC_PREKERNEL, CIC_KERNEL
translator = {}
if ktype == np.float32:
translator['cicType'] = 'float'
pragmas = ''
elif ktype == np.float64:
translator['cicType'] = 'double'
pragmas = '#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n'
else:
raise ValueError("Invalid ktype")
# 2 dimensions
translator['ndim'] = ndim
looperVariables = ','.join(['id%d' % d for d in xrange(ndim)])
looperFor = '\n'.join(['for (int id{dim}=0; id{dim} < 2; id{dim}++) {{'.format(dim=d) for d in xrange(ndim)])
looperForEnd = '}' * ndim
kern = pragmas + CIC_PREKERNEL.format(**translator) + (CIC_KERNEL % {'looperVariables': looperVariables, 'looperFor': looperFor, 'looperForEnd':looperForEnd})
self.kern_code = kern
self.ctx = context
self.queue = cl.CommandQueue(context)#, properties=cl.OUT_OF_ORDER_EXEC_MODE_ENABLE)
self.ktype = ktype
self.ndim = ndim
self.prog = cl.Program(self.ctx, kern).build()
def run(self, particles, Ng, L):
assert particles.strides[1] == self.ktype().itemsize # This is C-ordering
assert particles.shape[1] == self.ndim
print("Start again")
ndim = self.ndim
part_pos = cl_array.to_device(self.queue, particles)
part_mesh = cl_array.empty(self.queue, (Ng,)*ndim, np.int32, order='C')
density = cl_array.zeros(self.queue, (Ng,)*ndim, self.ktype, order='C')
part_list = cl_array.empty(self.queue, (particles.shape[0],), np.int32, order='C')
if True:
delta = Ng/L
with ct.time_block("Init pcell array"):
e = self.prog.init_pcell(self.queue, (Ng**ndim,), None, part_mesh.data, np.int32(-1))
e.wait()
with ct.time_block("Init idx array"):
e=self.prog.init_pcell(self.queue, (particles.shape[0],), None, part_list.data, np.int32(-1))
e.wait()
with ct.time_block("Build indices"):
self.prog.build_indices(self.queue, (particles.shape[0],), None,
part_pos.data, part_mesh.data, part_list.data, np.int32(Ng), self.ktype(delta))
if True:
with ct.time_block("Reverse list"):
lastevt = self.prog.reverse_list(self.queue, (Ng**ndim,), None, part_mesh.data, part_list.data)
# We require pmax pass, particles are ordered according to part_idx
with ct.time_block("dance"):
self.prog.dance(self.queue, (Ng,)*ndim, None, part_pos.data, density.data, part_mesh.data, part_list.data, np.int32(Ng), self.ktype(delta))
print("Grab result")
self.queue.finish()
print("Del ppos")
del part_pos
print("Del pm")
del part_mesh
print("Del pl")
del part_list
print("Del density")
with ct.time_block("download"):
return density.get()
def go_test():
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
Nparts = 512**3
Ng = 512
cic = CIC_CL(ctx,ndim=3,ktype=np.float32)
print ("Generate")
parts = np.random.rand(Nparts, 3).astype(np.float32)
# ix=(np.arange(512)*1.0/512).astype(np.float32)
# parts = [ix[:,None,None].repeat(512,axis=1).repeat(512,axis=2),
# ix[None,:,None].repeat(512,axis=0).repeat(512,axis=2),
# ix[None,None,:].repeat(512,axis=0).repeat(512,axis=1)]
# parts = np.array(parts).transpose((3,0,1,2)).reshape((Nparts,3),order='C')
with ct.time_block("run cic"):
cic.run(parts, Ng, 1.0)
print("Done")
with ct.time_block("run cic 2"):
density1 = cic.run(parts, Ng, 1.0)
with ct.time_block("classic cic"):
density2 = ct.project_cic((parts-0.5).astype(ct.DTYPE), None, Ng, 1.0, True)
return density1, density2
vv = go_test()