From 2101b6dd4f3484b7616b4ff810d8a9cea67ae0ef Mon Sep 17 00:00:00 2001 From: 36000 Date: Wed, 19 Feb 2025 20:42:21 -0800 Subject: [PATCH 01/31] WIP tweaking PTT params --- cuslines/ptt.cu | 2 +- cuslines/ptt.cuh | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu index b36e747..7bb0763 100644 --- a/cuslines/ptt.cu +++ b/cuslines/ptt.cu @@ -213,7 +213,7 @@ __device__ int get_direction_ptt_d( REAL_T *__first_val_sh = first_val_sh + tidy; const REAL_T max_curvature = SIN(max_angle / 2) / step_size; // bigger numbers means wiggle more - const REAL_T probe_step_size = ((step_size / 2) / (PROBE_QUALITY - 1)); + const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1)); REAL_T __tmp; diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh index d8986b5..a8222fc 100644 --- a/cuslines/ptt.cuh +++ b/cuslines/ptt.cuh @@ -6,16 +6,16 @@ #define STEP_FRAC 20 // divides output step size (usually 0.5) into this many internal steps #define PROBE_FRAC 2 // divides output step size (usually 0.5) to find probe length -#define PROBE_QUALITY 4 +#define PROBE_QUALITY 4 // Number of probing steps #define SAMPLING_QUALITY 4 // can be 2-7 -#define PROBABILISTIC_BIAS 1 // 1 looks good. can be 0-log_2(N_WARPS) (typically 0-5). 0 is fully probabilistic, 4 is close to deterministic. -#define ALLOW_WEAK_LINK 1 +#define DETERMINISTIC_BIAS 0 // Should be 0, higher values bias more towards higher fODF values when tracking +#define ALLOW_WEAK_LINK 0 #define TRIES_PER_REJECTION_SAMPLING 1024 -#define DEFAULT_PTT_MINDATASUPPORT 0.05 +#define DEFAULT_PTT_MINDATASUPPORT 0.0 // 0.01 #define K_SMALL 0.0001 #define NORM_MIN_SUPPORT (DEFAULT_PTT_MINDATASUPPORT * PROBE_QUALITY) -#define PROBABILISTIC_GROUP_SZ POW2(PROBABILISTIC_BIAS) +#define PROBABILISTIC_GROUP_SZ POW2(DETERMINISTIC_BIAS) #if SAMPLING_QUALITY == 2 #define DISC_VERT_CNT DISC_2_VERT_CNT From 61fb586902b1d76556a4ba3f958a83097a382ea0 Mon Sep 17 00:00:00 2001 From: 36000 Date: Thu, 10 Jul 2025 12:26:40 -0700 Subject: [PATCH 02/31] put this back --- cuslines/ptt.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh index a8222fc..e3317ff 100644 --- a/cuslines/ptt.cuh +++ b/cuslines/ptt.cuh @@ -11,7 +11,7 @@ #define DETERMINISTIC_BIAS 0 // Should be 0, higher values bias more towards higher fODF values when tracking #define ALLOW_WEAK_LINK 0 #define TRIES_PER_REJECTION_SAMPLING 1024 -#define DEFAULT_PTT_MINDATASUPPORT 0.0 // 0.01 +#define DEFAULT_PTT_MINDATASUPPORT 0.01 // 0.01 #define K_SMALL 0.0001 #define NORM_MIN_SUPPORT (DEFAULT_PTT_MINDATASUPPORT * PROBE_QUALITY) From d257ad9cf699422eddd7f29409b1e86b2c9f51d0 Mon Sep 17 00:00:00 2001 From: 36000 Date: Mon, 25 Aug 2025 10:52:39 -0700 Subject: [PATCH 03/31] update for CUDA 13 compatibility --- cuslines/cuslines.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp index 4e8bc30..45163ad 100644 --- a/cuslines/cuslines.cpp +++ b/cuslines/cuslines.cpp @@ -146,9 +146,12 @@ class GPUTracker { //#pragma omp parallel for for (int n = 0; n < ngpus_; ++n) { + cudaMemLocation location = {}; + location.type = cudaMemLocationTypeDevice; + location.id = n; CHECK_CUDA(cudaSetDevice(n)); CHECK_CUDA(cudaMallocManaged(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size)); - CHECK_CUDA(cudaMemAdvise(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n)); + CHECK_CUDA(cudaMemAdvise(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, location)); CHECK_CUDA(cudaMalloc(&H_d[n], sizeof(*H_d[n]) * H_info.size)); CHECK_CUDA(cudaMalloc(&R_d[n], sizeof(*R_d[n]) * R_info.size)); CHECK_CUDA(cudaMalloc(&delta_b_d[n], sizeof(*delta_b_d[n]) * delta_b_info.size)); From 173e48da04196cbb1d92e97d1e2e43fcc65c2266 Mon Sep 17 00:00:00 2001 From: 36000 Date: Mon, 25 Aug 2025 10:54:17 -0700 Subject: [PATCH 04/31] formatting --- cuslines/cuslines.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp index 45163ad..ceb7002 100644 --- a/cuslines/cuslines.cpp +++ b/cuslines/cuslines.cpp @@ -146,7 +146,7 @@ class GPUTracker { //#pragma omp parallel for for (int n = 0; n < ngpus_; ++n) { - cudaMemLocation location = {}; + cudaMemLocation location = {}; location.type = cudaMemLocationTypeDevice; location.id = n; CHECK_CUDA(cudaSetDevice(n)); From bea03ceddfdbec14eaf7b1597e74091c7d791234 Mon Sep 17 00:00:00 2001 From: 36000 Date: Mon, 25 Aug 2025 12:10:27 -0700 Subject: [PATCH 05/31] try this --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 06e9de9..15a0e1e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,6 +27,7 @@ ENV PATH /opt/anaconda/bin:${PATH} ENV LD_LIBRARY_PATH /opt/anaconda/lib:${LD_LIBRARY_PATH} # python prereqs +RUN conda tos accept --override-channels --channel conda-forge RUN conda install -c conda-forge git RUN pip install numpy>=2.0.0 RUN pip install scipy>=1.13.0 cython nibabel dipy tqdm From 00a22c8671551b60594f3b4570472e6ce21b51f3 Mon Sep 17 00:00:00 2001 From: 36000 Date: Mon, 25 Aug 2025 12:38:51 -0700 Subject: [PATCH 06/31] accept lots of TOS --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index 15a0e1e..3a2cbdc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,6 +28,8 @@ ENV LD_LIBRARY_PATH /opt/anaconda/lib:${LD_LIBRARY_PATH} # python prereqs RUN conda tos accept --override-channels --channel conda-forge +RUN conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main +RUN conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r RUN conda install -c conda-forge git RUN pip install numpy>=2.0.0 RUN pip install scipy>=1.13.0 cython nibabel dipy tqdm From 03397f602b0181b981a563b4830b04313a575737 Mon Sep 17 00:00:00 2001 From: 36000 Date: Mon, 25 Aug 2025 12:55:52 -0700 Subject: [PATCH 07/31] handle both 12/13 --- cuslines/cudamacro.h | 14 ++++++++++++++ cuslines/cuslines.cpp | 5 +---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/cuslines/cudamacro.h b/cuslines/cudamacro.h index 49ac24c..45a8fc3 100644 --- a/cuslines/cudamacro.h +++ b/cuslines/cudamacro.h @@ -45,6 +45,20 @@ exit(EXIT_FAILURE); \ }} +#if CUDART_VERSION >= 13000 +#define CUDA_MEM_ADVISE(devPtr, count, advice, device) \ + do { \ + cudaMemLocation loc; \ + loc.type = cudaMemLocationTypeDevice; \ + loc.id = (device); \ + CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), loc)); \ + } while (0) +#else +#define CUDA_MEM_ADVISE(devPtr, count, advice, device) \ + CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), (device))) +#endif + + #ifdef USE_NVTX #include "nvToolsExt.h" diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp index ceb7002..a1ada94 100644 --- a/cuslines/cuslines.cpp +++ b/cuslines/cuslines.cpp @@ -146,12 +146,9 @@ class GPUTracker { //#pragma omp parallel for for (int n = 0; n < ngpus_; ++n) { - cudaMemLocation location = {}; - location.type = cudaMemLocationTypeDevice; - location.id = n; CHECK_CUDA(cudaSetDevice(n)); CHECK_CUDA(cudaMallocManaged(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size)); - CHECK_CUDA(cudaMemAdvise(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, location)); + CHECK_CUDA(CUDA_MEM_ADVISE(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n)); CHECK_CUDA(cudaMalloc(&H_d[n], sizeof(*H_d[n]) * H_info.size)); CHECK_CUDA(cudaMalloc(&R_d[n], sizeof(*R_d[n]) * R_info.size)); CHECK_CUDA(cudaMalloc(&delta_b_d[n], sizeof(*delta_b_d[n]) * delta_b_info.size)); From 3057e5ba9207da13204ffcf8c45b745950739134 Mon Sep 17 00:00:00 2001 From: 36000 Date: Mon, 25 Aug 2025 13:04:29 -0700 Subject: [PATCH 08/31] bf --- cuslines/cudamacro.h | 12 +++++------- cuslines/cuslines.cpp | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/cuslines/cudamacro.h b/cuslines/cudamacro.h index 45a8fc3..e9b2e1e 100644 --- a/cuslines/cudamacro.h +++ b/cuslines/cudamacro.h @@ -46,13 +46,11 @@ }} #if CUDART_VERSION >= 13000 -#define CUDA_MEM_ADVISE(devPtr, count, advice, device) \ - do { \ - cudaMemLocation loc; \ - loc.type = cudaMemLocationTypeDevice; \ - loc.id = (device); \ - CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), loc)); \ - } while (0) +#define CUDA_MEM_ADVISE(devPtr, count, advice, device) \ + cudaMemLocation loc; \ + loc.type = cudaMemLocationTypeDevice; \ + loc.id = (device); \ + CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), loc)); \ #else #define CUDA_MEM_ADVISE(devPtr, count, advice, device) \ CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), (device))) diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp index a1ada94..1363705 100644 --- a/cuslines/cuslines.cpp +++ b/cuslines/cuslines.cpp @@ -148,7 +148,7 @@ class GPUTracker { for (int n = 0; n < ngpus_; ++n) { CHECK_CUDA(cudaSetDevice(n)); CHECK_CUDA(cudaMallocManaged(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size)); - CHECK_CUDA(CUDA_MEM_ADVISE(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n)); + CUDA_MEM_ADVISE(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n); CHECK_CUDA(cudaMalloc(&H_d[n], sizeof(*H_d[n]) * H_info.size)); CHECK_CUDA(cudaMalloc(&R_d[n], sizeof(*R_d[n]) * R_info.size)); CHECK_CUDA(cudaMalloc(&delta_b_d[n], sizeof(*delta_b_d[n]) * delta_b_info.size)); From 80c35be576c1224702c26513c00938dd8c9ef2fb Mon Sep 17 00:00:00 2001 From: 36000 Date: Mon, 25 Aug 2025 13:10:44 -0700 Subject: [PATCH 09/31] typo --- cuslines/cudamacro.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuslines/cudamacro.h b/cuslines/cudamacro.h index e9b2e1e..7f03c6c 100644 --- a/cuslines/cudamacro.h +++ b/cuslines/cudamacro.h @@ -50,7 +50,7 @@ cudaMemLocation loc; \ loc.type = cudaMemLocationTypeDevice; \ loc.id = (device); \ - CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), loc)); \ + CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), loc)); #else #define CUDA_MEM_ADVISE(devPtr, count, advice, device) \ CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), (device))) From 38250f0b34d61e5fa77e59703d3343b32397644c Mon Sep 17 00:00:00 2001 From: 36000 Date: Mon, 25 Aug 2025 13:17:06 -0700 Subject: [PATCH 10/31] update ENV in dockerfile --- Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3a2cbdc..889371d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,15 +16,15 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.0/cmake-3.24.0 && mkdir /opt/cmake \ && /tmp/cmake-install.sh --skip-license --prefix=/opt/cmake \ && rm /tmp/cmake-install.sh -ENV PATH /opt/cmake/bin:${PATH} +ENV PATH=/opt/cmake/bin:${PATH} RUN curl -L "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" \ -o "/tmp/Miniconda3.sh" RUN bash /tmp/Miniconda3.sh -b -p /opt/anaconda RUN rm -rf /tmp/Miniconda3.sh RUN cd /opt && eval "$(/opt/anaconda/bin/conda shell.bash hook)" -ENV PATH /opt/anaconda/bin:${PATH} -ENV LD_LIBRARY_PATH /opt/anaconda/lib:${LD_LIBRARY_PATH} +ENV PATH=/opt/anaconda/bin:${PATH} +ENV LD_LIBRARY_PATH=/opt/anaconda/lib:${LD_LIBRARY_PATH} # python prereqs RUN conda tos accept --override-channels --channel conda-forge From 152722a0ceadaf92c566761d6a22ee6353ac5b65 Mon Sep 17 00:00:00 2001 From: 36000 Date: Wed, 17 Dec 2025 21:18:29 -0800 Subject: [PATCH 11/31] PTT fixes and switch to FP32 --- cuslines/Makefile | 7 +- cuslines/cuslines.cpp | 29 ++-- cuslines/globals.h | 4 +- cuslines/ptt.cu | 347 +++++++++++++++++++++++++----------------- cuslines/ptt.cuh | 18 +-- cuslines/utils.cu | 18 +-- 6 files changed, 249 insertions(+), 174 deletions(-) diff --git a/cuslines/Makefile b/cuslines/Makefile index 1061a16..c8fe6c7 100644 --- a/cuslines/Makefile +++ b/cuslines/Makefile @@ -37,7 +37,12 @@ SMS ?= 70 CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM)) LASTSM := $(lastword $(sort $(SMS))) CUDA_ARCH += -gencode arch=compute_$(LASTSM),code=compute_$(LASTSM) -CUDACFLAGS=-c -O3 -lineinfo -Xptxas=-v -std=c++11 -Xcompiler -fPIC -Xcompiler=-fopenmp $(CUDA_ARCH) + +COMMON_FLAGS = -c -std=c++11 -Xcompiler -fPIC --use_fast_math -Xcompiler=-fopenmp $(CUDA_ARCH) +RELEASE_FLAGS = -O3 -Xptxas=-O3 +DEBUG_FLAGS = -O0 -Xptxas=-v -g -G -lineinfo +CUDACFLAGS = $(COMMON_FLAGS) $(RELEASE_FLAGS) + LDFLAGS= -shared -fopenmp -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt all: cuslines diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp index 1363705..f0b8690 100644 --- a/cuslines/cuslines.cpp +++ b/cuslines/cuslines.cpp @@ -61,12 +61,12 @@ py::capsule cleanup(T* ptr) { class GPUTracker { public: GPUTracker(ModelType model_type, - double max_angle, - double min_signal, - double tc_threshold, - double step_size, - double relative_peak_thresh, - double min_separation_angle, + REAL max_angle, + REAL min_signal, + REAL tc_threshold, + REAL step_size, + REAL relative_peak_thresh, + REAL min_separation_angle, np_array_cast dataf, np_array_cast H, np_array_cast R, @@ -149,6 +149,7 @@ class GPUTracker { CHECK_CUDA(cudaSetDevice(n)); CHECK_CUDA(cudaMallocManaged(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size)); CUDA_MEM_ADVISE(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n); + // CHECK_CUDA(cudaMemPrefetchAsync(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, n)); CHECK_CUDA(cudaMalloc(&H_d[n], sizeof(*H_d[n]) * H_info.size)); CHECK_CUDA(cudaMalloc(&R_d[n], sizeof(*R_d[n]) * R_info.size)); CHECK_CUDA(cudaMalloc(&delta_b_d[n], sizeof(*delta_b_d[n]) * delta_b_info.size)); @@ -294,12 +295,12 @@ class GPUTracker { int delta_nr_, samplm_nr_; ModelType model_type_; - double max_angle_; - double tc_threshold_; - double min_signal_; - double step_size_; - double relative_peak_thresh_; - double min_separation_angle_; + REAL max_angle_; + REAL tc_threshold_; + REAL min_signal_; + REAL step_size_; + REAL relative_peak_thresh_; + REAL min_separation_angle_; std::vector nSlines_old_; std::vector slines_; @@ -332,8 +333,8 @@ PYBIND11_MODULE(cuslines, m) { .value("PTT", PTT); py::class_(m, "GPUTracker") - .def(py::init -__device__ void norm3_d(REAL_T *num, int fail_ind) { +__device__ __forceinline__ void norm3_d(REAL_T *num, int fail_ind) { const REAL_T scale = SQRT(num[0] * num[0] + num[1] * num[1] + num[2] * num[2]); - if (scale != 0) { + if (scale > NORM_EPS) { num[0] /= scale; num[1] /= scale; num[2] /= scale; } else { + num[0] = num[1] = num[2] = 0; num[fail_ind] = 1.0; // this can happen randomly during propogation, though is exceedingly rare } } template -__device__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, const REAL_T *src2, int fail_ind) { +__device__ __forceinline__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, const REAL_T *src2, int fail_ind) { dest[0] = src1[1] * src2[2] - src1[2] * src2[1]; dest[1] = src1[2] * src2[0] - src1[0] * src2[2]; dest[2] = src1[0] * src2[1] - src1[1] * src2[0]; @@ -20,13 +21,20 @@ __device__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, const REAL_T *src norm3_d(dest, fail_ind); } -template -__device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T *__restrict__ pmf, +template +__device__ REAL_T interp4_d(const REAL3_T* pos, const REAL_T* frame, const REAL_T *__restrict__ pmf, const int dimx, const int dimy, const int dimz, const int dimt, const REAL3_T *__restrict__ odf_sphere_vertices) { + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + int closest_odf_idx = 0; - REAL_T __max_cos = 0; - for (int ii = 0; ii < dimt; ii++) { + REAL_T __max_cos = REAL_T(0); + + #pragma unroll + for (int ii = tidx; ii < dimt; ii+= BDIM_X) { REAL_T cos_sim = FABS( odf_sphere_vertices[ii].x * frame[0] \ + odf_sphere_vertices[ii].y * frame[1] \ @@ -36,15 +44,30 @@ __device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T closest_odf_idx = ii; } } + __syncwarp(WMASK); - const int rv = trilinear_interp_d(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, pos, &__max_cos); + #pragma unroll + for(int i = BDIM_X/2; i; i /= 2) { + const REAL_T __tmp = __shfl_xor_sync(WMASK, __max_cos, i, BDIM_X); + const int __tmp_idx = __shfl_xor_sync(WMASK, closest_odf_idx, i, BDIM_X); + if (__tmp > __max_cos || + (__tmp == __max_cos && __tmp_idx < closest_odf_idx)) { + __max_cos = __tmp; + closest_odf_idx = __tmp_idx; + } + } + __syncwarp(WMASK); #if 0 - printf("inerpolated %f at %f, %f, %f, %i\n", __max_cos, pos.x, pos.y, pos.z, closest_odf_idx); + if (closest_odf_idx >= dimt || closest_odf_idx < 0) { + printf("Error: closest_odf_idx out of bounds: %d (dimt: %d)\n", closest_odf_idx, dimt); + } #endif + const int rv = trilinear_interp_d(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, *pos, &__max_cos); + if (rv != 0) { - return -1; + return 0; // No support } else { return __max_cos; } @@ -87,24 +110,57 @@ __device__ void prepare_propagator_d(REAL_T k1, REAL_T k2, REAL_T arclength, } } +template +__device__ void random_normal(curandStatePhilox4_32_10_t *st, REAL_T* probing_frame) { + probing_frame[3] = curand_normal(st); + probing_frame[4] = curand_normal(st); + probing_frame[5] = curand_normal(st); + REAL_T dot = probing_frame[3]*probing_frame[0] + + probing_frame[4]*probing_frame[1] + + probing_frame[5]*probing_frame[2]; + + probing_frame[3] -= dot*probing_frame[0]; + probing_frame[4] -= dot*probing_frame[1]; + probing_frame[5] -= dot*probing_frame[2]; + REAL_T n2 = probing_frame[3]*probing_frame[3] + + probing_frame[4]*probing_frame[4] + + probing_frame[5]*probing_frame[5]; + + if (n2 < NORM_EPS) { + REAL_T abs_x = FABS(probing_frame[0]); + REAL_T abs_y = FABS(probing_frame[1]); + REAL_T abs_z = FABS(probing_frame[2]); + + if (abs_x <= abs_y && abs_x <= abs_z) { + probing_frame[3] = 0.0; + probing_frame[4] = probing_frame[2]; + probing_frame[5] = -probing_frame[1]; + } + else if (abs_y <= abs_z) { + probing_frame[3] = -probing_frame[2]; + probing_frame[4] = 0.0; + probing_frame[5] = probing_frame[0]; + } + else { + probing_frame[3] = probing_frame[1]; + probing_frame[4] = -probing_frame[0]; + probing_frame[5] = 0.0; + } + } +} + template __device__ void get_probing_frame_d(const REAL_T* frame, curandStatePhilox4_32_10_t *st, REAL_T* probing_frame) { if (IS_INIT) { for (int ii = 0; ii < 3; ii++) { // tangent probing_frame[ii] = frame[ii]; } - if ((probing_frame[0] != 0) && (probing_frame[1] != 0)) { // norm - probing_frame[3] = -probing_frame[1]; - probing_frame[4] = probing_frame[0]; - probing_frame[5] = 0; - } else { - probing_frame[3] = 0; - probing_frame[4] = -probing_frame[2]; - probing_frame[5] = 0; - } + norm3_d(probing_frame, 0); - norm3_d(probing_frame, 0); // tangent + random_normal(st, probing_frame); norm3_d(probing_frame + 3, 1); // norm + + // calculate binorm crossnorm3_d(probing_frame + 2*3, probing_frame, probing_frame + 3, 2); // binorm } else { for (int ii = 0; ii < 9; ii++) { @@ -123,49 +179,59 @@ __device__ void propogate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* dir frame[2*3 + ii] = propagator[6]*frame[ii] + propagator[7]*frame[3+ii] + propagator[8]*frame[6+ii]; } -#if 1 norm3_d(__tmp, 0); // normalize tangent crossnorm3_d(frame + 3, frame + 2*3, __tmp, 1); // calc normal crossnorm3_d(frame + 2*3, __tmp, frame + 3, 2); // calculate binorm from tangent, norm -#else - norm3_d(__tmp, 0); // normalize tangent - norm3_d(frame + 2*3, 2); // normalize binorm - crossnorm3_d(frame + 3, frame + 2*3, __tmp, 1); // calculate normal from binorm, tangent -#endif for (int ii = 0; ii < 3; ii++) { frame[ii] = __tmp[ii]; } } -template +template __device__ REAL_T calculate_data_support_d(REAL_T support, const REAL3_T pos, const REAL_T *__restrict__ pmf, const int dimx, const int dimy, const int dimz, const int dimt, const REAL_T probe_step_size, const REAL3_T *__restrict__ odf_sphere_vertices, - REAL_T k1, REAL_T k2, - REAL_T* probing_frame) { - REAL_T probing_prop[9]; - REAL_T direc[3]; - REAL3_T probing_pos; - REAL_T fod_amp; - - prepare_propagator_d(k1, k2, probe_step_size, probing_prop); - probing_pos.x = pos.x; - probing_pos.y = pos.y; - probing_pos.z = pos.z; + REAL_T* probing_prop_sh, + REAL_T* direc_sh, + REAL3_T* probing_pos_sh, + REAL_T* k1_sh, REAL_T* k2_sh, + REAL_T* probing_frame_sh) { + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + if (tidx == 0) { + prepare_propagator_d( + *k1_sh, *k2_sh, + probe_step_size, probing_prop_sh); + probing_pos_sh->x = pos.x; + probing_pos_sh->y = pos.y; + probing_pos_sh->z = pos.z; + } + __syncwarp(WMASK); for (int ii = 0; ii < PROBE_QUALITY; ii++) { // we spend about 2/3 of our time in this loop when doing PTT - propogate_frame_d(probing_prop, probing_frame, direc); + if (tidx == 0) { + propogate_frame_d( + probing_prop_sh, + probing_frame_sh, + direc_sh); + + probing_pos_sh->x += direc_sh[0]; + probing_pos_sh->y += direc_sh[1]; + probing_pos_sh->z += direc_sh[2]; + } + __syncwarp(WMASK); - probing_pos.x += direc[0]; - probing_pos.y += direc[1]; - probing_pos.z += direc[2]; + const REAL_T fod_amp = interp4_d( + probing_pos_sh, probing_frame_sh, pmf, + dimx, dimy, dimz, dimt, + odf_sphere_vertices); - fod_amp = interp4_d(probing_pos, probing_frame, pmf, - dimx, dimy, dimz, dimt, - odf_sphere_vertices); if (!ALLOW_WEAK_LINK && (fod_amp < PMF_THRESHOLD_P)) { return 0; } @@ -204,13 +270,27 @@ __device__ int get_direction_ptt_d( const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - REAL_T __shared__ face_cdf_sh[BDIM_Y*DISC_FACE_CNT]; - REAL_T __shared__ vert_pdf_sh[BDIM_Y*DISC_VERT_CNT]; - REAL_T __shared__ first_val_sh[BDIM_Y]; + __shared__ REAL_T face_cdf_sh[BDIM_Y*DISC_FACE_CNT]; + __shared__ REAL_T vert_pdf_sh[BDIM_Y*DISC_VERT_CNT]; + + __shared__ REAL_T probing_frame_sh[BDIM_Y*9]; + __shared__ REAL_T k1_probe_sh[BDIM_Y]; + __shared__ REAL_T k2_probe_sh[BDIM_Y]; + + __shared__ REAL_T probing_prop_sh[BDIM_Y*9]; + __shared__ REAL_T direc_sh[BDIM_Y*3]; + __shared__ REAL3_T probing_pos_sh[BDIM_Y]; REAL_T *__face_cdf_sh = face_cdf_sh + tidy*DISC_FACE_CNT; REAL_T *__vert_pdf_sh = vert_pdf_sh + tidy*DISC_VERT_CNT; - REAL_T *__first_val_sh = first_val_sh + tidy; + + REAL_T *__probing_frame_sh = probing_frame_sh + tidy*9; + REAL_T *__k1_probe_sh = k1_probe_sh + tidy; + REAL_T *__k2_probe_sh = k2_probe_sh + tidy; + + REAL_T *__probing_prop_sh = probing_prop_sh + tidy*9; + REAL_T *__direc_sh = direc_sh + tidy*3; + REAL3_T *__probing_pos_sh = probing_pos_sh + tidy; const REAL_T max_curvature = SIN(max_angle / 2) / step_size; // bigger numbers means wiggle more const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1)); @@ -225,30 +305,31 @@ __device__ int get_direction_ptt_d( __frame_sh[2] = dir.z; } } - if (tidx==0) { - *__first_val_sh = interp4_d(pos, __frame_sh, pmf, - dimx, dimy, dimz, dimt, - odf_sphere_vertices); - } + + const REAL_T first_val = interp4_d( + __probing_pos_sh, __frame_sh, pmf, + dimx, dimy, dimz, dimt, + odf_sphere_vertices); __syncwarp(WMASK); // Calculate __vert_pdf_sh - REAL_T probing_frame[9]; - REAL_T k1_probe, k2_probe; - bool support_found = 0; - for (int ii = tidx; ii < DISC_VERT_CNT; ii += BDIM_X) { - k1_probe = DISC_VERT[ii*2] * max_curvature; - k2_probe = DISC_VERT[ii*2+1] * max_curvature; - - get_probing_frame_d(__frame_sh, st, probing_frame); + bool support_found = false; + for (int ii = 0; ii < DISC_VERT_CNT; ii++) { + if (tidx == 0) { + *__k1_probe_sh = DISC_VERT[ii*2] * max_curvature; + *__k2_probe_sh = DISC_VERT[ii*2+1] * max_curvature; + get_probing_frame_d(__frame_sh, st, __probing_frame_sh); + } + __syncwarp(WMASK); - const REAL_T this_support = calculate_data_support_d( - *__first_val_sh, + const REAL_T this_support = calculate_data_support_d( + first_val, pos, pmf, dimx, dimy, dimz, dimt, probe_step_size, odf_sphere_vertices, - k1_probe, k2_probe, - probing_frame); + __probing_prop_sh, __direc_sh, __probing_pos_sh, + __k1_probe_sh, __k2_probe_sh, + __probing_frame_sh); #if 0 if (threadIdx.y == 1 && ii == 0) { @@ -257,14 +338,17 @@ __device__ int get_direction_ptt_d( #endif if (this_support < NORM_MIN_SUPPORT) { - __vert_pdf_sh[ii] = 0; + if (tidx == 0) { + __vert_pdf_sh[ii] = 0; + } } else { - __vert_pdf_sh[ii] = this_support; + if (tidx == 0) { + __vert_pdf_sh[ii] = this_support; + } support_found = 1; } } - const int __msk = __ballot_sync(WMASK, support_found); - if (__msk == 0) { + if (support_found == 0) { return 0; } @@ -323,82 +407,69 @@ __device__ int get_direction_ptt_d( #endif // Sample random valid faces randomly - REAL_T r1, r2; - for (int ii = 0; ii < TRIES_PER_REJECTION_SAMPLING / BDIM_X; ii++) { - r1 = curand_uniform(st); - r2 = curand_uniform(st); - if (r1 + r2 > 1) { - r1 = 1 - r1; - r2 = 1 - r2; - } - - __tmp = curand_uniform(st) * last_cdf; - int jj; - for (jj = 0; jj < DISC_FACE_CNT; jj++) { - if (__face_cdf_sh[jj] >= __tmp) - break; - } - - const REAL_T vx0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2]; - const REAL_T vx1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2]; - const REAL_T vx2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2]; - - const REAL_T vy0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2 + 1]; - const REAL_T vy1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2 + 1]; - const REAL_T vy2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2 + 1]; - - k1_probe = vx0 + r1 * (vx1 - vx0) + r2 * (vx2 - vx0); - k2_probe = vy0 + r1 * (vy1 - vy0) + r2 * (vy2 - vy0); - - get_probing_frame_d(__frame_sh, st, probing_frame); - - const REAL_T this_support = calculate_data_support_d(*__first_val_sh, - pos, pmf, dimx, dimy, dimz, dimt, - probe_step_size, - odf_sphere_vertices, - k1_probe, k2_probe, - probing_frame); + for (int ii = 0; ii < TRIES_PER_REJECTION_SAMPLING; ii++) { + if (tidx == 0) { + REAL_T r1 = curand_uniform(st); + REAL_T r2 = curand_uniform(st); + if (r1 + r2 > 1) { + r1 = 1 - r1; + r2 = 1 - r2; + } + + __tmp = curand_uniform(st) * last_cdf; + int jj; + for (jj = 0; jj < DISC_FACE_CNT; jj++) { // TODO: parallelize this + if (__face_cdf_sh[jj] >= __tmp) + break; + } + + const REAL_T vx0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2]; + const REAL_T vx1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2]; + const REAL_T vx2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2]; + const REAL_T vy0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2 + 1]; + const REAL_T vy1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2 + 1]; + const REAL_T vy2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2 + 1]; + *__k1_probe_sh = vx0 + r1 * (vx1 - vx0) + r2 * (vx2 - vx0); + *__k2_probe_sh = vy0 + r1 * (vy1 - vy0) + r2 * (vy2 - vy0); + get_probing_frame_d(__frame_sh, st, __probing_frame_sh); + } __syncwarp(WMASK); - int winning_lane = -1; // -1 indicates nobody won - int __msk = __ballot_sync(WMASK, this_support >= NORM_MIN_SUPPORT); - if (__msk != 0) { - REAL_T group_max_support = this_support; - #pragma unroll - for(int j = 1; j < PROBABILISTIC_GROUP_SZ; j *= 2) { - __tmp = __shfl_xor_sync(WMASK, group_max_support, j, BDIM_X); - group_max_support = MAX(group_max_support, __tmp); - } - __msk &= __ballot_sync(WMASK, this_support == group_max_support); - winning_lane = __ffs(__msk) - 1; + const REAL_T this_support = calculate_data_support_d( + first_val, + pos, pmf, dimx, dimy, dimz, dimt, + probe_step_size, + odf_sphere_vertices, + __probing_prop_sh, __direc_sh, __probing_pos_sh, + __k1_probe_sh, __k2_probe_sh, + __probing_frame_sh); + __syncwarp(WMASK); + + if (this_support < NORM_MIN_SUPPORT) { + continue; } - if (winning_lane != -1) { - if (tidx == winning_lane) { -#if 0 - if (threadIdx.y == 1) { - printf("winning k1 %f, k2 %f, cdf %f, cdf_idx %i", k1_probe, k2_probe, __tmp, jj); - } -#endif - if (IS_INIT) { - dirs[0] = dir; - } else { - REAL_T __prop[9]; - REAL_T __dir[3]; - prepare_propagator_d(k1_probe, k2_probe, step_size/STEP_FRAC, __prop); - propogate_frame_d(__prop, probing_frame, __dir); - norm3_d(__dir, 0); // this will be scaled by the generic stepping code - dirs[0] = (REAL3_T) {__dir[0], __dir[1], __dir[2]}; - } - for (int jj = 0; jj < 9; jj++) { - __frame_sh[jj] = probing_frame[jj]; - } + if (tidx == 0) { + if (IS_INIT) { + dirs[0] = dir; + } else { + // Propogate, but only 1/STEP_FRAC of a step + prepare_propagator_d( + *__k1_probe_sh, *__k2_probe_sh, + step_size/STEP_FRAC, __probing_prop_sh); + propogate_frame_d(__probing_prop_sh, __probing_frame_sh, __direc_sh); + norm3_d(__direc_sh, 0); // this will be scaled by the generic stepping code + dirs[0] = (REAL3_T) {__direc_sh[0], __direc_sh[1], __direc_sh[2]}; } - __syncwarp(WMASK); - return 1; } + + if (tidx < 9) { + __frame_sh[tidx] = __probing_frame_sh[tidx]; + } + __syncwarp(WMASK); + return 1; } return 0; } diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh index e3317ff..751c4bb 100644 --- a/cuslines/ptt.cuh +++ b/cuslines/ptt.cuh @@ -4,18 +4,16 @@ #include "disc.h" #include "globals.h" -#define STEP_FRAC 20 // divides output step size (usually 0.5) into this many internal steps -#define PROBE_FRAC 2 // divides output step size (usually 0.5) to find probe length -#define PROBE_QUALITY 4 // Number of probing steps -#define SAMPLING_QUALITY 4 // can be 2-7 -#define DETERMINISTIC_BIAS 0 // Should be 0, higher values bias more towards higher fODF values when tracking -#define ALLOW_WEAK_LINK 0 -#define TRIES_PER_REJECTION_SAMPLING 1024 -#define DEFAULT_PTT_MINDATASUPPORT 0.01 // 0.01 -#define K_SMALL 0.0001 +#define STEP_FRAC (20) // divides output step size (usually 0.5) into this many internal steps +#define PROBE_FRAC (2) // divides output step size (usually 0.5) to find probe length +#define PROBE_QUALITY (4) // Number of probing steps +#define SAMPLING_QUALITY (2) // can be 2-7 +#define ALLOW_WEAK_LINK (0) +#define TRIES_PER_REJECTION_SAMPLING (1024) +#define DEFAULT_PTT_MINDATASUPPORT ((REAL) 0.01) // 0.01 +#define K_SMALL ((REAL) 0.0001) #define NORM_MIN_SUPPORT (DEFAULT_PTT_MINDATASUPPORT * PROBE_QUALITY) -#define PROBABILISTIC_GROUP_SZ POW2(DETERMINISTIC_BIAS) #if SAMPLING_QUALITY == 2 #define DISC_VERT_CNT DISC_2_VERT_CNT diff --git a/cuslines/utils.cu b/cuslines/utils.cu index c7fe47f..19282de 100644 --- a/cuslines/utils.cu +++ b/cuslines/utils.cu @@ -59,7 +59,7 @@ __device__ void printArray(const char *name, int ncol, int n, REAL_T *arr) { } template -__device__ REAL_T interpolation_helper_d(const REAL_T* dataf, const REAL_T wgh[3][2], const long long coo[3][2], int dimy, int dimz, int dimt, int t) { +__device__ REAL_T interpolation_helper_d(const REAL_T*__restrict__ dataf, const REAL_T wgh[3][2], const long long coo[3][2], int dimy, int dimz, int dimt, int t) { REAL_T __tmp = 0; #pragma unroll for (int i = 0; i < 2; i++) { @@ -130,14 +130,12 @@ __device__ int trilinear_interp_d(const int dimx, *__vox_data = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, dimt_idx); } - /* - __syncwarp(WMASK); - if (tidx == 0 && threadIdx.y == 0) { - printf("point: %f, %f, %f\n", point.x, point.y, point.z); - for(int i = 0; i < dimt; i++) { - printf("__vox_data[%d]: %f\n", i, __vox_data[i]); - } - } - */ + // if (threadIdx.x == 0) { + // printf("point: %f, %f, %f\n", point.x, point.y, point.z); + // printf("dimt_idx: %d\n", dimt_idx); + // // for(int i = 0; i < dimt; i++) { + // // printf("__vox_data[%d]: %f\n", i, __vox_data[i]); + // // } + // } return 0; } From d93e5edc2559d3f6bd3235290b5dc730dd29319f Mon Sep 17 00:00:00 2001 From: 36000 Date: Thu, 18 Dec 2025 19:08:54 -0800 Subject: [PATCH 12/31] PTT looking even better --- cuslines/generate_streamlines_cuda.cu | 24 ------------------------ cuslines/globals.h | 2 +- cuslines/ptt.cu | 23 ++++++++++++++--------- cuslines/ptt.cuh | 3 --- cuslines/utils.cu | 23 +++++++++++++++++++++++ 5 files changed, 38 insertions(+), 37 deletions(-) diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu index 374c7c1..0efefdd 100644 --- a/cuslines/generate_streamlines_cuda.cu +++ b/cuslines/generate_streamlines_cuda.cu @@ -307,30 +307,6 @@ __device__ VAL_T max_mask_transl_d(const int n, return __m; } -template -__device__ VAL_T max_d(const int n, const VAL_T *__restrict__ src, const VAL_T minVal) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - VAL_T __m = minVal; - - for(int i = tidx; i < n; i += BDIM_X) { - __m = MAX(__m, src[i]); - } - - #pragma unroll - for(int i = BDIM_X/2; i; i /= 2) { - const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X); - __m = MAX(__m, __tmp); - } - - return __m; -} - template __device__ VAL_T min_d(const int n, const VAL_T *__restrict__ src, const VAL_T maxVal) { diff --git a/cuslines/globals.h b/cuslines/globals.h index 7b3f7b4..0d852e9 100644 --- a/cuslines/globals.h +++ b/cuslines/globals.h @@ -70,7 +70,7 @@ #endif #define MAX_SLINE_LEN (501) -#define PMF_THRESHOLD_P ((REAL)0.1) +#define PMF_THRESHOLD_P ((REAL)0.05) #define THR_X_BL (64) #define THR_X_SL (32) diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu index 11a2f44..f6cbc24 100644 --- a/cuslines/ptt.cu +++ b/cuslines/ptt.cu @@ -22,7 +22,7 @@ __device__ __forceinline__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, c } template -__device__ REAL_T interp4_d(const REAL3_T* pos, const REAL_T* frame, const REAL_T *__restrict__ pmf, +__device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T *__restrict__ pmf, const int dimx, const int dimy, const int dimz, const int dimt, const REAL3_T *__restrict__ odf_sphere_vertices) { const int tidx = threadIdx.x; @@ -64,7 +64,7 @@ __device__ REAL_T interp4_d(const REAL3_T* pos, const REAL_T* frame, const REAL_ } #endif - const int rv = trilinear_interp_d(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, *pos, &__max_cos); + const int rv = trilinear_interp_d(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, pos, &__max_cos); if (rv != 0) { return 0; // No support @@ -193,6 +193,7 @@ __device__ REAL_T calculate_data_support_d(REAL_T support, const REAL3_T pos, const REAL_T *__restrict__ pmf, const int dimx, const int dimy, const int dimz, const int dimt, const REAL_T probe_step_size, + const REAL_T absolpmf_thresh, const REAL3_T *__restrict__ odf_sphere_vertices, REAL_T* probing_prop_sh, REAL_T* direc_sh, @@ -227,12 +228,12 @@ __device__ REAL_T calculate_data_support_d(REAL_T support, } __syncwarp(WMASK); - const REAL_T fod_amp = interp4_d( - probing_pos_sh, probing_frame_sh, pmf, + const REAL_T fod_amp = interp4_d( // This is the most expensive call + *probing_pos_sh, probing_frame_sh, pmf, dimx, dimy, dimz, dimt, odf_sphere_vertices); - if (!ALLOW_WEAK_LINK && (fod_amp < PMF_THRESHOLD_P)) { + if (!ALLOW_WEAK_LINK && (fod_amp < absolpmf_thresh)) { return 0; } support += fod_amp; @@ -292,8 +293,9 @@ __device__ int get_direction_ptt_d( REAL_T *__direc_sh = direc_sh + tidy*3; REAL3_T *__probing_pos_sh = probing_pos_sh + tidy; - const REAL_T max_curvature = SIN(max_angle / 2) / step_size; // bigger numbers means wiggle more const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1)); + const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / step_size; + const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d(dimt, pmf, REAL_MIN); REAL_T __tmp; @@ -307,7 +309,7 @@ __device__ int get_direction_ptt_d( } const REAL_T first_val = interp4_d( - __probing_pos_sh, __frame_sh, pmf, + pos, __frame_sh, pmf, dimx, dimy, dimz, dimt, odf_sphere_vertices); __syncwarp(WMASK); @@ -326,6 +328,7 @@ __device__ int get_direction_ptt_d( first_val, pos, pmf, dimx, dimy, dimz, dimt, probe_step_size, + absolpmf_thresh, odf_sphere_vertices, __probing_prop_sh, __direc_sh, __probing_pos_sh, __k1_probe_sh, __k2_probe_sh, @@ -337,7 +340,7 @@ __device__ int get_direction_ptt_d( } #endif - if (this_support < NORM_MIN_SUPPORT) { + if (this_support < PROBE_QUALITY * absolpmf_thresh) { if (tidx == 0) { __vert_pdf_sh[ii] = 0; } @@ -441,13 +444,14 @@ __device__ int get_direction_ptt_d( first_val, pos, pmf, dimx, dimy, dimz, dimt, probe_step_size, + absolpmf_thresh, odf_sphere_vertices, __probing_prop_sh, __direc_sh, __probing_pos_sh, __k1_probe_sh, __k2_probe_sh, __probing_frame_sh); __syncwarp(WMASK); - if (this_support < NORM_MIN_SUPPORT) { + if (this_support < PROBE_QUALITY * absolpmf_thresh) { continue; } @@ -459,6 +463,7 @@ __device__ int get_direction_ptt_d( prepare_propagator_d( *__k1_probe_sh, *__k2_probe_sh, step_size/STEP_FRAC, __probing_prop_sh); + get_probing_frame_d<0>(__frame_sh, st, __probing_frame_sh); propogate_frame_d(__probing_prop_sh, __probing_frame_sh, __direc_sh); norm3_d(__direc_sh, 0); // this will be scaled by the generic stepping code dirs[0] = (REAL3_T) {__direc_sh[0], __direc_sh[1], __direc_sh[2]}; diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh index 751c4bb..9126250 100644 --- a/cuslines/ptt.cuh +++ b/cuslines/ptt.cuh @@ -10,11 +10,8 @@ #define SAMPLING_QUALITY (2) // can be 2-7 #define ALLOW_WEAK_LINK (0) #define TRIES_PER_REJECTION_SAMPLING (1024) -#define DEFAULT_PTT_MINDATASUPPORT ((REAL) 0.01) // 0.01 #define K_SMALL ((REAL) 0.0001) -#define NORM_MIN_SUPPORT (DEFAULT_PTT_MINDATASUPPORT * PROBE_QUALITY) - #if SAMPLING_QUALITY == 2 #define DISC_VERT_CNT DISC_2_VERT_CNT #define DISC_FACE_CNT DISC_2_FACE_CNT diff --git a/cuslines/utils.cu b/cuslines/utils.cu index 19282de..93b1190 100644 --- a/cuslines/utils.cu +++ b/cuslines/utils.cu @@ -1,3 +1,26 @@ +template +__device__ VAL_T max_d(const int n, const VAL_T *__restrict__ src, const VAL_T minVal) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + VAL_T __m = minVal; + + for(int i = tidx; i < n; i += BDIM_X) { + __m = MAX(__m, src[i]); + } + + #pragma unroll + for(int i = BDIM_X/2; i; i /= 2) { + const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X); + __m = MAX(__m, __tmp); + } + + return __m; +} template __device__ void prefix_sum_sh_d(REAL_T *num_sh, int __len) { From a5743bb249faa35564b8a74003f67300c40dd891 Mon Sep 17 00:00:00 2001 From: 36000 Date: Fri, 19 Dec 2025 10:40:23 -0800 Subject: [PATCH 13/31] bf --- cuslines/ptt.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu index f6cbc24..fa238c6 100644 --- a/cuslines/ptt.cu +++ b/cuslines/ptt.cu @@ -34,7 +34,7 @@ __device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T REAL_T __max_cos = REAL_T(0); #pragma unroll - for (int ii = tidx; ii < dimt; ii+= BDIM_X) { + for (int ii = tidx; ii < dimt; ii+= BDIM_X) { // TODO: I need to think about better ways of parallelizing this REAL_T cos_sim = FABS( odf_sphere_vertices[ii].x * frame[0] \ + odf_sphere_vertices[ii].y * frame[1] \ @@ -64,6 +64,7 @@ __device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T } #endif + // TODO: maybe this should be texture memory, I am not so sure const int rv = trilinear_interp_d(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, pos, &__max_cos); if (rv != 0) { @@ -294,7 +295,7 @@ __device__ int get_direction_ptt_d( REAL3_T *__probing_pos_sh = probing_pos_sh + tidy; const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1)); - const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / step_size; + const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / (step_size / PROBE_FRAC); const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d(dimt, pmf, REAL_MIN); REAL_T __tmp; From ef4617c06805ca395842b48a5bf8a9fa1d35d30c Mon Sep 17 00:00:00 2001 From: 36000 Date: Mon, 22 Dec 2025 11:14:12 -0800 Subject: [PATCH 14/31] finally put to bed the max curve stuff --- cuslines/ptt.cu | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu index fa238c6..57a27ab 100644 --- a/cuslines/ptt.cu +++ b/cuslines/ptt.cu @@ -295,9 +295,15 @@ __device__ int get_direction_ptt_d( REAL3_T *__probing_pos_sh = probing_pos_sh + tidy; const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1)); - const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / (step_size / PROBE_FRAC); + const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / step_size; const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d(dimt, pmf, REAL_MIN); +#if 0 + printf("absolpmf_thresh: %f, max_curvature: %f, probe_step_size: %f\n", absolpmf_thresh, max_curvature, probe_step_size); + printf("max_angle: %f\n", max_angle); + printf("step_size: %f\n", step_size); +#endif + REAL_T __tmp; __syncwarp(WMASK); From 1df2ec09b1cf9ffb2d51534ae6669e31c6c164d3 Mon Sep 17 00:00:00 2001 From: 36000 Date: Mon, 22 Dec 2025 11:17:45 -0800 Subject: [PATCH 15/31] spelling error --- cuslines/ptt.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu index 57a27ab..3cdd149 100644 --- a/cuslines/ptt.cu +++ b/cuslines/ptt.cu @@ -171,7 +171,7 @@ __device__ void get_probing_frame_d(const REAL_T* frame, curandStatePhilox4_32_1 } template -__device__ void propogate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* direc) { +__device__ void propagate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* direc) { REAL_T __tmp[3]; for (int ii = 0; ii < 3; ii++) { @@ -218,7 +218,7 @@ __device__ REAL_T calculate_data_support_d(REAL_T support, for (int ii = 0; ii < PROBE_QUALITY; ii++) { // we spend about 2/3 of our time in this loop when doing PTT if (tidx == 0) { - propogate_frame_d( + propagate_frame_d( probing_prop_sh, probing_frame_sh, direc_sh); @@ -466,12 +466,12 @@ __device__ int get_direction_ptt_d( if (IS_INIT) { dirs[0] = dir; } else { - // Propogate, but only 1/STEP_FRAC of a step + // propagate, but only 1/STEP_FRAC of a step prepare_propagator_d( *__k1_probe_sh, *__k2_probe_sh, step_size/STEP_FRAC, __probing_prop_sh); get_probing_frame_d<0>(__frame_sh, st, __probing_frame_sh); - propogate_frame_d(__probing_prop_sh, __probing_frame_sh, __direc_sh); + propagate_frame_d(__probing_prop_sh, __probing_frame_sh, __direc_sh); norm3_d(__direc_sh, 0); // this will be scaled by the generic stepping code dirs[0] = (REAL3_T) {__direc_sh[0], __direc_sh[1], __direc_sh[2]}; } From 09a83a8b2c1a1bfe2d5f4ba23473c0d76bf3290a Mon Sep 17 00:00:00 2001 From: 36000 Date: Mon, 22 Dec 2025 12:26:11 -0800 Subject: [PATCH 16/31] staring this up --- cuslines/__init__.py | 0 cuslines/cuslines.py | 295 +++++++++++++++++++++++++++++++++++++++++ cuslines/globals.py | 0 run_gpu_streamlines.py | 3 +- 4 files changed, 296 insertions(+), 2 deletions(-) create mode 100644 cuslines/__init__.py create mode 100644 cuslines/cuslines.py create mode 100644 cuslines/globals.py diff --git a/cuslines/__init__.py b/cuslines/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cuslines/cuslines.py b/cuslines/cuslines.py new file mode 100644 index 0000000..bb83e5e --- /dev/null +++ b/cuslines/cuslines.py @@ -0,0 +1,295 @@ +from cuda.bindings import driver, nvrtc, runtime +# TODO: this would be better if only using CUDA core + +import numpy as np +import logging + +import re +import os + + +logger = logging.getLogger("GPUStreamlines") + + +# We extract REAL_DTYPE, MAX_SLINE_LEN from globals.h +# Maybe there is a more elegant way of doing this? +dir_path = os.path.dirname(os.path.abspath(__file__)) +globals_path = os.path.join(dir_path, "globals.h") +with open(globals_path, 'r') as f: + content = f.read() + +defines = dict(re.findall(r"#define\s+(\w+)\s+([^\s/]+)", content)) +REAL_SIZE = int(defines["REAL_SIZE"]) +if REAL_SIZE == 4: + REAL_DTYPE = np.float32 +elif REAL_SIZE == 8: + REAL_DTYPE = np.float64 +else: + raise NotImplementedError(f"Unsupported REAL_SIZE={REAL_SIZE} in globals.h") +MAX_SLINE_LEN = int(defines["MAX_SLINE_LEN"]) + + +def _cudaGetErrorEnum(error): + if isinstance(error, driver.CUresult): + err, name = driver.cuGetErrorName(error) + return name if err == driver.CUresult.CUDA_SUCCESS else "" + elif isinstance(error, nvrtc.nvrtcResult): + return nvrtc.nvrtcGetErrorString(error)[1] + else: + raise RuntimeError('Unknown error type: {}'.format(error)) + +def checkCudaErrors(result): + if result[0].value: + raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0]))) + if len(result) == 1: + return None + elif len(result) == 2: + return result[1] + else: + return result[1:] + + +class GPUTracker: + def __init__( + self, + model_type: ModelType, + max_angle: float, + min_signal: float, + tc_threshold: float, + step_size: float, + relative_peak_thresh: float, + min_separation_angle: float, + dataf: np.ndarray, + H: np.ndarray, + R: np.ndarray, + delta_b: np.ndarray, + delta_q: np.ndarray, # TODO: some of these only needed for boot + b0s_mask: np.ndarray, + metric_map: np.ndarray, + sampling_matrix: np.ndarray, + sphere_vertices: np.ndarray, + sphere_edges: np.ndarray, + ngpus: int = 1, + rng_seed: int = 0, + rng_offset: int = 0, + ): + for name, arr, dt in [ + ("dataf", dataf, REAL_DTYPE), + ("H", H, REAL_DTYPE), + ("R", R, REAL_DTYPE), + ("delta_b", delta_b, REAL_DTYPE), + ("delta_q", delta_q, REAL_DTYPE), + ("b0s_mask", b0s_mask, np.int32), + ("metric_map", metric_map, REAL_DTYPE), + ("sampling_matrix", sampling_matrix, REAL_DTYPE), + ("sphere_vertices", sphere_vertices, REAL_DTYPE), + ("sphere_edges", sphere_edges, np.int32), + ]: + if arr.dtype != dt: + raise TypeError(f"{name} must have dtype {dt}, got {arr.dtype}") + if not arr.flags.c_contiguous: + raise ValueError(f"{name} must be C-contiguous") + + self.dataf = dataf + self.H = H + self.R = R + self.delta_b = delta_b + self.delta_q = delta_q + self.b0s_mask = b0s_mask + self.metric_map = metric_map + self.sampling_matrix = sampling_matrix + self.sphere_vertices = sphere_vertices + self.sphere_edges = sphere_edges + + self.dimx, self.dimy, self.dimz, self.dimt = dataf.shape + self.nedges = int(sphere_edges.shape[0]) + self.delta_nr = int(delta_b.shape[0]) + self.samplm_nr = int(sampling_matrix.shape[0]) + + self.model_type = int(model_type) + self.max_angle = REAL_DTYPE(max_angle) + self.min_signal = REAL_DTYPE(min_signal) + self.tc_threshold = REAL_DTYPE(tc_threshold) + self.step_size = REAL_DTYPE(step_size) + self.relative_peak_thresh = REAL_DTYPE(relative_peak_thresh) + self.min_separation_angle = REAL_DTYPE(min_separation_angle) + + self.ngpus = int(ngpus) + self.rng_seed = int(rng_seed) + self.rng_offset = int(rng_offset) + + self.nSlines_old = [] + self.slines = [] + self.sline_lens = [] + + checkCudaErrors(driver.cuInit(0)) + avail = checkCudaErrors(runtime.cudaGetDeviceCount()) + if self.ngpus > avail: + raise RuntimeError(f"Requested {self.ngpus} GPUs but only {avail} available") + + logger.info("Creating GPUTracker with %d GPUs...", self.ngpus) + + self.dataf_pts = [] + self.H_pts = [] + self.R_pts = [] + self.delta_b_pts = [] + self.delta_q_pts = [] + self.b0s_mask_pts = [] + self.metric_map_pts = [] + self.sampling_matrix_pts = [] + self.sphere_vertices_pts = [] + self.sphere_edges_pts = [] + + for ii in range(self.ngpus): + checkCudaErrors(runtime.cudaSetDevice(ii)) + self.dataf_pts.append( # TODO: put this in texture memory? + checkCudaErrors(runtime.cudaMallocManaged( + REAL_SIZE*self.dataf.size, + runtime.cudaMemAttachGlobal))) + checkCudaErrors(runtime.cudaMemAdvise( + self.dataf_pts[ii], + REAL_SIZE*self.dataf.size, + runtime.cudaMemAdviseSetPreferredLocation, + ii)) + self.H_pts.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.H.size))) + self.R_pts.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.R.size))) + self.delta_b_pts.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.delta_b.size))) + self.delta_q_pts.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.delta_q.size))) + self.b0s_mask_pts.append( + checkCudaErrors(runtime.cudaMalloc( + np.int32().nbytes*self.b0s_mask.size))) + self.metric_map_pts.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.metric_map.size))) + self.sampling_matrix_pts.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.sampling_matrix.size))) + self.sphere_vertices_pts.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.sphere_vertices.size))) + self.sphere_edges_pts.append( + checkCudaErrors(runtime.cudaMalloc( + np.int32().nbytes*self.sphere_edges.size))) + + checkCudaErrors(runtime.cudaMemcpy( + self.dataf_pts[ii], + self.dataf.ctypes.data, + REAL_SIZE*self.dataf.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.H_pts[ii], + self.H.ctypes.data, + REAL_SIZE*self.H.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.R_pts[ii], + self.R.ctypes.data, + REAL_SIZE*self.R.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.delta_b_pts[ii], + self.delta_b.ctypes.data, + REAL_SIZE*self.delta_b.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.delta_q_pts[ii], + self.delta_q.ctypes.data, + REAL_SIZE*self.delta_q.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.b0s_mask_pts[ii], + self.b0s_mask.ctypes.data, + np.int32().nbytes*self.b0s_mask.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.metric_map_pts[ii], + self.metric_map.ctypes.data, + REAL_SIZE*self.metric_map.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.sampling_matrix_pts[ii], + self.sampling_matrix.ctypes.data, + REAL_SIZE*self.sampling_matrix.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.sphere_vertices_pts[ii], + self.sphere_vertices.ctypes.data, + REAL_SIZE*self.sphere_vertices.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.sphere_edges_pts[ii], + self.sphere_edges.ctypes.data, + np.int32().nbytes*self.sphere_edges.size, + runtime.cudaMemcpyHostToDevice)) + + self.streams = [] + for ii in range(self.ngpus): + checkCudaErrors(runtime.cudaSetDevice(ii)) + self.streams.append( + checkCudaErrors(runtime.cudaStreamCreateWithFlags( + runtime.cudaStreamNonBlocking))) + + def generate_streamlines(self, seeds): # TODO: location this is going should be these arguments + nseeds = len(seeds) + nseeds_per_gpu = (nseeds + self.ngpus - 1) // self.ngpus + + seeds_ptrs = [] + + for ii in range(self.ngpus): + nseeds_gpu = min(nseeds_per_gpu, max(0, nseeds - ii * nseeds_per_gpu)) + checkCudaErrors(runtime.cudaSetDevice(ii)) + seeds_ptrs.append(checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*3*nseeds_gpu))) + checkCudaErrors(runtime.cudaMemcpy( + seeds_ptrs[ii], + seeds[ii*nseeds_per_gpu:(ii+1)*nseeds_per_gpu].ctypes.data, + REAL_SIZE*3*nseeds_gpu, + runtime.cudaMemcpyHostToDevice)) + + nSlines = [0] * self.ngpus # TODO: figure out what this is doing + # TODO: + # // Call GPU routine + # generate_streamlines_cuda_mgpu(model_type_, max_angle_, min_signal_, tc_threshold_, step_size_, + # relative_peak_thresh_, min_separation_angle_, + # nseeds, seeds_d, + # dimx_, dimy_, dimz_, dimt_, + # dataf_d, H_d, R_d, delta_nr_, delta_b_d, delta_q_d, b0s_mask_d, metric_map_d, samplm_nr_, sampling_matrix_d, + # sphere_vertices_d, sphere_edges_d, nedges_, + # slines_, slinesLen_, nSlines, nSlines_old_, rng_seed_, rng_offset_, ngpus_, + # streams_); + + self.nSlines_old = nSlines.copy() # TODO: figure out what this is doing + self.rng_offset += nseeds + + nSlines_total = 0 + for ii in range(self.ngpus): + checkCudaErrors(runtime.cudaFree(seeds_ptrs[ii])) + nSlines_total += nSlines[ii] + + + # TODO + # std::vector> slines_list; + # slines_list.reserve(nSlines_total); + # for (int n = 0; n < ngpus_; ++n) { + # for (int i = 0; i < nSlines[n]; ++i) { + # REAL* sl = new REAL[slinesLen_[n][i]*3]; + # std::memcpy(sl, slines_[n] + i*3*2*MAX_SLINE_LEN, slinesLen_[n][i]*3*sizeof(*sl)); + # auto sl_arr = py::array_t({slinesLen_[n][i], 3}, // shape + # {3*sizeof(REAL), sizeof(REAL)}, // strides + # sl, + # cleanup(sl)); + # slines_list.push_back(sl_arr); + # } + # } + + # return slines_list; + + # } \ No newline at end of file diff --git a/cuslines/globals.py b/cuslines/globals.py new file mode 100644 index 0000000..e69de29 diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py index e627978..d546d60 100644 --- a/run_gpu_streamlines.py +++ b/run_gpu_streamlines.py @@ -33,11 +33,10 @@ import zipfile import numpy as np -import numpy.linalg as npl import dipy.reconst.dti as dti from dipy.io import read_bvals_bvecs -from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram +from dipy.io.stateful_tractogram import Space, StatefulTractogram from dipy.io.streamline import save_tractogram from dipy.tracking import utils from dipy.core.gradients import gradient_table, unique_bvals_magnitude From 3828b73956c7da8e04106ee67a07fb423d90860b Mon Sep 17 00:00:00 2001 From: 36000 Date: Thu, 1 Jan 2026 15:13:12 -0800 Subject: [PATCH 17/31] preparing to implement compilation --- cuslines/cuslines.py | 295 ------------------------------------------- cuslines/globals.py | 0 2 files changed, 295 deletions(-) delete mode 100644 cuslines/cuslines.py delete mode 100644 cuslines/globals.py diff --git a/cuslines/cuslines.py b/cuslines/cuslines.py deleted file mode 100644 index bb83e5e..0000000 --- a/cuslines/cuslines.py +++ /dev/null @@ -1,295 +0,0 @@ -from cuda.bindings import driver, nvrtc, runtime -# TODO: this would be better if only using CUDA core - -import numpy as np -import logging - -import re -import os - - -logger = logging.getLogger("GPUStreamlines") - - -# We extract REAL_DTYPE, MAX_SLINE_LEN from globals.h -# Maybe there is a more elegant way of doing this? -dir_path = os.path.dirname(os.path.abspath(__file__)) -globals_path = os.path.join(dir_path, "globals.h") -with open(globals_path, 'r') as f: - content = f.read() - -defines = dict(re.findall(r"#define\s+(\w+)\s+([^\s/]+)", content)) -REAL_SIZE = int(defines["REAL_SIZE"]) -if REAL_SIZE == 4: - REAL_DTYPE = np.float32 -elif REAL_SIZE == 8: - REAL_DTYPE = np.float64 -else: - raise NotImplementedError(f"Unsupported REAL_SIZE={REAL_SIZE} in globals.h") -MAX_SLINE_LEN = int(defines["MAX_SLINE_LEN"]) - - -def _cudaGetErrorEnum(error): - if isinstance(error, driver.CUresult): - err, name = driver.cuGetErrorName(error) - return name if err == driver.CUresult.CUDA_SUCCESS else "" - elif isinstance(error, nvrtc.nvrtcResult): - return nvrtc.nvrtcGetErrorString(error)[1] - else: - raise RuntimeError('Unknown error type: {}'.format(error)) - -def checkCudaErrors(result): - if result[0].value: - raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0]))) - if len(result) == 1: - return None - elif len(result) == 2: - return result[1] - else: - return result[1:] - - -class GPUTracker: - def __init__( - self, - model_type: ModelType, - max_angle: float, - min_signal: float, - tc_threshold: float, - step_size: float, - relative_peak_thresh: float, - min_separation_angle: float, - dataf: np.ndarray, - H: np.ndarray, - R: np.ndarray, - delta_b: np.ndarray, - delta_q: np.ndarray, # TODO: some of these only needed for boot - b0s_mask: np.ndarray, - metric_map: np.ndarray, - sampling_matrix: np.ndarray, - sphere_vertices: np.ndarray, - sphere_edges: np.ndarray, - ngpus: int = 1, - rng_seed: int = 0, - rng_offset: int = 0, - ): - for name, arr, dt in [ - ("dataf", dataf, REAL_DTYPE), - ("H", H, REAL_DTYPE), - ("R", R, REAL_DTYPE), - ("delta_b", delta_b, REAL_DTYPE), - ("delta_q", delta_q, REAL_DTYPE), - ("b0s_mask", b0s_mask, np.int32), - ("metric_map", metric_map, REAL_DTYPE), - ("sampling_matrix", sampling_matrix, REAL_DTYPE), - ("sphere_vertices", sphere_vertices, REAL_DTYPE), - ("sphere_edges", sphere_edges, np.int32), - ]: - if arr.dtype != dt: - raise TypeError(f"{name} must have dtype {dt}, got {arr.dtype}") - if not arr.flags.c_contiguous: - raise ValueError(f"{name} must be C-contiguous") - - self.dataf = dataf - self.H = H - self.R = R - self.delta_b = delta_b - self.delta_q = delta_q - self.b0s_mask = b0s_mask - self.metric_map = metric_map - self.sampling_matrix = sampling_matrix - self.sphere_vertices = sphere_vertices - self.sphere_edges = sphere_edges - - self.dimx, self.dimy, self.dimz, self.dimt = dataf.shape - self.nedges = int(sphere_edges.shape[0]) - self.delta_nr = int(delta_b.shape[0]) - self.samplm_nr = int(sampling_matrix.shape[0]) - - self.model_type = int(model_type) - self.max_angle = REAL_DTYPE(max_angle) - self.min_signal = REAL_DTYPE(min_signal) - self.tc_threshold = REAL_DTYPE(tc_threshold) - self.step_size = REAL_DTYPE(step_size) - self.relative_peak_thresh = REAL_DTYPE(relative_peak_thresh) - self.min_separation_angle = REAL_DTYPE(min_separation_angle) - - self.ngpus = int(ngpus) - self.rng_seed = int(rng_seed) - self.rng_offset = int(rng_offset) - - self.nSlines_old = [] - self.slines = [] - self.sline_lens = [] - - checkCudaErrors(driver.cuInit(0)) - avail = checkCudaErrors(runtime.cudaGetDeviceCount()) - if self.ngpus > avail: - raise RuntimeError(f"Requested {self.ngpus} GPUs but only {avail} available") - - logger.info("Creating GPUTracker with %d GPUs...", self.ngpus) - - self.dataf_pts = [] - self.H_pts = [] - self.R_pts = [] - self.delta_b_pts = [] - self.delta_q_pts = [] - self.b0s_mask_pts = [] - self.metric_map_pts = [] - self.sampling_matrix_pts = [] - self.sphere_vertices_pts = [] - self.sphere_edges_pts = [] - - for ii in range(self.ngpus): - checkCudaErrors(runtime.cudaSetDevice(ii)) - self.dataf_pts.append( # TODO: put this in texture memory? - checkCudaErrors(runtime.cudaMallocManaged( - REAL_SIZE*self.dataf.size, - runtime.cudaMemAttachGlobal))) - checkCudaErrors(runtime.cudaMemAdvise( - self.dataf_pts[ii], - REAL_SIZE*self.dataf.size, - runtime.cudaMemAdviseSetPreferredLocation, - ii)) - self.H_pts.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.H.size))) - self.R_pts.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.R.size))) - self.delta_b_pts.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.delta_b.size))) - self.delta_q_pts.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.delta_q.size))) - self.b0s_mask_pts.append( - checkCudaErrors(runtime.cudaMalloc( - np.int32().nbytes*self.b0s_mask.size))) - self.metric_map_pts.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.metric_map.size))) - self.sampling_matrix_pts.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.sampling_matrix.size))) - self.sphere_vertices_pts.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.sphere_vertices.size))) - self.sphere_edges_pts.append( - checkCudaErrors(runtime.cudaMalloc( - np.int32().nbytes*self.sphere_edges.size))) - - checkCudaErrors(runtime.cudaMemcpy( - self.dataf_pts[ii], - self.dataf.ctypes.data, - REAL_SIZE*self.dataf.size, - runtime.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.H_pts[ii], - self.H.ctypes.data, - REAL_SIZE*self.H.size, - runtime.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.R_pts[ii], - self.R.ctypes.data, - REAL_SIZE*self.R.size, - runtime.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.delta_b_pts[ii], - self.delta_b.ctypes.data, - REAL_SIZE*self.delta_b.size, - runtime.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.delta_q_pts[ii], - self.delta_q.ctypes.data, - REAL_SIZE*self.delta_q.size, - runtime.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.b0s_mask_pts[ii], - self.b0s_mask.ctypes.data, - np.int32().nbytes*self.b0s_mask.size, - runtime.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.metric_map_pts[ii], - self.metric_map.ctypes.data, - REAL_SIZE*self.metric_map.size, - runtime.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.sampling_matrix_pts[ii], - self.sampling_matrix.ctypes.data, - REAL_SIZE*self.sampling_matrix.size, - runtime.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.sphere_vertices_pts[ii], - self.sphere_vertices.ctypes.data, - REAL_SIZE*self.sphere_vertices.size, - runtime.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.sphere_edges_pts[ii], - self.sphere_edges.ctypes.data, - np.int32().nbytes*self.sphere_edges.size, - runtime.cudaMemcpyHostToDevice)) - - self.streams = [] - for ii in range(self.ngpus): - checkCudaErrors(runtime.cudaSetDevice(ii)) - self.streams.append( - checkCudaErrors(runtime.cudaStreamCreateWithFlags( - runtime.cudaStreamNonBlocking))) - - def generate_streamlines(self, seeds): # TODO: location this is going should be these arguments - nseeds = len(seeds) - nseeds_per_gpu = (nseeds + self.ngpus - 1) // self.ngpus - - seeds_ptrs = [] - - for ii in range(self.ngpus): - nseeds_gpu = min(nseeds_per_gpu, max(0, nseeds - ii * nseeds_per_gpu)) - checkCudaErrors(runtime.cudaSetDevice(ii)) - seeds_ptrs.append(checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*3*nseeds_gpu))) - checkCudaErrors(runtime.cudaMemcpy( - seeds_ptrs[ii], - seeds[ii*nseeds_per_gpu:(ii+1)*nseeds_per_gpu].ctypes.data, - REAL_SIZE*3*nseeds_gpu, - runtime.cudaMemcpyHostToDevice)) - - nSlines = [0] * self.ngpus # TODO: figure out what this is doing - # TODO: - # // Call GPU routine - # generate_streamlines_cuda_mgpu(model_type_, max_angle_, min_signal_, tc_threshold_, step_size_, - # relative_peak_thresh_, min_separation_angle_, - # nseeds, seeds_d, - # dimx_, dimy_, dimz_, dimt_, - # dataf_d, H_d, R_d, delta_nr_, delta_b_d, delta_q_d, b0s_mask_d, metric_map_d, samplm_nr_, sampling_matrix_d, - # sphere_vertices_d, sphere_edges_d, nedges_, - # slines_, slinesLen_, nSlines, nSlines_old_, rng_seed_, rng_offset_, ngpus_, - # streams_); - - self.nSlines_old = nSlines.copy() # TODO: figure out what this is doing - self.rng_offset += nseeds - - nSlines_total = 0 - for ii in range(self.ngpus): - checkCudaErrors(runtime.cudaFree(seeds_ptrs[ii])) - nSlines_total += nSlines[ii] - - - # TODO - # std::vector> slines_list; - # slines_list.reserve(nSlines_total); - # for (int n = 0; n < ngpus_; ++n) { - # for (int i = 0; i < nSlines[n]; ++i) { - # REAL* sl = new REAL[slinesLen_[n][i]*3]; - # std::memcpy(sl, slines_[n] + i*3*2*MAX_SLINE_LEN, slinesLen_[n][i]*3*sizeof(*sl)); - # auto sl_arr = py::array_t({slinesLen_[n][i], 3}, // shape - # {3*sizeof(REAL), sizeof(REAL)}, // strides - # sl, - # cleanup(sl)); - # slines_list.push_back(sl_arr); - # } - # } - - # return slines_list; - - # } \ No newline at end of file diff --git a/cuslines/globals.py b/cuslines/globals.py deleted file mode 100644 index e69de29..0000000 From 7f77687dc9339f99a0232758949a37a2dece7427 Mon Sep 17 00:00:00 2001 From: 36000 Date: Thu, 1 Jan 2026 15:13:57 -0800 Subject: [PATCH 18/31] new folder --- cuslines/cu_direction_getters.py | 308 +++++++++++++++++++++++++++++++ cuslines/cu_propagate_seeds.py | 218 ++++++++++++++++++++++ cuslines/cu_tractography.py | 179 ++++++++++++++++++ cuslines/cutils.py | 65 +++++++ 4 files changed, 770 insertions(+) create mode 100644 cuslines/cu_direction_getters.py create mode 100644 cuslines/cu_propagate_seeds.py create mode 100644 cuslines/cu_tractography.py create mode 100644 cuslines/cutils.py diff --git a/cuslines/cu_direction_getters.py b/cuslines/cu_direction_getters.py new file mode 100644 index 0000000..cbf2959 --- /dev/null +++ b/cuslines/cu_direction_getters.py @@ -0,0 +1,308 @@ +import numpy as np +from abc import ABC, abstractmethod +import logging +from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch + +from cuda.bindings import runtime +from cuda.core import Device + +from cutils import ( + REAL_SIZE, + REAL_DTYPE, + checkCudaErrors, +) + + +__all__ = [ + "ProbDirectionGetter", + "PTTDirectionGetter", + "BootDirectionGetter" +] + + +logger = logging.getLogger("GPUStreamlines") + + +_program = None + + +def _compile_program(debug=False): # TODO: compile kernels individually as needed + if _program is None: + logger.info("Compiling GPUStreamlines") + dev = Device() + dev.set_current() + + if debug: + comp_kwargs = { + "debug": True, + "lineinfo": True, + "device_code_optimize": True, + "ptxas_options": ["-v", "-O0"] + } + else: + comp_kwargs = {"ptxas_options": ["-O3"]} + program_options = ProgramOptions( # include_path maybe needed here? + name="GPUStreamlines", + arch=f"sm_{dev.arch}", + use_fast_math=True, + extra_device_vectorization=True, + std="c++11", + **comp_kwargs + ) + prog = Program(code, code_type="c++", options=program_options) + _program = prog.compile("cubin", name_expressions=("vector_add",)) + + +class _GPUDirectionGetter(ABC): + @abstractmethod + def get_direction(self): + pass + + @abstractmethod + def get_num_streamlines(self): + pass + + @abstractmethod + def allocate_on_gpu(self): + pass + + @abstractmethod + def deallocate_on_gpu(self): + pass + + +class BootDirectionGetter(_GPUDirectionGetter): + def __init__( # TODO: Maybe accept a dipy thing and extract arrays here? maybe as a from_ function? + self, + min_signal: float, + H: np.ndarray, + R: np.ndarray, + delta_b: np.ndarray, + delta_q: np.ndarray, + sampling_matrix: np.ndarray, + b0s_mask: np.ndarray): + for name, arr, dt in [ + ("H", H, REAL_DTYPE), + ("R", R, REAL_DTYPE), + ("delta_b", delta_b, REAL_DTYPE), + ("delta_q", delta_q, REAL_DTYPE), + ("b0s_mask", b0s_mask, np.int32), + ("sampling_matrix", sampling_matrix, REAL_DTYPE)]: + if arr.dtype != dt: + raise TypeError(f"{name} must have dtype {dt}, got {arr.dtype}") + if not arr.flags.c_contiguous: + raise ValueError(f"{name} must be C-contiguous") + + self.H = H + self.R = R + self.delta_b = delta_b + self.delta_q = delta_q + self.delta_nr = int(delta_b.shape[0]) + self.min_signal = REAL_DTYPE(min_signal) + self.sampling_matrix = sampling_matrix + + self.H_d = [] + self.R_d = [] + self.delta_b_d = [] + self.delta_q_d = [] + self.b0s_mask_d = [] + self.sampling_matrix_d = [] + + def allocate_on_gpu(self, n): + self.H_d.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.H.size))) + self.R_d.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.R.size))) + self.delta_b_d.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.delta_b.size))) + self.delta_q_d.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.delta_q.size))) + self.b0s_mask_d.append( + checkCudaErrors(runtime.cudaMalloc( + np.int32().nbytes*self.b0s_mask.size))) + self.sampling_matrix_d.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.sampling_matrix.size))) + + checkCudaErrors(runtime.cudaMemcpy( + self.H_d[n], + self.H.ctypes.data, + REAL_SIZE*self.H.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.R_d[n], + self.R.ctypes.data, + REAL_SIZE*self.R.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.delta_b_d[n], + self.delta_b.ctypes.data, + REAL_SIZE*self.delta_b.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.delta_q_d[n], + self.delta_q.ctypes.data, + REAL_SIZE*self.delta_q.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.b0s_mask_d[n], + self.b0s_mask.ctypes.data, + np.int32().nbytes*self.b0s_mask.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.sampling_matrix_d[n], + self.sampling_matrix.ctypes.data, + REAL_SIZE*self.sampling_matrix.size, + runtime.cudaMemcpyHostToDevice)) + + def deallocate_on_gpu(self, n): + if self.H_d[n]: + checkCudaErrors(runtime.cudaFree(self.H_d[n])) + if self.R_d[n]: + checkCudaErrors(runtime.cudaFree(self.R_d[n])) + if self.delta_b_d[n]: + checkCudaErrors(runtime.cudaFree(self.delta_b_d[n])) + if self.delta_q_d[n]: + checkCudaErrors(runtime.cudaFree(self.delta_q_d[n])) + if self.b0s_mask_d[n]: + checkCudaErrors(runtime.cudaFree(self.b0s_mask_d[n])) + if self.sampling_matrix_d[n]: + checkCudaErrors(runtime.cudaFree(self.sampling_matrix_d[n])) + + def getNumStreamlines(self): + pass + + def generateStreamlines(self): + pass + + + + +// Precompute number of streamlines before allocating memory +if (!((model_type == PTT) || (model_type == PROB))) { + shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_boot_d + sizeof(int)*samplm_nr; // for peak_directions_d + getNumStreamlinesBoot_k + <<>>( + model_type, + max_angle, + min_signal, + relative_peak_thresh, + min_separation_angle, + rng_seed, + nseeds_gpu, + reinterpret_cast(seeds_d[n]), + dimx, + dimy, + dimz, + dimt, + dataf_d[n], + H_d[n], + R_d[n], + delta_nr, + delta_b_d[n], + delta_q_d[n], + b0s_mask_d[n], + samplm_nr, + sampling_matrix_d[n], + reinterpret_cast(sphere_vertices_d[n]), + reinterpret_cast(sphere_edges_d[n]), + nedges, + shDirTemp0_d[n], + slinesOffs_d[n]); +} else { + shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt + sizeof(int)*(THR_X_BL/THR_X_SL)*n32dimt; + getNumStreamlinesProb_k + <<>>( + max_angle, + relative_peak_thresh, + min_separation_angle, + rng_seed, + nseeds_gpu, + reinterpret_cast(seeds_d[n]), + dimx, + dimy, + dimz, + dimt, + dataf_d[n], + reinterpret_cast(sphere_vertices_d[n]), + reinterpret_cast(sphere_edges_d[n]), + nedges, + shDirTemp0_d[n], + slinesOffs_d[n]); +} + + + //#pragma omp parallel for + for (int n = 0; n < ngpus; ++n) { + CHECK_CUDA(cudaSetDevice(n)); + int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu)); + if (nseeds_gpu == 0) continue; + dim3 block(THR_X_SL, THR_X_BL/THR_X_SL); + dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL)); +#if 0 + std::cerr << "GPU " << n << ": "; + std::cerr << "Generating " << nSlines_h[n] << " streamlines (from " << nseeds_gpu << " seeds)" << std::endl; +#endif + + //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y); + switch(model_type) { + case OPDT: + genStreamlinesMerge_k <<>>( + max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, + rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), + dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n], + b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n], + reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), + nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); + break; + + case CSA: + genStreamlinesMerge_k <<>>( + max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, + rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), + dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n], + b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n], + reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), + nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); + break; + + case PROB: + // Shared memory requirements are smaller for probabilistic for main run + // than for preliminary run + shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt; + genStreamlinesMerge_k <<>>( + max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, + rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), + dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n], + b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n], + reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), + nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); + break; + + case PTT: + shSizeGNS = 0; // PTT uses exclusively static shared memory + genStreamlinesMerge_k <<>>( + max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, + rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), + dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n], + b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n], + reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), + nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); + break; + + default: + printf("FATAL: Invalid Model Type.\n"); + break; + } + + CHECK_ERROR("genStreamlinesMerge_k"); + } + + diff --git a/cuslines/cu_propagate_seeds.py b/cuslines/cu_propagate_seeds.py new file mode 100644 index 0000000..a334da6 --- /dev/null +++ b/cuslines/cu_propagate_seeds.py @@ -0,0 +1,218 @@ +import numpy as np +import ctypes +from cuda.bindings import runtime +from nibabel.streamlines.array_sequence import ArraySequence +import logging + +from cutils import ( + REAL_SIZE, + REAL_DTYPE, + REAL3_DTYPE, + MAX_SLINE_LEN, + EXCESS_ALLOC_FACT, + THR_X_SL, + THR_X_BL, + div_up, + checkCudaErrors, +) + + +logger = logging.getLogger("GPUStreamlines") + + +class SeedBatchPropagator: + def __init__( + self, + gpu_tracker): + self.gpu_tracker = gpu_tracker + + self.nSlines_old = np.zeros(self.ngpus, dtype=np.int32) + self.nSlines = np.zeros(self.ngpus, dtype=np.int32) + self.slines = np.zeros(self.ngpus, dtype=ctypes.c_void_p) + self.sline_lens = np.zeros(self.ngpus, dtype=ctypes.c_void_p) + + self.seeds_d = np.empty(self.ngpus, dtype=ctypes.c_void_p) + self.slineSeed_d = np.empty(self.ngpus, dtype=ctypes.c_void_p) + self.slinesOffs_d = np.empty(self.ngpus, dtype=ctypes.c_void_p) + self.shDirTemp0_d = np.empty(self.ngpus, dtype=ctypes.c_void_p) + self.slineLen_d = np.empty(self.ngpus, dtype=ctypes.c_void_p) + self.sline_d = np.empty(self.ngpus, dtype=ctypes.c_void_p) + + def _switch_device(self, n): + checkCudaErrors(runtime.cudaSetDevice(n)) + + nseeds_gpu = min( + self.nseeds_per_gpu, max(0, self.nseeds - n * self.nseeds_per_gpu)) + block = (THR_X_SL, THR_X_BL//THR_X_SL, 1) + grid = (div_up(nseeds_gpu, THR_X_BL//THR_X_SL), 1, 1) + + return nseeds_gpu, block, grid + + def _get_sl_buffer_size(self, n): + return REAL_SIZE*2*3*MAX_SLINE_LEN*self.nSlines[n] + + def _allocate_seed_memory(self): + # Move seeds to GPU + for ii in range(self.ngpus): + nseeds_gpu, _, _ = self._switch_device(ii) + self.seeds_d[ii] = checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*3*nseeds_gpu)) + checkCudaErrors(runtime.cudaMemcpy( + self.seeds_d[ii], + self.seeds[ii*self.nseeds_per_gpu:(ii+1)*self.nseeds_per_gpu].ctypes.data, + REAL_SIZE*3*nseeds_gpu, + runtime.cudaMemcpyHostToDevice)) + + for ii in range(self.ngpus): + nseeds_gpu, block, grid = self._switch_device(ii) + # Streamline offsets + self.slinesOffs_d[ii] = checkCudaErrors(runtime.cudaMalloc( + np.uint64().nbytes * (nseeds_gpu + 1))) + # Initial directions from each seed + self.shDirTemp0_d[ii] = checkCudaErrors(runtime.cudaMalloc( + REAL3_DTYPE.nbytes * self.samplm_nr * grid[0] * block[1])) + + def _cumsum_offsets(self): + for ii in range(self.ngpus): + nseeds_gpu, _, _ = self._switch_device(ii) + if (nseeds_gpu == 0): + self.nSlines[ii] = 0 + continue + + slinesOffs_h = np.empty(nseeds_gpu + 1, dtype=np.int32) + checkCudaErrors(runtime.cudaMemcpy( + slinesOffs_h.ctypes.data, + self.slinesOffs_d[ii], + slinesOffs_h.nbytes * (nseeds_gpu + 1), + runtime.cudaMemcpyDeviceToHost)) + + slinesOffs_h = np.concatenate(( + [0], np.cumsum(slinesOffs_h[:-1], dtype=slinesOffs_h.dtype))) + self.nSlines[ii] = int(slinesOffs_h[-1]) + + checkCudaErrors(runtime.cudaMemcpy( + self.slinesOffs_d[ii], + slinesOffs_h.ctypes.data, + self.slinesOffs_d.size * (nseeds_gpu + 1), + runtime.cudaMemcpyHostToDevice)) + + def _allocate_tracking_memory(self): + for ii in range(self.ngpus): + self._switch_device(ii) + + self.slineSeed_d[ii] = checkCudaErrors(runtime.cudaMalloc( + self.nSlines[ii] * np.int32().nbytes)) + checkCudaErrors(runtime.cudaMemset( + self.slineSeed_d[ii], + -1, + self.nSlines[ii] * np.int32().nbytes)) + + if self.nSlines[ii] > EXCESS_ALLOC_FACT*self.nSlines_old[ii]: + if self.slines[ii]: + checkCudaErrors(runtime.cudaFreeHost( + self.slines[ii])) + if self.sline_lens[ii]: + checkCudaErrors(runtime.cudaFreeHost( + self.sline_lens[ii])) + self.slines[ii] = 0 # Nullptr + self.sline_lens[ii] = 0 # Nullptr + + buffer_size = self._get_sl_buffer_size(ii) + logger.debug(f"Streamline buffer size: {buffer_size}") + + if not self.slines[ii]: + self.slines[ii] = checkCudaErrors(runtime.cudaMallocHost( + buffer_size)) + if not self.slines_lens[ii]: + self.slines_lens[ii] = checkCudaErrors(runtime.cudaMallocHost( + np.int32().nbytes*EXCESS_ALLOC_FACT*self.nSlines[ii])) + + for ii in range(self.ngpus): + self._switch_device(ii) + buffer_size = self._get_sl_buffer_size(ii) + + self.slineLen_d[ii] = checkCudaErrors(runtime.cudaMalloc( + np.int32().nbytes * self.nSlines[ii])) + self.sline_d[ii] = checkCudaErrors(runtime.cudaMalloc( + buffer_size)) + + def _cleanup(self): + for ii in range(self.ngpus): + self._switch_device(ii) + checkCudaErrors(runtime.cudaMemcpyAsync( + self.slines[ii], + self.sline_d[ii], + self._get_sl_buffer_size(ii), + runtime.cudaMemcpyDeviceToHost, + self.gpu_tracker.streams[ii])) + checkCudaErrors(runtime.cudaMemcpyAsync( + self.sline_lens[ii], + self.slineLen_d[ii], + np.int32().nbytes*self.nSlines[ii], + runtime.cudaMemcpyDeviceToHost, + self.gpu_tracker.streams[ii])) + + for ii in range(self.ngpus): + self._switch_device(ii) + checkCudaErrors(runtime.cudaStreamSynchronize( + self.gpu_tracker.streams[ii])) + checkCudaErrors(runtime.cudaFree(self.seeds_d[ii])) + checkCudaErrors(runtime.cudaFree(self.slineSeed_d[ii])) + checkCudaErrors(runtime.cudaFree(self.slinesOffs_d[ii])) + checkCudaErrors(runtime.cudaFree(self.shDirTemp0_d[ii])) + checkCudaErrors(runtime.cudaFree(self.slineLen_d[ii])) + checkCudaErrors(runtime.cudaFree(self.sline_d[ii])) + + self.nSlines_old = self.nSlines.copy() + self.rng_offset += self.nseeds + + def propagate(self, seeds): + self.seeds = seeds + self.nseeds = len(seeds) + self.nseeds_per_gpu = (self.nseeds + self.gpu_tracker.ngpus - 1) // self.gpu_tracker.ngpus + + self._seeds_to_gpu() + self._allocate_seed_memory() + + for ii in range(self.ngpus): + nseeds_gpu, block, grid = self._switch_device(ii) + if (nseeds_gpu == 0): + continue + + getNumStreamlines() # TODO: these will each be classes you can pass in + + self._cumsum_offsets() + self._allocate_tracking_memory() + + for ii in range(self.ngpus): + nseeds_gpu, block, grid = self._switch_device(ii) + if (nseeds_gpu == 0): + continue + + mergeStreamlines() # TODO + + self._cleanup() + + def as_array_sequence(self): # TODO: optimize memory usage here? also, direct to trx? + buffer_size = 0 + for ii in range(self.ngpus): + lens = self.sline_lens[ii] + for jj in range(self.nSlines[ii]): + buffer_size += lens[jj] * 3 * REAL_SIZE + + def _yield_slines(): + for ii in range(self.ngpus): + this_sls = self.slines[ii] + this_len = self.sline_lens[ii] + + for jj in range(self.nSlines[ii]): + npts = this_len[jj] + offset = jj * 3 * 2 * MAX_SLINE_LEN + + sl = np.asarray( + this_sls[offset : offset + npts * 3], + dtype=REAL_DTYPE) + sl = sl.reshape((npts, 3)) + yield sl + + return ArraySequence(_yield_slines, buffer_size) diff --git a/cuslines/cu_tractography.py b/cuslines/cu_tractography.py new file mode 100644 index 0000000..acfcc96 --- /dev/null +++ b/cuslines/cu_tractography.py @@ -0,0 +1,179 @@ +from cuda.bindings import driver, runtime +# TODO: consider cuda core over cuda bindings + +import numpy as np +import logging + +from cutils import ( + REAL_SIZE, + REAL_DTYPE, + checkCudaErrors, +) +from cu_direction_getters import ( + GPUDirectionGetter, + BootDirectionGetter +) +from cu_propagate_seeds import SeedBatchPropagator + + +logger = logging.getLogger("GPUStreamlines") + +# TODO: we need to organize this package into folders, then make it pip installable. +# but should merge in PTT FIRST +class GPUTracker: # TODO: bring in pyAFQ prep stuff + def __init__( + self, + dg: GPUDirectionGetter, + max_angle: float, + tc_threshold: float, + step_size: float, + relative_peak_thresh: float, + min_separation_angle: float, + dataf: np.ndarray, # TODO: reasonable defaults for floats, reorganize order, better names, documentation + metric_map: np.ndarray, + sphere_vertices: np.ndarray, + sphere_edges: np.ndarray, + ngpus: int = 1, + rng_seed: int = 0, + rng_offset: int = 0, + ): + for name, arr, dt in [ + ("dataf", dataf, REAL_DTYPE), + ("metric_map", metric_map, REAL_DTYPE), + ("sphere_vertices", sphere_vertices, REAL_DTYPE), + ("sphere_edges", sphere_edges, np.int32), + ]: + if arr.dtype != dt: + raise TypeError(f"{name} must have dtype {dt}, got {arr.dtype}") + if not arr.flags.c_contiguous: + raise ValueError(f"{name} must be C-contiguous") + + self.dataf = dataf + self.metric_map = metric_map + self.sphere_vertices = sphere_vertices + self.sphere_edges = sphere_edges + + self.dimx, self.dimy, self.dimz, self.dimt = dataf.shape + self.nedges = int(sphere_edges.shape[0]) + if isinstance(dg, BootDirectionGetter): + self.samplm_nr = int(dg.sampling_matrix.shape[0]) + else: + self.samplm_nr = self.dimt + + self.dg = dg + self.max_angle = REAL_DTYPE(max_angle) + self.tc_threshold = REAL_DTYPE(tc_threshold) + self.step_size = REAL_DTYPE(step_size) + self.relative_peak_thresh = REAL_DTYPE(relative_peak_thresh) + self.min_separation_angle = REAL_DTYPE(min_separation_angle) + + self.ngpus = int(ngpus) + self.rng_seed = int(rng_seed) + self.rng_offset = int(rng_offset) + + checkCudaErrors(driver.cuInit(0)) + avail = checkCudaErrors(runtime.cudaGetDeviceCount()) + if self.ngpus > avail: + raise RuntimeError(f"Requested {self.ngpus} GPUs but only {avail} available") + + logger.info("Creating GPUTracker with %d GPUs...", self.ngpus) + + self.dataf_d = [] + self.metric_map_d = [] + self.sphere_vertices_d = [] + self.sphere_edges_d = [] + + self.seed_propagator = SeedBatchPropagator( + gpu_tracker=self) + self._allocated = False + + def __enter__(self): + self._allocate() + return self + + def _allocate(self): + if self._allocated: + return + + for ii in range(self.ngpus): + checkCudaErrors(runtime.cudaSetDevice(ii)) + self.dataf_d.append( # TODO: put this in texture memory? + checkCudaErrors(runtime.cudaMallocManaged( # TODO: look at cuda core managed memory + REAL_SIZE*self.dataf.size, + runtime.cudaMemAttachGlobal))) + checkCudaErrors(runtime.cudaMemAdvise( + self.dataf_d[ii], + REAL_SIZE*self.dataf.size, + runtime.cudaMemAdviseSetPreferredLocation, + ii)) + self.metric_map_d.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.metric_map.size))) + self.sphere_vertices_d.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.sphere_vertices.size))) + self.sphere_edges_d.append( + checkCudaErrors(runtime.cudaMalloc( + np.int32().nbytes*self.sphere_edges.size))) + + checkCudaErrors(runtime.cudaMemcpy( + self.dataf_d[ii], + self.dataf.ctypes.data, + REAL_SIZE*self.dataf.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.metric_map_d[ii], + self.metric_map.ctypes.data, + REAL_SIZE*self.metric_map.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.sphere_vertices_d[ii], + self.sphere_vertices.ctypes.data, + REAL_SIZE*self.sphere_vertices.size, + runtime.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.sphere_edges_d[ii], + self.sphere_edges.ctypes.data, + np.int32().nbytes*self.sphere_edges.size, + runtime.cudaMemcpyHostToDevice)) + + self.dg.allocate_on_gpu(ii) + + self.streams = [] + for ii in range(self.ngpus): + checkCudaErrors(runtime.cudaSetDevice(ii)) + self.streams.append( + checkCudaErrors(runtime.cudaStreamCreateWithFlags( + runtime.cudaStreamNonBlocking))) + + self._allocated = True + + def __exit__(self, exc_type, exc, tb): + logger.info("Destroying GPUTracker and freeing GPU memory...") + + for n in range(self.ngpus): + checkCudaErrors(runtime.cudaSetDevice(n)) + if self.dataf_d[n]: + checkCudaErrors(runtime.cudaFree(self.dataf_d[n])) + if self.metric_map_d[n]: + checkCudaErrors(runtime.cudaFree(self.metric_map_d[n])) + if self.sphere_vertices_d[n]: + checkCudaErrors(runtime.cudaFree(self.sphere_vertices_d[n])) + if self.sphere_edges_d[n]: + checkCudaErrors(runtime.cudaFree(self.sphere_edges_d[n])) + + if self.seed_propagator.sline_lens[n]: + checkCudaErrors(runtime.cudaFreeHost( + self.seed_propagator.sline_lens[n])) + if self.seed_propagator.slines[n]: + checkCudaErrors(runtime.cudaFreeHost( + self.seed_propagator.slines[n])) + + self.dg.deallocate_on_gpu(n) + + checkCudaErrors(runtime.cudaStreamDestroy(self.streams[n])) + return False + + def generate_streamlines(self, seeds): + self.seed_propagator.propagate(seeds) + return self.seed_propagator.as_array_sequence() diff --git a/cuslines/cutils.py b/cuslines/cutils.py new file mode 100644 index 0000000..4d75847 --- /dev/null +++ b/cuslines/cutils.py @@ -0,0 +1,65 @@ +from cuda.bindings import driver, nvrtc + +import re +import os +import numpy as np + +from enum import IntEnum + + +class ModelType(IntEnum): + OPDT = 0 + CSA = 1 + PROB = 2 + PTT = 3 + + +# We extract REAL_DTYPE, MAX_SLINE_LEN from globals.h +# Maybe there is a more elegant way of doing this? +dir_path = os.path.dirname(os.path.abspath(__file__)) +globals_path = os.path.join(dir_path, "globals.h") +with open(globals_path, 'r') as f: + content = f.read() + +defines = dict(re.findall(r"#define\s+(\w+)\s+([^\s/]+)", content)) +REAL_SIZE = int(defines["REAL_SIZE"]) +REAL3_SIZE = 3 * REAL_SIZE +if REAL_SIZE == 4: + REAL_DTYPE = np.float32 + REAL3_DTYPE = np.dtype([('x', np.float32), + ('y', np.float32), + ('z', np.float32)]) +elif REAL_SIZE == 8: + REAL_DTYPE = np.float64 + REAL3_DTYPE = np.dtype([('x', np.float64), + ('y', np.float64), + ('z', np.float64)]) +else: + raise NotImplementedError(f"Unsupported REAL_SIZE={REAL_SIZE} in globals.h") +MAX_SLINE_LEN = int(defines["MAX_SLINE_LEN"]) +THR_X_SL = int(defines["THR_X_SL"]) +THR_X_BL = int(defines["THR_X_BL"]) +EXCESS_ALLOC_FACT = int(defines["EXCESS_ALLOC_FACT"]) + + +def _cudaGetErrorEnum(error): + if isinstance(error, driver.CUresult): + err, name = driver.cuGetErrorName(error) + return name if err == driver.CUresult.CUDA_SUCCESS else "" + elif isinstance(error, nvrtc.nvrtcResult): + return nvrtc.nvrtcGetErrorString(error)[1] + else: + raise RuntimeError('Unknown error type: {}'.format(error)) + +def checkCudaErrors(result): + if result[0].value: + raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0]))) + if len(result) == 1: + return None + elif len(result) == 2: + return result[1] + else: + return result[1:] + +def div_up(a, b): + return (a + b - 1) // b From 544445b99b6a3182ab176c29316b2d871b5b18b9 Mon Sep 17 00:00:00 2001 From: 36000 Date: Tue, 6 Jan 2026 09:54:16 -0800 Subject: [PATCH 19/31] first draft of cuda python GPUstreamlines; currently broken --- cuslines/Makefile | 6 +- cuslines/__init__.py | 13 + cuslines/cu_direction_getters.py | 308 -------------- cuslines/cuda_python/__init__.py | 13 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 385 bytes .../__pycache__/_globals.cpython-312.pyc | Bin 0 -> 389 bytes .../cu_direction_getters.cpython-312.pyc | Bin 0 -> 22688 bytes .../cu_propagate_seeds.cpython-312.pyc | Bin 0 -> 14574 bytes .../cu_tractography.cpython-312.pyc | Bin 0 -> 9441 bytes .../__pycache__/cutils.cpython-312.pyc | Bin 0 -> 2922 bytes cuslines/cuda_python/_globals.py | 10 + cuslines/cuda_python/cu_direction_getters.py | 381 ++++++++++++++++++ .../{ => cuda_python}/cu_propagate_seeds.py | 103 ++--- cuslines/{ => cuda_python}/cu_tractography.py | 97 ++--- cuslines/{ => cuda_python}/cutils.py | 28 +- cuslines/cuwsort.cuh | 19 +- cuslines/generate_streamlines_cuda.cu | 153 ++++--- cuslines/globals.h | 33 +- cuslines/ptt.cu | 2 +- pyproject.toml | 14 +- run_gpu_streamlines.py | 2 +- setup.py | 49 +++ 22 files changed, 704 insertions(+), 527 deletions(-) delete mode 100644 cuslines/cu_direction_getters.py create mode 100644 cuslines/cuda_python/__init__.py create mode 100644 cuslines/cuda_python/__pycache__/__init__.cpython-312.pyc create mode 100644 cuslines/cuda_python/__pycache__/_globals.cpython-312.pyc create mode 100644 cuslines/cuda_python/__pycache__/cu_direction_getters.cpython-312.pyc create mode 100644 cuslines/cuda_python/__pycache__/cu_propagate_seeds.cpython-312.pyc create mode 100644 cuslines/cuda_python/__pycache__/cu_tractography.cpython-312.pyc create mode 100644 cuslines/cuda_python/__pycache__/cutils.cpython-312.pyc create mode 100644 cuslines/cuda_python/_globals.py create mode 100644 cuslines/cuda_python/cu_direction_getters.py rename cuslines/{ => cuda_python}/cu_propagate_seeds.py (69%) rename cuslines/{ => cuda_python}/cu_tractography.py (69%) rename cuslines/{ => cuda_python}/cutils.py (70%) create mode 100644 setup.py diff --git a/cuslines/Makefile b/cuslines/Makefile index c8fe6c7..8fd8528 100644 --- a/cuslines/Makefile +++ b/cuslines/Makefile @@ -31,14 +31,14 @@ CUDACC=$(CUDA_HOME)/bin/nvcc # -G -g -dopt=on CXX=g++ LD=g++ -CXXFLAGS= -c -O3 -std=c++11 -fopenmp -fPIC `python3 -m pybind11 --includes` -I$(CUDA_HOME)/include +CXXFLAGS= -c -O3 -std=c++17 -fopenmp -fPIC `python3 -m pybind11 --includes` -I$(CUDA_HOME)/include -SMS ?= 70 +SMS ?= 75 80 CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM)) LASTSM := $(lastword $(sort $(SMS))) CUDA_ARCH += -gencode arch=compute_$(LASTSM),code=compute_$(LASTSM) -COMMON_FLAGS = -c -std=c++11 -Xcompiler -fPIC --use_fast_math -Xcompiler=-fopenmp $(CUDA_ARCH) +COMMON_FLAGS = -c -std=c++17 -Xcompiler -fPIC --use_fast_math -Xcompiler=-fopenmp $(CUDA_ARCH) RELEASE_FLAGS = -O3 -Xptxas=-O3 DEBUG_FLAGS = -O0 -Xptxas=-v -g -G -lineinfo CUDACFLAGS = $(COMMON_FLAGS) $(RELEASE_FLAGS) diff --git a/cuslines/__init__.py b/cuslines/__init__.py index e69de29..b96cca1 100644 --- a/cuslines/__init__.py +++ b/cuslines/__init__.py @@ -0,0 +1,13 @@ +from .cuda_python import ( + GPUTracker, + ProbDirectionGetter, + PttDirectionGetter, + BootDirectionGetter +) + +__all__ = [ + "GPUTracker", + "ProbDirectionGetter", + "PttDirectionGetter", + "BootDirectionGetter" +] diff --git a/cuslines/cu_direction_getters.py b/cuslines/cu_direction_getters.py deleted file mode 100644 index cbf2959..0000000 --- a/cuslines/cu_direction_getters.py +++ /dev/null @@ -1,308 +0,0 @@ -import numpy as np -from abc import ABC, abstractmethod -import logging -from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch - -from cuda.bindings import runtime -from cuda.core import Device - -from cutils import ( - REAL_SIZE, - REAL_DTYPE, - checkCudaErrors, -) - - -__all__ = [ - "ProbDirectionGetter", - "PTTDirectionGetter", - "BootDirectionGetter" -] - - -logger = logging.getLogger("GPUStreamlines") - - -_program = None - - -def _compile_program(debug=False): # TODO: compile kernels individually as needed - if _program is None: - logger.info("Compiling GPUStreamlines") - dev = Device() - dev.set_current() - - if debug: - comp_kwargs = { - "debug": True, - "lineinfo": True, - "device_code_optimize": True, - "ptxas_options": ["-v", "-O0"] - } - else: - comp_kwargs = {"ptxas_options": ["-O3"]} - program_options = ProgramOptions( # include_path maybe needed here? - name="GPUStreamlines", - arch=f"sm_{dev.arch}", - use_fast_math=True, - extra_device_vectorization=True, - std="c++11", - **comp_kwargs - ) - prog = Program(code, code_type="c++", options=program_options) - _program = prog.compile("cubin", name_expressions=("vector_add",)) - - -class _GPUDirectionGetter(ABC): - @abstractmethod - def get_direction(self): - pass - - @abstractmethod - def get_num_streamlines(self): - pass - - @abstractmethod - def allocate_on_gpu(self): - pass - - @abstractmethod - def deallocate_on_gpu(self): - pass - - -class BootDirectionGetter(_GPUDirectionGetter): - def __init__( # TODO: Maybe accept a dipy thing and extract arrays here? maybe as a from_ function? - self, - min_signal: float, - H: np.ndarray, - R: np.ndarray, - delta_b: np.ndarray, - delta_q: np.ndarray, - sampling_matrix: np.ndarray, - b0s_mask: np.ndarray): - for name, arr, dt in [ - ("H", H, REAL_DTYPE), - ("R", R, REAL_DTYPE), - ("delta_b", delta_b, REAL_DTYPE), - ("delta_q", delta_q, REAL_DTYPE), - ("b0s_mask", b0s_mask, np.int32), - ("sampling_matrix", sampling_matrix, REAL_DTYPE)]: - if arr.dtype != dt: - raise TypeError(f"{name} must have dtype {dt}, got {arr.dtype}") - if not arr.flags.c_contiguous: - raise ValueError(f"{name} must be C-contiguous") - - self.H = H - self.R = R - self.delta_b = delta_b - self.delta_q = delta_q - self.delta_nr = int(delta_b.shape[0]) - self.min_signal = REAL_DTYPE(min_signal) - self.sampling_matrix = sampling_matrix - - self.H_d = [] - self.R_d = [] - self.delta_b_d = [] - self.delta_q_d = [] - self.b0s_mask_d = [] - self.sampling_matrix_d = [] - - def allocate_on_gpu(self, n): - self.H_d.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.H.size))) - self.R_d.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.R.size))) - self.delta_b_d.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.delta_b.size))) - self.delta_q_d.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.delta_q.size))) - self.b0s_mask_d.append( - checkCudaErrors(runtime.cudaMalloc( - np.int32().nbytes*self.b0s_mask.size))) - self.sampling_matrix_d.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.sampling_matrix.size))) - - checkCudaErrors(runtime.cudaMemcpy( - self.H_d[n], - self.H.ctypes.data, - REAL_SIZE*self.H.size, - runtime.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.R_d[n], - self.R.ctypes.data, - REAL_SIZE*self.R.size, - runtime.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.delta_b_d[n], - self.delta_b.ctypes.data, - REAL_SIZE*self.delta_b.size, - runtime.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.delta_q_d[n], - self.delta_q.ctypes.data, - REAL_SIZE*self.delta_q.size, - runtime.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.b0s_mask_d[n], - self.b0s_mask.ctypes.data, - np.int32().nbytes*self.b0s_mask.size, - runtime.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.sampling_matrix_d[n], - self.sampling_matrix.ctypes.data, - REAL_SIZE*self.sampling_matrix.size, - runtime.cudaMemcpyHostToDevice)) - - def deallocate_on_gpu(self, n): - if self.H_d[n]: - checkCudaErrors(runtime.cudaFree(self.H_d[n])) - if self.R_d[n]: - checkCudaErrors(runtime.cudaFree(self.R_d[n])) - if self.delta_b_d[n]: - checkCudaErrors(runtime.cudaFree(self.delta_b_d[n])) - if self.delta_q_d[n]: - checkCudaErrors(runtime.cudaFree(self.delta_q_d[n])) - if self.b0s_mask_d[n]: - checkCudaErrors(runtime.cudaFree(self.b0s_mask_d[n])) - if self.sampling_matrix_d[n]: - checkCudaErrors(runtime.cudaFree(self.sampling_matrix_d[n])) - - def getNumStreamlines(self): - pass - - def generateStreamlines(self): - pass - - - - -// Precompute number of streamlines before allocating memory -if (!((model_type == PTT) || (model_type == PROB))) { - shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_boot_d - sizeof(int)*samplm_nr; // for peak_directions_d - getNumStreamlinesBoot_k - <<>>( - model_type, - max_angle, - min_signal, - relative_peak_thresh, - min_separation_angle, - rng_seed, - nseeds_gpu, - reinterpret_cast(seeds_d[n]), - dimx, - dimy, - dimz, - dimt, - dataf_d[n], - H_d[n], - R_d[n], - delta_nr, - delta_b_d[n], - delta_q_d[n], - b0s_mask_d[n], - samplm_nr, - sampling_matrix_d[n], - reinterpret_cast(sphere_vertices_d[n]), - reinterpret_cast(sphere_edges_d[n]), - nedges, - shDirTemp0_d[n], - slinesOffs_d[n]); -} else { - shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt + sizeof(int)*(THR_X_BL/THR_X_SL)*n32dimt; - getNumStreamlinesProb_k - <<>>( - max_angle, - relative_peak_thresh, - min_separation_angle, - rng_seed, - nseeds_gpu, - reinterpret_cast(seeds_d[n]), - dimx, - dimy, - dimz, - dimt, - dataf_d[n], - reinterpret_cast(sphere_vertices_d[n]), - reinterpret_cast(sphere_edges_d[n]), - nedges, - shDirTemp0_d[n], - slinesOffs_d[n]); -} - - - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu)); - if (nseeds_gpu == 0) continue; - dim3 block(THR_X_SL, THR_X_BL/THR_X_SL); - dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL)); -#if 0 - std::cerr << "GPU " << n << ": "; - std::cerr << "Generating " << nSlines_h[n] << " streamlines (from " << nseeds_gpu << " seeds)" << std::endl; -#endif - - //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y); - switch(model_type) { - case OPDT: - genStreamlinesMerge_k <<>>( - max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, - rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), - dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n], - b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n], - reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), - nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); - break; - - case CSA: - genStreamlinesMerge_k <<>>( - max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, - rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), - dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n], - b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n], - reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), - nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); - break; - - case PROB: - // Shared memory requirements are smaller for probabilistic for main run - // than for preliminary run - shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt; - genStreamlinesMerge_k <<>>( - max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, - rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), - dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n], - b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n], - reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), - nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); - break; - - case PTT: - shSizeGNS = 0; // PTT uses exclusively static shared memory - genStreamlinesMerge_k <<>>( - max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, - rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), - dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n], - b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n], - reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), - nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); - break; - - default: - printf("FATAL: Invalid Model Type.\n"); - break; - } - - CHECK_ERROR("genStreamlinesMerge_k"); - } - - diff --git a/cuslines/cuda_python/__init__.py b/cuslines/cuda_python/__init__.py new file mode 100644 index 0000000..d0b42d4 --- /dev/null +++ b/cuslines/cuda_python/__init__.py @@ -0,0 +1,13 @@ +from .cu_tractography import GPUTracker +from .cu_direction_getters import ( + ProbDirectionGetter, + PttDirectionGetter, + BootDirectionGetter +) + +__all__ = [ + "GPUTracker", + "ProbDirectionGetter", + "PttDirectionGetter", + "BootDirectionGetter" +] diff --git a/cuslines/cuda_python/__pycache__/__init__.cpython-312.pyc b/cuslines/cuda_python/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6aedacf00242281ecdb484682fe1cfc13555eb98 GIT binary patch literal 385 zcmZ9IF-ycS7>2*J=~eD5g6HB75Gjb=MI1!9O)4r#mr$F~R-2}z>Cn+%;qK;dxqlFF za&l8|5S;vacuT+G;mw=meM6Q(Fa?~0tGm)C0Pn+MGtR$Q9ieyz4xCgVNkYh=oyw7J z;!2uO=_MY49Ju>Jl1WcP?*p?SBzC%pZ*O#(JqjJVb{=bWe^F|YnNrmYVT{mrH#TPM z&Cit@n<3RMhH5&%8!$m!hQ8g&I%crMs9dK_(XY?hJ#_h|i{DNY!R90mz+9>PVjo>M3dQ+%6Vyvu7!`N}t+pSZncvp!t9bnY( d0qs8G(d(i;^^SQVs#}3t?&?%N89vaf^T_2Ujvvz35B6s-Hn5b(SN`n zadBa6-l!8B32|}qH4)En?zuPjoSWPaMcDxY#`DeaUDU6r*+TjgnrXE?S!^=|HM z1?$m?KO4AXmG&L?w&!@q6Q9dxBbUt`yE-k1sOVANsi1XSlB9JC?3b1xAC?Z5pcv+g dff8nOfgBb}!Cttl1eq|ah&HP{$kUu~{Q}jiZTtWL literal 0 HcmV?d00001 diff --git a/cuslines/cuda_python/__pycache__/cu_direction_getters.cpython-312.pyc b/cuslines/cuda_python/__pycache__/cu_direction_getters.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f59ad4d2e033c1468cb38fc97a82f693071d1aae GIT binary patch literal 22688 zcmdr!TW}lKb-Mr-&jkU##5V~_qG^y6^_Epxwk1)b6j`F4RwUTg8pJM1KzPs#P!Upy?1}jU71jLdG$0R?eTK7vA!y!rK zj|2zep}vkpyeHhN+K$VKUO5<3tx@8w7G*q+^{2v#I1;Gs3CBetelaYDgF>Ga6eU>@ z!?F}gCFDT}FYc96LO33Z4v3P_A58UOBAYxACmFM9QD{I69+hID{=uh#lHaL1P9E5M zL^$=ezyZ}s(EX>+9Y3HJXK`WgDdE)VlWGZiZ52qk?pL}vS_et%3H3>#3muTt0a;GS zN!4*QAxhEHgZ+|fKYi$=a8@{VM78ZZa;)QN;hbvQf8dO895N_6c>I}DDOn1}qT#rd z4CQuNrf->f91vrSM8l0(+qYyrod=Pdxnm_cTO?V4cxR@IV}qCF}bSDo>sB#B9( zw|_w8x}%BE1=Z3kheg$t>`x*oA2FYIZtP3Mq>Yh8Uwq@@YHkb-BuRY%5rabiAPkx~ z;F*4ff(g*6P_kru{~+n$u7OxiGYzd@T8VZ@u{c%()&duYmzfXk{5&8`C^L%?CuUJi z<$CawG1&ww`@l+Kbf5ePK((}2ic4}ZCFPdm5o$S>#9IdhtxJi#cmIuu2~hUy;}T1zQLUYuq~pJ|L4JO72sA-Crjpjedh$Wwb{Nvswi zByU21?Vxg^)IHFC(RLs3Xd@HtgVN$%&<99k@eRx zu}Ck+Wm~FB_Z_y0p1YZr+|FFKr)rE4d!$asv$md#=pE+XDAswwWrv7-mC-4&%u9|4 zPM|y^mJBlyU!Es8!FTOa-yYK&r8>o7Mkvu^JfP>xmX4NT(-6oFIxQAfvvtg7nAYDg zvCKJ!g}?8eyJSAcd;>UI*HC3gBGw;~lIMosl#w67}kd%a;U@|4dfW4{aWJ*+dQ3Adwflo+IsC+K}8>-cp zDgl_X&RPT{-X0oIS$W6|Sp(~Kkdmg5N7?jBaG`iQZmGRxP9{aQ4bB z^oJVBGERLNXO!xUhP$z}p-Y!!oWiOr2`nBOMV8_z)gDQNJ~#gg6kD{7!AfC4J^hrl2zflp<8O-v6W( zRlZ-7TCkUsswc;V4W*i4AVA|1)`sO|dsGX?Nums^F-L?NU-4whEM%xq7xTP08-Px7 z8?22lGqdGY-#;_roV6ESb&a|v%i5>y8$YV7yAheH^pCjz-R1poZS$S7@w%Bu+h(fj zW@>#OJGk1S(IazSrr0;$G+r@hHq{o599gh4HU2rqSyMYxTr*Q$KjW>KsapS$xAfZ4 zDeu}@Z`+--Q{Ej}yK&09F70hsyzO%iOZBFYU669FjPcZbTn^ZXXQ7_)l)ut9VxF

(Lp$^h0l1+S{ynn`aQ&sCYNdapwA>IgY7VH_vbto)OFU zTnjwo@MukO_?`W~?3{c?P>u-i*G@}8$VCtK>OMDy8SKYpSN)Px7xb=Y$r&C zc7co&n1@b5(8dGcydbI70V-w4{=go|@Y!0zGD`0tkpI9M)fRC+hT#y|v0WlMKfCaY4tGQx0EJ z?cJM_fG00>`R($vP-Xc!1kWQV0|09;u7^ojN?=cs+PVdne&J#wEDHTFKzhPb6a?S= zE`9-1BS2co9spOE4;|%`6)ltY))`0XwKHSqub-c^ub*-7BRy9Wqlrm-{bIc2TIblY z>&Hl(T)g^5=j~%R!M{zTBU~3I?M-vV4zKl_hqQr!87~?Lx@@CmeQWr=tM!fjm2jpQ|CJ)N=rRL$ukwHP){UIY-eq_k3 z-(pjSG)j{~hs`NN-g<-6B`2tZk^2METMDGdwkGBQ>A3>wjV0;<>8%CQ8?t(~|Ax5; zuA+HHw4#ul)3Qc;Wo7bVV?O~cqt7?y0VT5)C|R*yfCr?vr;7BLk@Y&3wHNi4v3uwR z8C$@WjTquG_7KFiW>J{2e;{F%i^7b(0|~2M6lUxmL|ba79s(oJp4zp@wR@54$sCvC zM-C{dPxpZx*D`dLpG73!<#KXCPK|m_um?Gy<5^pu^|)v7ySh}%vE%zstLBbVdxuKD z7QYybhDBd?;qeXCxA|fN$&{~K@+IQ%-{Wh>6wPhEW=PWv0yQU+B?|XkLzPRE6}Ykq z7j~;`o60`<`5KT_bMy>HCAn9EQ2#oW8|a61PBjB_lCkUL7ZD^7$N*I5nP7B4B1$OL z6z^AygGrFXQ{mo$#6U7A%fUez#~W>tRJmkd5QukUI9W+?)l@mc2g%~8a-q~E0d^pV z1W`4g1mMW9+${PcpiZ630$!ZQg29VW5oH`}G7d2rR~#9Ktb7c?aRew!`5l@(POA6d zwLOVzyCGj|;x$Nh;dnSD2t&0L=FzT)Mj&PKEm-|uWg19K3z>=FvNiN^RRzR#9+;}$1h{9mp)uXCRcY9I@7O!LrW?+T zSf=f@vz689$~L7EB>J~c{=?ayo}I459NaVI6<|3(EG-AiK9jMkYQR-Ald-C6!R480 zT!)nE^%U!OzkT+t!}o*l*zONb9zQXCGBkNYoD`+$o=cO`;N;-Y2 zy{2c-dd74Ht=6xbkX$n;veNci#a{cKy`IQIbXC$GU>(5R8SLYH6ET>1HY8=Jm9h$? zh&>ywV=03r0&Rmm>7fi0F_!DB-w@E&SxdeR3LJ*UlE1Vv8jL+#o>i66U?(|rzN~gM z*ijDc)wH9*x^n1zS!Fa>Uk>fnl#wEQJXBvBinoX_TQVKf85&LD2IGz*P^@ZfJfi29 zwI8ZZxpNFP=y_&UWo*eK0?vmjBdTMFiL~oj)=q8G@%;Io?2d)oSodec1f zfI8Av0y zU2go*xWj5{N~81Zujz8LiN<4AQ&Sq9Uw=&t&VQK0G|scqn*N$N-d8xSy{BET! zvr4}C71or(b+r{NmF1s?LO@^R2)Cl}L@lMhlIBoY5eNRFwFDJ_smpI$fqK#JlV5_e ztJWZF)8nFyx)vGdo9e{#prhp2T*k$m?7-nt1)dSUjBXLZHxWn(zJ;I%!S5p&LU0*? zYK?afrlcg66_=;&`6~1O^Ur~5g}-)uW?F(|o(6B9=ov@S_(@@n$4&1zBJ#Y@GxYGuwh?0oaXDG4mZTxv751U{U!x|{H7I@ zHqEb9__ff(d=rpQ^BQTaYMO6T_%_l6=`}6NnwIpMjSBqujVmB~_3-Fn7)(cJ`4Vi@ z8)dhvZ&r`*y;(PfZCHZUxb^51zb=OqSLOVa;T*1|${i~#F1Vea-vY3Zaz|qla%)!O zpWRx}PHdLjQ0S~ix5;LWq7+(1o&~AsdFeLE(_rjS`;1vdo&~AsdFeLElh<>~+N;R3 zAQe3?-6nZbgw$2!S&)jJmu{0h3vRN$&=@L6MbAsONuHEVSEinAUtru8q@w4g+jyQa z$dH?kaN7lCl^hubt~rUpPFc!GBt#e?atIMYh^#@x3>30*ZzPgC5JOEgp^KSt>l%auKu zrYY_W6(wZ`Ew1ItOMtmz?x#?M{}mp-!J!h7zj61iy5Rb(fH&9SHkfLQZ`~q>V=2`f z3tmzk*Gsa(xBLnun#kPCZ2~-Z7O2Gn9H7aPD8!_gK!kMqoLCa=KatXl0N_5C zr}9>X;_;0aE1tD8z{){%&=~S-r|y)zSwB&q_CKZgpSoZ2&-MRUpWb^;*?VrPDWLFy zkIk(2Irby(B60IIxG&|uv-ZvQiS~5MJ_UZ>eIIisZwEWam>eD2U7|(zzi~@VIBO@2 zphBFLrji8iNf~qcx&yaYnMg*}2RRsKm@{J7a>k;0*fMMb4BSt3X&8)c>Ma%8 zc+%F;p_3*4+Am;jP3?0+z6``p{mBXMCP`xb_d`G7#k&H8QF^b#~8_`UEfWT>kg^| z7HJ_E?~O`o8MzcF^#|d720k9i_{vfg+!x_hkQBTir260{bDwIL;Wi*XU;%RE8H)(o zkZ|l0frA7N5txE&grFtqfeQ=PKDdq@l{9hbWh_rOf;{=?2KrqA0Cyhx`y^QsE=qC= z-aI9NL{Y{|5_^f;8Yh^_C;Q+++i5A*zX@XT+OwErJw1?SB6Itp5$WQD$~K{lyhk-7 z({g8d4!nng`-|jZ73C@fxwlGZNK0;`{11>%{*fl6fg0e#=!JCAdZlRn_~~@hZl!7W zRM8V~v$wJ~UD>Www%?JbDz}WdXN&8`58SidubVDDHe&tAUbKjPk^SWO#XCvdvcmS& z2*h2r;lRBl-LX#L*Uj*2Ffg-`G~(gz{EIq_Ev~kw#X{w1BXUsKmSSM;4n7x$nq)TO z8CI=T2e!>d4o1;%^L*z-ZVmkk?}y79e47ym9F6G6rK{tgZm)LuG|3S6zq&Cv_ATJQ zzcn12zM}mmb8&i&XndU>lg80mc;S+faEDP!pz(5jt)}e1BR+i)W$gkS?BO^%y+$++ zPmfuU>r;6?od!(;&=155%K>y+G}(Y2^UIJFmg_;948hS3wGByOIjMjq0qD6uv>YJc zf>D;?`0{PQwVQqvRxgiZ=&Jx!J{8Ka?nD&s8zxgyKam{dpJ0?9BX|wtIgzg?fN@GG z`A^aJ&k($h;0}Tb1b>d;E&!E>m&$TD1aATQ1yTMs`V;>B24a7K;4cx}L+~a5c!Pv* zxK4q{0DRg>97m)$;5LE-k-vgyG`@@DZm5~29MRut5QXwHEyr|xp0 zi)YqUjs>m<=G^Ei(p?^Od6|0O?ZC~zTrs*zn2M^g$o0s4DY!oAVmnyQ^~qsV5%2&=R4JitzDllz59*ThTGjTmO>sta1p?ewe9%HWBb}BY%91a zEw@67oagFffDP4B8q~P*ya`ttP3bhV+8eyjO+O!>d@eDW=wEWl{(Al^|FwX}yY8G!-SXY@2Q zVUr%ymo1f!6`)BMGifEpg(c4r5%`fjPjDKd4~-5@w4~Rfg+iGglO}4?W6}gqdQ6(g zNkp>E0*R-pDU0! zVfF zJA4c4Gf&O09emGTtLbG-&}X_qoG`V+fD}HlD{z)&FfN@O3CB#1PG25Da1Q6|sc39oXDl?DG-gllEgJi$$D}iUTJ9~B8qsOe z7{A7(>5QJ1+a^kGghpzs9wHh{uRw3n7(Knj%dvP`>1Zr|xt^mje~$J%!VuC{rDuY6 z*r&Hn);4JF&-&?@7M4Ay%0Bv_t{4{moSKLkT&|e0RmP=WlQp*a*X76>$H8CQ+~e*; z&XXqaP%B5I_#1ToErOpQ_%VXl5WI!p!8$GZrx@=Nf*T0(1de;?N2HFwL+tMnyp5oc z{DEV6sF^N*EcwLQDdZkMqe~zw*ITVtp!m(zVa@WMYwRnXbOqeQiYY75UoD{iYGLOR zNd~XG=Y1dLJ2+T~lIjPD;qb`URNY5E1RHTUAZxqAd}1<#4#`=?)qS#q<#x|o*K&14 zj35z15XUf(h?x0Gu;@yoK{I@KtK@hp|1%)Jb$TNCXV3=uDj}}z`03Ma@;_k^RpV2$ zv}Fv15AF{hR^jvP^mDPvXXBIc!~$x1I#9*aujzgEn~2`0wX=}I2N%g)Lu6wR8lk7* zi=w73vWjXWiY~{lr&TU{{4J0xS1(~LQ6&e@4m{qBW|pIlWipKM^i&&;C^^XQ=wJ}( z@Zidf;CUy@6>AYOj=xpq;0t5@gPLO39PAFsSRwL>DAk4^I)l#+X%8|=!?FH^oPw{K zwZj#J#DEM}0%Rkf-sL?-b2H%A#Zf(3(>mp7d)c~ReVh%lH%b>6v=@e1 zCu<$KnD#U%o`wYm91GPQX4W}b-L}9WvT(%O$+C6CjmW}jlZ`E&tZMs|0h_4P)qdZZ zJu|!O0ozS1^$5G7~~mM|O<17RMLdHutj{}dnY9~wCsh;Z;Fw@ncIBfb2kVLsB& zdu(#{na;LuU+=5lPT5A19er<|?v!&d-Q-hVa!<9L_4RhY+SBRl?(C5_be=uh+1u-D z>+XK-sP9%g|JNx180{`KIr5-x=A$1-|ba9+LC>2l@VsM`1{vf)EvA za^CUNZ}f@+?+=Fp5kA(hOh<{6^JP|h;Qa=}lNA8!v+pxrBWHlv#2Fzrb0&x_oEhRQ z&H}L&sROjPLTux*A+}94Dk&n&c+!Z$-pSi22;x>ze5Q+Xht;C!a)m7Y0o&LS6`$Xj|p0}vNUyvrOOTd zsI9-`e$=sl%YO3cEVO6ZeAM~A~HjnZ` zG$xw^5wWRJ&WZFAgz`m0oSc<>fduDAG?gtek}KQD(--0+P?HZ2i6a88q?{E4w2ApR zIfoQL?12}%QtPz~7ck2n8|(-OeNbyByeTbp^ASjADXnrIgKUcNp$kGWv>8JHa=U-m zv12eA=63|6gOMHSXu6|+Sm^)~oZmMzA`V6)koFBF5s&wgVKoem$X1^(5DADr->6Fm z7#nD=6FbfZkb94LXklEn?(Ypm-lzm>d zSabk#a+b?nH|r+rrkZcASZt%t+m?4;(es0wC)EMorjdJU0ZIR zm^?Ancgs8Nz5C9hi;Sprd5zC$o&Mfp*+J|uefUa;Kd`R?MVK_kfe=F&M+wmB1|w=L z0l$(ERDqs;L6$-}Jbki^m__pekW`4I-xx;Bp=l1-Q&0sW`W0K%uQ>_~WE3PI%2Mca z%phtop&dl^+bHZh&TzpvW)ur)FL8soMoHQ#XCQSjfX zAdUz~A2}B{YcDT$@ZS&g^Rh7J-*gF9b1biIna5|=%u$N2r@=pqe1Te3~7%t!ya$xb>5>lCys1(!68WxOv%8bZzvL);njVs@)5YmN|BA*L*(g z!msm7Cw5O&&*#_7TWf?@U;t@b4+%)N-U&#+)`M6H9E%Ii888-A$pznFd+2V)QmBae zWU0O(ONA|ai~@=T0&a3)Gk!1-dJ9dEPC}AG)k5C@FZyC3-}&JS7hv6Dfl+?6R+qF- z7u+!0iKb?Crx+{=RwZzNe|69>D*C5vLwN zn5#N>Xge9`aqD2t{VT(a8K4x*J2S)$wA|a!>X>oNG-lF{r-*%$5F}~^eN&#IdN7rB zh#Qfzv_1NSe$`{9)sGtvz>BEzuV8_eM^kXfanw3;3|Zmz@{FUe(!t z1%G6K7jSnAh(0MC$wqW&e+W2;0)7Zg0uD>w;E(l3LDwD_jt<9&C@3H$1SFfR)FZmL zZ0SdFGbWojzvvfm*UCktS(1Gs8Wa1XNsebi)m=D*MJ%D{01Qet1tJ%s0t#Qi_0Ero zd`zf@7Xd{A*%}b}Frq6F3+3#XKRgr)`yzs{6|z0q3O_Ux4k{Z00Rh)SW_=)H5}|Yu zIw#!;yRl7Nl^DVq;gA}a+~gw^@QaGn=nM1Vs4z0>(M5Ehs`2)v@~T96qg38F+qh8PBH3NzT@&oYuI2KEJMScRv`IVKmK~)N71whnx+F&p z@XVccs|@S*jCXv{eWiP%0BWq1IuoV!Qfd7RzvL*m)|@D;k>Jl!GbPMaC$=4w;IH=J zTu`b#@zBWb=werj+)vDgvdZzUCl)5x_3`n9t3h%#EIBsbuwS=NoRJ*vB}ZApQ6V`h z5)N!BbqvdnEeXeN$+3Gi+q8MluNYHq>%)Aeu=-aeOm_LtthwXZZHL6V1n@=3~o_f*aQB)*srGaZGt< zE2jTIax?=8cDDctw(c44NZ2<^_RSOLC3^+TWNWil*O_osOOEO(F;Tlms@;>QZIj^7 z(RQ!n{)WFlE;&x#=jZ$0_{TxX@g_ESmc9!i~>TzF_9_J0J9F_J*WGy}J$r3(k(&Lk&$|r*=AJp1=n){{D zz^#>|DUKarQ%WF=WrVniV>nY86E_7BU^0S6oeQco1X55fEe5qsIF}TJUUR*ko4QPJ zJ(}srJkL-0AndEmK=%o9q7>@u@moZ32z0hAbB(=viSQk>qTCQBLzvGu{{z8OBpiYA zDLe)RXu?Q@3QQruYkWsIimx3Qbz+1IB6MMd9jCGng>m#nk&VbXz>xX@r6ceUY-cze z8x9M&8D%4wGQfa1)Bvg#zw1j;Xl35 z8_;a5jR5~LMZm;5zz@JX;r}pysjB*#DFuy7sA-{VcEfbfPb&Ye?#FeD&ZFbq%XX)R z->!tCN^(>s9CebT4&Xcqot2Vf>yjgxnGWAi5Wb(-8L$N$nmn|$q2QYFqcb<&y8hNg zeCFuPnfdaabM{1Qm(<#|xalNtk5^71Qo4HB@$QxK%Bcgh2C1rPp}aYj9i@&)cL*8d zQ&`N*w%JR!8zjfx^a1`F2l%LJeI)(}_Unu2I)KEFo(G9jjSwl=fgB(s0H&*Yj+PZC z=%$_aplT}6%BYg74!W)(vFe8>))loG9tb-YP2&e8@N;9B@p1!L{q%%|p?S&4B`Z-#nU3R(^ zyONGE}7#xU+y$5b$IP za#nvd6yp0uVLyI8fuGGBe|~sC5kpYk100o|sWQ46Vj0V6%e#FI{AycGW4vqP=RRs`2igL{q2K)S0Te zebRWVb-MMh_9d#CCHNx>Q!^?|)kI-(fx-wHp548Y!KU4y^-UFk&ICH-`t4;IH@7@t zjJbOr+8D>i58_wi6YWxd`C?_$Y)7KGLu&50cSdSHAyuCEGXKO-kd#5K~aYvt74+WdFKsOQNh%DnlK(tY!Sds&TQfX1S*B*5LHu^2V)G+^x`bXtq(RX^}RzByU`xZY|1RWTXp@ zs)sv3{&>>D6qclF?rE#Blre{u%3XJB6OA2GV@IYY+i|OVx_h=js!py>6Ru7btj-6o zU3m>iX)j;VwErcC6P5nWP{h53N?BjpU27<(Ik0zdSFTztFDw#)qW5g&^t%y>06J8N zhtVd3!q6d4RXJ%6sE?|8EUJQ%p6Us;bp)QZ)u>N_8DgNI_CoVpQz(H^tjW|x#3K@d z?qnk6p~Ak77y_-L9+VL@G3aGe23YDwFfg;=0R$d9V+LX-HgLui+NnZH5N(VZfir0( z!8Dy-6I3K6z%18g)F8Z{{1!V^6?$lY#5D0@dnP$nZ8o}sw;sz~gJq%!MO&lKHFlzY ztEj?e1Z7{)*68zvT5ZW_5?;T9R$E_6t81P&I&=DbAquRCQnWStBnFTk6DkNgXEcby zMn<3~9UPieAr@-u?8M7#@K8o&c!jb@p8h(Ti`Sw)6=UEcfEr{DLTSY+a~M*diZ!Jy za8*+s=h|W;k$y#lLRmiYD2%4c zYNW4p8O`eJ5ApuU@X#8b9?9Gfp{3aNXG&m6+e?$=YC?&L3OQ+JbTELRF=nPh{o!kBY+fH_7n%F~9az=WO9bf!>IeQW#l_8CE{Y?K_0&+h2g z*wMocXtv+7uGwC_tx$|sX);f8{(B|YUfmfPS)37@yCW%UZkJr!XZk*S`}W&&l~Vn| z1=pcP`{86GN?xM=AY{y)m+IRVThtW;5xc!?|6aS&wRIib7!Rb zBMUARC(~-Cj_85?@WVnN%flkZ<-U1f@<775U2<+;a5gCJK303eS|V9X64t6OtyLg^ zzB~P{WNmz4t$O+{-U{1@2xE;9r`e}eVJ6jUQlIg`S3GVRLC>!eE&OoD4vNnU)PYS7 zV^DhlXBFbot1@?WqyU>L80raVPciqDnlPTij0wQ2CekGZiIf`8=qvS38}BzsfL=i8 z%?uQvLBl@=VuKM%A!ucUxGKbu+ZWTAdCUS9eFM3DVXtmkgQBYaMQbjIsFxAq;P?|a z$1QP&ikom(e}XUzM8=Fj83VBmsX(q3((=XsZ*y0zXxeDknLEOIM$qO4jLXuauqzY? z1>|0%$vf*7n%DzAj7D%cB-l@wk#fO+LXx1vZ-mz|;|NB0&n+9c0eJ&>;PecKHRD7~ z@L~Z!7APFxBfJ2&38~ou$nj(;MuAgUlrXg(Orac>a+RC~uV5g+pco0VRrkZu&3Z-w zHT!R&X6!TwnqUMdDxKK+!ytMD!fo-*cPHPS=~I9*QGZl|zw)E^{8IVx`yGkyx25j4 z6W!+}_}g+G5zF~Y?FRPgOW7ravr5TYIkkJy>Y+EaDdkl7S1zmdZ*A$Fg`Kdt2{syf6OL-uEyY!PU-W=O(N!$?Ad&Vls#t$y&2$t^2n9HA(iS z+4ecxz0Uj1^RK`0Q~R4*haco#$^Ak8(-*p^0TP&AfyAEyP;=}Ut9dS7d4@i6?g+q& z0ZOSL3e6-#+<<}*_zNaA28MxhIv}N#OMxmd&yShFwF!U&TSz*uAwwYnjeIESjj>mY zOw84G6En_SVBr@5R&ZR(p`}2_GSdSEV2j5H0ywpql2>$Sa_c7_lMxQB3I12%R~*cr zvnLrgjc;8sudkY&<6%e)l+q&wLrB3wt{wrDTtf<>%z93raZ)aYP@r()J$amX&A@%E zI(hy37^R%XT{K^I4(@%+n7?3euz=0mfv5v$!TcNJJFEbU!Fa4G;xdA=BJ2{u7AGcW z`I9~{c;-D8#ad_;2Lmzi1B}T^3ZP0h1cS0EG9<G9Q-pj-0Rayoe(#yb?cbx(HB6o3T#pr{^*=WtGlK#qZg^-$YoX1Sz3WK{IP57*a|J%?{>J44M zO)89G^a@5Eh-77~7uOl<#s9#DsWCE4t3&`DtC`Ka~{UJtwj8XdV1T;gEs0<7*E6h!gkFid3$>V*6P+%KtKF+RMFji)ct$Iq}w;01= z&~Xu2gmVxnp5YDW;b&}cuMktb#fkE)errhn@J;cMsEh>8`_J>C21Pd~VA7Bd8syB$ zS?Fd5ZOTsii-`tQzLUS0fV3D0#T3^+L^k4fr+7evqQRs%paVNnywHgkhnx*!RX6eA zQ`CJzu5=R0)&t549TKiWW#o}DvG|D37#IkWSZ%z>|%yx*8|Si^4^ n8a;F|Hs^S8BD+k=E}M70vXH&~J|=&J2uc4Jp3g?r literal 0 HcmV?d00001 diff --git a/cuslines/cuda_python/__pycache__/cu_tractography.cpython-312.pyc b/cuslines/cuda_python/__pycache__/cu_tractography.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67a83039c23c63c11bb41d1469710e789f7c805e GIT binary patch literal 9441 zcmcgSS!^3emessTHN}=hT@)ozr%fN!WyQAQ9F}Fvb{tPe_Rhr2W(k_zqDb?a?pD^~ zYOuy4AM(Hh+sFczkSwf^`EZZ`>i`2x=4-vc02^TU!yOT6p+|!;FvbG;Sb!6G5MY4? z_Ej~jn-W?ZJF}RQSoP|7udY|`sOo?B`+W#P`~SR?S?NIN4>&`EqXumM0)TsnM?8~5 zc_z-}ZE+ie^R}El?}$4HY|lCKuDH7fd*WWGx95CuAEb_4L!OPZd4JrWZ;UsRHqKli z&&9cXQ@n}fUAg9bOT2|aHYE76`{^Lely`r^=(%{S4b39n^FHFepV(?;ZASpaOU!r zw@zz;)Pj(@b_$xG7R90{0lntgxBhA-BMK=wQ!LC1vJA9ari(>+S0VJ%a!C;Q>7<-m zcuOpnlIf&e6tzacza)!7GM~#71SzGDS&zGFneER3xQ7H3XLuC1c@h5OgFGYHd7I$i z?eKHJ&j~*l{M>>w%hnUpVcxSxxp$9p-yY=+f}3aI=NH`bA-?f_XWYXF-bb*p9K=4p z3E~F68De&+CEBX_V2+o?Wa=8Mjg3!h-hA?UB3Vf11dWwb33)*jq=jOR*SwM}loC>= zENDSd$R%aiszgahUaPgyg857#Aql0V2zA9mjfj&^%E@`nmltF)lS<^1B`qM876eg9 zz*ftdlptwrtw7+@kaHH&rA0~eh=nwe6L`&sq2m0!B*>}8x}|diY=_@Aa^fwJYkY=K zw#Hp`ZPBt@rY8e^%257WMhQ%Tgzv!j?167k{SJKZ9{4c!=jz`l(}u{3fp*LZuM7#x z?kq=PGyQ8AGtAseO8_SF1m^*6s4F*ERi+G&mVH(kU46Sny|b>~m{(o@uDW`>1tiq1X+vFo$fBODt2g$z4&Pr_ zA2#q`HlSB1lzo*#X386@L#Z%GHgQkVmf2|vndv7rkYfI{5%%qfSx=$m#%nVOiDCIP zE#aNkEk8uOYq=3(_dgk{`VWkJhT>((XKUP3j~aZ8R0!pR;P+!nrQ5tZlDlLLv!Vd=`^TMN2V}e)Z9ejC3uZBRUYJ; zb}2n5YOGg#katVO)&PB&PZpACflpvkA{NImUC@Myl4+ax63JYym`cK?Mgy8d5_0q3 zG2NXzLAGGm5@=4s3KCwn$jb-3ixRPq0N|5}(vrMTEC5V6 zL@uVqWNBe3R$9_Li9`mLHIXQ{nCum!g?Kll{{$U=13inNwhpy5sCQj|N3?%d=Xc4KAD{Q)jOZPqj<&v*L&y4KOSG@Kgv7^e%iU!*138}4INTKhaUV; z35{&DjY5-#t%lY+S1X~zm4?_>cVxX;?VC{gCe*&;O5gF%`;@*D8{Mz0%>4Y~lkUFz zSJtkq%-p)T+1Y)+bFFiIZms9Rw9*lK*sru70b1;n<{%{QtXyeUp91aHTSHC9?bmn z`H#;(IPviBA1yvge-VG&`}RijmB;Lrr=G^!4L2Je2akV#uC!yXI$Ik=Xu-zzSm~4>2LW`JlSbE+A zp_;9)g)$tdQedF|68f0ILeFa)g(xGQfJPEVjEGntL1jWHfZ-2Z7u>;hwbIMDJ(dnP z(58pI-FUOH;_287w5&9UuS4mM5n_XhjPM3PWQ0(#JBZ8@l5r~%U=~=4>k6Q&S0Ty^Le?%Fj`F99b+>L z7qk9($F^gPhIvy*=l^SrF_R{RzSkH>DYlvZS?f4K=~x-JRNiG@qOHsnc;c9uqtJ`b zNuDxr4P%~`fGcD;Fkre{$ter7-}*$<`22N~Hih=!ZVRU%J412IbO$HCz$vLS{Lh-y zDD+89Zkzo19qVK`axb(_7N=5QCs-4!Q}un;#EQ-ylH-Nf$W zjlkh`W81H}BntC_D1viDydk)Fgi?30gs&i_vI}1{9v;)2I2|p!NpfiNa&(`FZy_|N z2v?ng?gkP64zhs z!+5TNe<~iw=(bc!YkgC9>)<-OC@#$jQgKmC30l~)_y_t7#3LBnLw!Bb1zF-#gUbh3 zvEn4I_Ti4-ByDt88!HRqZ;MAU)Q6)1+|pIJx+Dvd?);78V$4%D0_2umb1ftbJly`e zQur!C0%sgMp35^5vrL6BEBv3(zqp;O-YL2%O%+x5|`8Xi-^V-Nc` z!jl#DrA?N*ec|Q>mF-p7-kPg(Qi+`03U;Z%F(o**ZD;x>wh_~GL~%62Pkx5^GG?BEvHtaAGmZa?uaUsl2|e<4=FbI|!jQmm!xnE)XRW_=y)a5gfbwsu2r~?1NqZs`-MxVmyFW1oV z0;=q=!VcG#QAcmPclPet56-K>AtgAp&a1;mmEoi6@CgO}gC{VlS23yAFsav>XCCBl zx&6k?Hrwy(BdF(n*><+#V*Wmo4EE~0Y#6mB0b@!j?5i==YH3I`o`t`6Qmuy)}7 z$l8d)!903YZc^bUx4n+;qtB4Tf9x5ovfhEa!bWy>P`BQgc;Ie8C#!M?74D!;90=>z z2zWcJ$%iqI)yAw7SQi!U0NKz%g&Wk#sobc-jS`|dK{Yg?z&|(fTx16nZeV>{Uk17a zn7dT&u)-Y%X1nLP>-nIzI_raK|Af*%q4pnB;Ga9TM;H3Scq?o_J{Ge#XH|AUVWFR| z*nQP^7~uTs-!Q0)c;LE!iv=DHSeIkZb^B*uU2*g@{|4q8&S@N!|Kds3l& z(UkOQ;R}C{KKGrAVXkc!FYT^i?hNd|acfbxn^vB* zrKhZ~A{_%wLGLrSIvi-p7PVh2kIcaRwOCx*bvF&FDl#t$LajKGC#n^0sn0|MqM_f! zb1?Q@N?yc{b)=XJG~J(Q{iUe zic1X)DS@H&z0%_Z?1p3#pj~12Kh<^iq!ONd$p0$$S?1nXlN{YA8sf?lC_Bw}+sY8SCpt9j{_yMSLf669e6UZvqV z1zsYkiQ~kV^HRN)qXpAKL4b#FA)#ODlMC{4Z`}=~Rf7l6rF0M?av7<5!ip!XdWIFx z@cLrKbC~SIfD#x`1CvT%awBl`vF9jRIruK(YkWgRUVcav_2c{YLNYHT5}GfO$QSv= z9ERCM;wOvAoL(d1gR42`b47SW!Db@7o$&hWO%#)R4M#ILI)x)1M;VAdMx2>^|;{JYjpPPDsJj2zFN881kU@aMg<;A1-Ua5Q};T zt9}eMVkm$i4sp%5%Qwr_CY)D-#uS#!MPv`h3|*fLe*BBYeVgw_p*1h)pnfg zKwL|;6GI`?(pK%lP#Cp#Y~%5K^B!Y!2EXwfclH9<;%wbMU~?YRCxzz|ZP&ek*j4!I z1f+{ZUWW7+A!6qn6|vgKD!Pg6#quGRDvI!~g=6s|5Q9$|7W1X0+5+R(BA}>s(RY&= z#;LuV04!&6l5Xc?eN3LEwCEmf6FmLAC6OlIw@741^(V4@dm$M-O}?*?^cN;C5V$}d zvUI^ocC~~@@H&pjWxD5t&PK0_e+Oll;iL-?RqYJJ{2oPrhem&g+O}PYY56T`{MO-M xY=1-$ee1I`wm*3gbC!Yq_jRqbtKN{}4L$BUxZyo?!}YCe&c+=32H}|W_P-No7&-6`on{a>*qrTB3eQ$)=5_5uHRbY{zJX#HmCPj$=g@M9OFujTaPmEs z+1;gE(h|a`1}dQ!<|G?+;q=r5Mgbq2Q+@A+w2X?F3TTU@2i@E#ryh#FndM5dprn^R zU}xUU`0}ci24MJ`s~qo;IAS^5Hi`Xc5BjaklORx{r_FV z+JH7#<3MLfi^?%Iau3PFYA=ioN8`(-qBdSAEgR!k<`&Ynsb!0L-q5V^T*Y!3HLH}X_HxO9O*^k!W96#p19LV6 zfU){vxQE(-V12qR1?owc`y^@hx{4;%+~r{)_!k6M39b?0mR`!*#LK1&yVNP9vs78s zH6l9mg7~UigtVMfki?*FcF?&-=e3-<8iZ(M$xV0Dr(mCMVr%n4fbD!&sv3{j&{WE#w!o| zADwz|>baQM>5XppCR)9T&tKW>YTGXVbum`$_-0HFmez;~3)iw96uxdMg$5&AxBTXdCOEw}J4 z$lUcBwrNz14q&2s7>IEd9V_LTQ~=)|2Y{4QQ52w<5N6rER_muQ=m04PSKd2!S4RO# z3}`0pgM>hYJ6XM=nLmV;No&T6M?AV_z#Z##@MY01@02^7Th?;7lTdPo6U#gYGWuCM z40Y5NL+ja3s=xYp?c<%pHM#Wk^p|h_>D@2ieLnM1^QPL=ZnbVMZOtri#qwKSg{H5t zBTDO+etEZ%eI^dH!;u|-aQ()wWB2hh|4=*J*T`;Eo4&!nQ?9YNUTgY>oZ9YRI&eDW zS#9atPHVe@Yrv_v20sft$RRu2tdjuRr0~e#EWwKwHFXi~Rr7b0O4+;!dlT?k6zA|~ z!B7Klq#BbOSMQ&01!7G;#zIJ(A|cuWV~cr1g+{VicRF1sKs%JHgfq&-o1?vK5niJ$ zLcv5v+^1lS+IOCUF{N|!vPXdU_wQvQQT776qlxHT?B|_LWjgcWT#5)PJ%R)rJ!h|* z*)v|5Oe^WkJn1*}3#`tgO)vr*`Us33vXz*+F+P7XF2OHwI2&NZ7)D>eCMM(gjld!mgqo1>qO-cNsa=J#iRceXkFMtdZFEdN5_hR6317dcnI_QH#X z;!VD&1AOCGfulx|Wv! literal 0 HcmV?d00001 diff --git a/cuslines/cuda_python/_globals.py b/cuslines/cuda_python/_globals.py new file mode 100644 index 0000000..c19368e --- /dev/null +++ b/cuslines/cuda_python/_globals.py @@ -0,0 +1,10 @@ +# AUTO-GENERATED FROM globals.h — DO NOT EDIT + +EXCESS_ALLOC_FACT = 2 +MAX_SLINES_PER_SEED = 10 +MAX_SLINE_LEN = 501 +NORM_EPS = 1e-08 +PMF_THRESHOLD_P = 0.05 +REAL_SIZE = 4 +THR_X_BL = 64 +THR_X_SL = 32 diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py new file mode 100644 index 0000000..2dc54cc --- /dev/null +++ b/cuslines/cuda_python/cu_direction_getters.py @@ -0,0 +1,381 @@ +import numpy as np +from abc import ABC, abstractmethod +import logging +import ctypes +from importlib.resources import files +from time import time + +from cuda.core import Device, LaunchConfig, Program, launch, ProgramOptions +from cuda.pathfinder import find_nvidia_header_directory +from cuda.cccl import get_include_paths +from cuda.bindings import runtime +from cuda.bindings.runtime import cudaMemcpyKind + +from cuslines.cuda_python.cutils import ( + REAL_SIZE, + REAL_DTYPE, + REAL_DTYPE_AS_STR, + REAL3_DTYPE_AS_STR, + REAL_DTYPE_AS_CTYPE, + checkCudaErrors, + ModelType, + THR_X_SL, + BLOCK_Y, + DEV_PTR, +) + +logger = logging.getLogger("GPUStreamlines") + + +class GPUDirectionGetter(ABC): + @abstractmethod + def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp): + pass + + @abstractmethod + def generateStreamlines(self): + pass + + def allocate_on_gpu(self, n): + pass + + def deallocate_on_gpu(self, n): + pass + + def compile_program(self, debug: bool = False): + start_time = time() + logger.info("Compiling GPUStreamlines") + + cuslines_cuda = files("cuslines") + + if debug: + program_opts = { + "ptxas_options": ["-O0", "-v"], + "device_code_optimize": True, + "debug": True, + "lineinfo": True, + } + else: + program_opts = { + "ptxas_options": ["-O3"] + } + + program_options = ProgramOptions( + name="cuslines", + use_fast_math=True, + std="c++17", + define_macro="__NVRTC__", + include_path=[ + str(cuslines_cuda), + find_nvidia_header_directory("cudart"), + find_nvidia_header_directory("curand"), + get_include_paths().libcudacxx], + **program_opts) + + # Here we assume all devices are the same, + # so we compile once for any current device. + # I think this is reasonable + dev = Device() + dev.set_current() + cuda_path = cuslines_cuda.joinpath("generate_streamlines_cuda.cu") + with open(cuda_path, "r") as f: + prog = Program(f.read(), code_type="c++", options=program_options) + self.module = prog.compile( + "cubin", + name_expressions=( + self.getnum_kernel_name, + self.genstreamlines_kernel_name, + )) + logger.info("GPUStreamlines compiled successfully in %.2f seconds", time() - start_time) + + +class _BootCtx(ctypes.Structure): + _fields_ = [ + ("min_signal", REAL_DTYPE_AS_CTYPE), + ("delta_nr", ctypes.c_int32), + ("H", ctypes.c_void_p), + ("R", ctypes.c_void_p), + ("delta_b", ctypes.c_void_p), + ("delta_q", ctypes.c_void_p), + ("sampling_matrix", ctypes.c_void_p), + ("b0s_mask", ctypes.c_void_p), + ] + + +class BootDirectionGetter(GPUDirectionGetter): + def __init__( # TODO: Maybe accept a dipy thing and extract arrays here? maybe as a from_ function? + self, + model_type: str, + min_signal: float, + H: np.ndarray, + R: np.ndarray, + delta_b: np.ndarray, + delta_q: np.ndarray, + sampling_matrix: np.ndarray, + b0s_mask: np.ndarray): + if model_type.upper() == "OPDT": + self.model_type = int(ModelType.OPDT) + elif model_type.upper() == "CSA": + self.model_type = int(ModelType.CSA) + else: + raise ValueError(f"Invalid model_type {model_type}, must be one of 'OPDT', 'CSA'") + + self.H = np.ascontiguousarray(H, dtype=REAL_DTYPE) + self.R = np.ascontiguousarray(R, dtype=REAL_DTYPE) + self.delta_b = np.ascontiguousarray(delta_b, dtype=REAL_DTYPE) + self.delta_q = np.ascontiguousarray(delta_q, dtype=REAL_DTYPE) + self.delta_nr = int(delta_b.shape[0]) + self.min_signal = REAL_DTYPE(min_signal) + self.sampling_matrix = np.ascontiguousarray(sampling_matrix, dtype=REAL_DTYPE) + self.b0s_mask = np.ascontiguousarray(b0s_mask, dtype=np.int32) + self.ctx_h = [] + + self.H_d = [] + self.R_d = [] + self.delta_b_d = [] + self.delta_q_d = [] + self.b0s_mask_d = [] + self.sampling_matrix_d = [] + self.ctx_d = [] + + self.getnum_kernel_name = f"getNumStreamlinesBoot_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" + self.genstreamlines_kernel_name = f"genStreamlinesMerge_k<{THR_X_SL},{BLOCK_Y},{model_type.upper()},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" + self.compile_program() + + def allocate_on_gpu(self, n): + self.H_d.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.H.size))) + self.R_d.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.R.size))) + self.delta_b_d.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.delta_b.size))) + self.delta_q_d.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.delta_q.size))) + self.b0s_mask_d.append( + checkCudaErrors(runtime.cudaMalloc( + np.int32().nbytes*self.b0s_mask.size))) + self.sampling_matrix_d.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.sampling_matrix.size))) + self.ctx_d.append( + checkCudaErrors(runtime.cudaMalloc( + ctypes.sizeof(_BootCtx)))) + self.ctx_h.append(_BootCtx( + min_signal=self.min_signal, + H=self.H_d[n], + R=self.R_d[n], + delta_b=self.delta_b_d[n], + delta_q=self.delta_q_d[n], + sampling_matrix=self.sampling_matrix_d[n], + b0s_mask=self.b0s_mask_d[n], + )) + + checkCudaErrors(runtime.cudaMemcpy( + self.H_d[n], + self.H.ctypes.data, + REAL_SIZE*self.H.size, + cudaMemcpyKind.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.R_d[n], + self.R.ctypes.data, + REAL_SIZE*self.R.size, + cudaMemcpyKind.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.delta_b_d[n], + self.delta_b.ctypes.data, + REAL_SIZE*self.delta_b.size, + cudaMemcpyKind.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.delta_q_d[n], + self.delta_q.ctypes.data, + REAL_SIZE*self.delta_q.size, + cudaMemcpyKind.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.b0s_mask_d[n], + self.b0s_mask.ctypes.data, + np.int32().nbytes*self.b0s_mask.size, + cudaMemcpyKind.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.sampling_matrix_d[n], + self.sampling_matrix.ctypes.data, + REAL_SIZE*self.sampling_matrix.size, + cudaMemcpyKind.cudaMemcpyHostToDevice)) + checkCudaErrors(runtime.cudaMemcpy( + self.ctx_d[n], + ctypes.byref(self.ctx_h[n]), + ctypes.sizeof(_BootCtx), + cudaMemcpyKind.cudaMemcpyHostToDevice + )) + + def deallocate_on_gpu(self, n): + if self.H_d[n]: + checkCudaErrors(runtime.cudaFree(self.H_d[n])) + if self.R_d[n]: + checkCudaErrors(runtime.cudaFree(self.R_d[n])) + if self.delta_b_d[n]: + checkCudaErrors(runtime.cudaFree(self.delta_b_d[n])) + if self.delta_q_d[n]: + checkCudaErrors(runtime.cudaFree(self.delta_q_d[n])) + if self.b0s_mask_d[n]: + checkCudaErrors(runtime.cudaFree(self.b0s_mask_d[n])) + if self.sampling_matrix_d[n]: + checkCudaErrors(runtime.cudaFree(self.sampling_matrix_d[n])) + if self.ctx_d[n]: + checkCudaErrors(runtime.cudaFree(self.ctx_d[n])) + + def _shared_mem_bytes(self, sp): + return REAL_SIZE*BLOCK_Y*2*( + sp.gpu_tracker.n32dimt + max(sp.gpu_tracker.n32dimt, sp.gpu_tracker.samplm_nr)) + \ + np.int32().nbytes*BLOCK_Y*sp.gpu_tracker.samplm_nr + + def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp): + ker = self.module.get_kernel(self.getnum_kernel_name) + shared_memory = self._shared_mem_bytes(sp) + config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory) + + launch( + sp.gpu_tracker.streams[n], config, ker, + self.model_type, + sp.gpu_tracker.max_angle, + sp.gpu_tracker.min_separation_angle, + sp.gpu_tracker.relative_peak_thresh, + sp.gpu_tracker.rng_seed, + nseeds_gpu, + sp.seeds_d[n], + sp.gpu_tracker.dimx, + sp.gpu_tracker.dimy, + sp.gpu_tracker.dimz, + sp.gpu_tracker.dimt, + sp.gpu_tracker.dataf_d[n].handle, + self.H_d[n], + self.R_d[n], + self.delta_nr, + self.delta_b_d[n], + self.delta_q_d[n], + self.b0s_mask_d[n], + sp.gpu_tracker.samplm_nr, + self.sampling_matrix_d[n], + sp.gpu_tracker.sphere_vertices_d[n], + sp.gpu_tracker.sphere_edges_d[n], + sp.gpu_tracker.nedges, + sp.shDirTemp0_d[n], + sp.slinesOffs_d[n]) + + def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): + ker = self.module.get_kernel(self.genstreamlines_kernel_name) + shared_memory = self._shared_mem_bytes(sp) + config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory) + + launch( + sp.gpu_tracker.streams[n], config, ker, + sp.gpu_tracker.max_angle, + sp.gpu_tracker.tc_threshold, + sp.gpu_tracker.step_size, + sp.gpu_tracker.relative_peak_thresh, + sp.gpu_tracker.min_separation_angle, + sp.gpu_tracker.rng_seed, + sp.gpu_tracker.rng_offset + n*nseeds_gpu, + nseeds_gpu, + sp.seeds_d[n], + sp.gpu_tracker.dimx, + sp.gpu_tracker.dimy, + sp.gpu_tracker.dimz, + sp.gpu_tracker.dimt, + sp.gpu_tracker.dataf_d[n].handle, + sp.gpu_tracker.metric_map_d[n], + self.ctx_d[n], + sp.gpu_tracker.samplm_nr, + sp.gpu_tracker.sphere_vertices_d[n], + sp.gpu_tracker.sphere_edges_d[n], + sp.gpu_tracker.nedges, + sp.slinesOffs_d[n], + sp.shDirTemp0_d[n], + sp.slineSeed_d[n], + sp.slineLen_d[n], + sp.sline_d[n] + ) + + +class ProbDirectionGetter(GPUDirectionGetter): + def __init__(self): + self.getnum_kernel_name = f"getNumStreamlinesProb_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" + self.genstreamlines_kernel_name = f"genStreamlinesMerge_k<{THR_X_SL},{BLOCK_Y},PROB,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" + self.compile_program() + + def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp): + ker = self.module.get_kernel(self.getnum_kernel_name) + shared_memory = REAL_SIZE*BLOCK_Y*sp.gpu_tracker.n32dimt + \ + np.int32().nbytes*BLOCK_Y*sp.gpu_tracker.n32dimt + config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory) + + launch( + sp.gpu_tracker.streams[n], config, ker, + sp.gpu_tracker.max_angle, + sp.gpu_tracker.relative_peak_thresh, + sp.gpu_tracker.min_separation_angle, + sp.gpu_tracker.rng_seed, + nseeds_gpu, + sp.seeds_d[n], + sp.gpu_tracker.dimx, + sp.gpu_tracker.dimy, + sp.gpu_tracker.dimz, + sp.gpu_tracker.dimt, + sp.gpu_tracker.dataf_d[n].handle, + sp.gpu_tracker.sphere_vertices_d[n], + sp.gpu_tracker.sphere_edges_d[n], + sp.gpu_tracker.nedges, + sp.shDirTemp0_d[n], + sp.slinesOffs_d[n]) + + def _shared_mem_bytes(self, sp): + return REAL_SIZE * BLOCK_Y * sp.gpu_tracker.n32dimt + + def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): + ker = self.module.get_kernel(self.genstreamlines_kernel_name) + shared_memory = self._shared_mem_bytes(sp) + config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory) + + launch( + sp.gpu_tracker.streams[n], config, ker, + sp.gpu_tracker.max_angle, + sp.gpu_tracker.tc_threshold, + sp.gpu_tracker.step_size, + sp.gpu_tracker.relative_peak_thresh, + sp.gpu_tracker.min_separation_angle, + sp.gpu_tracker.rng_seed, + sp.gpu_tracker.rng_offset + n*nseeds_gpu, + nseeds_gpu, + sp.seeds_d[n], + sp.gpu_tracker.dimx, + sp.gpu_tracker.dimy, + sp.gpu_tracker.dimz, + sp.gpu_tracker.dimt, + sp.gpu_tracker.dataf_d[n].handle, + sp.gpu_tracker.metric_map_d[n], + int(0), + sp.gpu_tracker.samplm_nr, + sp.gpu_tracker.sphere_vertices_d[n], + sp.gpu_tracker.sphere_edges_d[n], + sp.gpu_tracker.nedges, + sp.slinesOffs_d[n], + sp.shDirTemp0_d[n], + sp.slineSeed_d[n], + sp.slineLen_d[n], + sp.sline_d[n] + ) + + + +class PttDirectionGetter(ProbDirectionGetter): + def __init__(self): + self.getnum_kernel_name = f"getNumStreamlinesProb_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" + self.genstreamlines_kernel_name = f"genStreamlinesMerge_k<{THR_X_SL},{BLOCK_Y},PTT,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" + self.compile_program() + + def _shared_mem_bytes(self, sp): + return 0 + diff --git a/cuslines/cu_propagate_seeds.py b/cuslines/cuda_python/cu_propagate_seeds.py similarity index 69% rename from cuslines/cu_propagate_seeds.py rename to cuslines/cuda_python/cu_propagate_seeds.py index a334da6..73a4a6c 100644 --- a/cuslines/cu_propagate_seeds.py +++ b/cuslines/cuda_python/cu_propagate_seeds.py @@ -1,10 +1,12 @@ import numpy as np -import ctypes +import gc from cuda.bindings import runtime +from cuda.bindings.runtime import cudaMemcpyKind + from nibabel.streamlines.array_sequence import ArraySequence import logging -from cutils import ( +from cuslines.cuda_python.cutils import ( REAL_SIZE, REAL_DTYPE, REAL3_DTYPE, @@ -12,6 +14,7 @@ EXCESS_ALLOC_FACT, THR_X_SL, THR_X_BL, + DEV_PTR, div_up, checkCudaErrors, ) @@ -25,18 +28,19 @@ def __init__( self, gpu_tracker): self.gpu_tracker = gpu_tracker + self.ngpus = gpu_tracker.ngpus self.nSlines_old = np.zeros(self.ngpus, dtype=np.int32) self.nSlines = np.zeros(self.ngpus, dtype=np.int32) - self.slines = np.zeros(self.ngpus, dtype=ctypes.c_void_p) - self.sline_lens = np.zeros(self.ngpus, dtype=ctypes.c_void_p) + self.slines = np.zeros(self.ngpus, dtype=np.ndarray) + self.sline_lens = np.zeros(self.ngpus, dtype=np.ndarray) - self.seeds_d = np.empty(self.ngpus, dtype=ctypes.c_void_p) - self.slineSeed_d = np.empty(self.ngpus, dtype=ctypes.c_void_p) - self.slinesOffs_d = np.empty(self.ngpus, dtype=ctypes.c_void_p) - self.shDirTemp0_d = np.empty(self.ngpus, dtype=ctypes.c_void_p) - self.slineLen_d = np.empty(self.ngpus, dtype=ctypes.c_void_p) - self.sline_d = np.empty(self.ngpus, dtype=ctypes.c_void_p) + self.seeds_d = np.empty(self.ngpus, dtype=DEV_PTR) + self.slineSeed_d = np.empty(self.ngpus, dtype=DEV_PTR) + self.slinesOffs_d = np.empty(self.ngpus, dtype=DEV_PTR) + self.shDirTemp0_d = np.empty(self.ngpus, dtype=DEV_PTR) + self.slineLen_d = np.empty(self.ngpus, dtype=DEV_PTR) + self.sline_d = np.empty(self.ngpus, dtype=DEV_PTR) def _switch_device(self, n): checkCudaErrors(runtime.cudaSetDevice(n)) @@ -51,28 +55,31 @@ def _switch_device(self, n): def _get_sl_buffer_size(self, n): return REAL_SIZE*2*3*MAX_SLINE_LEN*self.nSlines[n] - def _allocate_seed_memory(self): + def _allocate_seed_memory(self, seeds): # Move seeds to GPU for ii in range(self.ngpus): nseeds_gpu, _, _ = self._switch_device(ii) self.seeds_d[ii] = checkCudaErrors(runtime.cudaMalloc( REAL_SIZE*3*nseeds_gpu)) + seeds_host = np.ascontiguousarray(seeds[ + ii*self.nseeds_per_gpu:ii*self.nseeds_per_gpu+nseeds_gpu], + dtype=REAL_DTYPE) checkCudaErrors(runtime.cudaMemcpy( self.seeds_d[ii], - self.seeds[ii*self.nseeds_per_gpu:(ii+1)*self.nseeds_per_gpu].ctypes.data, + seeds_host.ctypes.data, REAL_SIZE*3*nseeds_gpu, - runtime.cudaMemcpyHostToDevice)) + cudaMemcpyKind.cudaMemcpyHostToDevice)) for ii in range(self.ngpus): nseeds_gpu, block, grid = self._switch_device(ii) # Streamline offsets self.slinesOffs_d[ii] = checkCudaErrors(runtime.cudaMalloc( - np.uint64().nbytes * (nseeds_gpu + 1))) + np.int32().nbytes * (nseeds_gpu + 1))) # Initial directions from each seed self.shDirTemp0_d[ii] = checkCudaErrors(runtime.cudaMalloc( - REAL3_DTYPE.nbytes * self.samplm_nr * grid[0] * block[1])) + REAL3_DTYPE.itemsize * self.gpu_tracker.samplm_nr * grid[0] * block[1])) - def _cumsum_offsets(self): + def _cumsum_offsets(self): # TODO: do this on device? for ii in range(self.ngpus): nseeds_gpu, _, _ = self._switch_device(ii) if (nseeds_gpu == 0): @@ -83,8 +90,8 @@ def _cumsum_offsets(self): checkCudaErrors(runtime.cudaMemcpy( slinesOffs_h.ctypes.data, self.slinesOffs_d[ii], - slinesOffs_h.nbytes * (nseeds_gpu + 1), - runtime.cudaMemcpyDeviceToHost)) + slinesOffs_h.nbytes, + cudaMemcpyKind.cudaMemcpyDeviceToHost)) slinesOffs_h = np.concatenate(( [0], np.cumsum(slinesOffs_h[:-1], dtype=slinesOffs_h.dtype))) @@ -94,7 +101,7 @@ def _cumsum_offsets(self): self.slinesOffs_d[ii], slinesOffs_h.ctypes.data, self.slinesOffs_d.size * (nseeds_gpu + 1), - runtime.cudaMemcpyHostToDevice)) + cudaMemcpyKind.cudaMemcpyHostToDevice)) def _allocate_tracking_memory(self): for ii in range(self.ngpus): @@ -108,24 +115,21 @@ def _allocate_tracking_memory(self): self.nSlines[ii] * np.int32().nbytes)) if self.nSlines[ii] > EXCESS_ALLOC_FACT*self.nSlines_old[ii]: - if self.slines[ii]: - checkCudaErrors(runtime.cudaFreeHost( - self.slines[ii])) - if self.sline_lens[ii]: - checkCudaErrors(runtime.cudaFreeHost( - self.sline_lens[ii])) - self.slines[ii] = 0 # Nullptr - self.sline_lens[ii] = 0 # Nullptr + self.slines[ii] = 0 + self.sline_lens[ii] = 0 + gc.collect() buffer_size = self._get_sl_buffer_size(ii) logger.debug(f"Streamline buffer size: {buffer_size}") if not self.slines[ii]: - self.slines[ii] = checkCudaErrors(runtime.cudaMallocHost( - buffer_size)) - if not self.slines_lens[ii]: - self.slines_lens[ii] = checkCudaErrors(runtime.cudaMallocHost( - np.int32().nbytes*EXCESS_ALLOC_FACT*self.nSlines[ii])) + self.slines[ii] = np.empty( + (EXCESS_ALLOC_FACT*self.nSlines[ii], MAX_SLINE_LEN*2, 3), + dtype=REAL_DTYPE) + if not self.sline_lens[ii]: + self.sline_lens[ii] = np.empty( + EXCESS_ALLOC_FACT*self.nSlines[ii], + dtype=np.int32) for ii in range(self.ngpus): self._switch_device(ii) @@ -143,13 +147,13 @@ def _cleanup(self): self.slines[ii], self.sline_d[ii], self._get_sl_buffer_size(ii), - runtime.cudaMemcpyDeviceToHost, + cudaMemcpyKind.cudaMemcpyDeviceToHost, self.gpu_tracker.streams[ii])) checkCudaErrors(runtime.cudaMemcpyAsync( self.sline_lens[ii], self.slineLen_d[ii], np.int32().nbytes*self.nSlines[ii], - runtime.cudaMemcpyDeviceToHost, + cudaMemcpyKind.cudaMemcpyDeviceToHost, self.gpu_tracker.streams[ii])) for ii in range(self.ngpus): @@ -164,22 +168,19 @@ def _cleanup(self): checkCudaErrors(runtime.cudaFree(self.sline_d[ii])) self.nSlines_old = self.nSlines.copy() - self.rng_offset += self.nseeds + self.gpu_tracker.rng_offset += self.nseeds def propagate(self, seeds): - self.seeds = seeds self.nseeds = len(seeds) self.nseeds_per_gpu = (self.nseeds + self.gpu_tracker.ngpus - 1) // self.gpu_tracker.ngpus - self._seeds_to_gpu() - self._allocate_seed_memory() + self._allocate_seed_memory(seeds) for ii in range(self.ngpus): nseeds_gpu, block, grid = self._switch_device(ii) if (nseeds_gpu == 0): continue - - getNumStreamlines() # TODO: these will each be classes you can pass in + self.gpu_tracker.dg.getNumStreamlines(ii, nseeds_gpu, block, grid, self) self._cumsum_offsets() self._allocate_tracking_memory() @@ -188,12 +189,11 @@ def propagate(self, seeds): nseeds_gpu, block, grid = self._switch_device(ii) if (nseeds_gpu == 0): continue - - mergeStreamlines() # TODO + self.gpu_tracker.dg.generateStreamlines(ii, nseeds_gpu, block, grid, self) self._cleanup() - def as_array_sequence(self): # TODO: optimize memory usage here? also, direct to trx? + def as_array_sequence(self): buffer_size = 0 for ii in range(self.ngpus): lens = self.sline_lens[ii] @@ -207,12 +207,15 @@ def _yield_slines(): for jj in range(self.nSlines[ii]): npts = this_len[jj] - offset = jj * 3 * 2 * MAX_SLINE_LEN - sl = np.asarray( - this_sls[offset : offset + npts * 3], - dtype=REAL_DTYPE) - sl = sl.reshape((npts, 3)) - yield sl + yield np.asarray( + this_sls[jj], + dtype=REAL_DTYPE)[:npts] + + return ArraySequence(_yield_slines(), buffer_size) - return ArraySequence(_yield_slines, buffer_size) + def to_trx(): + raise NotImplementedError("Export to TRX not yet implemented") + + def to_trk(): + raise NotImplementedError("Export to TRK not yet implemented") diff --git a/cuslines/cu_tractography.py b/cuslines/cuda_python/cu_tractography.py similarity index 69% rename from cuslines/cu_tractography.py rename to cuslines/cuda_python/cu_tractography.py index acfcc96..1ff0944 100644 --- a/cuslines/cu_tractography.py +++ b/cuslines/cuda_python/cu_tractography.py @@ -1,25 +1,26 @@ from cuda.bindings import driver, runtime +from cuda.bindings.runtime import cudaMemcpyKind +import cuda.core as cc # TODO: consider cuda core over cuda bindings import numpy as np import logging -from cutils import ( +from cuslines.cuda_python.cutils import ( REAL_SIZE, REAL_DTYPE, checkCudaErrors, ) -from cu_direction_getters import ( +from cuslines.cuda_python.cu_direction_getters import ( GPUDirectionGetter, BootDirectionGetter ) -from cu_propagate_seeds import SeedBatchPropagator +from cuslines.cuda_python.cu_propagate_seeds import SeedBatchPropagator logger = logging.getLogger("GPUStreamlines") -# TODO: we need to organize this package into folders, then make it pip installable. -# but should merge in PTT FIRST + class GPUTracker: # TODO: bring in pyAFQ prep stuff def __init__( self, @@ -37,21 +38,10 @@ def __init__( rng_seed: int = 0, rng_offset: int = 0, ): - for name, arr, dt in [ - ("dataf", dataf, REAL_DTYPE), - ("metric_map", metric_map, REAL_DTYPE), - ("sphere_vertices", sphere_vertices, REAL_DTYPE), - ("sphere_edges", sphere_edges, np.int32), - ]: - if arr.dtype != dt: - raise TypeError(f"{name} must have dtype {dt}, got {arr.dtype}") - if not arr.flags.c_contiguous: - raise ValueError(f"{name} must be C-contiguous") - - self.dataf = dataf - self.metric_map = metric_map - self.sphere_vertices = sphere_vertices - self.sphere_edges = sphere_edges + self.dataf = np.ascontiguousarray(dataf, dtype=REAL_DTYPE) + self.metric_map = np.ascontiguousarray(metric_map, dtype=REAL_DTYPE) + self.sphere_vertices = np.ascontiguousarray(sphere_vertices, dtype=REAL_DTYPE) + self.sphere_edges = np.ascontiguousarray(sphere_edges, dtype=np.int32) self.dimx, self.dimy, self.dimz, self.dimt = dataf.shape self.nedges = int(sphere_edges.shape[0]) @@ -59,6 +49,7 @@ def __init__( self.samplm_nr = int(dg.sampling_matrix.shape[0]) else: self.samplm_nr = self.dimt + self.n32dimt = ((self.dimt + 31) // 32) * 32 self.dg = dg self.max_angle = REAL_DTYPE(max_angle) @@ -83,6 +74,9 @@ def __init__( self.sphere_vertices_d = [] self.sphere_edges_d = [] + self.streams = [] + self.managed_data = [] + self.seed_propagator = SeedBatchPropagator( gpu_tracker=self) self._allocated = False @@ -97,15 +91,22 @@ def _allocate(self): for ii in range(self.ngpus): checkCudaErrors(runtime.cudaSetDevice(ii)) - self.dataf_d.append( # TODO: put this in texture memory? - checkCudaErrors(runtime.cudaMallocManaged( # TODO: look at cuda core managed memory - REAL_SIZE*self.dataf.size, - runtime.cudaMemAttachGlobal))) - checkCudaErrors(runtime.cudaMemAdvise( - self.dataf_d[ii], - REAL_SIZE*self.dataf.size, - runtime.cudaMemAdviseSetPreferredLocation, - ii)) + self.streams.append( + checkCudaErrors(runtime.cudaStreamCreateWithFlags( + runtime.cudaStreamNonBlocking))) + + for ii in range(self.ngpus): + checkCudaErrors(runtime.cudaSetDevice(ii)) + + # TODO: put this in texture memory? + self.managed_data.append( + cc.ManagedMemoryResource( + options=cc.ManagedMemoryResourceOptions(preferred_location=ii) + ) + ) + self.dataf_d.append( + self.managed_data[ii].allocate( + REAL_SIZE*self.dataf.size)) self.metric_map_d.append( checkCudaErrors(runtime.cudaMalloc( REAL_SIZE*self.metric_map.size))) @@ -115,37 +116,32 @@ def _allocate(self): self.sphere_edges_d.append( checkCudaErrors(runtime.cudaMalloc( np.int32().nbytes*self.sphere_edges.size))) - + + logger.info("here-1") checkCudaErrors(runtime.cudaMemcpy( - self.dataf_d[ii], + self.dataf_d[ii].handle, self.dataf.ctypes.data, REAL_SIZE*self.dataf.size, - runtime.cudaMemcpyHostToDevice)) + cudaMemcpyKind.cudaMemcpyHostToDevice)) + logger.info("here0") checkCudaErrors(runtime.cudaMemcpy( self.metric_map_d[ii], self.metric_map.ctypes.data, REAL_SIZE*self.metric_map.size, - runtime.cudaMemcpyHostToDevice)) + cudaMemcpyKind.cudaMemcpyHostToDevice)) checkCudaErrors(runtime.cudaMemcpy( self.sphere_vertices_d[ii], self.sphere_vertices.ctypes.data, REAL_SIZE*self.sphere_vertices.size, - runtime.cudaMemcpyHostToDevice)) + cudaMemcpyKind.cudaMemcpyHostToDevice)) checkCudaErrors(runtime.cudaMemcpy( self.sphere_edges_d[ii], self.sphere_edges.ctypes.data, np.int32().nbytes*self.sphere_edges.size, - runtime.cudaMemcpyHostToDevice)) - + cudaMemcpyKind.cudaMemcpyHostToDevice)) + logger.info("here0,5") self.dg.allocate_on_gpu(ii) - self.streams = [] - for ii in range(self.ngpus): - checkCudaErrors(runtime.cudaSetDevice(ii)) - self.streams.append( - checkCudaErrors(runtime.cudaStreamCreateWithFlags( - runtime.cudaStreamNonBlocking))) - self._allocated = True def __exit__(self, exc_type, exc, tb): @@ -153,22 +149,17 @@ def __exit__(self, exc_type, exc, tb): for n in range(self.ngpus): checkCudaErrors(runtime.cudaSetDevice(n)) - if self.dataf_d[n]: - checkCudaErrors(runtime.cudaFree(self.dataf_d[n])) + # if self.dataf_d[n]: # TODO: find how to do this + # self.managed_data[n].deallocate( + # self.dataf_d[n], + # REAL_SIZE*self.dataf.size) + # self.managed_data[n].close() if self.metric_map_d[n]: checkCudaErrors(runtime.cudaFree(self.metric_map_d[n])) if self.sphere_vertices_d[n]: checkCudaErrors(runtime.cudaFree(self.sphere_vertices_d[n])) if self.sphere_edges_d[n]: checkCudaErrors(runtime.cudaFree(self.sphere_edges_d[n])) - - if self.seed_propagator.sline_lens[n]: - checkCudaErrors(runtime.cudaFreeHost( - self.seed_propagator.sline_lens[n])) - if self.seed_propagator.slines[n]: - checkCudaErrors(runtime.cudaFreeHost( - self.seed_propagator.slines[n])) - self.dg.deallocate_on_gpu(n) checkCudaErrors(runtime.cudaStreamDestroy(self.streams[n])) diff --git a/cuslines/cutils.py b/cuslines/cuda_python/cutils.py similarity index 70% rename from cuslines/cutils.py rename to cuslines/cuda_python/cutils.py index 4d75847..9cf164e 100644 --- a/cuslines/cutils.py +++ b/cuslines/cuda_python/cutils.py @@ -1,11 +1,12 @@ from cuda.bindings import driver, nvrtc -import re -import os import numpy as np +import ctypes from enum import IntEnum +from cuslines.cuda_python._globals import * + class ModelType(IntEnum): OPDT = 0 @@ -13,34 +14,27 @@ class ModelType(IntEnum): PROB = 2 PTT = 3 - -# We extract REAL_DTYPE, MAX_SLINE_LEN from globals.h -# Maybe there is a more elegant way of doing this? -dir_path = os.path.dirname(os.path.abspath(__file__)) -globals_path = os.path.join(dir_path, "globals.h") -with open(globals_path, 'r') as f: - content = f.read() - -defines = dict(re.findall(r"#define\s+(\w+)\s+([^\s/]+)", content)) -REAL_SIZE = int(defines["REAL_SIZE"]) REAL3_SIZE = 3 * REAL_SIZE if REAL_SIZE == 4: REAL_DTYPE = np.float32 REAL3_DTYPE = np.dtype([('x', np.float32), ('y', np.float32), ('z', np.float32)]) + REAL_DTYPE_AS_STR = "float" + REAL3_DTYPE_AS_STR = "float3" + REAL_DTYPE_AS_CTYPE = ctypes.c_float elif REAL_SIZE == 8: REAL_DTYPE = np.float64 REAL3_DTYPE = np.dtype([('x', np.float64), ('y', np.float64), ('z', np.float64)]) + REAL_DTYPE_AS_STR = "double" + REAL3_DTYPE_AS_STR = "double3" + REAL_DTYPE_AS_CTYPE = ctypes.c_double else: raise NotImplementedError(f"Unsupported REAL_SIZE={REAL_SIZE} in globals.h") -MAX_SLINE_LEN = int(defines["MAX_SLINE_LEN"]) -THR_X_SL = int(defines["THR_X_SL"]) -THR_X_BL = int(defines["THR_X_BL"]) -EXCESS_ALLOC_FACT = int(defines["EXCESS_ALLOC_FACT"]) - +BLOCK_Y = THR_X_BL//THR_X_SL +DEV_PTR = object def _cudaGetErrorEnum(error): if isinstance(error, driver.CUresult): diff --git a/cuslines/cuwsort.cuh b/cuslines/cuwsort.cuh index 18858f0..aac70ac 100644 --- a/cuslines/cuwsort.cuh +++ b/cuslines/cuwsort.cuh @@ -79,12 +79,15 @@ int swap4[3][4] = {{ 2, 3, 0, 1}, __device__ __constant__ int swap2[1][2] = {{ 1, 0}}; -__device__ __constant__ const int *__swaps[] = {NULL, - reinterpret_cast(&swap2[0][0]), - reinterpret_cast(&swap4[0][0]), - reinterpret_cast(&swap8[0][0]), - reinterpret_cast(&swap16[0][0]), - reinterpret_cast(&swap32[0][0])}; +template +__device__ __forceinline__ const int* get_swap_ptr() { + if constexpr (GSIZE == 2) return (const int*)swap2; + else if constexpr (GSIZE == 4) return (const int*)swap4; + else if constexpr (GSIZE == 8) return (const int*)swap8; + else if constexpr (GSIZE == 16) return (const int*)swap16; + else if constexpr (GSIZE == 32) return (const int*)swap32; + else return nullptr; +} template struct STATIC_LOG2 { @@ -113,7 +116,7 @@ __device__ KEY_T warp_sort(KEY_T v) { const int gid = lid % GSIZE; - const int (*swap)[GSIZE] = reinterpret_cast(__swaps[LOG2_GSIZE]); + const int (*swap)[GSIZE] = reinterpret_cast(get_swap_ptr()); #pragma unroll for(int i = 0; i < NSWAP; i++) { @@ -140,7 +143,7 @@ __device__ void warp_sort(KEY_T *__restrict__ k, VAL_T *__restrict__ v) { const int gid = lid % GSIZE; - const int (*swap)[GSIZE] = reinterpret_cast(__swaps[LOG2_GSIZE]); + const int (*swap)[GSIZE] = reinterpret_cast(get_swap_ptr()); #pragma unroll for(int i = 0; i < NSWAP; i++) { diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu index 0efefdd..db3c0e2 100644 --- a/cuslines/generate_streamlines_cuda.cu +++ b/cuslines/generate_streamlines_cuda.cu @@ -26,23 +26,31 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +// TODO: its possible all the cpp should be refactored +// out into a separate file, but for now, they are just wrapped +// in these ifndefs +#ifndef __NVRTC__ #include #include #include #include #include +#endif + #include #include + +#ifndef __NVRTC__ #include #include #include -#include +#include // Might not be needed anymore? +#include +#endif #include "cudamacro.h" /* for time() */ #include "globals.h" -#include - #include "cuwsort.cuh" #include "ptt.cuh" @@ -1204,7 +1212,6 @@ template __device__ int tracker_d(curandStatePhilox4_32_10_t *st, const REAL_T max_angle, - const REAL_T min_signal, const REAL_T tc_threshold, const REAL_T step_size, const REAL_T relative_peak_thres, @@ -1218,22 +1225,9 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st, const int dimz, const int dimt, const REAL_T *__restrict__ dataf, - const int *__restrict__ b0s_mask, // not using this (and its opposite, dwi_mask) - const REAL_T *__restrict__ H, - const REAL_T *__restrict__ R, - // model unused - // step_size from global defines - // max_angle, pmf_threshold from global defines - // b0s_mask already passed - // min_signal from global defines - // tc_threshold from global defines - // pmf_threashold from global defines const REAL_T *__restrict__ metric_map, - const int delta_nr, - const REAL_T *__restrict__ delta_b, - const REAL_T *__restrict__ delta_q, // fit_matrix - const int samplm_nr, - const REAL_T *__restrict__ sampling_matrix, + const typename ModelCtx::type* __restrict__ ctx, + const int samplm_nr, const REAL3_T *__restrict__ sphere_vertices, const int2 *__restrict__ sphere_edges, const int num_edges, @@ -1272,7 +1266,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st, int i; for(i = 1; i < MAX_SLINE_LEN*step_frac; i++) { int ndir; - if (MODEL_T == PROB) { + if constexpr (MODEL_T == PROB) { ndir = get_direction_prob_d( @@ -1288,7 +1282,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st, sphere_edges, num_edges, __sh_new_dir + tidy); - } else if (MODEL_T == PTT) { + } else if constexpr (MODEL_T == PTT) { ndir = get_direction_ptt_d( @@ -1310,22 +1304,18 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st, MODEL_T>( st, max_angle, - min_signal, + ctx->min_signal, relative_peak_thres, min_separation_angle, direction, dimx, dimy, dimz, dimt, dataf, - b0s_mask /* !dwi_mask */, + ctx->b0s_mask /* !dwi_mask */, point, - H, R, - // model unused - // max_angle, pmf_threshold from global defines - // b0s_mask already passed - // min_signal from global defines - delta_nr, - delta_b, delta_q, // fit_matrix + ctx->H, ctx->R, + ctx->delta_nr, + ctx->delta_b, ctx->delta_q, // fit_matrix samplm_nr, - sampling_matrix, + ctx->sampling_matrix, sphere_vertices, sphere_edges, num_edges, @@ -1603,7 +1593,6 @@ template __global__ void genStreamlinesMerge_k( const REAL_T max_angle, - const REAL_T min_signal, const REAL_T tc_threshold, const REAL_T step_size, const REAL_T relative_peak_thres, @@ -1617,15 +1606,9 @@ __global__ void genStreamlinesMerge_k( const int dimz, const int dimt, const REAL_T *__restrict__ dataf, - const REAL_T *__restrict__ H, - const REAL_T *__restrict__ R, - const int delta_nr, - const REAL_T *__restrict__ delta_b, - const REAL_T *__restrict__ delta_q, - const int *__restrict__ b0s_mask, // change to int const REAL_T *__restrict__ metric_map, - const int samplm_nr, - const REAL_T *__restrict__ sampling_matrix, + const typename ModelCtx::type* __restrict__ ctx, + const int samplm_nr, const REAL3_T *__restrict__ sphere_vertices, const int2 *__restrict__ sphere_edges, const int num_edges, @@ -1715,7 +1698,6 @@ __global__ void genStreamlinesMerge_k( BDIM_Y, MODEL_T>(&st, max_angle, - min_signal, tc_threshold, step_size, relative_peak_thres, @@ -1725,13 +1707,9 @@ __global__ void genStreamlinesMerge_k( __ptt_frame, MAKE_REAL3(1, 1, 1), dimx, dimy, dimz, dimt, dataf, - b0s_mask, - H, R, metric_map, - delta_nr, - delta_b, delta_q, //fit_matrix - samplm_nr, - sampling_matrix, + ctx, + samplm_nr, sphere_vertices, sphere_edges, num_edges, @@ -1755,7 +1733,6 @@ __global__ void genStreamlinesMerge_k( BDIM_Y, MODEL_T>(&st, max_angle, - min_signal, tc_threshold, step_size, relative_peak_thres, @@ -1765,13 +1742,9 @@ __global__ void genStreamlinesMerge_k( __ptt_frame + 9, MAKE_REAL3(1, 1, 1), dimx, dimy, dimz, dimt, dataf, - b0s_mask, - H, R, metric_map, - delta_nr, - delta_b, delta_q, //fit_matrix - samplm_nr, - sampling_matrix, + ctx, + samplm_nr, sphere_vertices, sphere_edges, num_edges, @@ -1802,15 +1775,16 @@ __global__ void genStreamlinesMerge_k( return; } +#ifndef __NVRTC__ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size, const REAL relative_peak_thresh, const REAL min_separation_angle, const int nseeds, const std::vector &seeds_d, const int dimx, const int dimy, const int dimz, const int dimt, const std::vector &dataf_d, const std::vector &H_d, const std::vector &R_d, - const int delta_nr, + const int delta_nr, const std::vector &delta_b_d, const std::vector &delta_q_d, const std::vector &b0s_mask_d, const std::vector &metric_map_d, - const int samplm_nr, + const int samplm_nr, const std::vector &sampling_matrix_d, const std::vector &sphere_vertices_d, const std::vector &sphere_edges_d, const int nedges, std::vector &slines_h, std::vector &slinesLen_h, std::vector &nSlines_h, @@ -1985,25 +1959,45 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a #endif //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y); - switch(model_type) { + switch(model_type) { // TODO: these may be better as separate functions, not as template specializations case OPDT: - genStreamlinesMerge_k <<>>( - max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, - rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), - dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n], - b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n], - reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), - nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); - break; - case CSA: - genStreamlinesMerge_k <<>>( - max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, - rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), - dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n], - b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n], - reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), - nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); + BootCtx* d_ctx; + BootCtx h_ctx; + h_ctx.min_signal = min_signal; + h_ctx.delta_nr = delta_nr; + h_ctx.H = H_d[n]; + h_ctx.R = R_d[n]; + h_ctx.delta_b = delta_b_d[n]; + h_ctx.delta_q = delta_q_d[n]; + h_ctx.sampling_matrix = sampling_matrix_d[n]; + h_ctx.b0s_mask = b0s_mask_d[n]; + CHECK_CUDA(cudaMalloc(&d_ctx, sizeof(BootCtx))); + CHECK_CUDA(cudaMemcpyAsync( + d_ctx, &h_ctx, sizeof(BootCtx), + cudaMemcpyHostToDevice, streams[n])); + + if (model_type == OPDT) { + genStreamlinesMerge_k <<>>( + max_angle, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, + rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), + dimx, dimy, dimz, dimt, dataf_d[n], + metric_map_d[n], d_ctx, samplm_nr, + reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), + nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); + } else if (model_type == CSA) { + genStreamlinesMerge_k <<>>( + max_angle, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, + rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), + dimx, dimy, dimz, dimt, dataf_d[n], + metric_map_d[n], d_ctx, samplm_nr, + reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), + nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); + } else { + // Should never reach here + } + + CHECK_CUDA(cudaFree(d_ctx)); break; case PROB: @@ -2011,10 +2005,10 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a // than for preliminary run shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt; genStreamlinesMerge_k <<>>( - max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, + max_angle, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), - dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n], - b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n], + dimx, dimy, dimz, dimt, dataf_d[n], + metric_map_d[n], nullptr, samplm_nr, reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); break; @@ -2022,10 +2016,10 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a case PTT: shSizeGNS = 0; // PTT uses exclusively static shared memory genStreamlinesMerge_k <<>>( - max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, + max_angle, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), - dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n], - b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n], + dimx, dimy, dimz, dimt, dataf_d[n], + metric_map_d[n], nullptr, samplm_nr, reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); break; @@ -2394,3 +2388,4 @@ void write_trk(const int num_threads, return; } #endif +#endif // __NVRTC__ diff --git a/cuslines/globals.h b/cuslines/globals.h index 0d852e9..e0bcac1 100644 --- a/cuslines/globals.h +++ b/cuslines/globals.h @@ -40,7 +40,7 @@ #define FLOOR floorf #define LOG __logf #define EXP __expf -#define REAL_MAX (FLT_MAX) +#define REAL_MAX __int_as_float(0x7f7fffffU) #define REAL_MIN (-REAL_MAX) #define COS __cosf #define SIN __sinf @@ -58,7 +58,7 @@ #define FLOOR floor #define LOG log #define EXP exp -#define REAL_MAX (DBL_MAX) +#define REAL_MAX __longlong_as_double(0x7fefffffffffffffLL) #define REAL_MIN (-REAL_MAX) #define COS cos #define SIN sin @@ -98,4 +98,33 @@ enum ModelType { PTT = 3, }; +struct NoCtx {}; + +template +struct BootCtx { + REAL_T min_signal; + int delta_nr; + const REAL_T* H; + const REAL_T* R; + const REAL_T* delta_b; + const REAL_T* delta_q; + const REAL_T* sampling_matrix; + const int* b0s_mask; +}; + +template +struct ModelCtx { + using type = NoCtx; +}; + +template +struct ModelCtx { + using type = BootCtx; +}; + +template +struct ModelCtx { + using type = BootCtx; +}; + #endif diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu index 3cdd149..894d0bf 100644 --- a/cuslines/ptt.cu +++ b/cuslines/ptt.cu @@ -473,7 +473,7 @@ __device__ int get_direction_ptt_d( get_probing_frame_d<0>(__frame_sh, st, __probing_frame_sh); propagate_frame_d(__probing_prop_sh, __probing_frame_sh, __direc_sh); norm3_d(__direc_sh, 0); // this will be scaled by the generic stepping code - dirs[0] = (REAL3_T) {__direc_sh[0], __direc_sh[1], __direc_sh[2]}; + dirs[0] = MAKE_REAL3(__direc_sh[0], __direc_sh[1], __direc_sh[2]); } } diff --git a/pyproject.toml b/pyproject.toml index 7ad8645..a1247c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] -requires = ["scikit-build-core", "pybind11"] -build-backend = "scikit_build_core.build" +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" [project] name = "cuslines" @@ -10,8 +10,12 @@ readme = "README.md" requires-python = ">=3.7" dependencies = [ "numpy", - "pybind11" + "nibabel", + "cuda-python", + "cuda-core", + "cuda-cccl" ] -[tool.scikit-build] -cmake.build-type = "Release" +[tool.setuptools.packages.find] +where = ["."] +include = ["cuslines*"] diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py index d546d60..7585e37 100644 --- a/run_gpu_streamlines.py +++ b/run_gpu_streamlines.py @@ -27,7 +27,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import argparse +import argparse # TODO: do this again, but for cuda python version import random import time import zipfile diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..cd53ade --- /dev/null +++ b/setup.py @@ -0,0 +1,49 @@ +from setuptools import setup +from setuptools.command.build_py import build_py +from pathlib import Path +import subprocess +import re + + +def defines_to_python(src, dst): + src = Path(src) + dst = Path(dst) + + defines = {} + + INT_DEFINE = re.compile( + r"#define\s+(\w+)\s+\(?\s*([0-9]+)\s*\)?" + ) + + REAL_CAST_DEFINE = re.compile( + r"#define\s+(\w+)\s+\(\(REAL\)\s*([0-9eE\.\+\-]+)\s*\)" + ) + + defines = {} + + for line in src.read_text().splitlines(): + if m := INT_DEFINE.match(line): + defines[m.group(1)] = int(m.group(2)) + elif m := REAL_CAST_DEFINE.match(line): + defines[m.group(1)] = float(m.group(2)) + + dst.parent.mkdir(parents=True, exist_ok=True) + + with dst.open("w") as f: + f.write("# AUTO-GENERATED FROM globals.h — DO NOT EDIT\n\n") + for k, v in sorted(defines.items()): + f.write(f"{k} = {v}\n") + +class build_py_with_cuda(build_py): + def run(self): + root = Path(__file__).parent + + globals_src = str(root / "cuslines" / "globals.h") + globals_dst = str(root / "cuslines" / "cuda_python" / "_globals.py") + defines_to_python(globals_src, globals_dst) + + super().run() + +setup( + cmdclass={"build_py": build_py_with_cuda}, +) From 039a95b86dcbce9b96a29fde8fb87ba46da793ad Mon Sep 17 00:00:00 2001 From: 36000 Date: Tue, 6 Jan 2026 14:13:23 -0800 Subject: [PATCH 20/31] working! --- cuslines/cuda_python/cu_direction_getters.py | 8 ++--- cuslines/cuda_python/cu_propagate_seeds.py | 33 +++++++++++++------- cuslines/cuda_python/cu_tractography.py | 24 ++++---------- cuslines/globals.h | 2 +- 4 files changed, 32 insertions(+), 35 deletions(-) diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py index 2dc54cc..135cb47 100644 --- a/cuslines/cuda_python/cu_direction_getters.py +++ b/cuslines/cuda_python/cu_direction_getters.py @@ -250,7 +250,7 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp): sp.gpu_tracker.dimy, sp.gpu_tracker.dimz, sp.gpu_tracker.dimt, - sp.gpu_tracker.dataf_d[n].handle, + sp.gpu_tracker.dataf_d[n], self.H_d[n], self.R_d[n], self.delta_nr, @@ -285,7 +285,7 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): sp.gpu_tracker.dimy, sp.gpu_tracker.dimz, sp.gpu_tracker.dimt, - sp.gpu_tracker.dataf_d[n].handle, + sp.gpu_tracker.dataf_d[n], sp.gpu_tracker.metric_map_d[n], self.ctx_d[n], sp.gpu_tracker.samplm_nr, @@ -324,7 +324,7 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp): sp.gpu_tracker.dimy, sp.gpu_tracker.dimz, sp.gpu_tracker.dimt, - sp.gpu_tracker.dataf_d[n].handle, + sp.gpu_tracker.dataf_d[n], sp.gpu_tracker.sphere_vertices_d[n], sp.gpu_tracker.sphere_edges_d[n], sp.gpu_tracker.nedges, @@ -354,7 +354,7 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): sp.gpu_tracker.dimy, sp.gpu_tracker.dimz, sp.gpu_tracker.dimt, - sp.gpu_tracker.dataf_d[n].handle, + sp.gpu_tracker.dataf_d[n], sp.gpu_tracker.metric_map_d[n], int(0), sp.gpu_tracker.samplm_nr, diff --git a/cuslines/cuda_python/cu_propagate_seeds.py b/cuslines/cuda_python/cu_propagate_seeds.py index 73a4a6c..92efef3 100644 --- a/cuslines/cuda_python/cu_propagate_seeds.py +++ b/cuslines/cuda_python/cu_propagate_seeds.py @@ -3,7 +3,7 @@ from cuda.bindings import runtime from cuda.bindings.runtime import cudaMemcpyKind -from nibabel.streamlines.array_sequence import ArraySequence +from nibabel.streamlines.array_sequence import ArraySequence, MEGABYTE import logging from cuslines.cuda_python.cutils import ( @@ -16,8 +16,7 @@ THR_X_BL, DEV_PTR, div_up, - checkCudaErrors, -) + checkCudaErrors) logger = logging.getLogger("GPUStreamlines") @@ -53,7 +52,7 @@ def _switch_device(self, n): return nseeds_gpu, block, grid def _get_sl_buffer_size(self, n): - return REAL_SIZE*2*3*MAX_SLINE_LEN*self.nSlines[n] + return REAL_SIZE*2*3*MAX_SLINE_LEN*self.nSlines[n].astype(np.int64) def _allocate_seed_memory(self, seeds): # Move seeds to GPU @@ -79,7 +78,7 @@ def _allocate_seed_memory(self, seeds): self.shDirTemp0_d[ii] = checkCudaErrors(runtime.cudaMalloc( REAL3_DTYPE.itemsize * self.gpu_tracker.samplm_nr * grid[0] * block[1])) - def _cumsum_offsets(self): # TODO: do this on device? + def _cumsum_offsets(self): # TODO: do this on device? not crucial for performance now for ii in range(self.ngpus): nseeds_gpu, _, _ = self._switch_device(ii) if (nseeds_gpu == 0): @@ -93,14 +92,18 @@ def _cumsum_offsets(self): # TODO: do this on device? slinesOffs_h.nbytes, cudaMemcpyKind.cudaMemcpyDeviceToHost)) - slinesOffs_h = np.concatenate(( - [0], np.cumsum(slinesOffs_h[:-1], dtype=slinesOffs_h.dtype))) - self.nSlines[ii] = int(slinesOffs_h[-1]) + __pval = slinesOffs_h[0] + slinesOffs_h[0] = 0 + for jj in range(1, nseeds_gpu + 1): + __cval = slinesOffs_h[jj] + slinesOffs_h[jj] = slinesOffs_h[jj - 1] + __pval + __pval = __cval + self.nSlines[ii] = int(slinesOffs_h[nseeds_gpu]) checkCudaErrors(runtime.cudaMemcpy( self.slinesOffs_d[ii], slinesOffs_h.ctypes.data, - self.slinesOffs_d.size * (nseeds_gpu + 1), + slinesOffs_h.nbytes, cudaMemcpyKind.cudaMemcpyHostToDevice)) def _allocate_tracking_memory(self): @@ -167,10 +170,10 @@ def _cleanup(self): checkCudaErrors(runtime.cudaFree(self.slineLen_d[ii])) checkCudaErrors(runtime.cudaFree(self.sline_d[ii])) - self.nSlines_old = self.nSlines.copy() + self.nSlines_old = self.nSlines self.gpu_tracker.rng_offset += self.nseeds - def propagate(self, seeds): + def propagate(self, seeds): # TODO: better queuing/batching of seeds, if more performance needed self.nseeds = len(seeds) self.nseeds_per_gpu = (self.nseeds + self.gpu_tracker.ngpus - 1) // self.gpu_tracker.ngpus @@ -181,6 +184,9 @@ def propagate(self, seeds): if (nseeds_gpu == 0): continue self.gpu_tracker.dg.getNumStreamlines(ii, nseeds_gpu, block, grid, self) + for ii in range(self.ngpus): + checkCudaErrors(runtime.cudaStreamSynchronize( + self.gpu_tracker.streams[ii])) self._cumsum_offsets() self._allocate_tracking_memory() @@ -190,6 +196,9 @@ def propagate(self, seeds): if (nseeds_gpu == 0): continue self.gpu_tracker.dg.generateStreamlines(ii, nseeds_gpu, block, grid, self) + for ii in range(self.ngpus): + checkCudaErrors(runtime.cudaStreamSynchronize( + self.gpu_tracker.streams[ii])) self._cleanup() @@ -212,7 +221,7 @@ def _yield_slines(): this_sls[jj], dtype=REAL_DTYPE)[:npts] - return ArraySequence(_yield_slines(), buffer_size) + return ArraySequence(_yield_slines(), buffer_size // MEGABYTE) def to_trx(): raise NotImplementedError("Export to TRX not yet implemented") diff --git a/cuslines/cuda_python/cu_tractography.py b/cuslines/cuda_python/cu_tractography.py index 1ff0944..eca62dd 100644 --- a/cuslines/cuda_python/cu_tractography.py +++ b/cuslines/cuda_python/cu_tractography.py @@ -98,15 +98,9 @@ def _allocate(self): for ii in range(self.ngpus): checkCudaErrors(runtime.cudaSetDevice(ii)) - # TODO: put this in texture memory? - self.managed_data.append( - cc.ManagedMemoryResource( - options=cc.ManagedMemoryResourceOptions(preferred_location=ii) - ) - ) - self.dataf_d.append( - self.managed_data[ii].allocate( - REAL_SIZE*self.dataf.size)) + self.dataf_d.append( + checkCudaErrors(runtime.cudaMalloc( + REAL_SIZE*self.dataf.size))) self.metric_map_d.append( checkCudaErrors(runtime.cudaMalloc( REAL_SIZE*self.metric_map.size))) @@ -117,13 +111,11 @@ def _allocate(self): checkCudaErrors(runtime.cudaMalloc( np.int32().nbytes*self.sphere_edges.size))) - logger.info("here-1") checkCudaErrors(runtime.cudaMemcpy( - self.dataf_d[ii].handle, + self.dataf_d[ii], self.dataf.ctypes.data, REAL_SIZE*self.dataf.size, cudaMemcpyKind.cudaMemcpyHostToDevice)) - logger.info("here0") checkCudaErrors(runtime.cudaMemcpy( self.metric_map_d[ii], self.metric_map.ctypes.data, @@ -139,7 +131,6 @@ def _allocate(self): self.sphere_edges.ctypes.data, np.int32().nbytes*self.sphere_edges.size, cudaMemcpyKind.cudaMemcpyHostToDevice)) - logger.info("here0,5") self.dg.allocate_on_gpu(ii) self._allocated = True @@ -149,11 +140,8 @@ def __exit__(self, exc_type, exc, tb): for n in range(self.ngpus): checkCudaErrors(runtime.cudaSetDevice(n)) - # if self.dataf_d[n]: # TODO: find how to do this - # self.managed_data[n].deallocate( - # self.dataf_d[n], - # REAL_SIZE*self.dataf.size) - # self.managed_data[n].close() + if self.dataf_d[n]: + checkCudaErrors(runtime.cudaFree(self.dataf_d[n])) if self.metric_map_d[n]: checkCudaErrors(runtime.cudaFree(self.metric_map_d[n])) if self.sphere_vertices_d[n]: diff --git a/cuslines/globals.h b/cuslines/globals.h index e0bcac1..b9f8211 100644 --- a/cuslines/globals.h +++ b/cuslines/globals.h @@ -68,7 +68,7 @@ #define ACOS acos #endif - +// TODO: half this in when WMGMI seeding #define MAX_SLINE_LEN (501) #define PMF_THRESHOLD_P ((REAL)0.05) From 55f69e1122182eb59575dd672d5d0b20687aaa5f Mon Sep 17 00:00:00 2001 From: 36000 Date: Tue, 6 Jan 2026 14:14:23 -0800 Subject: [PATCH 21/31] ignore pycs --- .gitignore | 6 ++++++ .../__pycache__/__init__.cpython-312.pyc | Bin 385 -> 0 bytes .../__pycache__/_globals.cpython-312.pyc | Bin 389 -> 0 bytes .../cu_direction_getters.cpython-312.pyc | Bin 22688 -> 0 bytes .../cu_propagate_seeds.cpython-312.pyc | Bin 14574 -> 0 bytes .../__pycache__/cu_tractography.cpython-312.pyc | Bin 9441 -> 0 bytes .../__pycache__/cutils.cpython-312.pyc | Bin 2922 -> 0 bytes 7 files changed, 6 insertions(+) create mode 100644 .gitignore delete mode 100644 cuslines/cuda_python/__pycache__/__init__.cpython-312.pyc delete mode 100644 cuslines/cuda_python/__pycache__/_globals.cpython-312.pyc delete mode 100644 cuslines/cuda_python/__pycache__/cu_direction_getters.cpython-312.pyc delete mode 100644 cuslines/cuda_python/__pycache__/cu_propagate_seeds.cpython-312.pyc delete mode 100644 cuslines/cuda_python/__pycache__/cu_tractography.cpython-312.pyc delete mode 100644 cuslines/cuda_python/__pycache__/cutils.cpython-312.pyc diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..78bb5e2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +# Python bytecode +**/*.pyc +**/__pycache__/ +*.pyo +*.pyd + diff --git a/cuslines/cuda_python/__pycache__/__init__.cpython-312.pyc b/cuslines/cuda_python/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 6aedacf00242281ecdb484682fe1cfc13555eb98..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 385 zcmZ9IF-ycS7>2*J=~eD5g6HB75Gjb=MI1!9O)4r#mr$F~R-2}z>Cn+%;qK;dxqlFF za&l8|5S;vacuT+G;mw=meM6Q(Fa?~0tGm)C0Pn+MGtR$Q9ieyz4xCgVNkYh=oyw7J z;!2uO=_MY49Ju>Jl1WcP?*p?SBzC%pZ*O#(JqjJVb{=bWe^F|YnNrmYVT{mrH#TPM z&Cit@n<3RMhH5&%8!$m!hQ8g&I%crMs9dK_(XY?hJ#_h|i{DNY!R90mz+9>PVjo>M3dQ+%6Vyvu7!`N}t+pSZncvp!t9bnY( d0qs8G(d(i;^^SQVs#}3t?&?%N89vaf^T_2Ujvvz35B6s-Hn5b(SN`n zadBa6-l!8B32|}qH4)En?zuPjoSWPaMcDxY#`DeaUDU6r*+TjgnrXE?S!^=|HM z1?$m?KO4AXmG&L?w&!@q6Q9dxBbUt`yE-k1sOVANsi1XSlB9JC?3b1xAC?Z5pcv+g dff8nOfgBb}!Cttl1eq|ah&HP{$kUu~{Q}jiZTtWL diff --git a/cuslines/cuda_python/__pycache__/cu_direction_getters.cpython-312.pyc b/cuslines/cuda_python/__pycache__/cu_direction_getters.cpython-312.pyc deleted file mode 100644 index f59ad4d2e033c1468cb38fc97a82f693071d1aae..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 22688 zcmdr!TW}lKb-Mr-&jkU##5V~_qG^y6^_Epxwk1)b6j`F4RwUTg8pJM1KzPs#P!Upy?1}jU71jLdG$0R?eTK7vA!y!rK zj|2zep}vkpyeHhN+K$VKUO5<3tx@8w7G*q+^{2v#I1;Gs3CBetelaYDgF>Ga6eU>@ z!?F}gCFDT}FYc96LO33Z4v3P_A58UOBAYxACmFM9QD{I69+hID{=uh#lHaL1P9E5M zL^$=ezyZ}s(EX>+9Y3HJXK`WgDdE)VlWGZiZ52qk?pL}vS_et%3H3>#3muTt0a;GS zN!4*QAxhEHgZ+|fKYi$=a8@{VM78ZZa;)QN;hbvQf8dO895N_6c>I}DDOn1}qT#rd z4CQuNrf->f91vrSM8l0(+qYyrod=Pdxnm_cTO?V4cxR@IV}qCF}bSDo>sB#B9( zw|_w8x}%BE1=Z3kheg$t>`x*oA2FYIZtP3Mq>Yh8Uwq@@YHkb-BuRY%5rabiAPkx~ z;F*4ff(g*6P_kru{~+n$u7OxiGYzd@T8VZ@u{c%()&duYmzfXk{5&8`C^L%?CuUJi z<$CawG1&ww`@l+Kbf5ePK((}2ic4}ZCFPdm5o$S>#9IdhtxJi#cmIuu2~hUy;}T1zQLUYuq~pJ|L4JO72sA-Crjpjedh$Wwb{Nvswi zByU21?Vxg^)IHFC(RLs3Xd@HtgVN$%&<99k@eRx zu}Ck+Wm~FB_Z_y0p1YZr+|FFKr)rE4d!$asv$md#=pE+XDAswwWrv7-mC-4&%u9|4 zPM|y^mJBlyU!Es8!FTOa-yYK&r8>o7Mkvu^JfP>xmX4NT(-6oFIxQAfvvtg7nAYDg zvCKJ!g}?8eyJSAcd;>UI*HC3gBGw;~lIMosl#w67}kd%a;U@|4dfW4{aWJ*+dQ3Adwflo+IsC+K}8>-cp zDgl_X&RPT{-X0oIS$W6|Sp(~Kkdmg5N7?jBaG`iQZmGRxP9{aQ4bB z^oJVBGERLNXO!xUhP$z}p-Y!!oWiOr2`nBOMV8_z)gDQNJ~#gg6kD{7!AfC4J^hrl2zflp<8O-v6W( zRlZ-7TCkUsswc;V4W*i4AVA|1)`sO|dsGX?Nums^F-L?NU-4whEM%xq7xTP08-Px7 z8?22lGqdGY-#;_roV6ESb&a|v%i5>y8$YV7yAheH^pCjz-R1poZS$S7@w%Bu+h(fj zW@>#OJGk1S(IazSrr0;$G+r@hHq{o599gh4HU2rqSyMYxTr*Q$KjW>KsapS$xAfZ4 zDeu}@Z`+--Q{Ej}yK&09F70hsyzO%iOZBFYU669FjPcZbTn^ZXXQ7_)l)ut9VxF

(Lp$^h0l1+S{ynn`aQ&sCYNdapwA>IgY7VH_vbto)OFU zTnjwo@MukO_?`W~?3{c?P>u-i*G@}8$VCtK>OMDy8SKYpSN)Px7xb=Y$r&C zc7co&n1@b5(8dGcydbI70V-w4{=go|@Y!0zGD`0tkpI9M)fRC+hT#y|v0WlMKfCaY4tGQx0EJ z?cJM_fG00>`R($vP-Xc!1kWQV0|09;u7^ojN?=cs+PVdne&J#wEDHTFKzhPb6a?S= zE`9-1BS2co9spOE4;|%`6)ltY))`0XwKHSqub-c^ub*-7BRy9Wqlrm-{bIc2TIblY z>&Hl(T)g^5=j~%R!M{zTBU~3I?M-vV4zKl_hqQr!87~?Lx@@CmeQWr=tM!fjm2jpQ|CJ)N=rRL$ukwHP){UIY-eq_k3 z-(pjSG)j{~hs`NN-g<-6B`2tZk^2METMDGdwkGBQ>A3>wjV0;<>8%CQ8?t(~|Ax5; zuA+HHw4#ul)3Qc;Wo7bVV?O~cqt7?y0VT5)C|R*yfCr?vr;7BLk@Y&3wHNi4v3uwR z8C$@WjTquG_7KFiW>J{2e;{F%i^7b(0|~2M6lUxmL|ba79s(oJp4zp@wR@54$sCvC zM-C{dPxpZx*D`dLpG73!<#KXCPK|m_um?Gy<5^pu^|)v7ySh}%vE%zstLBbVdxuKD z7QYybhDBd?;qeXCxA|fN$&{~K@+IQ%-{Wh>6wPhEW=PWv0yQU+B?|XkLzPRE6}Ykq z7j~;`o60`<`5KT_bMy>HCAn9EQ2#oW8|a61PBjB_lCkUL7ZD^7$N*I5nP7B4B1$OL z6z^AygGrFXQ{mo$#6U7A%fUez#~W>tRJmkd5QukUI9W+?)l@mc2g%~8a-q~E0d^pV z1W`4g1mMW9+${PcpiZ630$!ZQg29VW5oH`}G7d2rR~#9Ktb7c?aRew!`5l@(POA6d zwLOVzyCGj|;x$Nh;dnSD2t&0L=FzT)Mj&PKEm-|uWg19K3z>=FvNiN^RRzR#9+;}$1h{9mp)uXCRcY9I@7O!LrW?+T zSf=f@vz689$~L7EB>J~c{=?ayo}I459NaVI6<|3(EG-AiK9jMkYQR-Ald-C6!R480 zT!)nE^%U!OzkT+t!}o*l*zONb9zQXCGBkNYoD`+$o=cO`;N;-Y2 zy{2c-dd74Ht=6xbkX$n;veNci#a{cKy`IQIbXC$GU>(5R8SLYH6ET>1HY8=Jm9h$? zh&>ywV=03r0&Rmm>7fi0F_!DB-w@E&SxdeR3LJ*UlE1Vv8jL+#o>i66U?(|rzN~gM z*ijDc)wH9*x^n1zS!Fa>Uk>fnl#wEQJXBvBinoX_TQVKf85&LD2IGz*P^@ZfJfi29 zwI8ZZxpNFP=y_&UWo*eK0?vmjBdTMFiL~oj)=q8G@%;Io?2d)oSodec1f zfI8Av0y zU2go*xWj5{N~81Zujz8LiN<4AQ&Sq9Uw=&t&VQK0G|scqn*N$N-d8xSy{BET! zvr4}C71or(b+r{NmF1s?LO@^R2)Cl}L@lMhlIBoY5eNRFwFDJ_smpI$fqK#JlV5_e ztJWZF)8nFyx)vGdo9e{#prhp2T*k$m?7-nt1)dSUjBXLZHxWn(zJ;I%!S5p&LU0*? zYK?afrlcg66_=;&`6~1O^Ur~5g}-)uW?F(|o(6B9=ov@S_(@@n$4&1zBJ#Y@GxYGuwh?0oaXDG4mZTxv751U{U!x|{H7I@ zHqEb9__ff(d=rpQ^BQTaYMO6T_%_l6=`}6NnwIpMjSBqujVmB~_3-Fn7)(cJ`4Vi@ z8)dhvZ&r`*y;(PfZCHZUxb^51zb=OqSLOVa;T*1|${i~#F1Vea-vY3Zaz|qla%)!O zpWRx}PHdLjQ0S~ix5;LWq7+(1o&~AsdFeLE(_rjS`;1vdo&~AsdFeLElh<>~+N;R3 zAQe3?-6nZbgw$2!S&)jJmu{0h3vRN$&=@L6MbAsONuHEVSEinAUtru8q@w4g+jyQa z$dH?kaN7lCl^hubt~rUpPFc!GBt#e?atIMYh^#@x3>30*ZzPgC5JOEgp^KSt>l%auKu zrYY_W6(wZ`Ew1ItOMtmz?x#?M{}mp-!J!h7zj61iy5Rb(fH&9SHkfLQZ`~q>V=2`f z3tmzk*Gsa(xBLnun#kPCZ2~-Z7O2Gn9H7aPD8!_gK!kMqoLCa=KatXl0N_5C zr}9>X;_;0aE1tD8z{){%&=~S-r|y)zSwB&q_CKZgpSoZ2&-MRUpWb^;*?VrPDWLFy zkIk(2Irby(B60IIxG&|uv-ZvQiS~5MJ_UZ>eIIisZwEWam>eD2U7|(zzi~@VIBO@2 zphBFLrji8iNf~qcx&yaYnMg*}2RRsKm@{J7a>k;0*fMMb4BSt3X&8)c>Ma%8 zc+%F;p_3*4+Am;jP3?0+z6``p{mBXMCP`xb_d`G7#k&H8QF^b#~8_`UEfWT>kg^| z7HJ_E?~O`o8MzcF^#|d720k9i_{vfg+!x_hkQBTir260{bDwIL;Wi*XU;%RE8H)(o zkZ|l0frA7N5txE&grFtqfeQ=PKDdq@l{9hbWh_rOf;{=?2KrqA0Cyhx`y^QsE=qC= z-aI9NL{Y{|5_^f;8Yh^_C;Q+++i5A*zX@XT+OwErJw1?SB6Itp5$WQD$~K{lyhk-7 z({g8d4!nng`-|jZ73C@fxwlGZNK0;`{11>%{*fl6fg0e#=!JCAdZlRn_~~@hZl!7W zRM8V~v$wJ~UD>Www%?JbDz}WdXN&8`58SidubVDDHe&tAUbKjPk^SWO#XCvdvcmS& z2*h2r;lRBl-LX#L*Uj*2Ffg-`G~(gz{EIq_Ev~kw#X{w1BXUsKmSSM;4n7x$nq)TO z8CI=T2e!>d4o1;%^L*z-ZVmkk?}y79e47ym9F6G6rK{tgZm)LuG|3S6zq&Cv_ATJQ zzcn12zM}mmb8&i&XndU>lg80mc;S+faEDP!pz(5jt)}e1BR+i)W$gkS?BO^%y+$++ zPmfuU>r;6?od!(;&=155%K>y+G}(Y2^UIJFmg_;948hS3wGByOIjMjq0qD6uv>YJc zf>D;?`0{PQwVQqvRxgiZ=&Jx!J{8Ka?nD&s8zxgyKam{dpJ0?9BX|wtIgzg?fN@GG z`A^aJ&k($h;0}Tb1b>d;E&!E>m&$TD1aATQ1yTMs`V;>B24a7K;4cx}L+~a5c!Pv* zxK4q{0DRg>97m)$;5LE-k-vgyG`@@DZm5~29MRut5QXwHEyr|xp0 zi)YqUjs>m<=G^Ei(p?^Od6|0O?ZC~zTrs*zn2M^g$o0s4DY!oAVmnyQ^~qsV5%2&=R4JitzDllz59*ThTGjTmO>sta1p?ewe9%HWBb}BY%91a zEw@67oagFffDP4B8q~P*ya`ttP3bhV+8eyjO+O!>d@eDW=wEWl{(Al^|FwX}yY8G!-SXY@2Q zVUr%ymo1f!6`)BMGifEpg(c4r5%`fjPjDKd4~-5@w4~Rfg+iGglO}4?W6}gqdQ6(g zNkp>E0*R-pDU0! zVfF zJA4c4Gf&O09emGTtLbG-&}X_qoG`V+fD}HlD{z)&FfN@O3CB#1PG25Da1Q6|sc39oXDl?DG-gllEgJi$$D}iUTJ9~B8qsOe z7{A7(>5QJ1+a^kGghpzs9wHh{uRw3n7(Knj%dvP`>1Zr|xt^mje~$J%!VuC{rDuY6 z*r&Hn);4JF&-&?@7M4Ay%0Bv_t{4{moSKLkT&|e0RmP=WlQp*a*X76>$H8CQ+~e*; z&XXqaP%B5I_#1ToErOpQ_%VXl5WI!p!8$GZrx@=Nf*T0(1de;?N2HFwL+tMnyp5oc z{DEV6sF^N*EcwLQDdZkMqe~zw*ITVtp!m(zVa@WMYwRnXbOqeQiYY75UoD{iYGLOR zNd~XG=Y1dLJ2+T~lIjPD;qb`URNY5E1RHTUAZxqAd}1<#4#`=?)qS#q<#x|o*K&14 zj35z15XUf(h?x0Gu;@yoK{I@KtK@hp|1%)Jb$TNCXV3=uDj}}z`03Ma@;_k^RpV2$ zv}Fv15AF{hR^jvP^mDPvXXBIc!~$x1I#9*aujzgEn~2`0wX=}I2N%g)Lu6wR8lk7* zi=w73vWjXWiY~{lr&TU{{4J0xS1(~LQ6&e@4m{qBW|pIlWipKM^i&&;C^^XQ=wJ}( z@Zidf;CUy@6>AYOj=xpq;0t5@gPLO39PAFsSRwL>DAk4^I)l#+X%8|=!?FH^oPw{K zwZj#J#DEM}0%Rkf-sL?-b2H%A#Zf(3(>mp7d)c~ReVh%lH%b>6v=@e1 zCu<$KnD#U%o`wYm91GPQX4W}b-L}9WvT(%O$+C6CjmW}jlZ`E&tZMs|0h_4P)qdZZ zJu|!O0ozS1^$5G7~~mM|O<17RMLdHutj{}dnY9~wCsh;Z;Fw@ncIBfb2kVLsB& zdu(#{na;LuU+=5lPT5A19er<|?v!&d-Q-hVa!<9L_4RhY+SBRl?(C5_be=uh+1u-D z>+XK-sP9%g|JNx180{`KIr5-x=A$1-|ba9+LC>2l@VsM`1{vf)EvA za^CUNZ}f@+?+=Fp5kA(hOh<{6^JP|h;Qa=}lNA8!v+pxrBWHlv#2Fzrb0&x_oEhRQ z&H}L&sROjPLTux*A+}94Dk&n&c+!Z$-pSi22;x>ze5Q+Xht;C!a)m7Y0o&LS6`$Xj|p0}vNUyvrOOTd zsI9-`e$=sl%YO3cEVO6ZeAM~A~HjnZ` zG$xw^5wWRJ&WZFAgz`m0oSc<>fduDAG?gtek}KQD(--0+P?HZ2i6a88q?{E4w2ApR zIfoQL?12}%QtPz~7ck2n8|(-OeNbyByeTbp^ASjADXnrIgKUcNp$kGWv>8JHa=U-m zv12eA=63|6gOMHSXu6|+Sm^)~oZmMzA`V6)koFBF5s&wgVKoem$X1^(5DADr->6Fm z7#nD=6FbfZkb94LXklEn?(Ypm-lzm>d zSabk#a+b?nH|r+rrkZcASZt%t+m?4;(es0wC)EMorjdJU0ZIR zm^?Ancgs8Nz5C9hi;Sprd5zC$o&Mfp*+J|uefUa;Kd`R?MVK_kfe=F&M+wmB1|w=L z0l$(ERDqs;L6$-}Jbki^m__pekW`4I-xx;Bp=l1-Q&0sW`W0K%uQ>_~WE3PI%2Mca z%phtop&dl^+bHZh&TzpvW)ur)FL8soMoHQ#XCQSjfX zAdUz~A2}B{YcDT$@ZS&g^Rh7J-*gF9b1biIna5|=%u$N2r@=pqe1Te3~7%t!ya$xb>5>lCys1(!68WxOv%8bZzvL);njVs@)5YmN|BA*L*(g z!msm7Cw5O&&*#_7TWf?@U;t@b4+%)N-U&#+)`M6H9E%Ii888-A$pznFd+2V)QmBae zWU0O(ONA|ai~@=T0&a3)Gk!1-dJ9dEPC}AG)k5C@FZyC3-}&JS7hv6Dfl+?6R+qF- z7u+!0iKb?Crx+{=RwZzNe|69>D*C5vLwN zn5#N>Xge9`aqD2t{VT(a8K4x*J2S)$wA|a!>X>oNG-lF{r-*%$5F}~^eN&#IdN7rB zh#Qfzv_1NSe$`{9)sGtvz>BEzuV8_eM^kXfanw3;3|Zmz@{FUe(!t z1%G6K7jSnAh(0MC$wqW&e+W2;0)7Zg0uD>w;E(l3LDwD_jt<9&C@3H$1SFfR)FZmL zZ0SdFGbWojzvvfm*UCktS(1Gs8Wa1XNsebi)m=D*MJ%D{01Qet1tJ%s0t#Qi_0Ero zd`zf@7Xd{A*%}b}Frq6F3+3#XKRgr)`yzs{6|z0q3O_Ux4k{Z00Rh)SW_=)H5}|Yu zIw#!;yRl7Nl^DVq;gA}a+~gw^@QaGn=nM1Vs4z0>(M5Ehs`2)v@~T96qg38F+qh8PBH3NzT@&oYuI2KEJMScRv`IVKmK~)N71whnx+F&p z@XVccs|@S*jCXv{eWiP%0BWq1IuoV!Qfd7RzvL*m)|@D;k>Jl!GbPMaC$=4w;IH=J zTu`b#@zBWb=werj+)vDgvdZzUCl)5x_3`n9t3h%#EIBsbuwS=NoRJ*vB}ZApQ6V`h z5)N!BbqvdnEeXeN$+3Gi+q8MluNYHq>%)Aeu=-aeOm_LtthwXZZHL6V1n@=3~o_f*aQB)*srGaZGt< zE2jTIax?=8cDDctw(c44NZ2<^_RSOLC3^+TWNWil*O_osOOEO(F;Tlms@;>QZIj^7 z(RQ!n{)WFlE;&x#=jZ$0_{TxX@g_ESmc9!i~>TzF_9_J0J9F_J*WGy}J$r3(k(&Lk&$|r*=AJp1=n){{D zz^#>|DUKarQ%WF=WrVniV>nY86E_7BU^0S6oeQco1X55fEe5qsIF}TJUUR*ko4QPJ zJ(}srJkL-0AndEmK=%o9q7>@u@moZ32z0hAbB(=viSQk>qTCQBLzvGu{{z8OBpiYA zDLe)RXu?Q@3QQruYkWsIimx3Qbz+1IB6MMd9jCGng>m#nk&VbXz>xX@r6ceUY-cze z8x9M&8D%4wGQfa1)Bvg#zw1j;Xl35 z8_;a5jR5~LMZm;5zz@JX;r}pysjB*#DFuy7sA-{VcEfbfPb&Ye?#FeD&ZFbq%XX)R z->!tCN^(>s9CebT4&Xcqot2Vf>yjgxnGWAi5Wb(-8L$N$nmn|$q2QYFqcb<&y8hNg zeCFuPnfdaabM{1Qm(<#|xalNtk5^71Qo4HB@$QxK%Bcgh2C1rPp}aYj9i@&)cL*8d zQ&`N*w%JR!8zjfx^a1`F2l%LJeI)(}_Unu2I)KEFo(G9jjSwl=fgB(s0H&*Yj+PZC z=%$_aplT}6%BYg74!W)(vFe8>))loG9tb-YP2&e8@N;9B@p1!L{q%%|p?S&4B`Z-#nU3R(^ zyONGE}7#xU+y$5b$IP za#nvd6yp0uVLyI8fuGGBe|~sC5kpYk100o|sWQ46Vj0V6%e#FI{AycGW4vqP=RRs`2igL{q2K)S0Te zebRWVb-MMh_9d#CCHNx>Q!^?|)kI-(fx-wHp548Y!KU4y^-UFk&ICH-`t4;IH@7@t zjJbOr+8D>i58_wi6YWxd`C?_$Y)7KGLu&50cSdSHAyuCEGXKO-kd#5K~aYvt74+WdFKsOQNh%DnlK(tY!Sds&TQfX1S*B*5LHu^2V)G+^x`bXtq(RX^}RzByU`xZY|1RWTXp@ zs)sv3{&>>D6qclF?rE#Blre{u%3XJB6OA2GV@IYY+i|OVx_h=js!py>6Ru7btj-6o zU3m>iX)j;VwErcC6P5nWP{h53N?BjpU27<(Ik0zdSFTztFDw#)qW5g&^t%y>06J8N zhtVd3!q6d4RXJ%6sE?|8EUJQ%p6Us;bp)QZ)u>N_8DgNI_CoVpQz(H^tjW|x#3K@d z?qnk6p~Ak77y_-L9+VL@G3aGe23YDwFfg;=0R$d9V+LX-HgLui+NnZH5N(VZfir0( z!8Dy-6I3K6z%18g)F8Z{{1!V^6?$lY#5D0@dnP$nZ8o}sw;sz~gJq%!MO&lKHFlzY ztEj?e1Z7{)*68zvT5ZW_5?;T9R$E_6t81P&I&=DbAquRCQnWStBnFTk6DkNgXEcby zMn<3~9UPieAr@-u?8M7#@K8o&c!jb@p8h(Ti`Sw)6=UEcfEr{DLTSY+a~M*diZ!Jy za8*+s=h|W;k$y#lLRmiYD2%4c zYNW4p8O`eJ5ApuU@X#8b9?9Gfp{3aNXG&m6+e?$=YC?&L3OQ+JbTELRF=nPh{o!kBY+fH_7n%F~9az=WO9bf!>IeQW#l_8CE{Y?K_0&+h2g z*wMocXtv+7uGwC_tx$|sX);f8{(B|YUfmfPS)37@yCW%UZkJr!XZk*S`}W&&l~Vn| z1=pcP`{86GN?xM=AY{y)m+IRVThtW;5xc!?|6aS&wRIib7!Rb zBMUARC(~-Cj_85?@WVnN%flkZ<-U1f@<775U2<+;a5gCJK303eS|V9X64t6OtyLg^ zzB~P{WNmz4t$O+{-U{1@2xE;9r`e}eVJ6jUQlIg`S3GVRLC>!eE&OoD4vNnU)PYS7 zV^DhlXBFbot1@?WqyU>L80raVPciqDnlPTij0wQ2CekGZiIf`8=qvS38}BzsfL=i8 z%?uQvLBl@=VuKM%A!ucUxGKbu+ZWTAdCUS9eFM3DVXtmkgQBYaMQbjIsFxAq;P?|a z$1QP&ikom(e}XUzM8=Fj83VBmsX(q3((=XsZ*y0zXxeDknLEOIM$qO4jLXuauqzY? z1>|0%$vf*7n%DzAj7D%cB-l@wk#fO+LXx1vZ-mz|;|NB0&n+9c0eJ&>;PecKHRD7~ z@L~Z!7APFxBfJ2&38~ou$nj(;MuAgUlrXg(Orac>a+RC~uV5g+pco0VRrkZu&3Z-w zHT!R&X6!TwnqUMdDxKK+!ytMD!fo-*cPHPS=~I9*QGZl|zw)E^{8IVx`yGkyx25j4 z6W!+}_}g+G5zF~Y?FRPgOW7ravr5TYIkkJy>Y+EaDdkl7S1zmdZ*A$Fg`Kdt2{syf6OL-uEyY!PU-W=O(N!$?Ad&Vls#t$y&2$t^2n9HA(iS z+4ecxz0Uj1^RK`0Q~R4*haco#$^Ak8(-*p^0TP&AfyAEyP;=}Ut9dS7d4@i6?g+q& z0ZOSL3e6-#+<<}*_zNaA28MxhIv}N#OMxmd&yShFwF!U&TSz*uAwwYnjeIESjj>mY zOw84G6En_SVBr@5R&ZR(p`}2_GSdSEV2j5H0ywpql2>$Sa_c7_lMxQB3I12%R~*cr zvnLrgjc;8sudkY&<6%e)l+q&wLrB3wt{wrDTtf<>%z93raZ)aYP@r()J$amX&A@%E zI(hy37^R%XT{K^I4(@%+n7?3euz=0mfv5v$!TcNJJFEbU!Fa4G;xdA=BJ2{u7AGcW z`I9~{c;-D8#ad_;2Lmzi1B}T^3ZP0h1cS0EG9<G9Q-pj-0Rayoe(#yb?cbx(HB6o3T#pr{^*=WtGlK#qZg^-$YoX1Sz3WK{IP57*a|J%?{>J44M zO)89G^a@5Eh-77~7uOl<#s9#DsWCE4t3&`DtC`Ka~{UJtwj8XdV1T;gEs0<7*E6h!gkFid3$>V*6P+%KtKF+RMFji)ct$Iq}w;01= z&~Xu2gmVxnp5YDW;b&}cuMktb#fkE)errhn@J;cMsEh>8`_J>C21Pd~VA7Bd8syB$ zS?Fd5ZOTsii-`tQzLUS0fV3D0#T3^+L^k4fr+7evqQRs%paVNnywHgkhnx*!RX6eA zQ`CJzu5=R0)&t549TKiWW#o}DvG|D37#IkWSZ%z>|%yx*8|Si^4^ n8a;F|Hs^S8BD+k=E}M70vXH&~J|=&J2uc4Jp3g?r diff --git a/cuslines/cuda_python/__pycache__/cu_tractography.cpython-312.pyc b/cuslines/cuda_python/__pycache__/cu_tractography.cpython-312.pyc deleted file mode 100644 index 67a83039c23c63c11bb41d1469710e789f7c805e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9441 zcmcgSS!^3emessTHN}=hT@)ozr%fN!WyQAQ9F}Fvb{tPe_Rhr2W(k_zqDb?a?pD^~ zYOuy4AM(Hh+sFczkSwf^`EZZ`>i`2x=4-vc02^TU!yOT6p+|!;FvbG;Sb!6G5MY4? z_Ej~jn-W?ZJF}RQSoP|7udY|`sOo?B`+W#P`~SR?S?NIN4>&`EqXumM0)TsnM?8~5 zc_z-}ZE+ie^R}El?}$4HY|lCKuDH7fd*WWGx95CuAEb_4L!OPZd4JrWZ;UsRHqKli z&&9cXQ@n}fUAg9bOT2|aHYE76`{^Lely`r^=(%{S4b39n^FHFepV(?;ZASpaOU!r zw@zz;)Pj(@b_$xG7R90{0lntgxBhA-BMK=wQ!LC1vJA9ari(>+S0VJ%a!C;Q>7<-m zcuOpnlIf&e6tzacza)!7GM~#71SzGDS&zGFneER3xQ7H3XLuC1c@h5OgFGYHd7I$i z?eKHJ&j~*l{M>>w%hnUpVcxSxxp$9p-yY=+f}3aI=NH`bA-?f_XWYXF-bb*p9K=4p z3E~F68De&+CEBX_V2+o?Wa=8Mjg3!h-hA?UB3Vf11dWwb33)*jq=jOR*SwM}loC>= zENDSd$R%aiszgahUaPgyg857#Aql0V2zA9mjfj&^%E@`nmltF)lS<^1B`qM876eg9 zz*ftdlptwrtw7+@kaHH&rA0~eh=nwe6L`&sq2m0!B*>}8x}|diY=_@Aa^fwJYkY=K zw#Hp`ZPBt@rY8e^%257WMhQ%Tgzv!j?167k{SJKZ9{4c!=jz`l(}u{3fp*LZuM7#x z?kq=PGyQ8AGtAseO8_SF1m^*6s4F*ERi+G&mVH(kU46Sny|b>~m{(o@uDW`>1tiq1X+vFo$fBODt2g$z4&Pr_ zA2#q`HlSB1lzo*#X386@L#Z%GHgQkVmf2|vndv7rkYfI{5%%qfSx=$m#%nVOiDCIP zE#aNkEk8uOYq=3(_dgk{`VWkJhT>((XKUP3j~aZ8R0!pR;P+!nrQ5tZlDlLLv!Vd=`^TMN2V}e)Z9ejC3uZBRUYJ; zb}2n5YOGg#katVO)&PB&PZpACflpvkA{NImUC@Myl4+ax63JYym`cK?Mgy8d5_0q3 zG2NXzLAGGm5@=4s3KCwn$jb-3ixRPq0N|5}(vrMTEC5V6 zL@uVqWNBe3R$9_Li9`mLHIXQ{nCum!g?Kll{{$U=13inNwhpy5sCQj|N3?%d=Xc4KAD{Q)jOZPqj<&v*L&y4KOSG@Kgv7^e%iU!*138}4INTKhaUV; z35{&DjY5-#t%lY+S1X~zm4?_>cVxX;?VC{gCe*&;O5gF%`;@*D8{Mz0%>4Y~lkUFz zSJtkq%-p)T+1Y)+bFFiIZms9Rw9*lK*sru70b1;n<{%{QtXyeUp91aHTSHC9?bmn z`H#;(IPviBA1yvge-VG&`}RijmB;Lrr=G^!4L2Je2akV#uC!yXI$Ik=Xu-zzSm~4>2LW`JlSbE+A zp_;9)g)$tdQedF|68f0ILeFa)g(xGQfJPEVjEGntL1jWHfZ-2Z7u>;hwbIMDJ(dnP z(58pI-FUOH;_287w5&9UuS4mM5n_XhjPM3PWQ0(#JBZ8@l5r~%U=~=4>k6Q&S0Ty^Le?%Fj`F99b+>L z7qk9($F^gPhIvy*=l^SrF_R{RzSkH>DYlvZS?f4K=~x-JRNiG@qOHsnc;c9uqtJ`b zNuDxr4P%~`fGcD;Fkre{$ter7-}*$<`22N~Hih=!ZVRU%J412IbO$HCz$vLS{Lh-y zDD+89Zkzo19qVK`axb(_7N=5QCs-4!Q}un;#EQ-ylH-Nf$W zjlkh`W81H}BntC_D1viDydk)Fgi?30gs&i_vI}1{9v;)2I2|p!NpfiNa&(`FZy_|N z2v?ng?gkP64zhs z!+5TNe<~iw=(bc!YkgC9>)<-OC@#$jQgKmC30l~)_y_t7#3LBnLw!Bb1zF-#gUbh3 zvEn4I_Ti4-ByDt88!HRqZ;MAU)Q6)1+|pIJx+Dvd?);78V$4%D0_2umb1ftbJly`e zQur!C0%sgMp35^5vrL6BEBv3(zqp;O-YL2%O%+x5|`8Xi-^V-Nc` z!jl#DrA?N*ec|Q>mF-p7-kPg(Qi+`03U;Z%F(o**ZD;x>wh_~GL~%62Pkx5^GG?BEvHtaAGmZa?uaUsl2|e<4=FbI|!jQmm!xnE)XRW_=y)a5gfbwsu2r~?1NqZs`-MxVmyFW1oV z0;=q=!VcG#QAcmPclPet56-K>AtgAp&a1;mmEoi6@CgO}gC{VlS23yAFsav>XCCBl zx&6k?Hrwy(BdF(n*><+#V*Wmo4EE~0Y#6mB0b@!j?5i==YH3I`o`t`6Qmuy)}7 z$l8d)!903YZc^bUx4n+;qtB4Tf9x5ovfhEa!bWy>P`BQgc;Ie8C#!M?74D!;90=>z z2zWcJ$%iqI)yAw7SQi!U0NKz%g&Wk#sobc-jS`|dK{Yg?z&|(fTx16nZeV>{Uk17a zn7dT&u)-Y%X1nLP>-nIzI_raK|Af*%q4pnB;Ga9TM;H3Scq?o_J{Ge#XH|AUVWFR| z*nQP^7~uTs-!Q0)c;LE!iv=DHSeIkZb^B*uU2*g@{|4q8&S@N!|Kds3l& z(UkOQ;R}C{KKGrAVXkc!FYT^i?hNd|acfbxn^vB* zrKhZ~A{_%wLGLrSIvi-p7PVh2kIcaRwOCx*bvF&FDl#t$LajKGC#n^0sn0|MqM_f! zb1?Q@N?yc{b)=XJG~J(Q{iUe zic1X)DS@H&z0%_Z?1p3#pj~12Kh<^iq!ONd$p0$$S?1nXlN{YA8sf?lC_Bw}+sY8SCpt9j{_yMSLf669e6UZvqV z1zsYkiQ~kV^HRN)qXpAKL4b#FA)#ODlMC{4Z`}=~Rf7l6rF0M?av7<5!ip!XdWIFx z@cLrKbC~SIfD#x`1CvT%awBl`vF9jRIruK(YkWgRUVcav_2c{YLNYHT5}GfO$QSv= z9ERCM;wOvAoL(d1gR42`b47SW!Db@7o$&hWO%#)R4M#ILI)x)1M;VAdMx2>^|;{JYjpPPDsJj2zFN881kU@aMg<;A1-Ua5Q};T zt9}eMVkm$i4sp%5%Qwr_CY)D-#uS#!MPv`h3|*fLe*BBYeVgw_p*1h)pnfg zKwL|;6GI`?(pK%lP#Cp#Y~%5K^B!Y!2EXwfclH9<;%wbMU~?YRCxzz|ZP&ek*j4!I z1f+{ZUWW7+A!6qn6|vgKD!Pg6#quGRDvI!~g=6s|5Q9$|7W1X0+5+R(BA}>s(RY&= z#;LuV04!&6l5Xc?eN3LEwCEmf6FmLAC6OlIw@741^(V4@dm$M-O}?*?^cN;C5V$}d zvUI^ocC~~@@H&pjWxD5t&PK0_e+Oll;iL-?RqYJJ{2oPrhem&g+O}PYY56T`{MO-M xY=1-$ee1I`wm*3gbC!Yq_jRqbtKN{}4L$BUxZyo?!}YCe&c+=32H}|W_P-No7&-6`on{a>*qrTB3eQ$)=5_5uHRbY{zJX#HmCPj$=g@M9OFujTaPmEs z+1;gE(h|a`1}dQ!<|G?+;q=r5Mgbq2Q+@A+w2X?F3TTU@2i@E#ryh#FndM5dprn^R zU}xUU`0}ci24MJ`s~qo;IAS^5Hi`Xc5BjaklORx{r_FV z+JH7#<3MLfi^?%Iau3PFYA=ioN8`(-qBdSAEgR!k<`&Ynsb!0L-q5V^T*Y!3HLH}X_HxO9O*^k!W96#p19LV6 zfU){vxQE(-V12qR1?owc`y^@hx{4;%+~r{)_!k6M39b?0mR`!*#LK1&yVNP9vs78s zH6l9mg7~UigtVMfki?*FcF?&-=e3-<8iZ(M$xV0Dr(mCMVr%n4fbD!&sv3{j&{WE#w!o| zADwz|>baQM>5XppCR)9T&tKW>YTGXVbum`$_-0HFmez;~3)iw96uxdMg$5&AxBTXdCOEw}J4 z$lUcBwrNz14q&2s7>IEd9V_LTQ~=)|2Y{4QQ52w<5N6rER_muQ=m04PSKd2!S4RO# z3}`0pgM>hYJ6XM=nLmV;No&T6M?AV_z#Z##@MY01@02^7Th?;7lTdPo6U#gYGWuCM z40Y5NL+ja3s=xYp?c<%pHM#Wk^p|h_>D@2ieLnM1^QPL=ZnbVMZOtri#qwKSg{H5t zBTDO+etEZ%eI^dH!;u|-aQ()wWB2hh|4=*J*T`;Eo4&!nQ?9YNUTgY>oZ9YRI&eDW zS#9atPHVe@Yrv_v20sft$RRu2tdjuRr0~e#EWwKwHFXi~Rr7b0O4+;!dlT?k6zA|~ z!B7Klq#BbOSMQ&01!7G;#zIJ(A|cuWV~cr1g+{VicRF1sKs%JHgfq&-o1?vK5niJ$ zLcv5v+^1lS+IOCUF{N|!vPXdU_wQvQQT776qlxHT?B|_LWjgcWT#5)PJ%R)rJ!h|* z*)v|5Oe^WkJn1*}3#`tgO)vr*`Us33vXz*+F+P7XF2OHwI2&NZ7)D>eCMM(gjld!mgqo1>qO-cNsa=J#iRceXkFMtdZFEdN5_hR6317dcnI_QH#X z;!VD&1AOCGfulx|Wv! From 1c6dd28077bb17efb25b88cb8bedbd03a366e4ff Mon Sep 17 00:00:00 2001 From: 36000 Date: Tue, 6 Jan 2026 16:22:23 -0800 Subject: [PATCH 22/31] focus in on only cuda python --- CMakeLists.txt | 70 -- README.md | 58 +- build_cuslines.sh | 19 - cuslines/Makefile | 60 -- cuslines/cuda_python/cu_direction_getters.py | 75 ++- cuslines/cuda_python/cu_propagate_seeds.py | 22 +- cuslines/cuda_python/cu_tractography.py | 166 ++++- cuslines/cuslines.cpp | 360 ----------- cuslines/generate_streamlines_cuda.cu | 634 ------------------- cuslines/generate_streamlines_cuda.h | 70 -- cuslines/ptt.cu | 2 +- merge_trk.sh | 99 --- pyproject.toml | 3 + run_gpu_streamlines.py | 258 +++----- setup.py | 1 - 15 files changed, 329 insertions(+), 1568 deletions(-) delete mode 100644 CMakeLists.txt delete mode 100755 build_cuslines.sh delete mode 100644 cuslines/Makefile delete mode 100644 cuslines/cuslines.cpp delete mode 100644 cuslines/generate_streamlines_cuda.h delete mode 100755 merge_trk.sh diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index f27d490..0000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,70 +0,0 @@ -cmake_minimum_required(VERSION 3.24) - -project(cuslines LANGUAGES CUDA CXX VERSION 1.0) - -# Build settings -set(CMAKE_CXX_STANDARD 11) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_POSITION_INDEPENDENT_CODE ON) -set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -Werror=reorder") -set(CMAKE_CXX_FLAGS_RELEASE "-O3") - -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "Release") -endif() - -if (CMAKE_BUILD_TYPE STREQUAL "Debug" ) - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}") -else() - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}") -endif() - -# CUDA -find_package(CUDAToolkit REQUIRED) - -# Set default CUDA compute capabilities if unset -if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - include(FindCUDA/select_compute_arch.cmake) - cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS Auto) - set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_FLAGS}) -endif() -message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") - -# OpenMP -find_package(OpenMP) -if(OPENMP_FOUND) - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") - - # Set OMP runtime based on compiler - if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(OMP_RUNTIME "INTEL") - elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - set(OMP_RUNTIME "GNU") - elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") - set(OMP_RUNTIME "INTEL") - endif() - message(STATUS "OpenMP runtime used: ${OMP_RUNTIME}") -endif() - -# Find pybind11 -execute_process(COMMAND python -c "import pybind11; print(pybind11.get_cmake_dir())" - OUTPUT_VARIABLE pybind11_DIR - OUTPUT_STRIP_TRAILING_WHITESPACE) -list(APPEND CMAKE_PREFIX_PATH ${pybind11_DIR}) -find_package(pybind11 REQUIRED) - -# Build library and pybind11 module -add_library(cuslines_kernels) -target_sources(cuslines_kernels - PRIVATE - ${CMAKE_SOURCE_DIR}/cuslines/generate_streamlines_cuda.cu) -set_target_properties(cuslines_kernels PROPERTIES OUTPUT_NAME cuslines_kernels - POSITION_INDEPENDENT_CODE TRUE) - -pybind11_add_module(cuslines ${CMAKE_SOURCE_DIR}/cuslines/cuslines.cpp) -target_include_directories(cuslines PUBLIC "${CMAKE_SOURCE_DIR}/cuslines" "${CUDAToolkit_INCLUDE_DIRS}") -target_link_libraries(cuslines PRIVATE cuslines_kernels CUDA::cudart_static) - -# Install -install(TARGETS cuslines cuslines_kernels LIBRARY DESTINATION .) diff --git a/README.md b/README.md index 5cda7f5..a1e98fa 100644 --- a/README.md +++ b/README.md @@ -48,63 +48,7 @@ Destroy GPUTracker... Note that if you experience memory errors, you can adjust the `--chunk-size` flag. -To run on more seeds, we suggest enabling the `--use-fast-write` flag in the GPU script to not get bottlenecked by writing files. Here is a comparison running on 500K seeds on 1 GPU with and without this flag: - -Without `--use-fast-write`: -``` -$ python run_gpu_streamlines.py --output-prefix small --nseeds 500000 --ngpus 1 -parsing arguments -Fitting Tensor -Computing anisotropy measures (FA,MD,RGB) -slowadcodf -Bootstrap direction getter -streamline gen -Creating GPUTracker with 1 GPUs... -Generated 143891 streamlines from 100000 seeds, time: 7.978902339935303 s -Saved streamlines to small.1_5.trk, time 11.439777851104736 s -Generated 151932 streamlines from 100000 seeds, time: 10.155118703842163 s -Saved streamlines to small.2_5.trk, time 12.438884019851685 s -Generated 146971 streamlines from 100000 seeds, time: 9.822870016098022 s -Saved streamlines to small.3_5.trk, time 12.377111673355103 s -Generated 153824 streamlines from 100000 seeds, time: 11.133368968963623 s -Saved streamlines to small.4_5.trk, time 13.317519187927246 s -Generated 162004 streamlines from 100000 seeds, time: 13.19784665107727 s -Saved streamlines to small.5_5.trk, time 14.21276593208313 s -Completed processing 500000 seeds. -Initialization time: 14.789637088775635 sec -Streamline generation total time: 116.0746865272522 sec - Streamline processing: 52.28810667991638 sec - File writing: 63.7860586643219 sec -Destroy GPUTracker... -``` - -With `--use-fast-write`: -``` -$ python run_gpu_streamlines.py --output-prefix small --nseeds 500000 --ngpus 1 --use-fast-write -parsing arguments -Fitting Tensor -Computing anisotropy measures (FA,MD,RGB) -slowadcodf -Bootstrap direction getter -streamline gen -Creating GPUTracker with 1 GPUs... -Generated 143891 streamlines from 100000 seeds, time: 7.962322473526001 s -Saved streamlines to small.1_5_*.trk, time 0.1053612232208252 s -Generated 151932 streamlines from 100000 seeds, time: 10.148677825927734 s -Saved streamlines to small.2_5_*.trk, time 0.1606450080871582 s -Generated 146971 streamlines from 100000 seeds, time: 9.811130285263062 s -Saved streamlines to small.3_5_*.trk, time 0.571892499923706 s -Generated 153824 streamlines from 100000 seeds, time: 11.186563968658447 s -Saved streamlines to small.4_5_*.trk, time 0.3091111183166504 s -Generated 162004 streamlines from 100000 seeds, time: 13.282683610916138 s -Saved streamlines to small.5_5_*.trk, time 0.7107999324798584 s -Completed processing 500000 seeds. -Initialization time: 14.705361366271973 sec -Streamline generation total time: 54.24975609779358 sec - Streamline processing: 52.39137816429138 sec - File writing: 1.8578097820281982 sec -Destroy GPUTracker... -``` +To run on more seeds, we suggest enabling the `--trx` flag in the GPU script to not get bottlenecked by writing files. ## Running on AWS with Docker First, set up an AWS instance with GPU and ssh into it (we recommend a P3 instance with at least 1 V100 16 GB GPU and a Deep Learning AMI Ubuntu 18.04 v 33.0.). Then do the following: diff --git a/build_cuslines.sh b/build_cuslines.sh deleted file mode 100755 index 8375223..0000000 --- a/build_cuslines.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -build_dir=$(pwd)/build -install_dir=$(pwd)/install - -# set up build dir -mkdir -p ${build_dir} -cd ${build_dir} - -# configure -cmake -DCMAKE_INSTALL_PREFIX=${install_dir} \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_C_COMPILER=gcc \ - -DCMAKE_CXX_COMPILER=g++ \ - -DPYTHON_EXECUTABLE=$(which python) \ - .. - -# compile -make && make install diff --git a/cuslines/Makefile b/cuslines/Makefile deleted file mode 100644 index 8fd8528..0000000 --- a/cuslines/Makefile +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -CUDA_HOME=/usr/local/cuda -CUDACC=$(CUDA_HOME)/bin/nvcc # -G -g -dopt=on -CXX=g++ -LD=g++ - -CXXFLAGS= -c -O3 -std=c++17 -fopenmp -fPIC `python3 -m pybind11 --includes` -I$(CUDA_HOME)/include - -SMS ?= 75 80 -CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM)) -LASTSM := $(lastword $(sort $(SMS))) -CUDA_ARCH += -gencode arch=compute_$(LASTSM),code=compute_$(LASTSM) - -COMMON_FLAGS = -c -std=c++17 -Xcompiler -fPIC --use_fast_math -Xcompiler=-fopenmp $(CUDA_ARCH) -RELEASE_FLAGS = -O3 -Xptxas=-O3 -DEBUG_FLAGS = -O0 -Xptxas=-v -g -G -lineinfo -CUDACFLAGS = $(COMMON_FLAGS) $(RELEASE_FLAGS) - -LDFLAGS= -shared -fopenmp -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt - -all: cuslines - -cuslines: generate_streamlines_cuda.o cuslines.o - $(LD) cuslines.o generate_streamlines_cuda.o -o cuslines`python3-config --extension-suffix` $(LDFLAGS) - -%.o : %.cu - $(CUDACC) $(CUDACFLAGS) $< -o $@ - -%.o: %.cpp - $(CXX) $(CXXFLAGS) $< -o $@ - -clean: - rm *.o cuslines`python3-config --extension-suffix` __pycache__/*.pyc diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py index 135cb47..9901fc3 100644 --- a/cuslines/cuda_python/cu_direction_getters.py +++ b/cuslines/cuda_python/cu_direction_getters.py @@ -5,6 +5,8 @@ from importlib.resources import files from time import time +from dipy.reconst import shm + from cuda.core import Device, LaunchConfig, Program, launch, ProgramOptions from cuda.pathfinder import find_nvidia_header_directory from cuda.cccl import get_include_paths @@ -103,7 +105,7 @@ class _BootCtx(ctypes.Structure): class BootDirectionGetter(GPUDirectionGetter): - def __init__( # TODO: Maybe accept a dipy thing and extract arrays here? maybe as a from_ function? + def __init__( self, model_type: str, min_signal: float, @@ -142,6 +144,77 @@ def __init__( # TODO: Maybe accept a dipy thing and extract arrays here? maybe self.genstreamlines_kernel_name = f"genStreamlinesMerge_k<{THR_X_SL},{BLOCK_Y},{model_type.upper()},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" self.compile_program() + @classmethod + def from_dipy_opdt(cls, gtab, sphere, + sh_order_max=6, + full_basis=False, + sh_lambda=0.006, + min_signal=1): + sampling_matrix, _, _ = shm.real_sh_descoteaux( + sh_order_max, sphere.theta, sphere.phi, full_basis=full_basis, legacy=False + ) + + model = shm.OpdtModel( + gtab, sh_order_max=sh_order_max, smooth=sh_lambda, min_signal=min_signal + ) + fit_matrix = model._fit_matrix + delta_b, delta_q = fit_matrix + + b0s_mask = gtab.b0s_mask + dwi_mask = ~b0s_mask + x, y, z = model.gtab.gradients[dwi_mask].T + _, theta, phi = shm.cart2sphere(x, y, z) + B, _, _ = shm.real_sym_sh_basis(sh_order_max, theta, phi) + H = shm.hat(B) + R = shm.lcr_matrix(H) + + return cls( + model_type="OPDT", + min_signal=min_signal, + H=H, + R=R, + delta_b=delta_b, + delta_q=delta_q, + sampling_matrix=sampling_matrix, + b0s_mask=gtab.b0s_mask + ) + + @classmethod + def from_dipy_csa(cls, gtab, sphere, + sh_order_max=6, + full_basis=False, + sh_lambda=0.006, + min_signal=1): + sampling_matrix, _, _ = shm.real_sh_descoteaux( + sh_order_max, sphere.theta, sphere.phi, full_basis=full_basis, legacy=False + ) + + model = shm.CsaOdfModel( + gtab, sh_order_max=sh_order_max, smooth=sh_lambda, min_signal=min_signal + ) + fit_matrix = model._fit_matrix + delta_b = fit_matrix + delta_q = fit_matrix + + b0s_mask = gtab.b0s_mask + dwi_mask = ~b0s_mask + x, y, z = model.gtab.gradients[dwi_mask].T + _, theta, phi = shm.cart2sphere(x, y, z) + B, _, _ = shm.real_sym_sh_basis(sh_order_max, theta, phi) + H = shm.hat(B) + R = shm.lcr_matrix(H) + + return cls( + model_type="CSA", + min_signal=min_signal, + H=H, + R=R, + delta_b=delta_b, + delta_q=delta_q, + sampling_matrix=sampling_matrix, + b0s_mask=gtab.b0s_mask + ) + def allocate_on_gpu(self, n): self.H_d.append( checkCudaErrors(runtime.cudaMalloc( diff --git a/cuslines/cuda_python/cu_propagate_seeds.py b/cuslines/cuda_python/cu_propagate_seeds.py index 92efef3..72037c6 100644 --- a/cuslines/cuda_python/cu_propagate_seeds.py +++ b/cuslines/cuda_python/cu_propagate_seeds.py @@ -78,7 +78,7 @@ def _allocate_seed_memory(self, seeds): self.shDirTemp0_d[ii] = checkCudaErrors(runtime.cudaMalloc( REAL3_DTYPE.itemsize * self.gpu_tracker.samplm_nr * grid[0] * block[1])) - def _cumsum_offsets(self): # TODO: do this on device? not crucial for performance now + def _cumsum_offsets(self): # TODO: performance: do this on device? not crucial for performance now for ii in range(self.ngpus): nseeds_gpu, _, _ = self._switch_device(ii) if (nseeds_gpu == 0): @@ -173,7 +173,11 @@ def _cleanup(self): self.nSlines_old = self.nSlines self.gpu_tracker.rng_offset += self.nseeds - def propagate(self, seeds): # TODO: better queuing/batching of seeds, if more performance needed + # TODO: performance: better queuing/batching of seeds, + # if more performance needed, + # given exponential nature of streamlines + # May be better to do in cuda code directly + def propagate(self, seeds): self.nseeds = len(seeds) self.nseeds_per_gpu = (self.nseeds + self.gpu_tracker.ngpus - 1) // self.gpu_tracker.ngpus @@ -202,13 +206,15 @@ def propagate(self, seeds): # TODO: better queuing/batching of seeds, if more pe self._cleanup() - def as_array_sequence(self): + def get_buffer_size(self): buffer_size = 0 for ii in range(self.ngpus): lens = self.sline_lens[ii] for jj in range(self.nSlines[ii]): buffer_size += lens[jj] * 3 * REAL_SIZE + return buffer_size + def as_generator(self): def _yield_slines(): for ii in range(self.ngpus): this_sls = self.slines[ii] @@ -220,11 +226,7 @@ def _yield_slines(): yield np.asarray( this_sls[jj], dtype=REAL_DTYPE)[:npts] + return _yield_slines() - return ArraySequence(_yield_slines(), buffer_size // MEGABYTE) - - def to_trx(): - raise NotImplementedError("Export to TRX not yet implemented") - - def to_trk(): - raise NotImplementedError("Export to TRK not yet implemented") + def as_array_sequence(self): + return ArraySequence(self.as_generator(), self.get_buffer_size() // MEGABYTE) diff --git a/cuslines/cuda_python/cu_tractography.py b/cuslines/cuda_python/cu_tractography.py index eca62dd..1d34adc 100644 --- a/cuslines/cuda_python/cu_tractography.py +++ b/cuslines/cuda_python/cu_tractography.py @@ -1,10 +1,11 @@ from cuda.bindings import driver, runtime from cuda.bindings.runtime import cudaMemcpyKind -import cuda.core as cc # TODO: consider cuda core over cuda bindings import numpy as np +from tqdm import tqdm import logging +from math import radians from cuslines.cuda_python.cutils import ( REAL_SIZE, @@ -17,29 +18,81 @@ ) from cuslines.cuda_python.cu_propagate_seeds import SeedBatchPropagator +from trx.trx_file_memmap import TrxFile + +from nibabel.streamlines.tractogram import Tractogram +from nibabel.streamlines.array_sequence import ArraySequence, MEGABYTE + +from dipy.io.stateful_tractogram import Space, StatefulTractogram logger = logging.getLogger("GPUStreamlines") +# TODO performance: +# ACT +# SCIL streamline reduction onboard GPU +# Remove small/long streamlines on gpu -class GPUTracker: # TODO: bring in pyAFQ prep stuff +class GPUTracker: def __init__( self, dg: GPUDirectionGetter, - max_angle: float, - tc_threshold: float, - step_size: float, - relative_peak_thresh: float, - min_separation_angle: float, - dataf: np.ndarray, # TODO: reasonable defaults for floats, reorganize order, better names, documentation - metric_map: np.ndarray, + dataf: np.ndarray, + stop_map: np.ndarray, + stop_theshold: float, sphere_vertices: np.ndarray, sphere_edges: np.ndarray, + max_angle: float = radians(60), + step_size: float = 0.5, + relative_peak_thresh: float = 0.25, + min_separation_angle: float = radians(45), ngpus: int = 1, rng_seed: int = 0, rng_offset: int = 0, + chunk_size: int = 100000, ): + """ + Initialize GPUTracker with necessary data. + + Parameters + ---------- + dg : GPUDirectionGetter + Direction getter to use for tracking from + cuslines.cu_direction_getters + dataf : np.ndarray + 4D numpy array with ODFs for prob/ptt, diffusion data if doing + bootstrapping. + stop_map : np.ndarray + 3D numpy array with stopping metric (e.g., GFA, FA) + stop_theshold : float + Threshold for stopping metric (e.g., 0.2) + sphere_vertices : np.ndarray + Vertices of the sphere used for direction sampling. + sphere_edges : np.ndarray + Edges of the sphere used for direction sampling. + max_angle : float, optional + Maximum angle (in radians) between steps + default: radians(60) + step_size : float, optional + Step size for tracking + default: 0.5 + relative_peak_thresh : float, optional + Relative peak threshold for direction selection + default: 0.25 + min_separation_angle : float, optional + Minimum separation angle (in radians) between peaks + default: radians(45) + ngpus : int, optional + Number of GPUs to use + default: 1 + rng_seed : int, optional + Seed for random number generator + default: 0 + rng_offset : int, optional + Offset for random number generator + default: 0 + """ self.dataf = np.ascontiguousarray(dataf, dtype=REAL_DTYPE) - self.metric_map = np.ascontiguousarray(metric_map, dtype=REAL_DTYPE) + self.metric_map = np.ascontiguousarray(stop_map, dtype=REAL_DTYPE) self.sphere_vertices = np.ascontiguousarray(sphere_vertices, dtype=REAL_DTYPE) self.sphere_edges = np.ascontiguousarray(sphere_edges, dtype=np.int32) @@ -53,7 +106,7 @@ def __init__( self.dg = dg self.max_angle = REAL_DTYPE(max_angle) - self.tc_threshold = REAL_DTYPE(tc_threshold) + self.tc_threshold = REAL_DTYPE(stop_theshold) self.step_size = REAL_DTYPE(step_size) self.relative_peak_thresh = REAL_DTYPE(relative_peak_thresh) self.min_separation_angle = REAL_DTYPE(min_separation_angle) @@ -61,6 +114,7 @@ def __init__( self.ngpus = int(ngpus) self.rng_seed = int(rng_seed) self.rng_offset = int(rng_offset) + self.chunk_size = int(chunk_size) checkCudaErrors(driver.cuInit(0)) avail = checkCudaErrors(runtime.cudaGetDeviceCount()) @@ -98,6 +152,7 @@ def _allocate(self): for ii in range(self.ngpus): checkCudaErrors(runtime.cudaSetDevice(ii)) + # TODO: performance: dataf could be managed or texture memory instead? self.dataf_d.append( checkCudaErrors(runtime.cudaMalloc( REAL_SIZE*self.dataf.size))) @@ -153,6 +208,89 @@ def __exit__(self, exc_type, exc, tb): checkCudaErrors(runtime.cudaStreamDestroy(self.streams[n])) return False - def generate_streamlines(self, seeds): - self.seed_propagator.propagate(seeds) - return self.seed_propagator.as_array_sequence() + def _divide_chunks(self, seeds): + global_chunk_sz = self.chunk_size * self.ngpus + nchunks = (seeds.shape[0] + global_chunk_sz - 1) // global_chunk_sz + return global_chunk_sz, nchunks + + def generate_sft(self, seeds, ref_img): + global_chunk_sz, nchunks = self._divide_chunks(seeds) + buffer_size = 0 + generators = [] + + with tqdm(total=seeds.shape[0]) as pbar: + for idx in range(nchunks): + self.seed_propagator.propagate( + seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz] + ) + buffer_size += self.seed_propagator.get_buffer_size() + generators.append(self.seed_propagator.as_generator()) + pbar.update( + seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz].shape[0] + ) + array_sequence = ArraySequence( + (item for gen in generators for item in gen), + buffer_size // MEGABYTE + ) + return StatefulTractogram(array_sequence, ref_img, Space.VOX) + + # TODO: performance: consider a way to just output in VOX space directly + def generate_trx(self, seeds, ref_img): + global_chunk_sz, nchunks = self._divide_chunks(seeds) + + # Will resize by a factor of 2 if these are exceeded + sl_len_guess = 100 + sl_per_seed_guess = 3 + n_sls_guess = sl_per_seed_guess * seeds.shape[0] + + # trx files use memory mapping + trx_file = TrxFile( + reference=ref_img, + nb_streamlines=n_sls_guess, + nb_vertices=n_sls_guess * sl_len_guess, + ) + trx_file.streamlines._offsets = trx_file.streamlines._offsets.astype(np.uint64) + offsets_idx = 0 + sls_data_idx = 0 + + with tqdm(total=seeds.shape[0]) as pbar: + for idx in range(int(nchunks)): + self.seed_propagator.propagate( + seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz] + ) + tractogram = Tractogram( + self.seed_propagator.as_array_sequence(), + affine_to_rasmm=ref_img.affine) + tractogram.to_world() + sls = tractogram.streamlines + + new_offsets_idx = offsets_idx + len(sls._offsets) + new_sls_data_idx = sls_data_idx + len(sls._data) + + if ( + new_offsets_idx > trx_file.header["NB_STREAMLINES"] + or new_sls_data_idx > trx_file.header["NB_VERTICES"] + ): + print("TRX resizing...") + trx_file.resize( + nb_streamlines=new_offsets_idx * 2, + nb_vertices=new_sls_data_idx * 2, + ) + + # TRX uses memmaps here + trx_file.streamlines._data[sls_data_idx:new_sls_data_idx] = sls._data + trx_file.streamlines._offsets[offsets_idx:new_offsets_idx] = ( + sls_data_idx + sls._offsets + ) + trx_file.streamlines._lengths[offsets_idx:new_offsets_idx] = ( + sls._lengths + ) + + offsets_idx = new_offsets_idx + sls_data_idx = new_sls_data_idx + pbar.update( + seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz].shape[0] + ) + trx_file.resize() + + return trx_file diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp deleted file mode 100644 index f0b8690..0000000 --- a/cuslines/cuslines.cpp +++ /dev/null @@ -1,360 +0,0 @@ -/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include - -#include -#include -#include -namespace py = pybind11; - -#include - -// #define USE_NVTX - -#include "globals.h" -#include "cudamacro.h" -#include "generate_streamlines_cuda.h" - -using np_array = py::array_t; -using np_array_int = py::array_t; - -using np_array_cast = py::array_t; -using np_array_int_cast = py::array_t; - -// Handle to cleanup returned host allocations when associated Python object is destroyed -template -py::capsule cleanup(T* ptr) { - return py::capsule(ptr, [](void *f) { - T *g = reinterpret_cast(f); - delete [] g; - }); -} - -class GPUTracker { - public: - GPUTracker(ModelType model_type, - REAL max_angle, - REAL min_signal, - REAL tc_threshold, - REAL step_size, - REAL relative_peak_thresh, - REAL min_separation_angle, - np_array_cast dataf, - np_array_cast H, - np_array_cast R, - np_array_cast delta_b, - np_array_cast delta_q, - np_array_int_cast b0s_mask, - np_array_cast metric_map, - np_array_cast sampling_matrix, - np_array_cast sphere_vertices, - np_array_int_cast sphere_edges, - int ngpus = 1, - int rng_seed = 0, - int rng_offset = 0) { - - // Get info structs from numpy objects - auto dataf_info = dataf.request(); - auto H_info = H.request(); - auto R_info = R.request(); - auto delta_b_info = delta_b.request(); - auto delta_q_info = delta_q.request(); - auto b0s_mask_info = b0s_mask.request(); - auto metric_map_info = metric_map.request(); - auto sampling_matrix_info = sampling_matrix.request(); - auto sphere_vertices_info = sphere_vertices.request(); - auto sphere_edges_info = sphere_edges.request(); - - dimx_ = dataf_info.shape[0]; - dimy_ = dataf_info.shape[1]; - dimz_ = dataf_info.shape[2]; - dimt_ = dataf_info.shape[3]; - nedges_ = sphere_edges_info.shape[0]; - - delta_nr_ = delta_b_info.shape[0]; - samplm_nr_ = sampling_matrix_info.shape[0]; - -// No longer needed -#if 0 - // Error checking for template parameters. - // TODO: Need to make kernel more general. - if (delta_b_info.shape[0] != 28 || - sampling_matrix_info.shape[0] != 181 || - dataf_info.shape[3] > 160) { - std::cout << delta_b_info.shape[0] << " " << sampling_matrix_info.shape[0] << " " << dataf_info.shape[3] << std::endl; - throw std::invalid_argument("Input data dimensions not currently supported."); - } -#endif - - // Get number of GPUs - int ngpus_avail; - CHECK_CUDA(cudaGetDeviceCount(&ngpus_avail)); - if (ngpus > ngpus_avail) { - throw std::runtime_error("Requested to use more GPUs than available on system."); - } - - std::cerr << "Creating GPUTracker with " << ngpus << " GPUs..." << std::endl; - ngpus_ = ngpus; - - model_type_ = model_type; - max_angle_ = max_angle; - min_signal_ = min_signal; - tc_threshold_ = tc_threshold; - step_size_ = step_size; - relative_peak_thresh_ = relative_peak_thresh, - min_separation_angle_ = min_separation_angle, - - // Allocate/copy constant problem data on GPUs - dataf_d.resize(ngpus_, nullptr); - H_d.resize(ngpus_, nullptr); - R_d.resize(ngpus_, nullptr); - delta_b_d.resize(ngpus_, nullptr); - delta_q_d.resize(ngpus_, nullptr); - b0s_mask_d.resize(ngpus_, nullptr); - metric_map_d.resize(ngpus_, nullptr); - sampling_matrix_d.resize(ngpus_, nullptr); - sphere_vertices_d.resize(ngpus_, nullptr); - sphere_edges_d.resize(ngpus_, nullptr); - - //#pragma omp parallel for - for (int n = 0; n < ngpus_; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - CHECK_CUDA(cudaMallocManaged(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size)); - CUDA_MEM_ADVISE(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n); - // CHECK_CUDA(cudaMemPrefetchAsync(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, n)); - CHECK_CUDA(cudaMalloc(&H_d[n], sizeof(*H_d[n]) * H_info.size)); - CHECK_CUDA(cudaMalloc(&R_d[n], sizeof(*R_d[n]) * R_info.size)); - CHECK_CUDA(cudaMalloc(&delta_b_d[n], sizeof(*delta_b_d[n]) * delta_b_info.size)); - CHECK_CUDA(cudaMalloc(&delta_q_d[n], sizeof(*delta_q_d[n]) * delta_q_info.size)); - CHECK_CUDA(cudaMalloc(&b0s_mask_d[n], sizeof(*b0s_mask_d[n]) * b0s_mask_info.size)); - CHECK_CUDA(cudaMalloc(&metric_map_d[n], sizeof(*metric_map_d[n]) * metric_map_info.size)); - CHECK_CUDA(cudaMalloc(&sampling_matrix_d[n], sizeof(*sampling_matrix_d[n]) * sampling_matrix_info.size)); - CHECK_CUDA(cudaMalloc(&sphere_vertices_d[n], sizeof(*sphere_vertices_d[n]) * sphere_vertices_info.size)); - CHECK_CUDA(cudaMalloc(&sphere_edges_d[n], sizeof(*sphere_edges_d[n]) * sphere_edges_info.size)); - - CHECK_CUDA(cudaMemcpy(dataf_d[n], dataf_info.ptr, sizeof(*dataf_d[n]) * dataf_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(H_d[n], H_info.ptr, sizeof(*H_d[n]) * H_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(R_d[n], R_info.ptr, sizeof(*R_d[n]) * R_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(delta_b_d[n], delta_b_info.ptr, sizeof(*delta_b_d[n]) * delta_b_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(delta_q_d[n], delta_q_info.ptr, sizeof(*delta_q_d[n]) * delta_q_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(b0s_mask_d[n], b0s_mask_info.ptr, sizeof(*b0s_mask_d[n]) * b0s_mask_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(metric_map_d[n], metric_map_info.ptr, sizeof(*metric_map_d[n]) * metric_map_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(sampling_matrix_d[n], sampling_matrix_info.ptr, sizeof(*sampling_matrix_d[n]) * sampling_matrix_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(sphere_vertices_d[n], sphere_vertices_info.ptr, sizeof(*sphere_vertices_d[n]) * sphere_vertices_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(sphere_edges_d[n], sphere_edges_info.ptr, sizeof(*sphere_edges_d[n]) * sphere_edges_info.size, cudaMemcpyHostToDevice)); - } - - rng_seed_ = rng_seed; - rng_offset_ = rng_offset; - nSlines_old_.resize(ngpus_, 0); - slines_.resize(ngpus_, nullptr); - slinesLen_.resize(ngpus_, nullptr); - - streams_.resize(ngpus_); - for (int n = 0; n < ngpus_; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - CHECK_CUDA(cudaStreamCreateWithFlags(&streams_[n], cudaStreamNonBlocking)); - } - - } - - ~GPUTracker() { - std::cerr << "Destroy GPUTracker..." << std::endl; - for (int n = 0; n < ngpus_; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - if (dataf_d[n]) CHECK_CUDA(cudaFree(dataf_d[n])); - if (H_d[n]) CHECK_CUDA(cudaFree(H_d[n])); - if (R_d[n]) CHECK_CUDA(cudaFree(R_d[n])); - if (delta_b_d[n]) CHECK_CUDA(cudaFree(delta_b_d[n])); - if (delta_q_d[n]) CHECK_CUDA(cudaFree(delta_q_d[n])); - if (b0s_mask_d[n]) CHECK_CUDA(cudaFree(b0s_mask_d[n])); - if (metric_map_d[n]) CHECK_CUDA(cudaFree(metric_map_d[n])); - if (sampling_matrix_d[n]) CHECK_CUDA(cudaFree(sampling_matrix_d[n])); - if (sphere_vertices_d[n]) CHECK_CUDA(cudaFree(sphere_vertices_d[n])); - if (sphere_edges_d[n]) CHECK_CUDA(cudaFree(sphere_edges_d[n])); - - if (slines_[n]) CHECK_CUDA(cudaFreeHost(slines_[n])); - if (slinesLen_[n]) CHECK_CUDA(cudaFreeHost(slinesLen_[n])); - - CHECK_CUDA(cudaStreamDestroy(streams_[n])); - } - } - - std::vector> generate_streamlines(np_array seeds) { - - auto seeds_info = seeds.request(); - int nseeds = seeds_info.shape[0]; - - std::vector seeds_d(ngpus_, nullptr); - int nseeds_per_gpu = (nseeds + ngpus_ - 1) / ngpus_; - - //#pragma omp parallel for - for (int n = 0; n < ngpus_; ++n) { - int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu)); - CHECK_CUDA(cudaSetDevice(n)); - CHECK_CUDA(cudaMalloc(&seeds_d[n], sizeof(*seeds_d[n]) * 3 * nseeds_gpu)); - CHECK_CUDA(cudaMemcpy(seeds_d[n], reinterpret_cast(seeds_info.ptr) + 3*n*nseeds_per_gpu, sizeof(*seeds_d[n]) * 3 * nseeds_gpu, cudaMemcpyHostToDevice)); - } - - std::vector nSlines(ngpus_); - - // Call GPU routine - generate_streamlines_cuda_mgpu(model_type_, max_angle_, min_signal_, tc_threshold_, step_size_, - relative_peak_thresh_, min_separation_angle_, - nseeds, seeds_d, - dimx_, dimy_, dimz_, dimt_, - dataf_d, H_d, R_d, delta_nr_, delta_b_d, delta_q_d, b0s_mask_d, metric_map_d, samplm_nr_, sampling_matrix_d, - sphere_vertices_d, sphere_edges_d, nedges_, - slines_, slinesLen_, nSlines, nSlines_old_, rng_seed_, rng_offset_, ngpus_, - streams_); - - nSlines_old_ = nSlines; //store number of slines for next set of seeds - - // Update rng_offset for next set of seeds - rng_offset_ += nseeds; - - int nSlines_total = 0; - for (int n = 0; n < ngpus_; ++n) { - CHECK_CUDA(cudaFree(seeds_d[n])); - nSlines_total += nSlines[n]; - } - - std::vector> slines_list; - slines_list.reserve(nSlines_total); - for (int n = 0; n < ngpus_; ++n) { - for (int i = 0; i < nSlines[n]; ++i) { - REAL* sl = new REAL[slinesLen_[n][i]*3]; - std::memcpy(sl, slines_[n] + i*3*2*MAX_SLINE_LEN, slinesLen_[n][i]*3*sizeof(*sl)); - auto sl_arr = py::array_t({slinesLen_[n][i], 3}, // shape - {3*sizeof(REAL), sizeof(REAL)}, // strides - sl, - cleanup(sl)); - slines_list.push_back(sl_arr); - } - } - - return slines_list; - - } - - void dump_streamlines(std::string output_prefix, std::string voxel_order, - np_array_int roi_shape, np_array voxel_size, np_array vox_to_ras) { - - auto roi_shape_info = roi_shape.request(); - auto voxel_size_info = voxel_size.request(); - auto vox_to_ras_info = vox_to_ras.request(); - - START_RANGE("filewrite", 0); - - //#pragma omp parallel for - for (int n = 0; n < ngpus_; ++n) { - std::stringstream ss; - ss << output_prefix << "_" << std::to_string(n) << ".trk"; - write_trk(ss.str().c_str(), reinterpret_cast(roi_shape_info.ptr), reinterpret_cast(voxel_size_info.ptr), - voxel_order.c_str(), reinterpret_cast(vox_to_ras_info.ptr), nSlines_old_[n], slinesLen_[n], - reinterpret_cast(slines_[n])); - } - - END_RANGE; - } - - private: - int ngpus_; - int rng_seed_; - int rng_offset_; - int dimx_, dimy_, dimz_, dimt_; - int nedges_; - int delta_nr_, samplm_nr_; - - ModelType model_type_; - REAL max_angle_; - REAL tc_threshold_; - REAL min_signal_; - REAL step_size_; - REAL relative_peak_thresh_; - REAL min_separation_angle_; - - std::vector nSlines_old_; - std::vector slines_; - std::vector slinesLen_; - - std::vector dataf_d; - std::vector H_d; - std::vector R_d; - std::vector delta_b_d; - std::vector delta_q_d; - std::vector b0s_mask_d; - std::vector metric_map_d; - std::vector sampling_matrix_d; - std::vector sphere_vertices_d; - std::vector sphere_edges_d; - - std::vector streams_; - -}; - - -PYBIND11_MODULE(cuslines, m) { - m.attr("MAX_SLINE_LEN") = py::int_(MAX_SLINE_LEN); - m.attr("REAL_SIZE") = py::int_(REAL_SIZE); - - py::enum_(m, "ModelType") - .value("OPDT", OPDT) - .value("CSA", CSA) - .value("PROB", PROB) - .value("PTT", PTT); - - py::class_(m, "GPUTracker") - .def(py::init(), - py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), - py::arg().noconvert(), py::arg().noconvert(), - py::arg().noconvert(), py::arg().noconvert(), - py::arg().noconvert(), py::arg().noconvert(), - py::arg().noconvert(), py::arg().noconvert(), - py::arg().noconvert(), py::arg().noconvert(), - py::arg().noconvert(), py::arg().noconvert(), - py::arg("ngpus") = 1, py::arg("rng_seed") = 0, - py::arg("rng_offset") = 0) - - .def("generate_streamlines", &GPUTracker::generate_streamlines, - "Generates streamline for dipy test case.") - - .def("dump_streamlines", &GPUTracker::dump_streamlines, - "Dump streamlines to file."); -} - diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu index db3c0e2..9b04e60 100644 --- a/cuslines/generate_streamlines_cuda.cu +++ b/cuslines/generate_streamlines_cuda.cu @@ -26,28 +26,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -// TODO: its possible all the cpp should be refactored -// out into a separate file, but for now, they are just wrapped -// in these ifndefs -#ifndef __NVRTC__ -#include -#include -#include -#include -#include -#endif - #include #include -#ifndef __NVRTC__ -#include -#include -#include -#include // Might not be needed anymore? -#include -#endif - #include "cudamacro.h" /* for time() */ #include "globals.h" @@ -1774,618 +1755,3 @@ __global__ void genStreamlinesMerge_k( } return; } - -#ifndef __NVRTC__ -void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size, - const REAL relative_peak_thresh, const REAL min_separation_angle, - const int nseeds, const std::vector &seeds_d, - const int dimx, const int dimy, const int dimz, const int dimt, - const std::vector &dataf_d, const std::vector &H_d, const std::vector &R_d, - const int delta_nr, - const std::vector &delta_b_d, const std::vector &delta_q_d, - const std::vector &b0s_mask_d, const std::vector &metric_map_d, - const int samplm_nr, - const std::vector &sampling_matrix_d, - const std::vector &sphere_vertices_d, const std::vector &sphere_edges_d, const int nedges, - std::vector &slines_h, std::vector &slinesLen_h, std::vector &nSlines_h, - const std::vector nSlines_old_h, const int rng_seed, const int rng_offset, - const int ngpus, const std::vector &streams) { - - int nseeds_per_gpu = (nseeds + ngpus - 1) / ngpus; - - std::vector slinesOffs_d(ngpus, nullptr); - std::vector shDirTemp0_d(ngpus, nullptr); - - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu)); - dim3 block(THR_X_SL, THR_X_BL/THR_X_SL); - dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL)); - - CHECK_CUDA(cudaMalloc(&slinesOffs_d[n], sizeof(*slinesOffs_d[n])*(nseeds_gpu+1))); - CHECK_CUDA(cudaMalloc(&shDirTemp0_d[n], sizeof(*shDirTemp0_d[n])*samplm_nr*grid.x*block.y)); - } - - int n32dimt = ((dimt+31)/32)*32; - - size_t shSizeGNS; - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu)); - if (nseeds_gpu == 0) continue; - dim3 block(THR_X_SL, THR_X_BL/THR_X_SL); - dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL)); - - // Precompute number of streamlines before allocating memory - if (!((model_type == PTT) || (model_type == PROB))) { - shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_boot_d - sizeof(int)*samplm_nr; // for peak_directions_d - getNumStreamlinesBoot_k - <<>>( - model_type, - max_angle, - min_signal, - relative_peak_thresh, - min_separation_angle, - rng_seed, - nseeds_gpu, - reinterpret_cast(seeds_d[n]), - dimx, - dimy, - dimz, - dimt, - dataf_d[n], - H_d[n], - R_d[n], - delta_nr, - delta_b_d[n], - delta_q_d[n], - b0s_mask_d[n], - samplm_nr, - sampling_matrix_d[n], - reinterpret_cast(sphere_vertices_d[n]), - reinterpret_cast(sphere_edges_d[n]), - nedges, - shDirTemp0_d[n], - slinesOffs_d[n]); - } else { - shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt + sizeof(int)*(THR_X_BL/THR_X_SL)*n32dimt; - getNumStreamlinesProb_k - <<>>( - max_angle, - relative_peak_thresh, - min_separation_angle, - rng_seed, - nseeds_gpu, - reinterpret_cast(seeds_d[n]), - dimx, - dimy, - dimz, - dimt, - dataf_d[n], - reinterpret_cast(sphere_vertices_d[n]), - reinterpret_cast(sphere_edges_d[n]), - nedges, - shDirTemp0_d[n], - slinesOffs_d[n]); - } - } - - std::vector slinesOffs_h; - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - //std::vector slinesOffs_h; - int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu)); - if (nseeds_gpu == 0) { - nSlines_h[n] = 0; - continue; - } - slinesOffs_h.resize(nseeds_gpu+1); - CHECK_CUDA(cudaMemcpy(slinesOffs_h.data(), slinesOffs_d[n], sizeof(*slinesOffs_h.data())*(nseeds_gpu+1), cudaMemcpyDeviceToHost)); - - int __pval = slinesOffs_h[0]; - slinesOffs_h[0] = 0; - for(int i = 1; i < nseeds_gpu+1; i++) { - const int __cval = slinesOffs_h[i]; - slinesOffs_h[i] = slinesOffs_h[i-1] + __pval; - __pval = __cval; - } - nSlines_h[n] = slinesOffs_h[nseeds_gpu]; - CHECK_CUDA(cudaMemcpy(slinesOffs_d[n], slinesOffs_h.data(), sizeof(*slinesOffs_d[n])*(nseeds_gpu+1), cudaMemcpyHostToDevice)); - } - - std::vector slineSeed_d(ngpus, nullptr); - - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu)); - - CHECK_CUDA(cudaMalloc(&slineSeed_d[n], sizeof(*slineSeed_d[n])*nSlines_h[n])); - CHECK_CUDA(cudaMemset(slineSeed_d[n], -1, sizeof(*slineSeed_d[n])*nSlines_h[n])); - - // Allocate/reallocate output arrays if necessary - if (nSlines_h[n] > EXCESS_ALLOC_FACT*nSlines_old_h[n]) { - if(slines_h[n]) cudaFreeHost(slines_h[n]); - if(slinesLen_h[n]) cudaFreeHost(slinesLen_h[n]); - slines_h[n] = nullptr; - slinesLen_h[n] = nullptr; - } - -#ifdef DEBUG - printf("buffer size %zu\n", sizeof(*slines_h[n])*EXCESS_ALLOC_FACT*2*3*MAX_SLINE_LEN*nSlines_h[n]); -#endif - - if (!slines_h[n]) CHECK_CUDA(cudaMallocHost(&slines_h[n], sizeof(*slines_h[n])*EXCESS_ALLOC_FACT*2*3*MAX_SLINE_LEN*nSlines_h[n])); - if (!slinesLen_h[n]) CHECK_CUDA(cudaMallocHost(&slinesLen_h[n], sizeof(*slinesLen_h[n])*EXCESS_ALLOC_FACT*nSlines_h[n])); - } - - //if (nSlines_h) { - - std::vector slineLen_d(ngpus, nullptr); - std::vector sline_d(ngpus, nullptr); - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - CHECK_CUDA(cudaMalloc(&slineLen_d[n], sizeof(*slineLen_d[n])*nSlines_h[n])); - - CHECK_CUDA(cudaMalloc(&sline_d[n], sizeof(*sline_d[n])*2*MAX_SLINE_LEN*nSlines_h[n])); - -#if 0 - size_t free_mem, total_mem; - CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem)); - std::cerr << "GPU " << n << ": "; - std::cerr << "GPU Memory Usage before genStreamlinesMerge_k: "; - std::cerr << (total_mem-free_mem)/(1024*1024) << " MiB used, "; - std::cerr << total_mem/(1024*1024) << " MiB total "; - std::cerr << std::endl; -#endif - } - - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu)); - if (nseeds_gpu == 0) continue; - dim3 block(THR_X_SL, THR_X_BL/THR_X_SL); - dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL)); -#if 0 - std::cerr << "GPU " << n << ": "; - std::cerr << "Generating " << nSlines_h[n] << " streamlines (from " << nseeds_gpu << " seeds)" << std::endl; -#endif - - //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y); - switch(model_type) { // TODO: these may be better as separate functions, not as template specializations - case OPDT: - case CSA: - BootCtx* d_ctx; - BootCtx h_ctx; - h_ctx.min_signal = min_signal; - h_ctx.delta_nr = delta_nr; - h_ctx.H = H_d[n]; - h_ctx.R = R_d[n]; - h_ctx.delta_b = delta_b_d[n]; - h_ctx.delta_q = delta_q_d[n]; - h_ctx.sampling_matrix = sampling_matrix_d[n]; - h_ctx.b0s_mask = b0s_mask_d[n]; - CHECK_CUDA(cudaMalloc(&d_ctx, sizeof(BootCtx))); - CHECK_CUDA(cudaMemcpyAsync( - d_ctx, &h_ctx, sizeof(BootCtx), - cudaMemcpyHostToDevice, streams[n])); - - if (model_type == OPDT) { - genStreamlinesMerge_k <<>>( - max_angle, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, - rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), - dimx, dimy, dimz, dimt, dataf_d[n], - metric_map_d[n], d_ctx, samplm_nr, - reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), - nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); - } else if (model_type == CSA) { - genStreamlinesMerge_k <<>>( - max_angle, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, - rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), - dimx, dimy, dimz, dimt, dataf_d[n], - metric_map_d[n], d_ctx, samplm_nr, - reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), - nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); - } else { - // Should never reach here - } - - CHECK_CUDA(cudaFree(d_ctx)); - break; - - case PROB: - // Shared memory requirements are smaller for probabilistic for main run - // than for preliminary run - shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt; - genStreamlinesMerge_k <<>>( - max_angle, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, - rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), - dimx, dimy, dimz, dimt, dataf_d[n], - metric_map_d[n], nullptr, samplm_nr, - reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), - nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); - break; - - case PTT: - shSizeGNS = 0; // PTT uses exclusively static shared memory - genStreamlinesMerge_k <<>>( - max_angle, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, - rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), - dimx, dimy, dimz, dimt, dataf_d[n], - metric_map_d[n], nullptr, samplm_nr, - reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), - nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); - break; - - default: - printf("FATAL: Invalid Model Type.\n"); - break; - } - - CHECK_ERROR("genStreamlinesMerge_k"); - } - - //CHECK_CUDA(cudaDeviceSynchronize()); - - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - CHECK_CUDA(cudaMemcpyAsync(slines_h[n], - reinterpret_cast(sline_d[n]), - sizeof(*slines_h[n])*2*MAX_SLINE_LEN*nSlines_h[n]*3, - cudaMemcpyDeviceToHost, streams[n])); - CHECK_CUDA(cudaMemcpyAsync(slinesLen_h[n], - slineLen_d[n], - sizeof(*slinesLen_h[n])*nSlines_h[n], - cudaMemcpyDeviceToHost, streams[n])); - - } - //}; - - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - CHECK_CUDA(cudaStreamSynchronize(streams[n])); - CHECK_CUDA(cudaFree(slineSeed_d[n])); - CHECK_CUDA(cudaFree(slinesOffs_d[n])); - CHECK_CUDA(cudaFree(shDirTemp0_d[n])); - CHECK_CUDA(cudaFree(slineLen_d[n])); - CHECK_CUDA(cudaFree(sline_d[n])); - } - -} - -#if 1 -void write_trk(const char *fname, - const /*short*/ int *dims, - const REAL *voxel_size, - const char *voxel_order, - const REAL *vox_to_ras, - const int nsline, - const int *slineLen, - const REAL3 *sline) { - - FILE *fp = fopen(fname, "w"); - if (!fp) { - fprintf(stderr, "Cannot open file %s for writing...\n", fname); - exit(EXIT_FAILURE); - } - - const char ID_STRING[6] = "TRACK"; - short DIM[3] = {1, 1, 1}; - float VOXEL_SIZE[3] = {1.0f, 1.0f, 1.0f}; - float VOX_TO_RAS[4][4] = {{1.0f, 0.0f, 0.0, 0.0f}, - {0.0f, 1.0f, 0.0, 0.0f}, - {0.0f, 0.0f, 1.0, 0.0f}, - {0.0f, 0.0f, 0.0, 1.0f}}; - //const char VOXEL_ORDER[2][4] = {"RAS", "LAS"}; - const float ORIGIN[3] = {0.0f, 0.0f, 0.0f}; - const float IMAGE_ORIENTATION_PATIENT[6] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; - const int VERSION = 2; - const int HDR_SIZE = 1000; - - // write header - unsigned char header[1000]; - memset(&header[0], 0, sizeof(header)); - - long long int off = 0; - - memcpy(header, ID_STRING, sizeof(ID_STRING)); - off += sizeof(ID_STRING); - - if (dims) { - DIM[0] = dims[0]; - DIM[1] = dims[1]; - DIM[2] = dims[2]; - } - memcpy(header+off, DIM, sizeof(DIM)); - off += sizeof(DIM); - - if (voxel_size) { - VOXEL_SIZE[0] = (float)voxel_size[0]; - VOXEL_SIZE[1] = (float)voxel_size[1]; - VOXEL_SIZE[2] = (float)voxel_size[2]; - } - memcpy(header+off, VOXEL_SIZE, sizeof(VOXEL_SIZE)); - off += sizeof(VOXEL_SIZE); - - memcpy(header+off, ORIGIN, sizeof(ORIGIN)); - off += sizeof(ORIGIN); - - // skip n_scalaer(2b) + scalar_name(200b) + - // n_properties(2b) + property_name(200b) - off += 404; - - if (vox_to_ras) { - for(int i = 0; i < 4; i++) { - for(int j = 0; j < 4; j++) { - VOX_TO_RAS[i][j] = (float)vox_to_ras[i*4+j]; - } - } - } - memcpy(header+off, VOX_TO_RAS, sizeof(VOX_TO_RAS)); - off += sizeof(VOX_TO_RAS); - - // skip reserved(444b) - off += 444; - - if (voxel_order) { - memcpy(header+off, voxel_order, 4); - } else { - memcpy(header+off, "LAS", 4); - } - off += 4; //sizeof(VOXEL_ORDER[voxel_order]); - - // skip pad2(4b) - off += 4; - - memcpy(header+off, IMAGE_ORIENTATION_PATIENT, sizeof(IMAGE_ORIENTATION_PATIENT)); - off += sizeof(IMAGE_ORIENTATION_PATIENT); - - // skip pad1(2b) - off += 2; - - // skip invert_x(1b), invert_y(1b), invert_x(1b), swap_xy(1b), swap_yz(1b), swap_zx(1b) - off += 6; - - memcpy(header+off, &nsline, sizeof(int)); - off += sizeof(int); - - memcpy(header+off, &VERSION, sizeof(VERSION)); - off += sizeof(VERSION); - - memcpy(header+off, &HDR_SIZE, sizeof(HDR_SIZE)); - off += sizeof(HDR_SIZE); - - //assert(off == 1000); - if (off != 1000) { - fprintf(stderr, "%s:%s:%d: heder size = %lld, (!= 1000)!\n", __FILE__, __func__, __LINE__, off); - exit(EXIT_FAILURE); - } - - size_t nw = fwrite(header, sizeof(header), 1, fp); - if (nw != 1) { - fprintf(stderr, "Error while writing to file!\n"); - exit(EXIT_FAILURE); - } -#if 0 - // write body - long long maxSlineLen = slineLen[0]; - for(long long i = 1; i < nsline; i++) { - maxSlineLen = MAX(maxSlineLen, slineLen[i]); - } - - float *slineData = (float *)Malloc((1+3*maxSlineLen)*sizeof(*slineData)); -#else - float slineData[1 + 3*(2*MAX_SLINE_LEN)]; -#endif - for(int i = 0; i < nsline; i++) { - reinterpret_cast(slineData)[0] = slineLen[i]; - for(int j = 0; j < slineLen[i]; j++) { - slineData[1+3*j+0] = (float)((sline[i*2*MAX_SLINE_LEN + j].x+0.5)*VOXEL_SIZE[0]); - slineData[1+3*j+1] = (float)((sline[i*2*MAX_SLINE_LEN + j].y+0.5)*VOXEL_SIZE[1]); - slineData[1+3*j+2] = (float)((sline[i*2*MAX_SLINE_LEN + j].z+0.5)*VOXEL_SIZE[2]); - } - nw = fwrite(slineData, (1+3*slineLen[i])*sizeof(*slineData), 1, fp); - if (nw != 1) { - fprintf(stderr, "Error while writing to file!\n"); - exit(EXIT_FAILURE); - } - } -#if 0 - free(slineData); -#endif - fclose(fp); - - return; -} -#else -void write_trk(const int num_threads, - const char *fname, - const /*short*/ int *dims, - const REAL *voxel_size, - const char *voxel_order, - const REAL *vox_to_ras, - const int nsline, - const int *slineLen, - const REAL3 *sline) { - - FILE *fp = fopen(fname, "w"); - if (!fp) { - fprintf(stderr, "Cannot open file %s for writing...\n", fname); - exit(EXIT_FAILURE); - } - - const char ID_STRING[6] = "TRACK"; - short DIM[3] = {1, 1, 1}; - float VOXEL_SIZE[3] = {1.0f, 1.0f, 1.0f}; - float VOX_TO_RAS[4][4] = {{1.0f, 0.0f, 0.0, 0.0f}, - {0.0f, 1.0f, 0.0, 0.0f}, - {0.0f, 0.0f, 1.0, 0.0f}, - {0.0f, 0.0f, 0.0, 1.0f}}; - //const char VOXEL_ORDER[2][4] = {"RAS", "LAS"}; - const float ORIGIN[3] = {0.0f, 0.0f, 0.0f}; - const float IMAGE_ORIENTATION_PATIENT[6] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; - const int VERSION = 2; - const int HDR_SIZE = 1000; - - // write header - unsigned char header[1000]; - memset(&header[0], 0, sizeof(header)); - - long long int off = 0; - - memcpy(header, ID_STRING, sizeof(ID_STRING)); - off += sizeof(ID_STRING); - - if (dims) { - DIM[0] = dims[0]; - DIM[1] = dims[1]; - DIM[2] = dims[2]; - } - memcpy(header+off, DIM, sizeof(DIM)); - off += sizeof(DIM); - - if (voxel_size) { - VOXEL_SIZE[0] = (float)voxel_size[0]; - VOXEL_SIZE[1] = (float)voxel_size[1]; - VOXEL_SIZE[2] = (float)voxel_size[2]; - } - memcpy(header+off, VOXEL_SIZE, sizeof(VOXEL_SIZE)); - off += sizeof(VOXEL_SIZE); - - memcpy(header+off, ORIGIN, sizeof(ORIGIN)); - off += sizeof(ORIGIN); - - // skip n_scalaer(2b) + scalar_name(200b) + - // n_properties(2b) + property_name(200b) - off += 404; - - if (vox_to_ras) { - for(int i = 0; i < 4; i++) { - for(int j = 0; j < 4; j++) { - VOX_TO_RAS[i][j] = (float)vox_to_ras[i*4+j]; - } - } - } - memcpy(header+off, VOX_TO_RAS, sizeof(VOX_TO_RAS)); - off += sizeof(VOX_TO_RAS); - - // skip reserved(444b) - off += 444; - - if (voxel_order) { - memcpy(header+off, voxel_order, 4); - } else { - memcpy(header+off, "LAS", 4); - } - off += 4; //sizeof(VOXEL_ORDER[voxel_order]); - - // skip pad2(4b) - off += 4; - - memcpy(header+off, IMAGE_ORIENTATION_PATIENT, sizeof(IMAGE_ORIENTATION_PATIENT)); - off += sizeof(IMAGE_ORIENTATION_PATIENT); - - // skip pad1(2b) - off += 2; - - // skip invert_x(1b), invert_y(1b), invert_x(1b), swap_xy(1b), swap_yz(1b), swap_zx(1b) - off += 6; - - memcpy(header+off, &nsline, sizeof(int)); - off += sizeof(int); - - memcpy(header+off, &VERSION, sizeof(VERSION)); - off += sizeof(VERSION); - - memcpy(header+off, &HDR_SIZE, sizeof(HDR_SIZE)); - off += sizeof(HDR_SIZE); - - //assert(off == 1000); - if (off != 1000) { - fprintf(stderr, "%s:%s:%d: heder size = %lld, (!= 1000)!\n", __FILE__, __func__, __LINE__, off); - exit(EXIT_FAILURE); - } - - size_t nw = fwrite(header, sizeof(header), 1, fp); - if (nw != 1) { - fprintf(stderr, "Error while writing to file!\n"); - exit(EXIT_FAILURE); - } - - // write body - long long maxSlineLen = slineLen[0]; - for(long long i = 1; i < nsline; i++) { - maxSlineLen = MAX(maxSlineLen, slineLen[i]); - } - - //omp_set_dynamic(0); - const int NTHREADS = num_threads > 0 ? num_threads : 1; - omp_set_num_threads(NTHREADS); - - const int NFLTS_PER_TH = 1 + 2*(3*MAX_SLINE_LEN); - float *slineData = (float *)Malloc(NFLTS_PER_TH*NTHREADS*sizeof(*slineData)); - - #pragma omp parallel - { - const int tid = omp_get_thread_num(); - float *__mySlineData = slineData+tid*NFLTS_PER_TH; -#if 1 - //#pragma omp for schedule(static) - for(int i = 0; i < nsline; i += NTHREADS) { - if (i+tid < nsline) { - reinterpret_cast(__mySlineData)[0] = slineLen[i+tid]; - for(int j = 0; j < slineLen[i+tid]; j++) { - __mySlineData[1+3*j+0] = (float)((sline[(i+tid)*2*MAX_SLINE_LEN + j].x+0.5)*VOXEL_SIZE[0]); - __mySlineData[1+3*j+1] = (float)((sline[(i+tid)*2*MAX_SLINE_LEN + j].y+0.5)*VOXEL_SIZE[1]); - __mySlineData[1+3*j+2] = (float)((sline[(i+tid)*2*MAX_SLINE_LEN + j].z+0.5)*VOXEL_SIZE[2]); - } - } - #pragma omp barrier - if (tid == 0) { - for(int j = 0; j < NTHREADS; j++) { - if (i+j >= nsline) { - break; - } - nw = fwrite(slineData+j*NFLTS_PER_TH, (1+3*slineLen[i+j])*sizeof(*slineData), 1, fp); - if (nw != 1) { - fprintf(stderr, "Error while writing to file!\n"); - exit(EXIT_FAILURE); - } - } - } - #pragma omp barrier - } -#else - // streamlines are not required to be in any specific order inside the trk file... - #pragma omp for - for(int i = 0; i < nsline; i++) { - reinterpret_cast(__mySlineData)[0] = slineLen[i]; - for(int j = 0; j < slineLen[i]; j++) { - __mySlineData[1+3*j+0] = (float)((sline[i*2*MAX_SLINE_LEN + j].x+0.5)*VOXEL_SIZE[0]); - __mySlineData[1+3*j+1] = (float)((sline[i*2*MAX_SLINE_LEN + j].y+0.5)*VOXEL_SIZE[1]); - __mySlineData[1+3*j+2] = (float)((sline[i*2*MAX_SLINE_LEN + j].z+0.5)*VOXEL_SIZE[2]); - } - nw = fwrite(__mySlineData, (1+3*slineLen[i])*sizeof(*__mySlineData), 1, fp); - if (nw != 1) { - fprintf(stderr, "Error while writing to file!\n"); - exit(EXIT_FAILURE); - } - } -#endif - } - free(slineData); - fclose(fp); - - return; -} -#endif -#endif // __NVRTC__ diff --git a/cuslines/generate_streamlines_cuda.h b/cuslines/generate_streamlines_cuda.h deleted file mode 100644 index 14105ce..0000000 --- a/cuslines/generate_streamlines_cuda.h +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __GENERATE_STREAMLINES_CUDA_H__ -#define __GENERATE_STREAMLINES_CUDA_H__ - -#include - -#include "globals.h" - -void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size, - const REAL relative_peak_thresh, const REAL min_separation_angle, - const int nseeds, const std::vector &seeds_d, - const int dimx, const int dimy, const int dimz, const int dimt, - const std::vector &dataf_d, const std::vector &H_d, const std::vector &R_d, - const int delta_nr, - const std::vector &delta_b_d, const std::vector &delta_q_d, - const std::vector &b0s_mask_d, const std::vector &metric_map_d, - const int samplm_nr, - const std::vector &sampling_matrix_d, - const std::vector &sphere_vertices_d, const std::vector &sphere_edges_d, const int nedges, - std::vector &slines_h, std::vector &slinesLen_h, std::vector &nSlines_h, - const std::vector nSlines_old_h, const int rng_seed, const int rng_offset, - const int ngpus, const std::vector &streams); -#if 1 -void write_trk(const char *fname, - const /*short*/ int *dims, - const REAL *voxel_size, - const char *voxel_order, - const REAL *vox_to_ras, - const int nsline, - const int *slineLen, - const REAL3 *sline); -#else -void write_trk(const int num_threads, - const char *fname, - const /*short*/ int *dims, - const REAL *voxel_size, - const char *voxel_order, - const REAL *vox_to_ras, - const int nsline, - const int *slineLen, - const REAL3 *sline); -#endif -#endif diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu index 894d0bf..5684272 100644 --- a/cuslines/ptt.cu +++ b/cuslines/ptt.cu @@ -295,7 +295,7 @@ __device__ int get_direction_ptt_d( REAL3_T *__probing_pos_sh = probing_pos_sh + tidy; const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1)); - const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / step_size; + const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / (step_size / PROBE_FRAC); // This seems to work well const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d(dimt, pmf, REAL_MIN); #if 0 diff --git a/merge_trk.sh b/merge_trk.sh deleted file mode 100755 index c47412f..0000000 --- a/merge_trk.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -usage() { - echo "$(basename $0) [ -v ] -o trk_outfile ..." -} - -if [ $# -eq 0 ]; then - usage - exit 1 -fi - -OUT_FILE="" -VERBOSE="" - -OPTS=$(getopt -o "vho:" -- "$@") -eval set -- "$OPTS" - -while true; do - case "$1" in - -o) - OUT_FILE=$2 - shift - shift - ;; - -v) - VERBOSE="1" - shift - ;; - -h) - usage - exit 1 - ;; - --) - shift - break - ;; - esac -done - -if [ -z $OUT_FILE ]; then - echo "Please provide an output file name with the -o option!" - exit 1 -fi - -# necessary when running via docker to expand again -# the parameter list turning spaces into separators -set -- $* - -TRK_FILES=("$@") -NTRKF=$(($#)) - -#echo $TRK_FILES -#echo $NTRKF - -if [ $VERBOSE ]; then - echo "Merging $NTRKF files into $OUT_FILE..." -fi - -head -c1000 ${TRK_FILES[0]} > $OUT_FILE - -NTRACK=0 -for((i=0; i<$NTRKF; i++)); do - if [ $VERBOSE ]; then - printf "%8d/%8d\r" $i $NTRKF - fi - NTRACK=$(($NTRACK + $(od -A none -t dI -j 988 -N4 ${TRK_FILES[$i]}))); - tail -c+1001 ${TRK_FILES[$i]} >> $OUT_FILE -done - -NTRACK=$(printf "%08X" $NTRACK) - -printf "\x${NTRACK:6:2}\x${NTRACK:4:2}\x${NTRACK:2:2}\x${NTRACK:0:2}" | dd of=$OUT_FILE bs=1 seek=988 count=4 conv=notrunc &> /dev/null diff --git a/pyproject.toml b/pyproject.toml index a1247c5..67877c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,9 @@ requires-python = ">=3.7" dependencies = [ "numpy", "nibabel", + "tqdm", + "dipy", + "trx-python", "cuda-python", "cuda-core", "cuda-cccl" diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py index 7585e37..89bbd04 100644 --- a/run_gpu_streamlines.py +++ b/run_gpu_streamlines.py @@ -27,10 +27,9 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import argparse # TODO: do this again, but for cuda python version +import argparse # TODO: get this running, commit it, then run clode cleaner import random import time -import zipfile import numpy as np @@ -41,22 +40,28 @@ from dipy.tracking import utils from dipy.core.gradients import gradient_table, unique_bvals_magnitude from dipy.data import default_sphere -from dipy.direction import (BootDirectionGetter, ProbabilisticDirectionGetter, PTTDirectionGetter) +from dipy.direction import ( + BootDirectionGetter as cpu_BootDirectionGetter, + ProbabilisticDirectionGetter as cpu_ProbDirectionGetter, + PTTDirectionGetter as cpu_PTTDirectionGetter) from dipy.reconst.shm import OpdtModel, CsaOdfModel from dipy.reconst.csdeconv import ConstrainedSphericalDeconvModel, auto_response_ssst from dipy.tracking.local_tracking import LocalTracking from dipy.tracking.stopping_criterion import ThresholdStoppingCriterion -from dipy.reconst import shm from dipy.data import get_fnames from dipy.data import read_stanford_pve_maps import nibabel as nib from nibabel.orientations import aff2axcodes -from trx.trx_file_memmap import TrxFile, zip_from_folder +from trx.io import save as save_trx -# Import custom module -import cuslines +from cuslines import ( + BootDirectionGetter, + GPUTracker, + ProbDirectionGetter, + PttDirectionGetter, +) t0 = time.time() @@ -86,7 +91,7 @@ def get_img(ep2_seq): parser.add_argument("--chunk-size", type=int, default=100000, help="how many seeds to process per sweep, per GPU") parser.add_argument("--nseeds", type=int, default=100000, help="how many seeds to process in total") parser.add_argument("--ngpus", type=int, default=1, help="number of GPUs to use if using gpu") -parser.add_argument("--write-method", type=str, default="fast", help="Can be trx, fast, or standard") +parser.add_argument("--write-method", type=str, default="trk", help="Can be trx or trk") parser.add_argument("--max-angle", type=float, default=60, help="max angle (in degrees)") parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0") parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5") @@ -100,9 +105,9 @@ def get_img(ep2_seq): args = parser.parse_args() -if args.device == "cpu" and args.write_method != "standard": - print("WARNING: only standard write method is implemented for cpu testing.") - write_method = "standard" +if args.device == "cpu" and args.write_method != "trk": + print("WARNING: only trk write method is implemented for cpu testing.") + write_method = "trk" else: write_method = args.write_method @@ -137,15 +142,11 @@ def get_img(ep2_seq): tenfit = tenmodel.fit(data, mask) print('Computing anisotropy measures (FA,MD,RGB)') FA = tenfit.fa -FA[np.isnan(FA)] = 0 # Setup tissue_classifier args tissue_classifier = ThresholdStoppingCriterion(FA, args.fa_threshold) -metric_map = np.asarray(FA, 'float64') # Create seeds for ROI -# seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4)) -# seed_mask = seed_mask[0:args.nseeds] seed_mask = np.asarray(utils.random_seeds_from_mask( roi_data, seeds_count=args.nseeds, seed_count_per_voxel=False, @@ -154,20 +155,27 @@ def get_img(ep2_seq): # Setup model sphere = default_sphere if args.model == "opdt": - model_type = cuslines.ModelType.OPDT - print("Running OPDT model...") - model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal) - fit_matrix = model._fit_matrix - delta_b, delta_q = fit_matrix + if args.device == "cpu": + model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal) + dg = cpu_BootDirectionGetter + else: + dg = BootDirectionGetter.from_dipy_opdt( + gtab, + sphere, + sh_order_max=args.sh_order, + sh_lambda=args.sm_lambda, + min_signal=args.min_signal) elif args.model == "csa": - model_type = cuslines.ModelType.CSA - print("Running CSA model...") - model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal) - fit_matrix = model._fit_matrix - # Unlike OPDT, CSA has a single matrix used for fit_matrix. Populating delta_b and delta_q with necessary values for - # now. - delta_b = fit_matrix - delta_q = fit_matrix + if args.device == "cpu": + model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal) + dg = cpu_BootDirectionGetter + else: + dg = BootDirectionGetter.from_dipy_csa( + gtab, + sphere, + sh_order_max=args.sh_order, + sh_lambda=args.sm_lambda, + min_signal=args.min_signal) else: print("Running CSD model...") unique_bvals = unique_bvals_magnitude(gtab.bvals) @@ -185,158 +193,64 @@ def get_img(ep2_seq): roi_radii=10, fa_thr=0.7) model = ConstrainedSphericalDeconvModel(gtab, response, sh_order=args.sh_order) - # TODO: we shouldnt have to do this, also for CSA, but we populate delta_b, delta_q. - # we need to name change delta_b/delta_q and make it possible for them to be None, or something like this - delta_b = model._X - delta_q = model.B_reg - -if args.dg != "boot": - if args.dg == "prob": - model_type = cuslines.ModelType.PROB - dg = ProbabilisticDirectionGetter - else: - model_type = cuslines.ModelType.PTT - dg = PTTDirectionGetter - fit = model.fit(data, mask=(metric_map >= args.fa_threshold)) + fit = model.fit(data, mask=(FA >= args.fa_threshold)) data = fit.odf(sphere).clip(min=0) -else: - dg = BootDirectionGetter - -global_chunk_size = args.chunk_size + if args.model == "ptt": + if args.device == "cpu": + dg = cpu_PTTDirectionGetter() + else: + # Set FOD to 0 outside mask for probing + data[FA < args.fa_threshold, :] = 0 + dg = PttDirectionGetter() + elif args.model == "prob": + if args.device == "cpu": + dg = cpu_ProbDirectionGetter() + else: + dg = ProbDirectionGetter() + else: + raise ValueError("Unknown model type: {}".format(args.model)) # Setup direction getter args if args.device == "cpu": if args.dg != "boot": dg = dg.from_pmf(data, max_angle=args.max_angle, sphere=sphere, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle) else: - dg = BootDirectionGetter.from_data(data, model, max_angle=args.max_angle, sphere=sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle) -else: - # Setup direction getter args - b0s_mask = gtab.b0s_mask - dwi_mask = ~b0s_mask - - # setup sampling matrix - theta = sphere.theta - phi = sphere.phi - sampling_matrix, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi) - - ## from BootPmfGen __init__ - # setup H and R matrices - # TODO: figure out how to get H, R matrices from direction getter object - x, y, z = model.gtab.gradients[dwi_mask].T - r, theta, phi = shm.cart2sphere(x, y, z) - B, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi) - H = shm.hat(B) - R = shm.lcr_matrix(H) - - # create floating point copy of data - dataf = np.asarray(data, dtype=np.float64) - - gpu_tracker = cuslines.GPUTracker(model_type, - args.max_angle * np.pi/180, - args.min_signal, - args.fa_threshold, - args.step_size, - args.relative_peak_threshold, - args.min_separation_angle * np.pi/180, - dataf.astype(np.float64), H.astype(np.float64), R.astype(np.float64), delta_b.astype(np.float64), delta_q.astype(np.float64), - b0s_mask.astype(np.int32), metric_map.astype(np.float64), sampling_matrix.astype(np.float64), - sphere.vertices.astype(np.float64), sphere.edges.astype(np.int32), - ngpus=args.ngpus, rng_seed=0) - -print('streamline gen') -nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size - -t1 = time.time() -streamline_time = 0 -io_time = 0 - -if args.output_prefix and write_method == "trx": - # Will resize by a factor of 2 if these are exceeded - sl_len_guess = 100 - sl_per_seed_guess = 3 - n_sls_guess = sl_per_seed_guess*len(seed_mask) - - # trx files use memory mapping - trx_file = TrxFile( - reference=hardi_nifti_fname, - nb_streamlines=n_sls_guess, - nb_vertices=n_sls_guess*sl_len_guess) - offsets_idx = 0 - sls_data_idx = 0 - -for idx in range(int(nchunks)): - # Main streamline computation - ts = time.time() - if args.device == "cpu": - streamline_generator = LocalTracking(dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=np.eye(4), step_size=args.step_size) - streamlines = [s for s in streamline_generator] - else: - streamlines = gpu_tracker.generate_streamlines(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size]) - te = time.time() - streamline_time += (te-ts) - print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines), - seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0], - te-ts)) + dg = dg.from_data(data, model, max_angle=args.max_angle, sphere=sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle) - # Save tracklines file - if args.output_prefix: ts = time.time() - if write_method == "standard": - fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks) - sft = StatefulTractogram(streamlines, args.nifti_file, Space.VOX) - save_tractogram(sft, fname) - te = time.time() - print("Saved streamlines to {}, time {} s".format(fname, te-ts)) - elif write_method == "trx": - tractogram = nib.streamlines.Tractogram(streamlines, affine_to_rasmm=img.affine) - tractogram.to_world() - sls = tractogram.streamlines - - new_offsets_idx = offsets_idx + len(sls._offsets) - new_sls_data_idx = sls_data_idx + len(sls._data) - - if new_offsets_idx > trx_file.header["NB_STREAMLINES"]\ - or new_sls_data_idx > trx_file.header["NB_VERTICES"]: - print("TRX resizing...") - trx_file.resize(nb_streamlines=new_offsets_idx*2, nb_vertices=new_sls_data_idx*2) - - # TRX uses memmaps here - trx_file.streamlines._data[sls_data_idx:new_sls_data_idx] = sls._data - trx_file.streamlines._offsets[offsets_idx:new_offsets_idx] = offsets_idx + sls._offsets - trx_file.streamlines._lengths[offsets_idx:new_offsets_idx] = sls._lengths - - offsets_idx = new_offsets_idx - sls_data_idx = new_sls_data_idx - - te = time.time() - print("Streamlines to TRX format, time {} s".format(te-ts)) - else: - fname = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks) - gpu_tracker.dump_streamlines(fname, voxel_order, wm.shape, wm.header.get_zooms(), img.affine) - te = time.time() - print("Saved streamlines to {}, time {} s".format(fname, te-ts)) - - io_time += (te-ts) - -if args.output_prefix and write_method == "trx": - ts = time.time() - fname = "{}.trx".format(args.output_prefix) - trx_file.resize() - zip_from_folder( - trx_file._uncompressed_folder_handle.name, - fname, - zipfile.ZIP_STORED) - trx_file.close() - te = time.time() - print("Saved streamlines to {}, time {} s".format(fname, te-ts)) - io_time += (te-ts) - -t2 = time.time() + streamline_generator = LocalTracking(dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=args.step_size) + sft = StatefulTractogram(streamline_generator, img, Space.VOX) + te = time.time() +else: + with GPUTracker( + dg, + data, + FA, + args.fa_threshold, + sphere.vertices, + sphere.edges, + max_angle=args.max_angle * np.pi/180, + step_size=args.step_size, + relative_peak_thresh=args.relative_peak_threshold, + min_separation_angle=args.min_separation_angle * np.pi/180, + ngpus=args.ngpus, + rng_seed=0, + chunk_size=args.chunk_size + ) as gpu_tracker: + ts = time.time() + if args.output_prefix and write_method == "trx": + trx_file = gpu_tracker.generate_trx(seed_mask, img) + else: + sft = gpu_tracker.generate_sft(seed_mask, img) + te = time.time() +print("Generated {} streamlines from {} seeds, time: {} s".format(len(sft.streamlines), + seed_mask.shape[0], + te-ts)) -print("Completed processing {} seeds.".format(seed_mask.shape[0])) -print("Initialization time: {} sec".format(t1-t0)) -print("Streamline generation total time: {} sec".format(t2-t1)) -print("\tStreamline processing: {} sec".format(streamline_time)) if args.output_prefix: - print("\tFile writing: {} sec".format(io_time)) + if write_method == "trx": + fname = "{}.trx".format(args.output_prefix) + save_trx(trx_file, fname) + else: + fname = "{}.trk".format(args.output_prefix) + save_tractogram(sft, fname) diff --git a/setup.py b/setup.py index cd53ade..a392cc6 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,6 @@ from setuptools import setup from setuptools.command.build_py import build_py from pathlib import Path -import subprocess import re From 10f2fa3426a464c5be8e6d8598ee72f75f7598fa Mon Sep 17 00:00:00 2001 From: 36000 Date: Tue, 6 Jan 2026 16:54:47 -0800 Subject: [PATCH 23/31] remove todo comment --- run_gpu_streamlines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py index 89bbd04..06c61c6 100644 --- a/run_gpu_streamlines.py +++ b/run_gpu_streamlines.py @@ -27,7 +27,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import argparse # TODO: get this running, commit it, then run clode cleaner +import argparse import random import time From a751e8dfc563b3bba3291765cab8fa09a3f81356 Mon Sep 17 00:00:00 2001 From: 36000 Date: Tue, 6 Jan 2026 17:45:45 -0800 Subject: [PATCH 24/31] trying to fix boot --- cuslines/cuda_python/cu_direction_getters.py | 36 +++++++++----------- cuslines/cuda_python/cutils.py | 4 +-- cuslines/generate_streamlines_cuda.cu | 4 +-- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py index 9901fc3..5e03600 100644 --- a/cuslines/cuda_python/cu_direction_getters.py +++ b/cuslines/cuda_python/cu_direction_getters.py @@ -1,7 +1,7 @@ import numpy as np from abc import ABC, abstractmethod -import logging import ctypes +import logging from importlib.resources import files from time import time @@ -18,12 +18,11 @@ REAL_DTYPE, REAL_DTYPE_AS_STR, REAL3_DTYPE_AS_STR, - REAL_DTYPE_AS_CTYPE, checkCudaErrors, ModelType, THR_X_SL, BLOCK_Y, - DEV_PTR, + REAL_DTYPE_AS_CTYPE, ) logger = logging.getLogger("GPUStreamlines") @@ -95,13 +94,12 @@ class _BootCtx(ctypes.Structure): _fields_ = [ ("min_signal", REAL_DTYPE_AS_CTYPE), ("delta_nr", ctypes.c_int32), - ("H", ctypes.c_void_p), - ("R", ctypes.c_void_p), - ("delta_b", ctypes.c_void_p), - ("delta_q", ctypes.c_void_p), - ("sampling_matrix", ctypes.c_void_p), - ("b0s_mask", ctypes.c_void_p), - ] + ("H", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), + ("R", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), + ("delta_b", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), + ("delta_q", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), + ("sampling_matrix", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), + ("b0s_mask", ctypes.POINTER(ctypes.c_int32))] class BootDirectionGetter(GPUDirectionGetter): @@ -239,12 +237,13 @@ def allocate_on_gpu(self, n): ctypes.sizeof(_BootCtx)))) self.ctx_h.append(_BootCtx( min_signal=self.min_signal, - H=self.H_d[n], - R=self.R_d[n], - delta_b=self.delta_b_d[n], - delta_q=self.delta_q_d[n], - sampling_matrix=self.sampling_matrix_d[n], - b0s_mask=self.b0s_mask_d[n], + delta_nr=self.delta_nr, + H=ctypes.cast(self.H_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), + R=ctypes.cast(self.R_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), + delta_b=ctypes.cast(self.delta_b_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), + delta_q=ctypes.cast(self.delta_q_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), + sampling_matrix=ctypes.cast(self.sampling_matrix_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), + b0s_mask=ctypes.cast(self.b0s_mask_d[n], ctypes.POINTER(ctypes.c_int32)) )) checkCudaErrors(runtime.cudaMemcpy( @@ -279,10 +278,9 @@ def allocate_on_gpu(self, n): cudaMemcpyKind.cudaMemcpyHostToDevice)) checkCudaErrors(runtime.cudaMemcpy( self.ctx_d[n], - ctypes.byref(self.ctx_h[n]), + ctypes.addressof(self.ctx_h[n]), ctypes.sizeof(_BootCtx), - cudaMemcpyKind.cudaMemcpyHostToDevice - )) + cudaMemcpyKind.cudaMemcpyHostToDevice)) def deallocate_on_gpu(self, n): if self.H_d[n]: diff --git a/cuslines/cuda_python/cutils.py b/cuslines/cuda_python/cutils.py index 9cf164e..2fd688e 100644 --- a/cuslines/cuda_python/cutils.py +++ b/cuslines/cuda_python/cutils.py @@ -19,7 +19,7 @@ class ModelType(IntEnum): REAL_DTYPE = np.float32 REAL3_DTYPE = np.dtype([('x', np.float32), ('y', np.float32), - ('z', np.float32)]) + ('z', np.float32)], align=True) REAL_DTYPE_AS_STR = "float" REAL3_DTYPE_AS_STR = "float3" REAL_DTYPE_AS_CTYPE = ctypes.c_float @@ -27,7 +27,7 @@ class ModelType(IntEnum): REAL_DTYPE = np.float64 REAL3_DTYPE = np.dtype([('x', np.float64), ('y', np.float64), - ('z', np.float64)]) + ('z', np.float64)], align=True) REAL_DTYPE_AS_STR = "double" REAL3_DTYPE_AS_STR = "double3" REAL_DTYPE_AS_CTYPE = ctypes.c_double diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu index 9b04e60..b9a84c2 100644 --- a/cuslines/generate_streamlines_cuda.cu +++ b/cuslines/generate_streamlines_cuda.cu @@ -1208,7 +1208,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st, const REAL_T *__restrict__ dataf, const REAL_T *__restrict__ metric_map, const typename ModelCtx::type* __restrict__ ctx, - const int samplm_nr, + const int samplm_nr, const REAL3_T *__restrict__ sphere_vertices, const int2 *__restrict__ sphere_edges, const int num_edges, @@ -1589,7 +1589,7 @@ __global__ void genStreamlinesMerge_k( const REAL_T *__restrict__ dataf, const REAL_T *__restrict__ metric_map, const typename ModelCtx::type* __restrict__ ctx, - const int samplm_nr, + const int samplm_nr, const REAL3_T *__restrict__ sphere_vertices, const int2 *__restrict__ sphere_edges, const int num_edges, From 785e4219d534bc4036452230a958267514ad9d85 Mon Sep 17 00:00:00 2001 From: 36000 Date: Tue, 6 Jan 2026 17:46:45 -0800 Subject: [PATCH 25/31] move the cuda c stuff into their own folder --- cuslines/{ => cuda_c}/cudamacro.h | 0 cuslines/{ => cuda_c}/cuwsort.cuh | 0 cuslines/{ => cuda_c}/disc.h | 0 cuslines/{ => cuda_c}/generate_streamlines_cuda.cu | 0 cuslines/{ => cuda_c}/globals.h | 0 cuslines/{ => cuda_c}/ptt.cu | 0 cuslines/{ => cuda_c}/ptt.cuh | 0 cuslines/{ => cuda_c}/utils.cu | 0 cuslines/cuda_python/cu_direction_getters.py | 2 +- 9 files changed, 1 insertion(+), 1 deletion(-) rename cuslines/{ => cuda_c}/cudamacro.h (100%) rename cuslines/{ => cuda_c}/cuwsort.cuh (100%) rename cuslines/{ => cuda_c}/disc.h (100%) rename cuslines/{ => cuda_c}/generate_streamlines_cuda.cu (100%) rename cuslines/{ => cuda_c}/globals.h (100%) rename cuslines/{ => cuda_c}/ptt.cu (100%) rename cuslines/{ => cuda_c}/ptt.cuh (100%) rename cuslines/{ => cuda_c}/utils.cu (100%) diff --git a/cuslines/cudamacro.h b/cuslines/cuda_c/cudamacro.h similarity index 100% rename from cuslines/cudamacro.h rename to cuslines/cuda_c/cudamacro.h diff --git a/cuslines/cuwsort.cuh b/cuslines/cuda_c/cuwsort.cuh similarity index 100% rename from cuslines/cuwsort.cuh rename to cuslines/cuda_c/cuwsort.cuh diff --git a/cuslines/disc.h b/cuslines/cuda_c/disc.h similarity index 100% rename from cuslines/disc.h rename to cuslines/cuda_c/disc.h diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/cuda_c/generate_streamlines_cuda.cu similarity index 100% rename from cuslines/generate_streamlines_cuda.cu rename to cuslines/cuda_c/generate_streamlines_cuda.cu diff --git a/cuslines/globals.h b/cuslines/cuda_c/globals.h similarity index 100% rename from cuslines/globals.h rename to cuslines/cuda_c/globals.h diff --git a/cuslines/ptt.cu b/cuslines/cuda_c/ptt.cu similarity index 100% rename from cuslines/ptt.cu rename to cuslines/cuda_c/ptt.cu diff --git a/cuslines/ptt.cuh b/cuslines/cuda_c/ptt.cuh similarity index 100% rename from cuslines/ptt.cuh rename to cuslines/cuda_c/ptt.cuh diff --git a/cuslines/utils.cu b/cuslines/cuda_c/utils.cu similarity index 100% rename from cuslines/utils.cu rename to cuslines/cuda_c/utils.cu diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py index 5e03600..d1b9e28 100644 --- a/cuslines/cuda_python/cu_direction_getters.py +++ b/cuslines/cuda_python/cu_direction_getters.py @@ -78,7 +78,7 @@ def compile_program(self, debug: bool = False): # I think this is reasonable dev = Device() dev.set_current() - cuda_path = cuslines_cuda.joinpath("generate_streamlines_cuda.cu") + cuda_path = cuslines_cuda.joinpath("cuda_c/generate_streamlines_cuda.cu") with open(cuda_path, "r") as f: prog = Program(f.read(), code_type="c++", options=program_options) self.module = prog.compile( From 136e9aaa0c5a918d7aa30ec7eec89526e2bfb575 Mon Sep 17 00:00:00 2001 From: 36000 Date: Wed, 7 Jan 2026 12:53:56 -0800 Subject: [PATCH 26/31] Fix boot, refactor boot, other bfs --- README.md | 2 +- cuslines/cuda_c/boot.cu | 1066 +++++++++++++++++ cuslines/cuda_c/generate_streamlines_cuda.cu | 1074 +----------------- cuslines/cuda_c/globals.h | 29 +- cuslines/cuda_c/tracking_helpers.cu | 290 +++++ cuslines/cuda_c/utils.cu | 138 +-- cuslines/cuda_python/cu_direction_getters.py | 66 +- cuslines/cuda_python/cu_tractography.py | 3 +- cuslines/cuda_python/cutils.py | 3 - run_gpu_streamlines.py | 12 +- 10 files changed, 1449 insertions(+), 1234 deletions(-) create mode 100644 cuslines/cuda_c/boot.cu create mode 100644 cuslines/cuda_c/tracking_helpers.cu diff --git a/README.md b/README.md index a1e98fa..9e0275d 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ Destroy GPUTracker... Note that if you experience memory errors, you can adjust the `--chunk-size` flag. -To run on more seeds, we suggest enabling the `--trx` flag in the GPU script to not get bottlenecked by writing files. +To run on more seeds, we suggest setting the `--write-method trx` flag in the GPU script to not get bottlenecked by writing files. ## Running on AWS with Docker First, set up an AWS instance with GPU and ssh into it (we recommend a P3 instance with at least 1 V100 16 GB GPU and a Deep Learning AMI Ubuntu 18.04 v 33.0.). Then do the following: diff --git a/cuslines/cuda_c/boot.cu b/cuslines/cuda_c/boot.cu new file mode 100644 index 0000000..133c43d --- /dev/null +++ b/cuslines/cuda_c/boot.cu @@ -0,0 +1,1066 @@ +//#define USE_FIXED_PERMUTATION +#ifdef USE_FIXED_PERMUTATION +//__device__ const int fixedPerm[] = {44, 47, 53, 0, 3, 3, 39, 9, 19, 21, 50, 36, 23, +// 6, 24, 24, 12, 1, 38, 39, 23, 46, 24, 17, 37, 25, +// 13, 8, 9, 20, 51, 16, 51, 5, 15, 47, 0, 18, 35, +// 24, 49, 51, 29, 19, 19, 14, 39, 32, 1, 9, 32, 31, +// 10, 52, 23}; +__device__ const int fixedPerm[] = { + 47, 117, 67, 103, 9, 21, 36, 87, 70, 88, 140, 58, 39, 87, 88, 81, 25, 77, + 72, 9, 148, 115, 79, 82, 99, 29, 147, 147, 142, 32, 9, 127, 32, 31, 114, 28, + 34, 128, 128, 53, 133, 38, 17, 79, 132, 105, 42, 31, 120, 1, 65, 57, 35, 102, + 119, 11, 82, 91, 128, 142, 99, 53, 140, 121, 84, 68, 6, 47, 127, 131, 100, 78, + 143, 148, 23, 141, 117, 85, 48, 49, 69, 95, 94, 0, 113, 36, 48, 93, 131, 98, + 42, 112, 149, 127, 0, 138, 114, 43, 127, 23, 130, 121, 98, 62, 123, 82, 148, 50, + 14, 41, 58, 36, 10, 86, 43, 104, 11, 2, 51, 80, 32, 128, 38, 19, 42, 115, + 77, 30, 24, 125, 2, 3, 94, 107, 13, 112, 40, 72, 19, 95, 72, 67, 61, 14, + 96, 4, 139, 86, 121, 109}; +#endif + +template +__device__ VAL_T avgMask(const int mskLen, + const int *__restrict__ mask, + const VAL_T *__restrict__ data) { + + const int tidx = threadIdx.x; + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + int __myCnt = 0; + VAL_T __mySum = 0; + + for(int i = tidx; i < mskLen; i += BDIM_X) { + if(mask[i]) { + __myCnt++; + __mySum += data[i]; + } + } + + #pragma unroll + for(int i = BDIM_X/2; i; i /= 2) { + __mySum += __shfl_xor_sync(WMASK, __mySum, i, BDIM_X); + __myCnt += __shfl_xor_sync(WMASK, __myCnt, i, BDIM_X); + } + + return __mySum/__myCnt; + +} + +template< + int BDIM_X, + typename LEN_T, + typename MSK_T, + typename VAL_T> +__device__ LEN_T maskGet(const LEN_T n, + const MSK_T *__restrict__ mask, + const VAL_T *__restrict__ plain, + VAL_T *__restrict__ masked) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + const int __laneMask = (1 << tidx)-1; + + int woff = 0; + for(int j = 0; j < n; j += BDIM_X) { + + const int __act = (j+tidx < n) ? !mask[j+tidx] : 0; + const int __msk = __ballot_sync(WMASK, __act); + + const int toff = __popc(__msk & __laneMask); + if (__act) { + masked[woff+toff] = plain[j+tidx]; + } + woff += __popc(__msk); + } + return woff; +} + +template< + int BDIM_X, + typename LEN_T, + typename MSK_T, + typename VAL_T> +__device__ void maskPut(const LEN_T n, + const MSK_T *__restrict__ mask, + const VAL_T *__restrict__ masked, + VAL_T *__restrict__ plain) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + const int __laneMask = (1 << tidx)-1; + + int woff = 0; + for(int j = 0; j < n; j += BDIM_X) { + + const int __act = (j+tidx < n) ? !mask[j+tidx] : 0; + const int __msk = __ballot_sync(WMASK, __act); + + const int toff = __popc(__msk & __laneMask); + if (__act) { + plain[j+tidx] = masked[woff+toff]; + } + woff += __popc(__msk); + } + return; +} + +template +__device__ int closest_peak_d(const REAL_T max_angle, + const REAL3_T direction, //dir + const int npeaks, + const REAL3_T *__restrict__ peaks, + REAL3_T *__restrict__ peak) {// dirs, + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + //const REAL_T cos_similarity = COS(MAX_ANGLE_P); + const REAL_T cos_similarity = COS(max_angle); +#if 0 + if (!threadIdx.y && !tidx) { + printf("direction: (%f, %f, %f)\n", + direction.x, direction.y, direction.z); + } + __syncwarp(WMASK); +#endif + REAL_T cpeak_dot = 0; + int cpeak_idx = -1; + for(int j = 0; j < npeaks; j += BDIM_X) { + if (j+tidx < npeaks) { +#if 0 + if (!threadIdx.y && !tidx) { + printf("j+tidx: %d, peaks[j+tidx]: (%f, %f, %f)\n", + j+tidx, peaks[j+tidx].x, peaks[j+tidx].y, peaks[j+tidx].z); + } +#endif + const REAL_T dot = direction.x*peaks[j+tidx].x+ + direction.y*peaks[j+tidx].y+ + direction.z*peaks[j+tidx].z; + + if (FABS(dot) > FABS(cpeak_dot)) { + cpeak_dot = dot; + cpeak_idx = j+tidx; + } + } + } +#if 0 + if (!threadIdx.y && !tidx) { + printf("cpeak_idx: %d, cpeak_dot: %f\n", cpeak_idx, cpeak_dot); + } + __syncwarp(WMASK); +#endif + + #pragma unroll + for(int j = BDIM_X/2; j; j /= 2) { + + const REAL_T dot = __shfl_xor_sync(WMASK, cpeak_dot, j, BDIM_X); + const int idx = __shfl_xor_sync(WMASK, cpeak_idx, j, BDIM_X); + if (FABS(dot) > FABS(cpeak_dot)) { + cpeak_dot = dot; + cpeak_idx = idx; + } + } +#if 0 + if (!threadIdx.y && !tidx) { + printf("cpeak_idx: %d, cpeak_dot: %f, cos_similarity: %f\n", cpeak_idx, cpeak_dot, cos_similarity); + } + __syncwarp(WMASK); +#endif + if (cpeak_idx >= 0) { + if (cpeak_dot >= cos_similarity) { + peak[0] = peaks[cpeak_idx]; + return 1; + } + if (cpeak_dot <= -cos_similarity) { + peak[0] = MAKE_REAL3(-peaks[cpeak_idx].x, + -peaks[cpeak_idx].y, + -peaks[cpeak_idx].z); + return 1; + } + } + return 0; +} + +template +__device__ void ndotp_d(const int N, + const int M, + const VAL_T *__restrict__ srcV, + const VAL_T *__restrict__ srcM, + VAL_T *__restrict__ dstV) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + //#pragma unroll + for(int i = 0; i < N; i++) { + + VAL_T __tmp = 0; + + //#pragma unroll + for(int j = 0; j < M; j += BDIM_X) { + if (j+tidx < M) { + __tmp += srcV[j+tidx]*srcM[i*M + j+tidx]; + } + } + #pragma unroll + for(int j = BDIM_X/2; j; j /= 2) { +#if 0 + __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X); +#else + __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X); +#endif + } + // values could be held by BDIM_X threads and written + // together every BDIM_X iterations... + + if (tidx == 0) { + dstV[i] = __tmp; + } + } + return; +} + + +template +__device__ void ndotp_log_opdt_d(const int N, + const int M, + const VAL_T *__restrict__ srcV, + const VAL_T *__restrict__ srcM, + VAL_T *__restrict__ dstV) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + const VAL_T ONEP5 = static_cast(1.5); + + //#pragma unroll + for(int i = 0; i < N; i++) { + + VAL_T __tmp = 0; + + //#pragma unroll + for(int j = 0; j < M; j += BDIM_X) { + if (j+tidx < M) { + const VAL_T v = srcV[j+tidx]; + __tmp += -LOG(v)*(ONEP5+LOG(v))*v * srcM[i*M + j+tidx]; + } + } + #pragma unroll + for(int j = BDIM_X/2; j; j /= 2) { +#if 0 + __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X); +#else + __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X); +#endif + } + // values could be held by BDIM_X threads and written + // together every BDIM_X iterations... + + if (tidx == 0) { + dstV[i] = __tmp; + } + } + return; +} + +template +__device__ void ndotp_log_csa_d(const int N, + const int M, + const VAL_T *__restrict__ srcV, + const VAL_T *__restrict__ srcM, + VAL_T *__restrict__ dstV) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + // Clamp values + constexpr VAL_T min = .001; + constexpr VAL_T max = .999; + + //#pragma unroll + for(int i = 0; i < N; i++) { + + VAL_T __tmp = 0; + + //#pragma unroll + for(int j = 0; j < M; j += BDIM_X) { + if (j+tidx < M) { + const VAL_T v = MIN(MAX(srcV[j+tidx], min), max); + __tmp += LOG(-LOG(v)) * srcM[i*M + j+tidx]; + } + } + #pragma unroll + for(int j = BDIM_X/2; j; j /= 2) { +#if 0 + __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X); +#else + __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X); +#endif + } + // values could be held by BDIM_X threads and written + // together every BDIM_X iterations... + + if (tidx == 0) { + dstV[i] = __tmp; + } + } + return; +} + + +template +__device__ void fit_opdt(const int delta_nr, + const int hr_side, + const REAL_T *__restrict__ delta_q, + const REAL_T *__restrict__ delta_b, + const REAL_T *__restrict__ __msk_data_sh, + REAL_T *__restrict__ __h_sh, + REAL_T *__restrict__ __r_sh) { + const int tidx = threadIdx.x; + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + ndotp_log_opdt_d(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh); + ndotp_d (delta_nr, hr_side, __msk_data_sh, delta_b, __h_sh); + __syncwarp(WMASK); + #pragma unroll + for(int j = tidx; j < delta_nr; j += BDIM_X) { + __r_sh[j] -= __h_sh[j]; + } + __syncwarp(WMASK); +} + +template +__device__ void fit_csa(const int delta_nr, + const int hr_side, + const REAL_T *__restrict__ fit_matrix, + const REAL_T *__restrict__ __msk_data_sh, + REAL_T *__restrict__ __r_sh) { + const int tidx = threadIdx.x; + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + constexpr REAL _n0_const = 0.28209479177387814; // .5 / sqrt(pi) + ndotp_log_csa_d(delta_nr, hr_side, __msk_data_sh, fit_matrix, __r_sh); + __syncwarp(WMASK); + if (tidx == 0) { + __r_sh[0] = _n0_const; + } + __syncwarp(WMASK); +} + +template +__device__ void fit_model_coef(const int delta_nr, // delta_nr is number of ODF directions + const int hr_side, // hr_side is number of data directions + const REAL_T *__restrict__ delta_q, + const REAL_T *__restrict__ delta_b, // these are fit matrices the model can use, different for each model + const REAL_T *__restrict__ __msk_data_sh, // __msk_data_sh is the part of the data currently being operated on by this block + REAL_T *__restrict__ __h_sh, // these last two are modifications to the coefficients that will be returned + REAL_T *__restrict__ __r_sh) { + switch(MODEL_T) { + case OPDT: + fit_opdt(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh); + break; + case CSA: + fit_csa(delta_nr, hr_side, delta_q, __msk_data_sh, __r_sh); + break; + default: + printf("FATAL: Invalid Model Type.\n"); + break; + } +} + +template +__device__ int get_direction_boot_d( + curandStatePhilox4_32_10_t *st, + const REAL_T max_angle, + const REAL_T min_signal, + const REAL_T relative_peak_thres, + const REAL_T min_separation_angle, + REAL3_T dir, + const int dimx, + const int dimy, + const int dimz, + const int dimt, + const REAL_T *__restrict__ dataf, + const int *__restrict__ b0s_mask, // not using this (and its opposite, dwi_mask) + // but not clear if it will never be needed so + // we'll keep it here for now... + const REAL3_T point, + const REAL_T *__restrict__ H, + const REAL_T *__restrict__ R, + // model unused + // max_angle, pmf_threshold from global defines + // b0s_mask already passed + // min_signal from global defines + const int delta_nr, + const REAL_T *__restrict__ delta_b, + const REAL_T *__restrict__ delta_q, // fit_matrix + const int samplm_nr, + const REAL_T *__restrict__ sampling_matrix, + const REAL3_T *__restrict__ sphere_vertices, + const int2 *__restrict__ sphere_edges, + const int num_edges, + REAL3_T *__restrict__ dirs) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + const int n32dimt = ((dimt+31)/32)*32; + + extern REAL_T __shared__ __sh[]; + + REAL_T *__vox_data_sh = reinterpret_cast(__sh); + REAL_T *__msk_data_sh = __vox_data_sh + BDIM_Y*n32dimt; + + REAL_T *__r_sh = __msk_data_sh + BDIM_Y*n32dimt; + REAL_T *__h_sh = __r_sh + BDIM_Y*MAX(n32dimt, samplm_nr); + + __vox_data_sh += tidy*n32dimt; + __msk_data_sh += tidy*n32dimt; + + __r_sh += tidy*MAX(n32dimt, samplm_nr); + __h_sh += tidy*MAX(n32dimt, samplm_nr); + + // compute hr_side (may be passed from python) + int hr_side = 0; + for(int j = tidx; j < dimt; j += BDIM_X) { + hr_side += !b0s_mask[j] ? 1 : 0; + } + #pragma unroll + for(int i = BDIM_X/2; i; i /= 2) { + hr_side += __shfl_xor_sync(WMASK, hr_side, i, BDIM_X); + } + + #pragma unroll + for(int i = 0; i < NATTEMPTS; i++) { + + const int rv = trilinear_interp_d(dimx, dimy, dimz, dimt, -1, dataf, point, __vox_data_sh); + + const int nmsk = maskGet(dimt, b0s_mask, __vox_data_sh, __msk_data_sh); + + //if (!tidx && !threadIdx.y && !blockIdx.x) { + // + // printf("interp of %f, %f, %f\n", point.x, point.y, point.z); + // printf("hr_side: %d\n", hr_side); + // printArray("vox_data", 6, dimt, __vox_data_sh[tidy]); + // printArray("msk_data", 6, nmsk, __msk_data_sh[tidy]); + //} + //break; + + __syncwarp(WMASK); + + if (rv == 0) { + + ndotp_d(hr_side, hr_side, __msk_data_sh, R, __r_sh); + //__syncwarp(); + //printArray("__r", 5, hr_side*hr_side, R); + //printArray("__r_sh", 6, hr_side, __r_sh[tidy]); + + ndotp_d(hr_side, hr_side, __msk_data_sh, H, __h_sh); + //__syncwarp(); + //printArray("__h_sh", 6, hr_side, __h_sh[tidy]); + + __syncwarp(WMASK); + + for(int j = 0; j < hr_side; j += BDIM_X) { + if (j+tidx < hr_side) { +#ifdef USE_FIXED_PERMUTATION + const int srcPermInd = fixedPerm[j+tidx]; +#else + const int srcPermInd = curand(st) % hr_side; +// if (srcPermInd < 0 || srcPermInd >= hr_side) { +// printf("srcPermInd: %d\n", srcPermInd); +// } +#endif + __h_sh[j+tidx] += __r_sh[srcPermInd]; + //__h_sh[j+tidx] += __r_sh[j+tidx]; + } + } + __syncwarp(WMASK); + + //printArray("h+perm(r):", 6, hr_side, __h_sh[tidy]); + //__syncwarp(); + + // vox_data[dwi_mask] = masked_data + maskPut(dimt, b0s_mask, __h_sh, __vox_data_sh); + __syncwarp(WMASK); + + //printArray("vox_data[dwi_mask]:", 6, dimt, __vox_data_sh[tidy]); + //__syncwarp(); + + for(int j = tidx; j < dimt; j += BDIM_X) { + //__vox_data_sh[j] = MAX(MIN_SIGNAL_P, __vox_data_sh[j]); + __vox_data_sh[j] = MAX(min_signal, __vox_data_sh[j]); + } + __syncwarp(WMASK); + + const REAL_T denom = avgMask(dimt, b0s_mask, __vox_data_sh); + + for(int j = tidx; j < dimt; j += BDIM_X) { + __vox_data_sh[j] /= denom; + } + __syncwarp(); + + //if (!tidx && !threadIdx.y && !blockIdx.x) { + // printf("denom: %f\n", denom); + //} + ////break; + //if (!tidx && !threadIdx.y && !blockIdx.x) { + // + // printf("__vox_data_sh:\n"); + // printArray("vox_data", 6, dimt, __vox_data_sh[tidy]); + //} + //break; + + maskGet(dimt, b0s_mask, __vox_data_sh, __msk_data_sh); + __syncwarp(WMASK); + + fit_model_coef(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh); + + // __r_sh[tidy] <- python 'coef' + + ndotp_d(samplm_nr, delta_nr, __r_sh, sampling_matrix, __h_sh); + + // __h_sh[tidy] <- python 'pmf' + } else { + #pragma unroll + for(int j = tidx; j < samplm_nr; j += BDIM_X) { + __h_sh[j] = 0; + } + // __h_sh[tidy] <- python 'pmf' + } + __syncwarp(WMASK); +#if 0 + if (!threadIdx.y && threadIdx.x == 0) { + for(int j = 0; j < samplm_nr; j++) { + printf("pmf[%d]: %f\n", j, __h_sh[tidy][j]); + } + } + //return; +#endif + const REAL_T abs_pmf_thr = PMF_THRESHOLD_P*max_d(samplm_nr, __h_sh, REAL_MIN); + __syncwarp(WMASK); + + #pragma unroll + for(int j = tidx; j < samplm_nr; j += BDIM_X) { + const REAL_T __v = __h_sh[j]; + if (__v < abs_pmf_thr) { + __h_sh[j] = 0; + } + } + __syncwarp(WMASK); +#if 0 + if (!threadIdx.y && threadIdx.x == 0) { + printf("abs_pmf_thr: %f\n", abs_pmf_thr); + for(int j = 0; j < samplm_nr; j++) { + printf("pmfNORM[%d]: %f\n", j, __h_sh[tidy][j]); + } + } + //return; +#endif +#if 0 + if init: + directions = peak_directions(pmf, sphere)[0] + return directions + else: + peaks = peak_directions(pmf, sphere)[0] + if (len(peaks) > 0): + return closest_peak(directions, peaks, cos_similarity) +#endif + const int ndir = peak_directions_d(__h_sh, dirs, + sphere_vertices, + sphere_edges, + num_edges, + samplm_nr, + reinterpret_cast(__r_sh), // reuse __r_sh as shInd in func which is large enough + relative_peak_thres, + min_separation_angle); + if (NATTEMPTS == 1) { // init=True... + return ndir; // and dirs; + } else { // init=False... + if (ndir > 0) { + /* + if (!threadIdx.y && threadIdx.x == 0 && ndir > 1) { + printf("NATTEMPTS=5 and ndir: %d!!!\n", ndir); + } + */ + REAL3_T peak; + const int foundPeak = closest_peak_d(max_angle, dir, ndir, dirs, &peak); + __syncwarp(WMASK); + if (foundPeak) { + if (tidx == 0) { + dirs[0] = peak; + } + return 1; + } + } + } + } + return 0; +} + +template +__global__ void getNumStreamlinesBoot_k( + const ModelType model_type, + const REAL_T max_angle, + const REAL_T min_signal, + const REAL_T relative_peak_thres, + const REAL_T min_separation_angle, + const long long rndSeed, + const int nseed, + const REAL3_T *__restrict__ seeds, + const int dimx, + const int dimy, + const int dimz, + const int dimt, + const REAL_T *__restrict__ dataf, + const REAL_T *__restrict__ H, + const REAL_T *__restrict__ R, + const int delta_nr, + const REAL_T *__restrict__ delta_b, + const REAL_T *__restrict__ delta_q, + const int *__restrict__ b0s_mask, // change to int + const int samplm_nr, + const REAL_T *__restrict__ sampling_matrix, + const REAL3_T *__restrict__ sphere_vertices, + const int2 *__restrict__ sphere_edges, + const int num_edges, + REAL3_T *__restrict__ shDir0, + int *slineOutOff) { + + const int tidx = threadIdx.x; + const int slid = blockIdx.x*blockDim.y + threadIdx.y; + const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x; + + if (slid >= nseed) { + return; + } + + REAL3_T seed = seeds[slid]; + // seed = lin_mat*seed + offset + + REAL3_T *__restrict__ __shDir = shDir0+slid*samplm_nr; + + // const int hr_side = dimt-1; + + curandStatePhilox4_32_10_t st; + //curand_init(rndSeed, slid + rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(hr_side/BDIM_X) + curand_init(rndSeed, gid, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X) + // elements of the same sequence + // python: + //directions = get_direction(None, dataf, dwi_mask, sphere, s, H, R, model, max_angle, + // pmf_threshold, b0s_mask, min_signal, fit_matrix, + // sampling_matrix, init=True) + + //if (!tidx && !threadIdx.y && !blockIdx.x) { + // printf("seed: %f, %f, %f\n", seed.x, seed.y, seed.z); + //} + + int ndir; + switch(model_type) { + case OPDT: + ndir = get_direction_boot_d( + &st, + max_angle, + min_signal, + relative_peak_thres, + min_separation_angle, + MAKE_REAL3(0,0,0), + dimx, dimy, dimz, dimt, dataf, + b0s_mask /* !dwi_mask */, + seed, + H, R, + // model unused + // max_angle, pmf_threshold from global defines + // b0s_mask already passed + // min_signal from global defines + delta_nr, + delta_b, delta_q, // fit_matrix + samplm_nr, + sampling_matrix, + sphere_vertices, + sphere_edges, + num_edges, + __shDir); + break; + case CSA: + ndir = get_direction_boot_d( + &st, + max_angle, + min_signal, + relative_peak_thres, + min_separation_angle, + MAKE_REAL3(0,0,0), + dimx, dimy, dimz, dimt, dataf, + b0s_mask /* !dwi_mask */, + seed, + H, R, + // model unused + // max_angle, pmf_threshold from global defines + // b0s_mask already passed + // min_signal from global defines + delta_nr, + delta_b, delta_q, // fit_matrix + samplm_nr, + sampling_matrix, + sphere_vertices, + sphere_edges, + num_edges, + __shDir); + break; + default: + printf("FATAL: Invalid Model Type.\n"); + break; + } + + if (tidx == 0) { + slineOutOff[slid] = ndir; + } + + return; +} + +template +__device__ int tracker_boot_d( + curandStatePhilox4_32_10_t *st, + const REAL_T max_angle, + const REAL_T tc_threshold, + const REAL_T step_size, + const REAL_T relative_peak_thres, + const REAL_T min_separation_angle, + REAL3_T seed, + REAL3_T first_step, + REAL3_T voxel_size, + const int dimx, + const int dimy, + const int dimz, + const int dimt, + const REAL_T *__restrict__ dataf, + const REAL_T *__restrict__ metric_map, + const int samplm_nr, + const REAL3_T *__restrict__ sphere_vertices, + const int2 *__restrict__ sphere_edges, + const int num_edges, + /*BOOT specific params*/ + const REAL_T min_signal, + const int delta_nr, + const REAL_T *__restrict__ H, + const REAL_T *__restrict__ R, + const REAL_T *__restrict__ delta_b, + const REAL_T *__restrict__ delta_q, + const REAL_T *__restrict__ sampling_matrix, + const int *__restrict__ b0s_mask, + /*BOOT specific params*/ + int *__restrict__ nsteps, + REAL3_T *__restrict__ streamline) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + int tissue_class = TRACKPOINT; + + REAL3_T point = seed; + REAL3_T direction = first_step; + __shared__ REAL3_T __sh_new_dir[BDIM_Y]; + + if (tidx == 0) { + streamline[0] = point; +#if 0 + if (threadIdx.y == 1) { + printf("streamline[0]: %f, %f, %f\n", point.x, point.y, point.z); + } +#endif + } + __syncwarp(WMASK); + + int step_frac = 1; + + int i; + for(i = 1; i < MAX_SLINE_LEN*step_frac; i++) { + int ndir = get_direction_boot_d( + st, + max_angle, + min_signal, + relative_peak_thres, + min_separation_angle, + direction, + dimx, dimy, dimz, dimt, dataf, + b0s_mask /* !dwi_mask */, + point, + H, R, + delta_nr, + delta_b, delta_q, // fit_matrix + samplm_nr, + sampling_matrix, + sphere_vertices, + sphere_edges, + num_edges, + __sh_new_dir + tidy); + __syncwarp(WMASK); + direction = __sh_new_dir[tidy]; + __syncwarp(WMASK); + + if (ndir == 0) { + break; + } + + point.x += (direction.x / voxel_size.x) * (step_size / step_frac); + point.y += (direction.y / voxel_size.y) * (step_size / step_frac); + point.z += (direction.z / voxel_size.z) * (step_size / step_frac); + + if ((tidx == 0) && ((i % step_frac) == 0)){ + streamline[i/step_frac] = point; + } + __syncwarp(WMASK); + + tissue_class = check_point_d(tc_threshold, point, dimx, dimy, dimz, metric_map); + + if (tissue_class == ENDPOINT || + tissue_class == INVALIDPOINT || + tissue_class == OUTSIDEIMAGE) { + break; + } + } + nsteps[0] = i/step_frac; + if (((i % step_frac) != 0) && i < step_frac*(MAX_SLINE_LEN - 1)){ + nsteps[0]++; + if (tidx == 0) { + streamline[nsteps[0]] = point; + } + } + + return tissue_class; +} + +template +__global__ void genStreamlinesMergeBoot_k( + const REAL_T max_angle, + const REAL_T tc_threshold, + const REAL_T step_size, + const REAL_T relative_peak_thres, + const REAL_T min_separation_angle, + const long long rndSeed, + const int rndOffset, + const int nseed, + const REAL3_T *__restrict__ seeds, + const int dimx, + const int dimy, + const int dimz, + const int dimt, + const REAL_T *__restrict__ dataf, + const REAL_T *__restrict__ metric_map, + const int samplm_nr, + const REAL3_T *__restrict__ sphere_vertices, + const int2 *__restrict__ sphere_edges, + const int num_edges, + /*BOOT specific params*/ + const REAL_T min_signal, + const int delta_nr, + const REAL_T *__restrict__ H, + const REAL_T *__restrict__ R, + const REAL_T *__restrict__ delta_b, + const REAL_T *__restrict__ delta_q, + const REAL_T *__restrict__ sampling_matrix, + const int *__restrict__ b0s_mask, + /*BOOT specific params*/ + const int *__restrict__ slineOutOff, + REAL3_T *__restrict__ shDir0, + int *__restrict__ slineSeed, + int *__restrict__ slineLen, + REAL3_T *__restrict__ sline) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int slid = blockIdx.x*blockDim.y + threadIdx.y; + + const int lid = (tidy*BDIM_X + tidx) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + curandStatePhilox4_32_10_t st; + // const int gbid = blockIdx.y*gridDim.x + blockIdx.x; + const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x; + //curand_init(rndSeed, slid+rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(HR_SIDE/BDIM_X) + curand_init(rndSeed, gid+1, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X) + // elements of the same sequence + if (slid >= nseed) { + return; + } + + REAL3_T seed = seeds[slid]; + + int ndir = slineOutOff[slid+1]-slineOutOff[slid]; +#if 0 + if (threadIdx.y == 0 && threadIdx.x == 0) { + printf("%s: ndir: %d\n", __func__, ndir); + for(int i = 0; i < ndir; i++) { + printf("__shDir[%d][%d]: (%f, %f, %f)\n", + tidy, i, __shDir[tidy][i].x, __shDir[tidy][i].y, __shDir[tidy][i].z); + } + } +#endif + __syncwarp(WMASK); + + int slineOff = slineOutOff[slid]; + + for(int i = 0; i < ndir; i++) { + REAL3_T first_step = shDir0[slid*samplm_nr + i]; + + REAL3_T *__restrict__ currSline = sline + slineOff*MAX_SLINE_LEN*2; + + if (tidx == 0) { + slineSeed[slineOff] = slid; + } +#if 0 + if (threadIdx.y == 0 && threadIdx.x == 0) { + printf("calling trackerF from: (%f, %f, %f)\n", first_step.x, first_step.y, first_step.z); + } +#endif + + int stepsB; + const int tissue_classB = tracker_boot_d( + &st, + max_angle, + tc_threshold, + step_size, + relative_peak_thres, + min_separation_angle, + seed, + MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z), + MAKE_REAL3(1, 1, 1), + dimx, dimy, dimz, dimt, dataf, + metric_map, + samplm_nr, + sphere_vertices, + sphere_edges, + num_edges, + min_signal, + delta_nr, + H, + R, + delta_b, + delta_q, + sampling_matrix, + b0s_mask, + &stepsB, + currSline); + + // reverse backward sline + for(int j = 0; j < stepsB/2; j += BDIM_X) { + if (j+tidx < stepsB/2) { + const REAL3_T __p = currSline[j+tidx]; + currSline[j+tidx] = currSline[stepsB-1 - (j+tidx)]; + currSline[stepsB-1 - (j+tidx)] = __p; + } + } + + int stepsF; + const int tissue_classF = tracker_boot_d( + &st, + max_angle, + tc_threshold, + step_size, + relative_peak_thres, + min_separation_angle, + seed, + first_step, + MAKE_REAL3(1, 1, 1), + dimx, dimy, dimz, dimt, dataf, + metric_map, + samplm_nr, + sphere_vertices, + sphere_edges, + num_edges, + min_signal, + delta_nr, + H, + R, + delta_b, + delta_q, + sampling_matrix, + b0s_mask, + &stepsF, + currSline + stepsB-1); + if (tidx == 0) { + slineLen[slineOff] = stepsB-1+stepsF; + } + + slineOff += 1; +#if 0 + if (threadIdx.y == 0 && threadIdx.x == 0) { + printf("%s: stepsF: %d, tissue_classF: %d\n", __func__, stepsF, tissue_classF); + } + __syncwarp(WMASK); +#endif + //if (/* !return_all || */0 && + // tissue_classF != ENDPOINT && + // tissue_classF != OUTSIDEIMAGE) { + // continue; + //} + //if (/* !return_all || */ 0 && + // tissue_classB != ENDPOINT && + // tissue_classB != OUTSIDEIMAGE) { + // continue; + //} + } + return; +} diff --git a/cuslines/cuda_c/generate_streamlines_cuda.cu b/cuslines/cuda_c/generate_streamlines_cuda.cu index b9a84c2..f5629e0 100644 --- a/cuslines/cuda_c/generate_streamlines_cuda.cu +++ b/cuslines/cuda_c/generate_streamlines_cuda.cu @@ -29,13 +29,13 @@ #include #include -#include "cudamacro.h" /* for time() */ #include "globals.h" - #include "cuwsort.cuh" #include "ptt.cuh" #include "utils.cu" +#include "tracking_helpers.cu" +#include "boot.cu" #include "ptt.cu" #define MAX_NUM_DIR (128) @@ -45,630 +45,6 @@ #define MAX_DIMS (8) #define MAX_STR_LEN (256) -using namespace cuwsort; - -//#define USE_FIXED_PERMUTATION -#ifdef USE_FIXED_PERMUTATION -//__device__ const int fixedPerm[] = {44, 47, 53, 0, 3, 3, 39, 9, 19, 21, 50, 36, 23, -// 6, 24, 24, 12, 1, 38, 39, 23, 46, 24, 17, 37, 25, -// 13, 8, 9, 20, 51, 16, 51, 5, 15, 47, 0, 18, 35, -// 24, 49, 51, 29, 19, 19, 14, 39, 32, 1, 9, 32, 31, -// 10, 52, 23}; -__device__ const int fixedPerm[] = { - 47, 117, 67, 103, 9, 21, 36, 87, 70, 88, 140, 58, 39, 87, 88, 81, 25, 77, - 72, 9, 148, 115, 79, 82, 99, 29, 147, 147, 142, 32, 9, 127, 32, 31, 114, 28, - 34, 128, 128, 53, 133, 38, 17, 79, 132, 105, 42, 31, 120, 1, 65, 57, 35, 102, - 119, 11, 82, 91, 128, 142, 99, 53, 140, 121, 84, 68, 6, 47, 127, 131, 100, 78, - 143, 148, 23, 141, 117, 85, 48, 49, 69, 95, 94, 0, 113, 36, 48, 93, 131, 98, - 42, 112, 149, 127, 0, 138, 114, 43, 127, 23, 130, 121, 98, 62, 123, 82, 148, 50, - 14, 41, 58, 36, 10, 86, 43, 104, 11, 2, 51, 80, 32, 128, 38, 19, 42, 115, - 77, 30, 24, 125, 2, 3, 94, 107, 13, 112, 40, 72, 19, 95, 72, 67, 61, 14, - 96, 4, 139, 86, 121, 109}; -#endif - -template -__device__ void ndotp_d(const int N, - const int M, - const VAL_T *__restrict__ srcV, - const VAL_T *__restrict__ srcM, - VAL_T *__restrict__ dstV) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - //#pragma unroll - for(int i = 0; i < N; i++) { - - VAL_T __tmp = 0; - - //#pragma unroll - for(int j = 0; j < M; j += BDIM_X) { - if (j+tidx < M) { - __tmp += srcV[j+tidx]*srcM[i*M + j+tidx]; - } - } - #pragma unroll - for(int j = BDIM_X/2; j; j /= 2) { -#if 0 - __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X); -#else - __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X); -#endif - } - // values could be held by BDIM_X threads and written - // together every BDIM_X iterations... - - if (tidx == 0) { - dstV[i] = __tmp; - } - } - return; -} - - -template -__device__ void ndotp_log_opdt_d(const int N, - const int M, - const VAL_T *__restrict__ srcV, - const VAL_T *__restrict__ srcM, - VAL_T *__restrict__ dstV) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - const VAL_T ONEP5 = static_cast(1.5); - - //#pragma unroll - for(int i = 0; i < N; i++) { - - VAL_T __tmp = 0; - - //#pragma unroll - for(int j = 0; j < M; j += BDIM_X) { - if (j+tidx < M) { - const VAL_T v = srcV[j+tidx]; - __tmp += -LOG(v)*(ONEP5+LOG(v))*v * srcM[i*M + j+tidx]; - } - } - #pragma unroll - for(int j = BDIM_X/2; j; j /= 2) { -#if 0 - __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X); -#else - __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X); -#endif - } - // values could be held by BDIM_X threads and written - // together every BDIM_X iterations... - - if (tidx == 0) { - dstV[i] = __tmp; - } - } - return; -} - -template -__device__ void ndotp_log_csa_d(const int N, - const int M, - const VAL_T *__restrict__ srcV, - const VAL_T *__restrict__ srcM, - VAL_T *__restrict__ dstV) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - // Clamp values - constexpr VAL_T min = .001; - constexpr VAL_T max = .999; - - //#pragma unroll - for(int i = 0; i < N; i++) { - - VAL_T __tmp = 0; - - //#pragma unroll - for(int j = 0; j < M; j += BDIM_X) { - if (j+tidx < M) { - const VAL_T v = MIN(MAX(srcV[j+tidx], min), max); - __tmp += LOG(-LOG(v)) * srcM[i*M + j+tidx]; - } - } - #pragma unroll - for(int j = BDIM_X/2; j; j /= 2) { -#if 0 - __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X); -#else - __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X); -#endif - } - // values could be held by BDIM_X threads and written - // together every BDIM_X iterations... - - if (tidx == 0) { - dstV[i] = __tmp; - } - } - return; -} - - -template -__device__ void fit_opdt(const int delta_nr, - const int hr_side, - const REAL_T *__restrict__ delta_q, - const REAL_T *__restrict__ delta_b, - const REAL_T *__restrict__ __msk_data_sh, - REAL_T *__restrict__ __h_sh, - REAL_T *__restrict__ __r_sh) { - const int tidx = threadIdx.x; - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - ndotp_log_opdt_d(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh); - ndotp_d (delta_nr, hr_side, __msk_data_sh, delta_b, __h_sh); - __syncwarp(WMASK); - #pragma unroll - for(int j = tidx; j < delta_nr; j += BDIM_X) { - __r_sh[j] -= __h_sh[j]; - } - __syncwarp(WMASK); -} - -template -__device__ void fit_csa(const int delta_nr, - const int hr_side, - const REAL_T *__restrict__ fit_matrix, - const REAL_T *__restrict__ __msk_data_sh, - REAL_T *__restrict__ __r_sh) { - const int tidx = threadIdx.x; - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - constexpr REAL _n0_const = 0.28209479177387814; // .5 / sqrt(pi) - ndotp_log_csa_d(delta_nr, hr_side, __msk_data_sh, fit_matrix, __r_sh); - __syncwarp(WMASK); - if (tidx == 0) { - __r_sh[0] = _n0_const; - } - __syncwarp(WMASK); -} - -template -__device__ void fit_model_coef(const int delta_nr, // delta_nr is number of ODF directions - const int hr_side, // hr_side is number of data directions - const REAL_T *__restrict__ delta_q, - const REAL_T *__restrict__ delta_b, // these are fit matrices the model can use, different for each model - const REAL_T *__restrict__ __msk_data_sh, // __msk_data_sh is the part of the data currently being operated on by this block - REAL_T *__restrict__ __h_sh, // these last two are modifications to the coefficients that will be returned - REAL_T *__restrict__ __r_sh) { - switch(MODEL_T) { - case OPDT: - fit_opdt(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh); - break; - case CSA: - fit_csa(delta_nr, hr_side, delta_q, __msk_data_sh, __r_sh); - break; - default: - printf("FATAL: Invalid Model Type.\n"); - break; - } -} - -template -__device__ VAL_T max_mask_transl_d(const int n, - const LEN_T *__restrict__ srcMsk, - const VAL_T *__restrict__ srcVal, - const VAL_T offset, - const VAL_T minVal) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - VAL_T __m = minVal; - - for(int i = tidx; i < n; i += BDIM_X) { - const LEN_T sel = srcMsk[i]; - if (sel > 0) { - __m = MAX(__m, srcVal[i]+offset); - } - } - - #pragma unroll - for(int i = BDIM_X/2; i; i /= 2) { - const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X); - __m = MAX(__m, __tmp); - } - - return __m; -} - -template -__device__ VAL_T min_d(const int n, const VAL_T *__restrict__ src, const VAL_T maxVal) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - VAL_T __m = maxVal; - - for(int i = tidx; i < n; i += BDIM_X) { - __m = MIN(__m, src[i]); - } - - #pragma unroll - for(int i = BDIM_X/2; i; i /= 2) { - const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X); - __m = MIN(__m, __tmp); - } - - return __m; -} - -template -__device__ VAL_T avgMask(const int mskLen, - const int *__restrict__ mask, - const VAL_T *__restrict__ data) { - - const int tidx = threadIdx.x; - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - int __myCnt = 0; - VAL_T __mySum = 0; - - for(int i = tidx; i < mskLen; i += BDIM_X) { - if(mask[i]) { - __myCnt++; - __mySum += data[i]; - } - } - - #pragma unroll - for(int i = BDIM_X/2; i; i /= 2) { - __mySum += __shfl_xor_sync(WMASK, __mySum, i, BDIM_X); - __myCnt += __shfl_xor_sync(WMASK, __myCnt, i, BDIM_X); - } - - return __mySum/__myCnt; - -} - -template -__device__ int peak_directions_d(const REAL_T *__restrict__ odf, - REAL3_T *__restrict__ dirs, - const REAL3_T *__restrict__ sphere_vertices, - const int2 *__restrict__ sphere_edges, - const int num_edges, - int samplm_nr, - int *__restrict__ __shInd, - const REAL_T relative_peak_thres, - const REAL_T min_separation_angle) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - const unsigned int lmask = (1 << lid)-1; - -// __shared__ int __shInd[BDIM_Y][SAMPLM_NR]; - - #pragma unroll - for(int j = tidx; j < samplm_nr; j += BDIM_X) { - __shInd[j] = 0; - } - - REAL_T odf_min = min_d(samplm_nr, odf, REAL_MAX); - odf_min = MAX(0, odf_min); - - __syncwarp(WMASK); - - // local_maxima() + _compare_neighbors() - // selecting only the indices corrisponding to maxima Ms - // such that M-odf_min >= relative_peak_thres - //#pragma unroll - for(int j = 0; j < num_edges; j += BDIM_X) { - if (j+tidx < num_edges) { - const int u_ind = sphere_edges[j+tidx].x; - const int v_ind = sphere_edges[j+tidx].y; - - //if (u_ind >= NUM_EDGES || v_ind >= NUM_EDGES) { ERROR; } - - const REAL_T u_val = odf[u_ind]; - const REAL_T v_val = odf[v_ind]; - - //if (u_val != u_val || v_val != v_val) { ERROR_NANs; } - - // only check that they are not equal - //if (u_val != v_val) { - // __shInd[tidy][u_val < v_val ? u_ind : v_ind] = -1; // benign race conditions... - //} - if (u_val < v_val) { - atomicExch(__shInd+u_ind, -1); - atomicOr( __shInd+v_ind, 1); - } else if (v_val < u_val) { - atomicExch(__shInd+v_ind, -1); - atomicOr( __shInd+u_ind, 1); - } - } - } - __syncwarp(WMASK); - - const REAL_T compThres = relative_peak_thres*max_mask_transl_d(samplm_nr, __shInd, odf, -odf_min, REAL_MIN); -#if 1 -/* - if (!tidy && !tidx) { - for(int j = 0; j < SAMPLM_NR; j++) { - printf("local_max[%d]: %d (%f)\n", j, __shInd[tidy][j], odf[j]); - } - printf("maxMax with offset %f: %f\n", -odf_min, compThres); - } - __syncwarp(WMASK); -*/ - // compact indices of positive values to the right - int n = 0; - - for(int j = 0; j < samplm_nr; j += BDIM_X) { - - const int __v = (j+tidx < samplm_nr) ? __shInd[j+tidx] : -1; - const int __keep = (__v > 0) && ((odf[j+tidx]-odf_min) >= compThres); - const int __msk = __ballot_sync(WMASK, __keep); - -//__syncwarp(WMASK); // unnecessary - if (__keep) { - const int myoff = __popc(__msk & lmask); - __shInd[n + myoff] = j+tidx; - } - n += __popc(__msk); -//__syncwarp(WMASK); // should be unnecessary - } - __syncwarp(WMASK); -/* - if (!tidy && !tidx) { - for(int j = 0; j < n; j++) { - printf("local_max_compact[%d]: %d\n", j, __shInd[tidy][j]); - } - } - __syncwarp(WMASK); -*/ - - // sort local maxima indices - if (n < BDIM_X) { - REAL_T k = REAL_MIN; - int v = 0; - if (tidx < n) { - v = __shInd[tidx]; - k = odf[v]; - } - warp_sort<32, BDIM_X, WSORT_DIR_DEC>(&k, &v); - __syncwarp(WMASK); - - if (tidx < n) { - __shInd[tidx] = v; - } - } else { - // ERROR !!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - } - __syncwarp(WMASK); - - // __shInd[tidy][] contains the indices in odf correspoding to - // normalized maxima NOT sorted! - if (n != 0) { - // remove_similar_vertices() - // PRELIMINARY INEFFICIENT, SINGLE TH, IMPLEMENTATION - if (tidx == 0) { - const REAL_T cos_similarity = COS(min_separation_angle); - - dirs[0] = sphere_vertices[__shInd[0]]; - - int k = 1; - for(int i = 1; i < n; i++) { - - const REAL3_T abc = sphere_vertices[__shInd[i]]; - - int j = 0; - for(; j < k; j++) { - const REAL_T cos = FABS(abc.x*dirs[j].x+ - abc.y*dirs[j].y+ - abc.z*dirs[j].z); - if (cos > cos_similarity) { - break; - } - } - if (j == k) { - dirs[k++] = abc; - } - } - n = k; - } - n = __shfl_sync(WMASK, n, 0, BDIM_X); - __syncwarp(WMASK); - - } -/* - if (!tidy && !tidx) { - for(int j = 0; j < n; j++) { - printf("local_max_compact_uniq[%d]: %d\n", j, __shInd[tidy][j]); - } - } - __syncwarp(WMASK); -*/ -#else - const int indMax = max_d(__shInd[tidy], -1); - if (indMax != -1) { - __ret = MAKE_REAL3(sphere_vertices[indMax][0], - sphere_vertices[indMax][1], - sphere_vertices[indMax][2]); - } -#endif - return n; -} - -template -__device__ int closest_peak_d(const REAL_T max_angle, - const REAL3_T direction, //dir - const int npeaks, - const REAL3_T *__restrict__ peaks, - REAL3_T *__restrict__ peak) {// dirs, - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - //const REAL_T cos_similarity = COS(MAX_ANGLE_P); - const REAL_T cos_similarity = COS(max_angle); -#if 0 - if (!threadIdx.y && !tidx) { - printf("direction: (%f, %f, %f)\n", - direction.x, direction.y, direction.z); - } - __syncwarp(WMASK); -#endif - REAL_T cpeak_dot = 0; - int cpeak_idx = -1; - for(int j = 0; j < npeaks; j += BDIM_X) { - if (j+tidx < npeaks) { -#if 0 - if (!threadIdx.y && !tidx) { - printf("j+tidx: %d, peaks[j+tidx]: (%f, %f, %f)\n", - j+tidx, peaks[j+tidx].x, peaks[j+tidx].y, peaks[j+tidx].z); - } -#endif - const REAL_T dot = direction.x*peaks[j+tidx].x+ - direction.y*peaks[j+tidx].y+ - direction.z*peaks[j+tidx].z; - - if (FABS(dot) > FABS(cpeak_dot)) { - cpeak_dot = dot; - cpeak_idx = j+tidx; - } - } - } -#if 0 - if (!threadIdx.y && !tidx) { - printf("cpeak_idx: %d, cpeak_dot: %f\n", cpeak_idx, cpeak_dot); - } - __syncwarp(WMASK); -#endif - - #pragma unroll - for(int j = BDIM_X/2; j; j /= 2) { - - const REAL_T dot = __shfl_xor_sync(WMASK, cpeak_dot, j, BDIM_X); - const int idx = __shfl_xor_sync(WMASK, cpeak_idx, j, BDIM_X); - if (FABS(dot) > FABS(cpeak_dot)) { - cpeak_dot = dot; - cpeak_idx = idx; - } - } -#if 0 - if (!threadIdx.y && !tidx) { - printf("cpeak_idx: %d, cpeak_dot: %f, cos_similarity: %f\n", cpeak_idx, cpeak_dot, cos_similarity); - } - __syncwarp(WMASK); -#endif - if (cpeak_idx >= 0) { - if (cpeak_dot >= cos_similarity) { - peak[0] = peaks[cpeak_idx]; - return 1; - } - if (cpeak_dot <= -cos_similarity) { - peak[0] = MAKE_REAL3(-peaks[cpeak_idx].x, - -peaks[cpeak_idx].y, - -peaks[cpeak_idx].z); - return 1; - } - } - return 0; -} - -template -__device__ LEN_T maskGet(const LEN_T n, - const MSK_T *__restrict__ mask, - const VAL_T *__restrict__ plain, - VAL_T *__restrict__ masked) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - const int __laneMask = (1 << tidx)-1; - - int woff = 0; - for(int j = 0; j < n; j += BDIM_X) { - - const int __act = (j+tidx < n) ? !mask[j+tidx] : 0; - const int __msk = __ballot_sync(WMASK, __act); - - const int toff = __popc(__msk & __laneMask); - if (__act) { - masked[woff+toff] = plain[j+tidx]; - } - woff += __popc(__msk); - } - return woff; -} - -template -__device__ void maskPut(const LEN_T n, - const MSK_T *__restrict__ mask, - const VAL_T *__restrict__ masked, - VAL_T *__restrict__ plain) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - const int __laneMask = (1 << tidx)-1; - - int woff = 0; - for(int j = 0; j < n; j += BDIM_X) { - - const int __act = (j+tidx < n) ? !mask[j+tidx] : 0; - const int __msk = __ballot_sync(WMASK, __act); - - const int toff = __popc(__msk & __laneMask); - if (__act) { - plain[j+tidx] = masked[woff+toff]; - } - woff += __popc(__msk); - } - return; -} - template -__device__ int get_direction_boot_d( - curandStatePhilox4_32_10_t *st, - const REAL_T max_angle, - const REAL_T min_signal, - const REAL_T relative_peak_thres, - const REAL_T min_separation_angle, - REAL3_T dir, - const int dimx, - const int dimy, - const int dimz, - const int dimt, - const REAL_T *__restrict__ dataf, - const int *__restrict__ b0s_mask, // not using this (and its opposite, dwi_mask) - // but not clear if it will never be needed so - // we'll keep it here for now... - const REAL3_T point, - const REAL_T *__restrict__ H, - const REAL_T *__restrict__ R, - // model unused - // max_angle, pmf_threshold from global defines - // b0s_mask already passed - // min_signal from global defines - const int delta_nr, - const REAL_T *__restrict__ delta_b, - const REAL_T *__restrict__ delta_q, // fit_matrix - const int samplm_nr, - const REAL_T *__restrict__ sampling_matrix, - const REAL3_T *__restrict__ sphere_vertices, - const int2 *__restrict__ sphere_edges, - const int num_edges, - REAL3_T *__restrict__ dirs) { - - const int tidx = threadIdx.x; - const int tidy = threadIdx.y; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - const int n32dimt = ((dimt+31)/32)*32; - - extern REAL_T __shared__ __sh[]; - - REAL_T *__vox_data_sh = reinterpret_cast(__sh); - REAL_T *__msk_data_sh = __vox_data_sh + BDIM_Y*n32dimt; - - REAL_T *__r_sh = __msk_data_sh + BDIM_Y*n32dimt; - REAL_T *__h_sh = __r_sh + BDIM_Y*MAX(n32dimt, samplm_nr); - - __vox_data_sh += tidy*n32dimt; - __msk_data_sh += tidy*n32dimt; - - __r_sh += tidy*MAX(n32dimt, samplm_nr); - __h_sh += tidy*MAX(n32dimt, samplm_nr); - - // compute hr_side (may be passed from python) - int hr_side = 0; - for(int j = tidx; j < dimt; j += BDIM_X) { - hr_side += !b0s_mask[j] ? 1 : 0; - } - #pragma unroll - for(int i = BDIM_X/2; i; i /= 2) { - hr_side += __shfl_xor_sync(WMASK, hr_side, i, BDIM_X); - } - - #pragma unroll - for(int i = 0; i < NATTEMPTS; i++) { - - const int rv = trilinear_interp_d(dimx, dimy, dimz, dimt, -1, dataf, point, __vox_data_sh); - - const int nmsk = maskGet(dimt, b0s_mask, __vox_data_sh, __msk_data_sh); - - //if (!tidx && !threadIdx.y && !blockIdx.x) { - // - // printf("interp of %f, %f, %f\n", point.x, point.y, point.z); - // printf("hr_side: %d\n", hr_side); - // printArray("vox_data", 6, dimt, __vox_data_sh[tidy]); - // printArray("msk_data", 6, nmsk, __msk_data_sh[tidy]); - //} - //break; - - __syncwarp(WMASK); - - if (rv == 0) { - - ndotp_d(hr_side, hr_side, __msk_data_sh, R, __r_sh); - //__syncwarp(); - //printArray("__r", 5, hr_side*hr_side, R); - //printArray("__r_sh", 6, hr_side, __r_sh[tidy]); - - ndotp_d(hr_side, hr_side, __msk_data_sh, H, __h_sh); - //__syncwarp(); - //printArray("__h_sh", 6, hr_side, __h_sh[tidy]); - - __syncwarp(WMASK); - - for(int j = 0; j < hr_side; j += BDIM_X) { - if (j+tidx < hr_side) { -#ifdef USE_FIXED_PERMUTATION - const int srcPermInd = fixedPerm[j+tidx]; -#else - const int srcPermInd = curand(st) % hr_side; -// if (srcPermInd < 0 || srcPermInd >= hr_side) { -// printf("srcPermInd: %d\n", srcPermInd); -// } -#endif - __h_sh[j+tidx] += __r_sh[srcPermInd]; - //__h_sh[j+tidx] += __r_sh[j+tidx]; - } - } - __syncwarp(WMASK); - - //printArray("h+perm(r):", 6, hr_side, __h_sh[tidy]); - //__syncwarp(); - - // vox_data[dwi_mask] = masked_data - maskPut(dimt, b0s_mask, __h_sh, __vox_data_sh); - __syncwarp(WMASK); - - //printArray("vox_data[dwi_mask]:", 6, dimt, __vox_data_sh[tidy]); - //__syncwarp(); - - for(int j = tidx; j < dimt; j += BDIM_X) { - //__vox_data_sh[j] = MAX(MIN_SIGNAL_P, __vox_data_sh[j]); - __vox_data_sh[j] = MAX(min_signal, __vox_data_sh[j]); - } - __syncwarp(WMASK); - - const REAL_T denom = avgMask(dimt, b0s_mask, __vox_data_sh); - - for(int j = tidx; j < dimt; j += BDIM_X) { - __vox_data_sh[j] /= denom; - } - __syncwarp(); - - //if (!tidx && !threadIdx.y && !blockIdx.x) { - // printf("denom: %f\n", denom); - //} - ////break; - //if (!tidx && !threadIdx.y && !blockIdx.x) { - // - // printf("__vox_data_sh:\n"); - // printArray("vox_data", 6, dimt, __vox_data_sh[tidy]); - //} - //break; - - maskGet(dimt, b0s_mask, __vox_data_sh, __msk_data_sh); - __syncwarp(WMASK); - - fit_model_coef(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh); - - // __r_sh[tidy] <- python 'coef' - - ndotp_d(samplm_nr, delta_nr, __r_sh, sampling_matrix, __h_sh); - - // __h_sh[tidy] <- python 'pmf' - } else { - #pragma unroll - for(int j = tidx; j < samplm_nr; j += BDIM_X) { - __h_sh[j] = 0; - } - // __h_sh[tidy] <- python 'pmf' - } - __syncwarp(WMASK); -#if 0 - if (!threadIdx.y && threadIdx.x == 0) { - for(int j = 0; j < samplm_nr; j++) { - printf("pmf[%d]: %f\n", j, __h_sh[tidy][j]); - } - } - //return; -#endif - const REAL_T abs_pmf_thr = PMF_THRESHOLD_P*max_d(samplm_nr, __h_sh, REAL_MIN); - __syncwarp(WMASK); - - #pragma unroll - for(int j = tidx; j < samplm_nr; j += BDIM_X) { - const REAL_T __v = __h_sh[j]; - if (__v < abs_pmf_thr) { - __h_sh[j] = 0; - } - } - __syncwarp(WMASK); -#if 0 - if (!threadIdx.y && threadIdx.x == 0) { - printf("abs_pmf_thr: %f\n", abs_pmf_thr); - for(int j = 0; j < samplm_nr; j++) { - printf("pmfNORM[%d]: %f\n", j, __h_sh[tidy][j]); - } - } - //return; -#endif -#if 0 - if init: - directions = peak_directions(pmf, sphere)[0] - return directions - else: - peaks = peak_directions(pmf, sphere)[0] - if (len(peaks) > 0): - return closest_peak(directions, peaks, cos_similarity) -#endif - const int ndir = peak_directions_d(__h_sh, dirs, - sphere_vertices, - sphere_edges, - num_edges, - samplm_nr, - reinterpret_cast(__r_sh), // reuse __r_sh as shInd in func which is large enough - relative_peak_thres, - min_separation_angle); - if (NATTEMPTS == 1) { // init=True... - return ndir; // and dirs; - } else { // init=False... - if (ndir > 0) { - /* - if (!threadIdx.y && threadIdx.x == 0 && ndir > 1) { - printf("NATTEMPTS=5 and ndir: %d!!!\n", ndir); - } - */ - REAL3_T peak; - const int foundPeak = closest_peak_d(max_angle, dir, ndir, dirs, &peak); - __syncwarp(WMASK); - if (foundPeak) { - if (tidx == 0) { - dirs[0] = peak; - } - return 1; - } - } - } - } - return 0; -} - -enum {OUTSIDEIMAGE, INVALIDPOINT, TRACKPOINT, ENDPOINT}; - -template -__device__ int check_point_d(const REAL_T tc_threshold, - const REAL3_T point, - const int dimx, - const int dimy, - const int dimz, - const REAL_T *__restrict__ metric_map) { - - const int tidy = threadIdx.y; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - __shared__ REAL_T __shInterpOut[BDIM_Y]; - - const int rv = trilinear_interp_d(dimx, dimy, dimz, 1, 0, metric_map, point, __shInterpOut+tidy); - __syncwarp(WMASK); -#if 0 - if (threadIdx.y == 1 && threadIdx.x == 0) { - printf("__shInterpOut[tidy]: %f, TC_THRESHOLD_P: %f\n", __shInterpOut[tidy], TC_THRESHOLD_P); - } -#endif - if (rv != 0) { - return OUTSIDEIMAGE; - } - //return (__shInterpOut[tidy] > TC_THRESHOLD_P) ? TRACKPOINT : ENDPOINT; - return (__shInterpOut[tidy] > tc_threshold) ? TRACKPOINT : ENDPOINT; -} - template::type* __restrict__ ctx, const int samplm_nr, const REAL3_T *__restrict__ sphere_vertices, const int2 *__restrict__ sphere_edges, @@ -1229,11 +331,6 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st, if (tidx == 0) { streamline[0] = point; -#if 0 - if (threadIdx.y == 1) { - printf("streamline[0]: %f, %f, %f\n", point.x, point.y, point.z); - } -#endif } __syncwarp(WMASK); @@ -1277,30 +374,6 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st, point, sphere_vertices, __sh_new_dir + tidy); - } else { - // call get_direction_boot_d() with NATTEMPTS=5 - ndir = get_direction_boot_d( - st, - max_angle, - ctx->min_signal, - relative_peak_thres, - min_separation_angle, - direction, - dimx, dimy, dimz, dimt, dataf, - ctx->b0s_mask /* !dwi_mask */, - point, - ctx->H, ctx->R, - ctx->delta_nr, - ctx->delta_b, ctx->delta_q, // fit_matrix - samplm_nr, - ctx->sampling_matrix, - sphere_vertices, - sphere_edges, - num_edges, - __sh_new_dir + tidy); } __syncwarp(WMASK); direction = __sh_new_dir[tidy]; @@ -1315,9 +388,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st, } //return; #endif - //point.x += (direction.x / voxel_size.x) * STEP_SIZE_P; - //point.y += (direction.y / voxel_size.y) * STEP_SIZE_P; - //point.z += (direction.z / voxel_size.z) * STEP_SIZE_P; + point.x += (direction.x / voxel_size.x) * (step_size / step_frac); point.y += (direction.y / voxel_size.y) * (step_size / step_frac); point.z += (direction.z / voxel_size.z) * (step_size / step_frac); @@ -1382,136 +453,6 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st, return tissue_class; } -template -__global__ void getNumStreamlinesBoot_k( - const ModelType model_type, - const REAL_T max_angle, - const REAL_T min_signal, - const REAL_T relative_peak_thres, - const REAL_T min_separation_angle, - const long long rndSeed, - const int nseed, - const REAL3_T *__restrict__ seeds, - const int dimx, - const int dimy, - const int dimz, - const int dimt, - const REAL_T *__restrict__ dataf, - const REAL_T *__restrict__ H, - const REAL_T *__restrict__ R, - const int delta_nr, - const REAL_T *__restrict__ delta_b, - const REAL_T *__restrict__ delta_q, - const int *__restrict__ b0s_mask, // change to int - const int samplm_nr, - const REAL_T *__restrict__ sampling_matrix, - const REAL3_T *__restrict__ sphere_vertices, - const int2 *__restrict__ sphere_edges, - const int num_edges, - REAL3_T *__restrict__ shDir0, - int *slineOutOff) { - - const int tidx = threadIdx.x; - const int slid = blockIdx.x*blockDim.y + threadIdx.y; - const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x; - - if (slid >= nseed) { - return; - } - - REAL3_T seed = seeds[slid]; - // seed = lin_mat*seed + offset - - REAL3_T *__restrict__ __shDir = shDir0+slid*samplm_nr; - - // const int hr_side = dimt-1; - - curandStatePhilox4_32_10_t st; - //curand_init(rndSeed, slid + rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(hr_side/BDIM_X) - curand_init(rndSeed, gid, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X) - // elements of the same sequence - // python: - //directions = get_direction(None, dataf, dwi_mask, sphere, s, H, R, model, max_angle, - // pmf_threshold, b0s_mask, min_signal, fit_matrix, - // sampling_matrix, init=True) - - //if (!tidx && !threadIdx.y && !blockIdx.x) { - // printf("seed: %f, %f, %f\n", seed.x, seed.y, seed.z); - //} - - int ndir; - switch(model_type) { - case OPDT: - ndir = get_direction_boot_d( - &st, - max_angle, - min_signal, - relative_peak_thres, - min_separation_angle, - MAKE_REAL3(0,0,0), - dimx, dimy, dimz, dimt, dataf, - b0s_mask /* !dwi_mask */, - seed, - H, R, - // model unused - // max_angle, pmf_threshold from global defines - // b0s_mask already passed - // min_signal from global defines - delta_nr, - delta_b, delta_q, // fit_matrix - samplm_nr, - sampling_matrix, - sphere_vertices, - sphere_edges, - num_edges, - __shDir); - break; - case CSA: - ndir = get_direction_boot_d( - &st, - max_angle, - min_signal, - relative_peak_thres, - min_separation_angle, - MAKE_REAL3(0,0,0), - dimx, dimy, dimz, dimt, dataf, - b0s_mask /* !dwi_mask */, - seed, - H, R, - // model unused - // max_angle, pmf_threshold from global defines - // b0s_mask already passed - // min_signal from global defines - delta_nr, - delta_b, delta_q, // fit_matrix - samplm_nr, - sampling_matrix, - sphere_vertices, - sphere_edges, - num_edges, - __shDir); - break; - default: - printf("FATAL: Invalid Model Type.\n"); - break; - } - - if (tidx == 0) { - slineOutOff[slid] = ndir; - } - - return; -} - template -__global__ void genStreamlinesMerge_k( +__global__ void genStreamlinesMergeProb_k( const REAL_T max_angle, const REAL_T tc_threshold, const REAL_T step_size, @@ -1588,7 +529,6 @@ __global__ void genStreamlinesMerge_k( const int dimt, const REAL_T *__restrict__ dataf, const REAL_T *__restrict__ metric_map, - const typename ModelCtx::type* __restrict__ ctx, const int samplm_nr, const REAL3_T *__restrict__ sphere_vertices, const int2 *__restrict__ sphere_edges, @@ -1689,8 +629,7 @@ __global__ void genStreamlinesMerge_k( MAKE_REAL3(1, 1, 1), dimx, dimy, dimz, dimt, dataf, metric_map, - ctx, - samplm_nr, + samplm_nr, sphere_vertices, sphere_edges, num_edges, @@ -1724,8 +663,7 @@ __global__ void genStreamlinesMerge_k( MAKE_REAL3(1, 1, 1), dimx, dimy, dimz, dimt, dataf, metric_map, - ctx, - samplm_nr, + samplm_nr, sphere_vertices, sphere_edges, num_edges, diff --git a/cuslines/cuda_c/globals.h b/cuslines/cuda_c/globals.h index b9f8211..71bcd73 100644 --- a/cuslines/cuda_c/globals.h +++ b/cuslines/cuda_c/globals.h @@ -98,33 +98,6 @@ enum ModelType { PTT = 3, }; -struct NoCtx {}; - -template -struct BootCtx { - REAL_T min_signal; - int delta_nr; - const REAL_T* H; - const REAL_T* R; - const REAL_T* delta_b; - const REAL_T* delta_q; - const REAL_T* sampling_matrix; - const int* b0s_mask; -}; - -template -struct ModelCtx { - using type = NoCtx; -}; - -template -struct ModelCtx { - using type = BootCtx; -}; - -template -struct ModelCtx { - using type = BootCtx; -}; +enum {OUTSIDEIMAGE, INVALIDPOINT, TRACKPOINT, ENDPOINT}; #endif diff --git a/cuslines/cuda_c/tracking_helpers.cu b/cuslines/cuda_c/tracking_helpers.cu new file mode 100644 index 0000000..21d5f67 --- /dev/null +++ b/cuslines/cuda_c/tracking_helpers.cu @@ -0,0 +1,290 @@ + +using namespace cuwsort; + +template +__device__ REAL_T interpolation_helper_d(const REAL_T*__restrict__ dataf, const REAL_T wgh[3][2], const long long coo[3][2], int dimy, int dimz, int dimt, int t) { + REAL_T __tmp = 0; + #pragma unroll + for (int i = 0; i < 2; i++) { + #pragma unroll + for (int j = 0; j < 2; j++) { + #pragma unroll + for (int k = 0; k < 2; k++) { + __tmp += wgh[0][i] * wgh[1][j] * wgh[2][k] * + dataf[coo[0][i] * dimy * dimz * dimt + + coo[1][j] * dimz * dimt + + coo[2][k] * dimt + + t]; + } + } + } + return __tmp; +} + +template +__device__ int trilinear_interp_d(const int dimx, + const int dimy, + const int dimz, + const int dimt, + int dimt_idx, // If -1, get all + const REAL_T *__restrict__ dataf, + const REAL3_T point, + REAL_T *__restrict__ __vox_data) { + const REAL_T HALF = static_cast(0.5); + + // all thr compute the same here + if (point.x < -HALF || point.x+HALF >= dimx || + point.y < -HALF || point.y+HALF >= dimy || + point.z < -HALF || point.z+HALF >= dimz) { + return -1; + } + + long long coo[3][2]; + REAL_T wgh[3][2]; // could use just one... + + const REAL_T ONE = static_cast(1.0); + + const REAL3_T fl = MAKE_REAL3(FLOOR(point.x), + FLOOR(point.y), + FLOOR(point.z)); + + wgh[0][1] = point.x - fl.x; + wgh[0][0] = ONE-wgh[0][1]; + coo[0][0] = MAX(0, fl.x); + coo[0][1] = MIN(dimx-1, coo[0][0]+1); + + wgh[1][1] = point.y - fl.y; + wgh[1][0] = ONE-wgh[1][1]; + coo[1][0] = MAX(0, fl.y); + coo[1][1] = MIN(dimy-1, coo[1][0]+1); + + wgh[2][1] = point.z - fl.z; + wgh[2][0] = ONE-wgh[2][1]; + coo[2][0] = MAX(0, fl.z); + coo[2][1] = MIN(dimz-1, coo[2][0]+1); + + if (dimt_idx == -1) { + for (int t = threadIdx.x; t < dimt; t += BDIM_X) { + __vox_data[t] = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, t); + } + } else { + *__vox_data = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, dimt_idx); + } + + // if (threadIdx.x == 0) { + // printf("point: %f, %f, %f\n", point.x, point.y, point.z); + // printf("dimt_idx: %d\n", dimt_idx); + // // for(int i = 0; i < dimt; i++) { + // // printf("__vox_data[%d]: %f\n", i, __vox_data[i]); + // // } + // } + return 0; +} + +template +__device__ int check_point_d(const REAL_T tc_threshold, + const REAL3_T point, + const int dimx, + const int dimy, + const int dimz, + const REAL_T *__restrict__ metric_map) { + + const int tidy = threadIdx.y; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + __shared__ REAL_T __shInterpOut[BDIM_Y]; + + const int rv = trilinear_interp_d(dimx, dimy, dimz, 1, 0, metric_map, point, __shInterpOut+tidy); + __syncwarp(WMASK); +#if 0 + if (threadIdx.y == 1 && threadIdx.x == 0) { + printf("__shInterpOut[tidy]: %f, TC_THRESHOLD_P: %f\n", __shInterpOut[tidy], TC_THRESHOLD_P); + } +#endif + if (rv != 0) { + return OUTSIDEIMAGE; + } + //return (__shInterpOut[tidy] > TC_THRESHOLD_P) ? TRACKPOINT : ENDPOINT; + return (__shInterpOut[tidy] > tc_threshold) ? TRACKPOINT : ENDPOINT; +} + +template +__device__ int peak_directions_d(const REAL_T *__restrict__ odf, + REAL3_T *__restrict__ dirs, + const REAL3_T *__restrict__ sphere_vertices, + const int2 *__restrict__ sphere_edges, + const int num_edges, + int samplm_nr, + int *__restrict__ __shInd, + const REAL_T relative_peak_thres, + const REAL_T min_separation_angle) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + const unsigned int lmask = (1 << lid)-1; + +// __shared__ int __shInd[BDIM_Y][SAMPLM_NR]; + + #pragma unroll + for(int j = tidx; j < samplm_nr; j += BDIM_X) { + __shInd[j] = 0; + } + + REAL_T odf_min = min_d(samplm_nr, odf, REAL_MAX); + odf_min = MAX(0, odf_min); + + __syncwarp(WMASK); + + // local_maxima() + _compare_neighbors() + // selecting only the indices corrisponding to maxima Ms + // such that M-odf_min >= relative_peak_thres + //#pragma unroll + for(int j = 0; j < num_edges; j += BDIM_X) { + if (j+tidx < num_edges) { + const int u_ind = sphere_edges[j+tidx].x; + const int v_ind = sphere_edges[j+tidx].y; + + //if (u_ind >= NUM_EDGES || v_ind >= NUM_EDGES) { ERROR; } + + const REAL_T u_val = odf[u_ind]; + const REAL_T v_val = odf[v_ind]; + + //if (u_val != u_val || v_val != v_val) { ERROR_NANs; } + + // only check that they are not equal + //if (u_val != v_val) { + // __shInd[tidy][u_val < v_val ? u_ind : v_ind] = -1; // benign race conditions... + //} + if (u_val < v_val) { + atomicExch(__shInd+u_ind, -1); + atomicOr( __shInd+v_ind, 1); + } else if (v_val < u_val) { + atomicExch(__shInd+v_ind, -1); + atomicOr( __shInd+u_ind, 1); + } + } + } + __syncwarp(WMASK); + + const REAL_T compThres = relative_peak_thres*max_mask_transl_d(samplm_nr, __shInd, odf, -odf_min, REAL_MIN); +#if 1 +/* + if (!tidy && !tidx) { + for(int j = 0; j < SAMPLM_NR; j++) { + printf("local_max[%d]: %d (%f)\n", j, __shInd[tidy][j], odf[j]); + } + printf("maxMax with offset %f: %f\n", -odf_min, compThres); + } + __syncwarp(WMASK); +*/ + // compact indices of positive values to the right + int n = 0; + + for(int j = 0; j < samplm_nr; j += BDIM_X) { + + const int __v = (j+tidx < samplm_nr) ? __shInd[j+tidx] : -1; + const int __keep = (__v > 0) && ((odf[j+tidx]-odf_min) >= compThres); + const int __msk = __ballot_sync(WMASK, __keep); + +//__syncwarp(WMASK); // unnecessary + if (__keep) { + const int myoff = __popc(__msk & lmask); + __shInd[n + myoff] = j+tidx; + } + n += __popc(__msk); +//__syncwarp(WMASK); // should be unnecessary + } + __syncwarp(WMASK); +/* + if (!tidy && !tidx) { + for(int j = 0; j < n; j++) { + printf("local_max_compact[%d]: %d\n", j, __shInd[tidy][j]); + } + } + __syncwarp(WMASK); +*/ + + // sort local maxima indices + if (n < BDIM_X) { + REAL_T k = REAL_MIN; + int v = 0; + if (tidx < n) { + v = __shInd[tidx]; + k = odf[v]; + } + warp_sort<32, BDIM_X, WSORT_DIR_DEC>(&k, &v); + __syncwarp(WMASK); + + if (tidx < n) { + __shInd[tidx] = v; + } + } else { + // ERROR !!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + } + __syncwarp(WMASK); + + // __shInd[tidy][] contains the indices in odf correspoding to + // normalized maxima NOT sorted! + if (n != 0) { + // remove_similar_vertices() + // PRELIMINARY INEFFICIENT, SINGLE TH, IMPLEMENTATION + if (tidx == 0) { + const REAL_T cos_similarity = COS(min_separation_angle); + + dirs[0] = sphere_vertices[__shInd[0]]; + + int k = 1; + for(int i = 1; i < n; i++) { + + const REAL3_T abc = sphere_vertices[__shInd[i]]; + + int j = 0; + for(; j < k; j++) { + const REAL_T cos = FABS(abc.x*dirs[j].x+ + abc.y*dirs[j].y+ + abc.z*dirs[j].z); + if (cos > cos_similarity) { + break; + } + } + if (j == k) { + dirs[k++] = abc; + } + } + n = k; + } + n = __shfl_sync(WMASK, n, 0, BDIM_X); + __syncwarp(WMASK); + + } +/* + if (!tidy && !tidx) { + for(int j = 0; j < n; j++) { + printf("local_max_compact_uniq[%d]: %d\n", j, __shInd[tidy][j]); + } + } + __syncwarp(WMASK); +*/ +#else + const int indMax = max_d(__shInd[tidy], -1); + if (indMax != -1) { + __ret = MAKE_REAL3(sphere_vertices[indMax][0], + sphere_vertices[indMax][1], + sphere_vertices[indMax][2]); + } +#endif + return n; +} diff --git a/cuslines/cuda_c/utils.cu b/cuslines/cuda_c/utils.cu index 93b1190..8c5afe1 100644 --- a/cuslines/cuda_c/utils.cu +++ b/cuslines/cuda_c/utils.cu @@ -22,6 +22,62 @@ __device__ VAL_T max_d(const int n, const VAL_T *__restrict__ src, const VAL_T m return __m; } +template +__device__ VAL_T max_mask_transl_d(const int n, + const LEN_T *__restrict__ srcMsk, + const VAL_T *__restrict__ srcVal, + const VAL_T offset, + const VAL_T minVal) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + VAL_T __m = minVal; + + for(int i = tidx; i < n; i += BDIM_X) { + const LEN_T sel = srcMsk[i]; + if (sel > 0) { + __m = MAX(__m, srcVal[i]+offset); + } + } + + #pragma unroll + for(int i = BDIM_X/2; i; i /= 2) { + const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X); + __m = MAX(__m, __tmp); + } + + return __m; +} + +template +__device__ VAL_T min_d(const int n, const VAL_T *__restrict__ src, const VAL_T maxVal) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + VAL_T __m = maxVal; + + for(int i = tidx; i < n; i += BDIM_X) { + __m = MIN(__m, src[i]); + } + + #pragma unroll + for(int i = BDIM_X/2; i; i /= 2) { + const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X); + __m = MIN(__m, __tmp); + } + + return __m; +} + template __device__ void prefix_sum_sh_d(REAL_T *num_sh, int __len) { const int tidx = threadIdx.x; @@ -80,85 +136,3 @@ __device__ void printArray(const char *name, int ncol, int n, REAL_T *arr) { printArrayAlways(name, ncol, n, arr); } } - -template -__device__ REAL_T interpolation_helper_d(const REAL_T*__restrict__ dataf, const REAL_T wgh[3][2], const long long coo[3][2], int dimy, int dimz, int dimt, int t) { - REAL_T __tmp = 0; - #pragma unroll - for (int i = 0; i < 2; i++) { - #pragma unroll - for (int j = 0; j < 2; j++) { - #pragma unroll - for (int k = 0; k < 2; k++) { - __tmp += wgh[0][i] * wgh[1][j] * wgh[2][k] * - dataf[coo[0][i] * dimy * dimz * dimt + - coo[1][j] * dimz * dimt + - coo[2][k] * dimt + - t]; - } - } - } - return __tmp; -} - -template -__device__ int trilinear_interp_d(const int dimx, - const int dimy, - const int dimz, - const int dimt, - int dimt_idx, // If -1, get all - const REAL_T *__restrict__ dataf, - const REAL3_T point, - REAL_T *__restrict__ __vox_data) { - const REAL_T HALF = static_cast(0.5); - - // all thr compute the same here - if (point.x < -HALF || point.x+HALF >= dimx || - point.y < -HALF || point.y+HALF >= dimy || - point.z < -HALF || point.z+HALF >= dimz) { - return -1; - } - - long long coo[3][2]; - REAL_T wgh[3][2]; // could use just one... - - const REAL_T ONE = static_cast(1.0); - - const REAL3_T fl = MAKE_REAL3(FLOOR(point.x), - FLOOR(point.y), - FLOOR(point.z)); - - wgh[0][1] = point.x - fl.x; - wgh[0][0] = ONE-wgh[0][1]; - coo[0][0] = MAX(0, fl.x); - coo[0][1] = MIN(dimx-1, coo[0][0]+1); - - wgh[1][1] = point.y - fl.y; - wgh[1][0] = ONE-wgh[1][1]; - coo[1][0] = MAX(0, fl.y); - coo[1][1] = MIN(dimy-1, coo[1][0]+1); - - wgh[2][1] = point.z - fl.z; - wgh[2][0] = ONE-wgh[2][1]; - coo[2][0] = MAX(0, fl.z); - coo[2][1] = MIN(dimz-1, coo[2][0]+1); - - if (dimt_idx == -1) { - for (int t = threadIdx.x; t < dimt; t += BDIM_X) { - __vox_data[t] = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, t); - } - } else { - *__vox_data = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, dimt_idx); - } - - // if (threadIdx.x == 0) { - // printf("point: %f, %f, %f\n", point.x, point.y, point.z); - // printf("dimt_idx: %d\n", dimt_idx); - // // for(int i = 0; i < dimt; i++) { - // // printf("__vox_data[%d]: %f\n", i, __vox_data[i]); - // // } - // } - return 0; -} diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py index d1b9e28..3383d1a 100644 --- a/cuslines/cuda_python/cu_direction_getters.py +++ b/cuslines/cuda_python/cu_direction_getters.py @@ -1,6 +1,5 @@ import numpy as np from abc import ABC, abstractmethod -import ctypes import logging from importlib.resources import files from time import time @@ -10,7 +9,7 @@ from cuda.core import Device, LaunchConfig, Program, launch, ProgramOptions from cuda.pathfinder import find_nvidia_header_directory from cuda.cccl import get_include_paths -from cuda.bindings import runtime +from cuda.bindings import runtime, driver from cuda.bindings.runtime import cudaMemcpyKind from cuslines.cuda_python.cutils import ( @@ -22,7 +21,6 @@ ModelType, THR_X_SL, BLOCK_Y, - REAL_DTYPE_AS_CTYPE, ) logger = logging.getLogger("GPUStreamlines") @@ -47,7 +45,7 @@ def compile_program(self, debug: bool = False): start_time = time() logger.info("Compiling GPUStreamlines") - cuslines_cuda = files("cuslines") + cuslines_cuda = files("cuslines").joinpath("cuda_c") if debug: program_opts = { @@ -78,7 +76,7 @@ def compile_program(self, debug: bool = False): # I think this is reasonable dev = Device() dev.set_current() - cuda_path = cuslines_cuda.joinpath("cuda_c/generate_streamlines_cuda.cu") + cuda_path = cuslines_cuda.joinpath("generate_streamlines_cuda.cu") with open(cuda_path, "r") as f: prog = Program(f.read(), code_type="c++", options=program_options) self.module = prog.compile( @@ -90,18 +88,6 @@ def compile_program(self, debug: bool = False): logger.info("GPUStreamlines compiled successfully in %.2f seconds", time() - start_time) -class _BootCtx(ctypes.Structure): - _fields_ = [ - ("min_signal", REAL_DTYPE_AS_CTYPE), - ("delta_nr", ctypes.c_int32), - ("H", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), - ("R", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), - ("delta_b", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), - ("delta_q", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), - ("sampling_matrix", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), - ("b0s_mask", ctypes.POINTER(ctypes.c_int32))] - - class BootDirectionGetter(GPUDirectionGetter): def __init__( self, @@ -120,6 +106,8 @@ def __init__( else: raise ValueError(f"Invalid model_type {model_type}, must be one of 'OPDT', 'CSA'") + checkCudaErrors(driver.cuInit(0)) + self.H = np.ascontiguousarray(H, dtype=REAL_DTYPE) self.R = np.ascontiguousarray(R, dtype=REAL_DTYPE) self.delta_b = np.ascontiguousarray(delta_b, dtype=REAL_DTYPE) @@ -128,7 +116,6 @@ def __init__( self.min_signal = REAL_DTYPE(min_signal) self.sampling_matrix = np.ascontiguousarray(sampling_matrix, dtype=REAL_DTYPE) self.b0s_mask = np.ascontiguousarray(b0s_mask, dtype=np.int32) - self.ctx_h = [] self.H_d = [] self.R_d = [] @@ -136,10 +123,9 @@ def __init__( self.delta_q_d = [] self.b0s_mask_d = [] self.sampling_matrix_d = [] - self.ctx_d = [] self.getnum_kernel_name = f"getNumStreamlinesBoot_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" - self.genstreamlines_kernel_name = f"genStreamlinesMerge_k<{THR_X_SL},{BLOCK_Y},{model_type.upper()},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" + self.genstreamlines_kernel_name = f"genStreamlinesMergeBoot_k<{THR_X_SL},{BLOCK_Y},{model_type.upper()},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" self.compile_program() @classmethod @@ -232,19 +218,6 @@ def allocate_on_gpu(self, n): self.sampling_matrix_d.append( checkCudaErrors(runtime.cudaMalloc( REAL_SIZE*self.sampling_matrix.size))) - self.ctx_d.append( - checkCudaErrors(runtime.cudaMalloc( - ctypes.sizeof(_BootCtx)))) - self.ctx_h.append(_BootCtx( - min_signal=self.min_signal, - delta_nr=self.delta_nr, - H=ctypes.cast(self.H_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), - R=ctypes.cast(self.R_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), - delta_b=ctypes.cast(self.delta_b_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), - delta_q=ctypes.cast(self.delta_q_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), - sampling_matrix=ctypes.cast(self.sampling_matrix_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)), - b0s_mask=ctypes.cast(self.b0s_mask_d[n], ctypes.POINTER(ctypes.c_int32)) - )) checkCudaErrors(runtime.cudaMemcpy( self.H_d[n], @@ -276,11 +249,6 @@ def allocate_on_gpu(self, n): self.sampling_matrix.ctypes.data, REAL_SIZE*self.sampling_matrix.size, cudaMemcpyKind.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.ctx_d[n], - ctypes.addressof(self.ctx_h[n]), - ctypes.sizeof(_BootCtx), - cudaMemcpyKind.cudaMemcpyHostToDevice)) def deallocate_on_gpu(self, n): if self.H_d[n]: @@ -295,8 +263,6 @@ def deallocate_on_gpu(self, n): checkCudaErrors(runtime.cudaFree(self.b0s_mask_d[n])) if self.sampling_matrix_d[n]: checkCudaErrors(runtime.cudaFree(self.sampling_matrix_d[n])) - if self.ctx_d[n]: - checkCudaErrors(runtime.cudaFree(self.ctx_d[n])) def _shared_mem_bytes(self, sp): return REAL_SIZE*BLOCK_Y*2*( @@ -312,8 +278,9 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp): sp.gpu_tracker.streams[n], config, ker, self.model_type, sp.gpu_tracker.max_angle, - sp.gpu_tracker.min_separation_angle, + self.min_signal, sp.gpu_tracker.relative_peak_thresh, + sp.gpu_tracker.min_separation_angle, sp.gpu_tracker.rng_seed, nseeds_gpu, sp.seeds_d[n], @@ -358,11 +325,18 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): sp.gpu_tracker.dimt, sp.gpu_tracker.dataf_d[n], sp.gpu_tracker.metric_map_d[n], - self.ctx_d[n], sp.gpu_tracker.samplm_nr, sp.gpu_tracker.sphere_vertices_d[n], sp.gpu_tracker.sphere_edges_d[n], sp.gpu_tracker.nedges, + self.min_signal, + self.delta_nr, + self.H_d[n], + self.R_d[n], + self.delta_b_d[n], + self.delta_q_d[n], + self.sampling_matrix_d[n], + self.b0s_mask_d[n], sp.slinesOffs_d[n], sp.shDirTemp0_d[n], sp.slineSeed_d[n], @@ -373,8 +347,9 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): class ProbDirectionGetter(GPUDirectionGetter): def __init__(self): + checkCudaErrors(driver.cuInit(0)) self.getnum_kernel_name = f"getNumStreamlinesProb_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" - self.genstreamlines_kernel_name = f"genStreamlinesMerge_k<{THR_X_SL},{BLOCK_Y},PROB,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" + self.genstreamlines_kernel_name = f"genStreamlinesMergeProb_k<{THR_X_SL},{BLOCK_Y},PROB,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" self.compile_program() def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp): @@ -427,7 +402,6 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): sp.gpu_tracker.dimt, sp.gpu_tracker.dataf_d[n], sp.gpu_tracker.metric_map_d[n], - int(0), sp.gpu_tracker.samplm_nr, sp.gpu_tracker.sphere_vertices_d[n], sp.gpu_tracker.sphere_edges_d[n], @@ -440,11 +414,11 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): ) - class PttDirectionGetter(ProbDirectionGetter): def __init__(self): + checkCudaErrors(driver.cuInit(0)) self.getnum_kernel_name = f"getNumStreamlinesProb_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" - self.genstreamlines_kernel_name = f"genStreamlinesMerge_k<{THR_X_SL},{BLOCK_Y},PTT,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" + self.genstreamlines_kernel_name = f"genStreamlinesMergeProb_k<{THR_X_SL},{BLOCK_Y},PTT,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" self.compile_program() def _shared_mem_bytes(self, sp): diff --git a/cuslines/cuda_python/cu_tractography.py b/cuslines/cuda_python/cu_tractography.py index 1d34adc..d9f94f9 100644 --- a/cuslines/cuda_python/cu_tractography.py +++ b/cuslines/cuda_python/cu_tractography.py @@ -1,4 +1,4 @@ -from cuda.bindings import driver, runtime +from cuda.bindings import runtime from cuda.bindings.runtime import cudaMemcpyKind # TODO: consider cuda core over cuda bindings @@ -116,7 +116,6 @@ def __init__( self.rng_offset = int(rng_offset) self.chunk_size = int(chunk_size) - checkCudaErrors(driver.cuInit(0)) avail = checkCudaErrors(runtime.cudaGetDeviceCount()) if self.ngpus > avail: raise RuntimeError(f"Requested {self.ngpus} GPUs but only {avail} available") diff --git a/cuslines/cuda_python/cutils.py b/cuslines/cuda_python/cutils.py index 2fd688e..4d0e313 100644 --- a/cuslines/cuda_python/cutils.py +++ b/cuslines/cuda_python/cutils.py @@ -1,7 +1,6 @@ from cuda.bindings import driver, nvrtc import numpy as np -import ctypes from enum import IntEnum @@ -22,7 +21,6 @@ class ModelType(IntEnum): ('z', np.float32)], align=True) REAL_DTYPE_AS_STR = "float" REAL3_DTYPE_AS_STR = "float3" - REAL_DTYPE_AS_CTYPE = ctypes.c_float elif REAL_SIZE == 8: REAL_DTYPE = np.float64 REAL3_DTYPE = np.dtype([('x', np.float64), @@ -30,7 +28,6 @@ class ModelType(IntEnum): ('z', np.float64)], align=True) REAL_DTYPE_AS_STR = "double" REAL3_DTYPE_AS_STR = "double3" - REAL_DTYPE_AS_CTYPE = ctypes.c_double else: raise NotImplementedError(f"Unsupported REAL_SIZE={REAL_SIZE} in globals.h") BLOCK_Y = THR_X_BL//THR_X_SL diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py index 06c61c6..57053fe 100644 --- a/run_gpu_streamlines.py +++ b/run_gpu_streamlines.py @@ -72,7 +72,7 @@ #Get Gradient values def get_gtab(fbval, fbvec): bvals, bvecs = read_bvals_bvecs(fbval, fbvec) - gtab = gradient_table(bvals, bvecs) + gtab = gradient_table(bvals=bvals, bvecs=bvecs) return gtab def get_img(ep2_seq): @@ -115,7 +115,8 @@ def get_img(ep2_seq): if not all(arg == 'hardi' for arg in [args.nifti_file, args.bvals, args.bvecs, args.mask_nifti, args.roi_nifti]): raise ValueError("If any of the arguments is 'hardi', all must be 'hardi'") # Get Stanford HARDI data - hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames('stanford_hardi') + hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames( + name='stanford_hardi') csf, gm, wm = read_stanford_pve_maps() wm_data = wm.get_fdata() @@ -139,7 +140,7 @@ def get_img(ep2_seq): tenmodel = dti.TensorModel(gtab, fit_method='WLS') print('Fitting Tensor') -tenfit = tenmodel.fit(data, mask) +tenfit = tenmodel.fit(data, mask=mask) print('Computing anisotropy measures (FA,MD,RGB)') FA = tenfit.fa @@ -220,6 +221,7 @@ def get_img(ep2_seq): ts = time.time() streamline_generator = LocalTracking(dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=args.step_size) sft = StatefulTractogram(streamline_generator, img, Space.VOX) + n_sls = len(sft.streamlines) te = time.time() else: with GPUTracker( @@ -240,10 +242,12 @@ def get_img(ep2_seq): ts = time.time() if args.output_prefix and write_method == "trx": trx_file = gpu_tracker.generate_trx(seed_mask, img) + n_sls = len(trx_file.streamlines) else: sft = gpu_tracker.generate_sft(seed_mask, img) + n_sls = len(sft.streamlines) te = time.time() -print("Generated {} streamlines from {} seeds, time: {} s".format(len(sft.streamlines), +print("Generated {} streamlines from {} seeds, time: {} s".format(n_sls, seed_mask.shape[0], te-ts)) From d008d039e061a8c67b24750c301fcd9d6cfcc068 Mon Sep 17 00:00:00 2001 From: 36000 Date: Wed, 7 Jan 2026 12:54:34 -0800 Subject: [PATCH 27/31] ruff --- cuslines/cuda_python/__init__.py | 4 +- cuslines/cuda_python/cu_direction_getters.py | 224 +++++++++++-------- cuslines/cuda_python/cu_propagate_seeds.py | 177 ++++++++------- cuslines/cuda_python/cu_tractography.py | 98 ++++---- cuslines/cuda_python/cutils.py | 26 ++- 5 files changed, 315 insertions(+), 214 deletions(-) diff --git a/cuslines/cuda_python/__init__.py b/cuslines/cuda_python/__init__.py index d0b42d4..fd05c1e 100644 --- a/cuslines/cuda_python/__init__.py +++ b/cuslines/cuda_python/__init__.py @@ -2,12 +2,12 @@ from .cu_direction_getters import ( ProbDirectionGetter, PttDirectionGetter, - BootDirectionGetter + BootDirectionGetter, ) __all__ = [ "GPUTracker", "ProbDirectionGetter", "PttDirectionGetter", - "BootDirectionGetter" + "BootDirectionGetter", ] diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py index 3383d1a..b659445 100644 --- a/cuslines/cuda_python/cu_direction_getters.py +++ b/cuslines/cuda_python/cu_direction_getters.py @@ -55,9 +55,7 @@ def compile_program(self, debug: bool = False): "lineinfo": True, } else: - program_opts = { - "ptxas_options": ["-O3"] - } + program_opts = {"ptxas_options": ["-O3"]} program_options = ProgramOptions( name="cuslines", @@ -68,8 +66,10 @@ def compile_program(self, debug: bool = False): str(cuslines_cuda), find_nvidia_header_directory("cudart"), find_nvidia_header_directory("curand"), - get_include_paths().libcudacxx], - **program_opts) + get_include_paths().libcudacxx, + ], + **program_opts, + ) # Here we assume all devices are the same, # so we compile once for any current device. @@ -84,27 +84,33 @@ def compile_program(self, debug: bool = False): name_expressions=( self.getnum_kernel_name, self.genstreamlines_kernel_name, - )) - logger.info("GPUStreamlines compiled successfully in %.2f seconds", time() - start_time) + ), + ) + logger.info( + "GPUStreamlines compiled successfully in %.2f seconds", time() - start_time + ) class BootDirectionGetter(GPUDirectionGetter): def __init__( - self, - model_type: str, - min_signal: float, - H: np.ndarray, - R: np.ndarray, - delta_b: np.ndarray, - delta_q: np.ndarray, - sampling_matrix: np.ndarray, - b0s_mask: np.ndarray): + self, + model_type: str, + min_signal: float, + H: np.ndarray, + R: np.ndarray, + delta_b: np.ndarray, + delta_q: np.ndarray, + sampling_matrix: np.ndarray, + b0s_mask: np.ndarray, + ): if model_type.upper() == "OPDT": self.model_type = int(ModelType.OPDT) elif model_type.upper() == "CSA": self.model_type = int(ModelType.CSA) else: - raise ValueError(f"Invalid model_type {model_type}, must be one of 'OPDT', 'CSA'") + raise ValueError( + f"Invalid model_type {model_type}, must be one of 'OPDT', 'CSA'" + ) checkCudaErrors(driver.cuInit(0)) @@ -129,11 +135,15 @@ def __init__( self.compile_program() @classmethod - def from_dipy_opdt(cls, gtab, sphere, - sh_order_max=6, - full_basis=False, - sh_lambda=0.006, - min_signal=1): + def from_dipy_opdt( + cls, + gtab, + sphere, + sh_order_max=6, + full_basis=False, + sh_lambda=0.006, + min_signal=1, + ): sampling_matrix, _, _ = shm.real_sh_descoteaux( sh_order_max, sphere.theta, sphere.phi, full_basis=full_basis, legacy=False ) @@ -160,15 +170,19 @@ def from_dipy_opdt(cls, gtab, sphere, delta_b=delta_b, delta_q=delta_q, sampling_matrix=sampling_matrix, - b0s_mask=gtab.b0s_mask + b0s_mask=gtab.b0s_mask, ) @classmethod - def from_dipy_csa(cls, gtab, sphere, - sh_order_max=6, - full_basis=False, - sh_lambda=0.006, - min_signal=1): + def from_dipy_csa( + cls, + gtab, + sphere, + sh_order_max=6, + full_basis=False, + sh_lambda=0.006, + min_signal=1, + ): sampling_matrix, _, _ = shm.real_sh_descoteaux( sh_order_max, sphere.theta, sphere.phi, full_basis=full_basis, legacy=False ) @@ -196,59 +210,73 @@ def from_dipy_csa(cls, gtab, sphere, delta_b=delta_b, delta_q=delta_q, sampling_matrix=sampling_matrix, - b0s_mask=gtab.b0s_mask + b0s_mask=gtab.b0s_mask, ) def allocate_on_gpu(self, n): - self.H_d.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.H.size))) - self.R_d.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.R.size))) + self.H_d.append(checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.H.size))) + self.R_d.append(checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.R.size))) self.delta_b_d.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.delta_b.size))) + checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.delta_b.size)) + ) self.delta_q_d.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.delta_q.size))) + checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.delta_q.size)) + ) self.b0s_mask_d.append( - checkCudaErrors(runtime.cudaMalloc( - np.int32().nbytes*self.b0s_mask.size))) + checkCudaErrors(runtime.cudaMalloc(np.int32().nbytes * self.b0s_mask.size)) + ) self.sampling_matrix_d.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.sampling_matrix.size))) + checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.sampling_matrix.size)) + ) - checkCudaErrors(runtime.cudaMemcpy( - self.H_d[n], - self.H.ctypes.data, - REAL_SIZE*self.H.size, - cudaMemcpyKind.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.R_d[n], - self.R.ctypes.data, - REAL_SIZE*self.R.size, - cudaMemcpyKind.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.delta_b_d[n], - self.delta_b.ctypes.data, - REAL_SIZE*self.delta_b.size, - cudaMemcpyKind.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.delta_q_d[n], - self.delta_q.ctypes.data, - REAL_SIZE*self.delta_q.size, - cudaMemcpyKind.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.b0s_mask_d[n], - self.b0s_mask.ctypes.data, - np.int32().nbytes*self.b0s_mask.size, - cudaMemcpyKind.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.sampling_matrix_d[n], - self.sampling_matrix.ctypes.data, - REAL_SIZE*self.sampling_matrix.size, - cudaMemcpyKind.cudaMemcpyHostToDevice)) + checkCudaErrors( + runtime.cudaMemcpy( + self.H_d[n], + self.H.ctypes.data, + REAL_SIZE * self.H.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.R_d[n], + self.R.ctypes.data, + REAL_SIZE * self.R.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.delta_b_d[n], + self.delta_b.ctypes.data, + REAL_SIZE * self.delta_b.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.delta_q_d[n], + self.delta_q.ctypes.data, + REAL_SIZE * self.delta_q.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.b0s_mask_d[n], + self.b0s_mask.ctypes.data, + np.int32().nbytes * self.b0s_mask.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.sampling_matrix_d[n], + self.sampling_matrix.ctypes.data, + REAL_SIZE * self.sampling_matrix.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) def deallocate_on_gpu(self, n): if self.H_d[n]: @@ -265,9 +293,16 @@ def deallocate_on_gpu(self, n): checkCudaErrors(runtime.cudaFree(self.sampling_matrix_d[n])) def _shared_mem_bytes(self, sp): - return REAL_SIZE*BLOCK_Y*2*( - sp.gpu_tracker.n32dimt + max(sp.gpu_tracker.n32dimt, sp.gpu_tracker.samplm_nr)) + \ - np.int32().nbytes*BLOCK_Y*sp.gpu_tracker.samplm_nr + return ( + REAL_SIZE + * BLOCK_Y + * 2 + * ( + sp.gpu_tracker.n32dimt + + max(sp.gpu_tracker.n32dimt, sp.gpu_tracker.samplm_nr) + ) + + np.int32().nbytes * BLOCK_Y * sp.gpu_tracker.samplm_nr + ) def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp): ker = self.module.get_kernel(self.getnum_kernel_name) @@ -275,7 +310,9 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp): config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory) launch( - sp.gpu_tracker.streams[n], config, ker, + sp.gpu_tracker.streams[n], + config, + ker, self.model_type, sp.gpu_tracker.max_angle, self.min_signal, @@ -301,7 +338,8 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp): sp.gpu_tracker.sphere_edges_d[n], sp.gpu_tracker.nedges, sp.shDirTemp0_d[n], - sp.slinesOffs_d[n]) + sp.slinesOffs_d[n], + ) def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): ker = self.module.get_kernel(self.genstreamlines_kernel_name) @@ -309,14 +347,16 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory) launch( - sp.gpu_tracker.streams[n], config, ker, + sp.gpu_tracker.streams[n], + config, + ker, sp.gpu_tracker.max_angle, sp.gpu_tracker.tc_threshold, sp.gpu_tracker.step_size, sp.gpu_tracker.relative_peak_thresh, sp.gpu_tracker.min_separation_angle, sp.gpu_tracker.rng_seed, - sp.gpu_tracker.rng_offset + n*nseeds_gpu, + sp.gpu_tracker.rng_offset + n * nseeds_gpu, nseeds_gpu, sp.seeds_d[n], sp.gpu_tracker.dimx, @@ -341,7 +381,7 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): sp.shDirTemp0_d[n], sp.slineSeed_d[n], sp.slineLen_d[n], - sp.sline_d[n] + sp.sline_d[n], ) @@ -354,12 +394,16 @@ def __init__(self): def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp): ker = self.module.get_kernel(self.getnum_kernel_name) - shared_memory = REAL_SIZE*BLOCK_Y*sp.gpu_tracker.n32dimt + \ - np.int32().nbytes*BLOCK_Y*sp.gpu_tracker.n32dimt + shared_memory = ( + REAL_SIZE * BLOCK_Y * sp.gpu_tracker.n32dimt + + np.int32().nbytes * BLOCK_Y * sp.gpu_tracker.n32dimt + ) config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory) launch( - sp.gpu_tracker.streams[n], config, ker, + sp.gpu_tracker.streams[n], + config, + ker, sp.gpu_tracker.max_angle, sp.gpu_tracker.relative_peak_thresh, sp.gpu_tracker.min_separation_angle, @@ -375,7 +419,8 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp): sp.gpu_tracker.sphere_edges_d[n], sp.gpu_tracker.nedges, sp.shDirTemp0_d[n], - sp.slinesOffs_d[n]) + sp.slinesOffs_d[n], + ) def _shared_mem_bytes(self, sp): return REAL_SIZE * BLOCK_Y * sp.gpu_tracker.n32dimt @@ -386,14 +431,16 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory) launch( - sp.gpu_tracker.streams[n], config, ker, + sp.gpu_tracker.streams[n], + config, + ker, sp.gpu_tracker.max_angle, sp.gpu_tracker.tc_threshold, sp.gpu_tracker.step_size, sp.gpu_tracker.relative_peak_thresh, sp.gpu_tracker.min_separation_angle, sp.gpu_tracker.rng_seed, - sp.gpu_tracker.rng_offset + n*nseeds_gpu, + sp.gpu_tracker.rng_offset + n * nseeds_gpu, nseeds_gpu, sp.seeds_d[n], sp.gpu_tracker.dimx, @@ -410,7 +457,7 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): sp.shDirTemp0_d[n], sp.slineSeed_d[n], sp.slineLen_d[n], - sp.sline_d[n] + sp.sline_d[n], ) @@ -423,4 +470,3 @@ def __init__(self): def _shared_mem_bytes(self, sp): return 0 - diff --git a/cuslines/cuda_python/cu_propagate_seeds.py b/cuslines/cuda_python/cu_propagate_seeds.py index 72037c6..b8991e5 100644 --- a/cuslines/cuda_python/cu_propagate_seeds.py +++ b/cuslines/cuda_python/cu_propagate_seeds.py @@ -16,16 +16,15 @@ THR_X_BL, DEV_PTR, div_up, - checkCudaErrors) + checkCudaErrors, +) logger = logging.getLogger("GPUStreamlines") class SeedBatchPropagator: - def __init__( - self, - gpu_tracker): + def __init__(self, gpu_tracker): self.gpu_tracker = gpu_tracker self.ngpus = gpu_tracker.ngpus @@ -44,53 +43,71 @@ def __init__( def _switch_device(self, n): checkCudaErrors(runtime.cudaSetDevice(n)) - nseeds_gpu = min( - self.nseeds_per_gpu, max(0, self.nseeds - n * self.nseeds_per_gpu)) - block = (THR_X_SL, THR_X_BL//THR_X_SL, 1) - grid = (div_up(nseeds_gpu, THR_X_BL//THR_X_SL), 1, 1) + nseeds_gpu = min( + self.nseeds_per_gpu, max(0, self.nseeds - n * self.nseeds_per_gpu) + ) + block = (THR_X_SL, THR_X_BL // THR_X_SL, 1) + grid = (div_up(nseeds_gpu, THR_X_BL // THR_X_SL), 1, 1) return nseeds_gpu, block, grid def _get_sl_buffer_size(self, n): - return REAL_SIZE*2*3*MAX_SLINE_LEN*self.nSlines[n].astype(np.int64) + return REAL_SIZE * 2 * 3 * MAX_SLINE_LEN * self.nSlines[n].astype(np.int64) def _allocate_seed_memory(self, seeds): # Move seeds to GPU for ii in range(self.ngpus): nseeds_gpu, _, _ = self._switch_device(ii) - self.seeds_d[ii] = checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*3*nseeds_gpu)) - seeds_host = np.ascontiguousarray(seeds[ - ii*self.nseeds_per_gpu:ii*self.nseeds_per_gpu+nseeds_gpu], - dtype=REAL_DTYPE) - checkCudaErrors(runtime.cudaMemcpy( - self.seeds_d[ii], - seeds_host.ctypes.data, - REAL_SIZE*3*nseeds_gpu, - cudaMemcpyKind.cudaMemcpyHostToDevice)) + self.seeds_d[ii] = checkCudaErrors( + runtime.cudaMalloc(REAL_SIZE * 3 * nseeds_gpu) + ) + seeds_host = np.ascontiguousarray( + seeds[ii * self.nseeds_per_gpu : ii * self.nseeds_per_gpu + nseeds_gpu], + dtype=REAL_DTYPE, + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.seeds_d[ii], + seeds_host.ctypes.data, + REAL_SIZE * 3 * nseeds_gpu, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) for ii in range(self.ngpus): nseeds_gpu, block, grid = self._switch_device(ii) # Streamline offsets - self.slinesOffs_d[ii] = checkCudaErrors(runtime.cudaMalloc( - np.int32().nbytes * (nseeds_gpu + 1))) + self.slinesOffs_d[ii] = checkCudaErrors( + runtime.cudaMalloc(np.int32().nbytes * (nseeds_gpu + 1)) + ) # Initial directions from each seed - self.shDirTemp0_d[ii] = checkCudaErrors(runtime.cudaMalloc( - REAL3_DTYPE.itemsize * self.gpu_tracker.samplm_nr * grid[0] * block[1])) - - def _cumsum_offsets(self): # TODO: performance: do this on device? not crucial for performance now + self.shDirTemp0_d[ii] = checkCudaErrors( + runtime.cudaMalloc( + REAL3_DTYPE.itemsize + * self.gpu_tracker.samplm_nr + * grid[0] + * block[1] + ) + ) + + def _cumsum_offsets( + self, + ): # TODO: performance: do this on device? not crucial for performance now for ii in range(self.ngpus): nseeds_gpu, _, _ = self._switch_device(ii) - if (nseeds_gpu == 0): + if nseeds_gpu == 0: self.nSlines[ii] = 0 continue slinesOffs_h = np.empty(nseeds_gpu + 1, dtype=np.int32) - checkCudaErrors(runtime.cudaMemcpy( - slinesOffs_h.ctypes.data, - self.slinesOffs_d[ii], - slinesOffs_h.nbytes, - cudaMemcpyKind.cudaMemcpyDeviceToHost)) + checkCudaErrors( + runtime.cudaMemcpy( + slinesOffs_h.ctypes.data, + self.slinesOffs_d[ii], + slinesOffs_h.nbytes, + cudaMemcpyKind.cudaMemcpyDeviceToHost, + ) + ) __pval = slinesOffs_h[0] slinesOffs_h[0] = 0 @@ -100,24 +117,29 @@ def _cumsum_offsets(self): # TODO: performance: do this on device? not crucial __pval = __cval self.nSlines[ii] = int(slinesOffs_h[nseeds_gpu]) - checkCudaErrors(runtime.cudaMemcpy( - self.slinesOffs_d[ii], - slinesOffs_h.ctypes.data, - slinesOffs_h.nbytes, - cudaMemcpyKind.cudaMemcpyHostToDevice)) + checkCudaErrors( + runtime.cudaMemcpy( + self.slinesOffs_d[ii], + slinesOffs_h.ctypes.data, + slinesOffs_h.nbytes, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) def _allocate_tracking_memory(self): for ii in range(self.ngpus): self._switch_device(ii) - self.slineSeed_d[ii] = checkCudaErrors(runtime.cudaMalloc( - self.nSlines[ii] * np.int32().nbytes)) - checkCudaErrors(runtime.cudaMemset( - self.slineSeed_d[ii], - -1, - self.nSlines[ii] * np.int32().nbytes)) + self.slineSeed_d[ii] = checkCudaErrors( + runtime.cudaMalloc(self.nSlines[ii] * np.int32().nbytes) + ) + checkCudaErrors( + runtime.cudaMemset( + self.slineSeed_d[ii], -1, self.nSlines[ii] * np.int32().nbytes + ) + ) - if self.nSlines[ii] > EXCESS_ALLOC_FACT*self.nSlines_old[ii]: + if self.nSlines[ii] > EXCESS_ALLOC_FACT * self.nSlines_old[ii]: self.slines[ii] = 0 self.sline_lens[ii] = 0 gc.collect() @@ -127,42 +149,48 @@ def _allocate_tracking_memory(self): if not self.slines[ii]: self.slines[ii] = np.empty( - (EXCESS_ALLOC_FACT*self.nSlines[ii], MAX_SLINE_LEN*2, 3), - dtype=REAL_DTYPE) + (EXCESS_ALLOC_FACT * self.nSlines[ii], MAX_SLINE_LEN * 2, 3), + dtype=REAL_DTYPE, + ) if not self.sline_lens[ii]: self.sline_lens[ii] = np.empty( - EXCESS_ALLOC_FACT*self.nSlines[ii], - dtype=np.int32) + EXCESS_ALLOC_FACT * self.nSlines[ii], dtype=np.int32 + ) for ii in range(self.ngpus): self._switch_device(ii) buffer_size = self._get_sl_buffer_size(ii) - self.slineLen_d[ii] = checkCudaErrors(runtime.cudaMalloc( - np.int32().nbytes * self.nSlines[ii])) - self.sline_d[ii] = checkCudaErrors(runtime.cudaMalloc( - buffer_size)) + self.slineLen_d[ii] = checkCudaErrors( + runtime.cudaMalloc(np.int32().nbytes * self.nSlines[ii]) + ) + self.sline_d[ii] = checkCudaErrors(runtime.cudaMalloc(buffer_size)) def _cleanup(self): for ii in range(self.ngpus): self._switch_device(ii) - checkCudaErrors(runtime.cudaMemcpyAsync( - self.slines[ii], - self.sline_d[ii], - self._get_sl_buffer_size(ii), - cudaMemcpyKind.cudaMemcpyDeviceToHost, - self.gpu_tracker.streams[ii])) - checkCudaErrors(runtime.cudaMemcpyAsync( - self.sline_lens[ii], - self.slineLen_d[ii], - np.int32().nbytes*self.nSlines[ii], - cudaMemcpyKind.cudaMemcpyDeviceToHost, - self.gpu_tracker.streams[ii])) + checkCudaErrors( + runtime.cudaMemcpyAsync( + self.slines[ii], + self.sline_d[ii], + self._get_sl_buffer_size(ii), + cudaMemcpyKind.cudaMemcpyDeviceToHost, + self.gpu_tracker.streams[ii], + ) + ) + checkCudaErrors( + runtime.cudaMemcpyAsync( + self.sline_lens[ii], + self.slineLen_d[ii], + np.int32().nbytes * self.nSlines[ii], + cudaMemcpyKind.cudaMemcpyDeviceToHost, + self.gpu_tracker.streams[ii], + ) + ) for ii in range(self.ngpus): self._switch_device(ii) - checkCudaErrors(runtime.cudaStreamSynchronize( - self.gpu_tracker.streams[ii])) + checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii])) checkCudaErrors(runtime.cudaFree(self.seeds_d[ii])) checkCudaErrors(runtime.cudaFree(self.slineSeed_d[ii])) checkCudaErrors(runtime.cudaFree(self.slinesOffs_d[ii])) @@ -179,30 +207,30 @@ def _cleanup(self): # May be better to do in cuda code directly def propagate(self, seeds): self.nseeds = len(seeds) - self.nseeds_per_gpu = (self.nseeds + self.gpu_tracker.ngpus - 1) // self.gpu_tracker.ngpus + self.nseeds_per_gpu = ( + self.nseeds + self.gpu_tracker.ngpus - 1 + ) // self.gpu_tracker.ngpus self._allocate_seed_memory(seeds) for ii in range(self.ngpus): nseeds_gpu, block, grid = self._switch_device(ii) - if (nseeds_gpu == 0): + if nseeds_gpu == 0: continue self.gpu_tracker.dg.getNumStreamlines(ii, nseeds_gpu, block, grid, self) for ii in range(self.ngpus): - checkCudaErrors(runtime.cudaStreamSynchronize( - self.gpu_tracker.streams[ii])) + checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii])) self._cumsum_offsets() self._allocate_tracking_memory() for ii in range(self.ngpus): nseeds_gpu, block, grid = self._switch_device(ii) - if (nseeds_gpu == 0): + if nseeds_gpu == 0: continue self.gpu_tracker.dg.generateStreamlines(ii, nseeds_gpu, block, grid, self) for ii in range(self.ngpus): - checkCudaErrors(runtime.cudaStreamSynchronize( - self.gpu_tracker.streams[ii])) + checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii])) self._cleanup() @@ -223,9 +251,8 @@ def _yield_slines(): for jj in range(self.nSlines[ii]): npts = this_len[jj] - yield np.asarray( - this_sls[jj], - dtype=REAL_DTYPE)[:npts] + yield np.asarray(this_sls[jj], dtype=REAL_DTYPE)[:npts] + return _yield_slines() def as_array_sequence(self): diff --git a/cuslines/cuda_python/cu_tractography.py b/cuslines/cuda_python/cu_tractography.py index d9f94f9..92f34c0 100644 --- a/cuslines/cuda_python/cu_tractography.py +++ b/cuslines/cuda_python/cu_tractography.py @@ -14,7 +14,7 @@ ) from cuslines.cuda_python.cu_direction_getters import ( GPUDirectionGetter, - BootDirectionGetter + BootDirectionGetter, ) from cuslines.cuda_python.cu_propagate_seeds import SeedBatchPropagator @@ -32,6 +32,7 @@ # SCIL streamline reduction onboard GPU # Remove small/long streamlines on gpu + class GPUTracker: def __init__( self, @@ -118,7 +119,9 @@ def __init__( avail = checkCudaErrors(runtime.cudaGetDeviceCount()) if self.ngpus > avail: - raise RuntimeError(f"Requested {self.ngpus} GPUs but only {avail} available") + raise RuntimeError( + f"Requested {self.ngpus} GPUs but only {avail} available" + ) logger.info("Creating GPUTracker with %d GPUs...", self.ngpus) @@ -130,8 +133,7 @@ def __init__( self.streams = [] self.managed_data = [] - self.seed_propagator = SeedBatchPropagator( - gpu_tracker=self) + self.seed_propagator = SeedBatchPropagator(gpu_tracker=self) self._allocated = False def __enter__(self): @@ -145,46 +147,64 @@ def _allocate(self): for ii in range(self.ngpus): checkCudaErrors(runtime.cudaSetDevice(ii)) self.streams.append( - checkCudaErrors(runtime.cudaStreamCreateWithFlags( - runtime.cudaStreamNonBlocking))) + checkCudaErrors( + runtime.cudaStreamCreateWithFlags(runtime.cudaStreamNonBlocking) + ) + ) for ii in range(self.ngpus): checkCudaErrors(runtime.cudaSetDevice(ii)) # TODO: performance: dataf could be managed or texture memory instead? self.dataf_d.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.dataf.size))) + checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.dataf.size)) + ) self.metric_map_d.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.metric_map.size))) + checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.metric_map.size)) + ) self.sphere_vertices_d.append( - checkCudaErrors(runtime.cudaMalloc( - REAL_SIZE*self.sphere_vertices.size))) + checkCudaErrors( + runtime.cudaMalloc(REAL_SIZE * self.sphere_vertices.size) + ) + ) self.sphere_edges_d.append( - checkCudaErrors(runtime.cudaMalloc( - np.int32().nbytes*self.sphere_edges.size))) - - checkCudaErrors(runtime.cudaMemcpy( - self.dataf_d[ii], - self.dataf.ctypes.data, - REAL_SIZE*self.dataf.size, - cudaMemcpyKind.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.metric_map_d[ii], - self.metric_map.ctypes.data, - REAL_SIZE*self.metric_map.size, - cudaMemcpyKind.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.sphere_vertices_d[ii], - self.sphere_vertices.ctypes.data, - REAL_SIZE*self.sphere_vertices.size, - cudaMemcpyKind.cudaMemcpyHostToDevice)) - checkCudaErrors(runtime.cudaMemcpy( - self.sphere_edges_d[ii], - self.sphere_edges.ctypes.data, - np.int32().nbytes*self.sphere_edges.size, - cudaMemcpyKind.cudaMemcpyHostToDevice)) + checkCudaErrors( + runtime.cudaMalloc(np.int32().nbytes * self.sphere_edges.size) + ) + ) + + checkCudaErrors( + runtime.cudaMemcpy( + self.dataf_d[ii], + self.dataf.ctypes.data, + REAL_SIZE * self.dataf.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.metric_map_d[ii], + self.metric_map.ctypes.data, + REAL_SIZE * self.metric_map.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.sphere_vertices_d[ii], + self.sphere_vertices.ctypes.data, + REAL_SIZE * self.sphere_vertices.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.sphere_edges_d[ii], + self.sphere_edges.ctypes.data, + np.int32().nbytes * self.sphere_edges.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) self.dg.allocate_on_gpu(ii) self._allocated = True @@ -211,7 +231,7 @@ def _divide_chunks(self, seeds): global_chunk_sz = self.chunk_size * self.ngpus nchunks = (seeds.shape[0] + global_chunk_sz - 1) // global_chunk_sz return global_chunk_sz, nchunks - + def generate_sft(self, seeds, ref_img): global_chunk_sz, nchunks = self._divide_chunks(seeds) buffer_size = 0 @@ -228,8 +248,7 @@ def generate_sft(self, seeds, ref_img): seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz].shape[0] ) array_sequence = ArraySequence( - (item for gen in generators for item in gen), - buffer_size // MEGABYTE + (item for gen in generators for item in gen), buffer_size // MEGABYTE ) return StatefulTractogram(array_sequence, ref_img, Space.VOX) @@ -259,7 +278,8 @@ def generate_trx(self, seeds, ref_img): ) tractogram = Tractogram( self.seed_propagator.as_array_sequence(), - affine_to_rasmm=ref_img.affine) + affine_to_rasmm=ref_img.affine, + ) tractogram.to_world() sls = tractogram.streamlines diff --git a/cuslines/cuda_python/cutils.py b/cuslines/cuda_python/cutils.py index 4d0e313..db4115a 100644 --- a/cuslines/cuda_python/cutils.py +++ b/cuslines/cuda_python/cutils.py @@ -13,26 +13,28 @@ class ModelType(IntEnum): PROB = 2 PTT = 3 + REAL3_SIZE = 3 * REAL_SIZE if REAL_SIZE == 4: REAL_DTYPE = np.float32 - REAL3_DTYPE = np.dtype([('x', np.float32), - ('y', np.float32), - ('z', np.float32)], align=True) + REAL3_DTYPE = np.dtype( + [("x", np.float32), ("y", np.float32), ("z", np.float32)], align=True + ) REAL_DTYPE_AS_STR = "float" REAL3_DTYPE_AS_STR = "float3" elif REAL_SIZE == 8: REAL_DTYPE = np.float64 - REAL3_DTYPE = np.dtype([('x', np.float64), - ('y', np.float64), - ('z', np.float64)], align=True) + REAL3_DTYPE = np.dtype( + [("x", np.float64), ("y", np.float64), ("z", np.float64)], align=True + ) REAL_DTYPE_AS_STR = "double" REAL3_DTYPE_AS_STR = "double3" else: raise NotImplementedError(f"Unsupported REAL_SIZE={REAL_SIZE} in globals.h") -BLOCK_Y = THR_X_BL//THR_X_SL +BLOCK_Y = THR_X_BL // THR_X_SL DEV_PTR = object + def _cudaGetErrorEnum(error): if isinstance(error, driver.CUresult): err, name = driver.cuGetErrorName(error) @@ -40,11 +42,16 @@ def _cudaGetErrorEnum(error): elif isinstance(error, nvrtc.nvrtcResult): return nvrtc.nvrtcGetErrorString(error)[1] else: - raise RuntimeError('Unknown error type: {}'.format(error)) + raise RuntimeError("Unknown error type: {}".format(error)) + def checkCudaErrors(result): if result[0].value: - raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0]))) + raise RuntimeError( + "CUDA error code={}({})".format( + result[0].value, _cudaGetErrorEnum(result[0]) + ) + ) if len(result) == 1: return None elif len(result) == 2: @@ -52,5 +59,6 @@ def checkCudaErrors(result): else: return result[1:] + def div_up(a, b): return (a + b - 1) // b From aa8e45fb2b9b883d053f9b46e6de008fbc660b0b Mon Sep 17 00:00:00 2001 From: John Kruper <36000@users.noreply.github.com> Date: Wed, 7 Jan 2026 13:04:30 -0800 Subject: [PATCH 28/31] Fix run_gpu_streamlines.py from copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- run_gpu_streamlines.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py index 57053fe..0d6c447 100644 --- a/run_gpu_streamlines.py +++ b/run_gpu_streamlines.py @@ -196,20 +196,20 @@ def get_img(ep2_seq): model = ConstrainedSphericalDeconvModel(gtab, response, sh_order=args.sh_order) fit = model.fit(data, mask=(FA >= args.fa_threshold)) data = fit.odf(sphere).clip(min=0) - if args.model == "ptt": + if args.dg == "ptt": if args.device == "cpu": dg = cpu_PTTDirectionGetter() else: # Set FOD to 0 outside mask for probing data[FA < args.fa_threshold, :] = 0 dg = PttDirectionGetter() - elif args.model == "prob": + elif args.dg == "prob": if args.device == "cpu": dg = cpu_ProbDirectionGetter() else: dg = ProbDirectionGetter() else: - raise ValueError("Unknown model type: {}".format(args.model)) + raise ValueError("Unknown direction getter type: {}".format(args.dg)) # Setup direction getter args if args.device == "cpu": From cddceb00ba417f622efd33f87e9a26f22141a057 Mon Sep 17 00:00:00 2001 From: 36000 Date: Wed, 7 Jan 2026 13:05:14 -0800 Subject: [PATCH 29/31] cleanup --- setup.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.py b/setup.py index a392cc6..ae15d6a 100644 --- a/setup.py +++ b/setup.py @@ -8,8 +8,6 @@ def defines_to_python(src, dst): src = Path(src) dst = Path(dst) - defines = {} - INT_DEFINE = re.compile( r"#define\s+(\w+)\s+\(?\s*([0-9]+)\s*\)?" ) From cfaa93206205ba86817918621545a4f82fa09593 Mon Sep 17 00:00:00 2001 From: 36000 Date: Wed, 7 Jan 2026 14:23:51 -0800 Subject: [PATCH 30/31] use logging here --- cuslines/cuda_python/cu_tractography.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuslines/cuda_python/cu_tractography.py b/cuslines/cuda_python/cu_tractography.py index 92f34c0..9c24cd7 100644 --- a/cuslines/cuda_python/cu_tractography.py +++ b/cuslines/cuda_python/cu_tractography.py @@ -290,7 +290,7 @@ def generate_trx(self, seeds, ref_img): new_offsets_idx > trx_file.header["NB_STREAMLINES"] or new_sls_data_idx > trx_file.header["NB_VERTICES"] ): - print("TRX resizing...") + logger.info("TRX resizing...") trx_file.resize( nb_streamlines=new_offsets_idx * 2, nb_vertices=new_sls_data_idx * 2, From e6f01baebf4095212c016adf57d057dcebdd269c Mon Sep 17 00:00:00 2001 From: 36000 Date: Wed, 7 Jan 2026 14:52:09 -0800 Subject: [PATCH 31/31] abstract class correction --- cuslines/cuda_python/cu_direction_getters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py index b659445..617f893 100644 --- a/cuslines/cuda_python/cu_direction_getters.py +++ b/cuslines/cuda_python/cu_direction_getters.py @@ -32,7 +32,7 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp): pass @abstractmethod - def generateStreamlines(self): + def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): pass def allocate_on_gpu(self, n):