From 2101b6dd4f3484b7616b4ff810d8a9cea67ae0ef Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Wed, 19 Feb 2025 20:42:21 -0800
Subject: [PATCH 01/31] WIP tweaking PTT params

---
 cuslines/ptt.cu  |  2 +-
 cuslines/ptt.cuh | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
index b36e747..7bb0763 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/ptt.cu
@@ -213,7 +213,7 @@ __device__ int get_direction_ptt_d(
     REAL_T *__first_val_sh = first_val_sh + tidy;
 
     const REAL_T max_curvature = SIN(max_angle / 2) / step_size; // bigger numbers means wiggle more
-    const REAL_T probe_step_size = ((step_size / 2) / (PROBE_QUALITY - 1));
+    const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1));
 
     REAL_T __tmp;
 
diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh
index d8986b5..a8222fc 100644
--- a/cuslines/ptt.cuh
+++ b/cuslines/ptt.cuh
@@ -6,16 +6,16 @@
 
 #define STEP_FRAC 20 // divides output step size (usually 0.5) into this many internal steps
 #define PROBE_FRAC 2 // divides output step size (usually 0.5) to find probe length
-#define PROBE_QUALITY 4
+#define PROBE_QUALITY 4 // Number of probing steps
 #define SAMPLING_QUALITY 4 // can be 2-7
-#define PROBABILISTIC_BIAS 1 // 1 looks good. can be 0-log_2(N_WARPS) (typically 0-5). 0 is fully probabilistic, 4 is close to deterministic.
-#define ALLOW_WEAK_LINK 1
+#define DETERMINISTIC_BIAS 0 // Should be 0, higher values bias more towards higher fODF values when tracking 
+#define ALLOW_WEAK_LINK 0
 #define TRIES_PER_REJECTION_SAMPLING 1024
-#define DEFAULT_PTT_MINDATASUPPORT 0.05
+#define DEFAULT_PTT_MINDATASUPPORT 0.0 // 0.01
 #define K_SMALL 0.0001
 
 #define NORM_MIN_SUPPORT (DEFAULT_PTT_MINDATASUPPORT * PROBE_QUALITY)
-#define PROBABILISTIC_GROUP_SZ POW2(PROBABILISTIC_BIAS)
+#define PROBABILISTIC_GROUP_SZ POW2(DETERMINISTIC_BIAS)
 
 #if SAMPLING_QUALITY == 2
 #define DISC_VERT_CNT DISC_2_VERT_CNT

From 61fb586902b1d76556a4ba3f958a83097a382ea0 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Thu, 10 Jul 2025 12:26:40 -0700
Subject: [PATCH 02/31] put this back

---
 cuslines/ptt.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh
index a8222fc..e3317ff 100644
--- a/cuslines/ptt.cuh
+++ b/cuslines/ptt.cuh
@@ -11,7 +11,7 @@
 #define DETERMINISTIC_BIAS 0 // Should be 0, higher values bias more towards higher fODF values when tracking 
 #define ALLOW_WEAK_LINK 0
 #define TRIES_PER_REJECTION_SAMPLING 1024
-#define DEFAULT_PTT_MINDATASUPPORT 0.0 // 0.01
+#define DEFAULT_PTT_MINDATASUPPORT 0.01 // 0.01
 #define K_SMALL 0.0001
 
 #define NORM_MIN_SUPPORT (DEFAULT_PTT_MINDATASUPPORT * PROBE_QUALITY)

From d257ad9cf699422eddd7f29409b1e86b2c9f51d0 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 25 Aug 2025 10:52:39 -0700
Subject: [PATCH 03/31] update for CUDA 13 compatibility

---
 cuslines/cuslines.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
index 4e8bc30..45163ad 100644
--- a/cuslines/cuslines.cpp
+++ b/cuslines/cuslines.cpp
@@ -146,9 +146,12 @@ class GPUTracker {
 
       //#pragma omp parallel for
       for (int n = 0; n < ngpus_; ++n) {
+	cudaMemLocation location = {};
+        location.type = cudaMemLocationTypeDevice;
+        location.id   = n;
         CHECK_CUDA(cudaSetDevice(n));
         CHECK_CUDA(cudaMallocManaged(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size));
-        CHECK_CUDA(cudaMemAdvise(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n));
+        CHECK_CUDA(cudaMemAdvise(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, location));
         CHECK_CUDA(cudaMalloc(&H_d[n], sizeof(*H_d[n]) * H_info.size));
         CHECK_CUDA(cudaMalloc(&R_d[n], sizeof(*R_d[n]) * R_info.size));
         CHECK_CUDA(cudaMalloc(&delta_b_d[n], sizeof(*delta_b_d[n]) * delta_b_info.size));

From 173e48da04196cbb1d92e97d1e2e43fcc65c2266 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 25 Aug 2025 10:54:17 -0700
Subject: [PATCH 04/31] formatting

---
 cuslines/cuslines.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
index 45163ad..ceb7002 100644
--- a/cuslines/cuslines.cpp
+++ b/cuslines/cuslines.cpp
@@ -146,7 +146,7 @@ class GPUTracker {
 
       //#pragma omp parallel for
       for (int n = 0; n < ngpus_; ++n) {
-	cudaMemLocation location = {};
+        cudaMemLocation location = {};
         location.type = cudaMemLocationTypeDevice;
         location.id   = n;
         CHECK_CUDA(cudaSetDevice(n));

From bea03ceddfdbec14eaf7b1597e74091c7d791234 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 25 Aug 2025 12:10:27 -0700
Subject: [PATCH 05/31] try this

---
 Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile b/Dockerfile
index 06e9de9..15a0e1e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -27,6 +27,7 @@ ENV PATH /opt/anaconda/bin:${PATH}
 ENV LD_LIBRARY_PATH /opt/anaconda/lib:${LD_LIBRARY_PATH}
 
 # python prereqs
+RUN conda tos accept --override-channels --channel conda-forge
 RUN conda install -c conda-forge git
 RUN pip install numpy>=2.0.0
 RUN pip install scipy>=1.13.0 cython nibabel dipy tqdm

From 00a22c8671551b60594f3b4570472e6ce21b51f3 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 25 Aug 2025 12:38:51 -0700
Subject: [PATCH 06/31] accept lots of TOS

---
 Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 15a0e1e..3a2cbdc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -28,6 +28,8 @@ ENV LD_LIBRARY_PATH /opt/anaconda/lib:${LD_LIBRARY_PATH}
 
 # python prereqs
 RUN conda tos accept --override-channels --channel conda-forge
+RUN conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main
+RUN conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
 RUN conda install -c conda-forge git
 RUN pip install numpy>=2.0.0
 RUN pip install scipy>=1.13.0 cython nibabel dipy tqdm

From 03397f602b0181b981a563b4830b04313a575737 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 25 Aug 2025 12:55:52 -0700
Subject: [PATCH 07/31] handle both 12/13

---
 cuslines/cudamacro.h  | 14 ++++++++++++++
 cuslines/cuslines.cpp |  5 +----
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/cuslines/cudamacro.h b/cuslines/cudamacro.h
index 49ac24c..45a8fc3 100644
--- a/cuslines/cudamacro.h
+++ b/cuslines/cudamacro.h
@@ -45,6 +45,20 @@
         exit(EXIT_FAILURE);                                                  \
     }}
 
+#if CUDART_VERSION >= 13000
+#define CUDA_MEM_ADVISE(devPtr, count, advice, device)          \
+    do {                                                        \
+        cudaMemLocation loc;                                    \
+        loc.type = cudaMemLocationTypeDevice;                   \
+        loc.id   = (device);                                    \
+        CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), loc)); \
+    } while (0)
+#else
+#define CUDA_MEM_ADVISE(devPtr, count, advice, device)            \
+    CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), (device)))
+#endif
+
+
 #ifdef USE_NVTX
 #include "nvToolsExt.h"
 
diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
index ceb7002..a1ada94 100644
--- a/cuslines/cuslines.cpp
+++ b/cuslines/cuslines.cpp
@@ -146,12 +146,9 @@ class GPUTracker {
 
       //#pragma omp parallel for
       for (int n = 0; n < ngpus_; ++n) {
-        cudaMemLocation location = {};
-        location.type = cudaMemLocationTypeDevice;
-        location.id   = n;
         CHECK_CUDA(cudaSetDevice(n));
         CHECK_CUDA(cudaMallocManaged(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size));
-        CHECK_CUDA(cudaMemAdvise(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, location));
+        CHECK_CUDA(CUDA_MEM_ADVISE(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n));
         CHECK_CUDA(cudaMalloc(&H_d[n], sizeof(*H_d[n]) * H_info.size));
         CHECK_CUDA(cudaMalloc(&R_d[n], sizeof(*R_d[n]) * R_info.size));
         CHECK_CUDA(cudaMalloc(&delta_b_d[n], sizeof(*delta_b_d[n]) * delta_b_info.size));

From 3057e5ba9207da13204ffcf8c45b745950739134 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 25 Aug 2025 13:04:29 -0700
Subject: [PATCH 08/31] bf

---
 cuslines/cudamacro.h  | 12 +++++-------
 cuslines/cuslines.cpp |  2 +-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/cuslines/cudamacro.h b/cuslines/cudamacro.h
index 45a8fc3..e9b2e1e 100644
--- a/cuslines/cudamacro.h
+++ b/cuslines/cudamacro.h
@@ -46,13 +46,11 @@
     }}
 
 #if CUDART_VERSION >= 13000
-#define CUDA_MEM_ADVISE(devPtr, count, advice, device)          \
-    do {                                                        \
-        cudaMemLocation loc;                                    \
-        loc.type = cudaMemLocationTypeDevice;                   \
-        loc.id   = (device);                                    \
-        CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), loc)); \
-    } while (0)
+#define CUDA_MEM_ADVISE(devPtr, count, advice, device)            \
+    cudaMemLocation loc;                                          \
+    loc.type = cudaMemLocationTypeDevice;                         \
+    loc.id   = (device);                                          \
+    CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), loc));  \
 #else
 #define CUDA_MEM_ADVISE(devPtr, count, advice, device)            \
     CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), (device)))
diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
index a1ada94..1363705 100644
--- a/cuslines/cuslines.cpp
+++ b/cuslines/cuslines.cpp
@@ -148,7 +148,7 @@ class GPUTracker {
       for (int n = 0; n < ngpus_; ++n) {
         CHECK_CUDA(cudaSetDevice(n));
         CHECK_CUDA(cudaMallocManaged(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size));
-        CHECK_CUDA(CUDA_MEM_ADVISE(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n));
+        CUDA_MEM_ADVISE(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n);
         CHECK_CUDA(cudaMalloc(&H_d[n], sizeof(*H_d[n]) * H_info.size));
         CHECK_CUDA(cudaMalloc(&R_d[n], sizeof(*R_d[n]) * R_info.size));
         CHECK_CUDA(cudaMalloc(&delta_b_d[n], sizeof(*delta_b_d[n]) * delta_b_info.size));

From 80c35be576c1224702c26513c00938dd8c9ef2fb Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 25 Aug 2025 13:10:44 -0700
Subject: [PATCH 09/31] typo

---
 cuslines/cudamacro.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuslines/cudamacro.h b/cuslines/cudamacro.h
index e9b2e1e..7f03c6c 100644
--- a/cuslines/cudamacro.h
+++ b/cuslines/cudamacro.h
@@ -50,7 +50,7 @@
     cudaMemLocation loc;                                          \
     loc.type = cudaMemLocationTypeDevice;                         \
     loc.id   = (device);                                          \
-    CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), loc));  \
+    CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), loc));
 #else
 #define CUDA_MEM_ADVISE(devPtr, count, advice, device)            \
     CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), (device)))

From 38250f0b34d61e5fa77e59703d3343b32397644c Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 25 Aug 2025 13:17:06 -0700
Subject: [PATCH 10/31] update ENV in dockerfile

---
 Dockerfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 3a2cbdc..889371d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -16,15 +16,15 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.0/cmake-3.24.0
     && mkdir /opt/cmake \
     && /tmp/cmake-install.sh --skip-license --prefix=/opt/cmake \
     && rm /tmp/cmake-install.sh
-ENV PATH /opt/cmake/bin:${PATH}
+ENV PATH=/opt/cmake/bin:${PATH}
 
 RUN curl -L "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" \
     -o "/tmp/Miniconda3.sh"
 RUN bash /tmp/Miniconda3.sh -b -p /opt/anaconda
 RUN rm -rf /tmp/Miniconda3.sh
 RUN cd /opt && eval "$(/opt/anaconda/bin/conda shell.bash hook)"
-ENV PATH /opt/anaconda/bin:${PATH}
-ENV LD_LIBRARY_PATH /opt/anaconda/lib:${LD_LIBRARY_PATH}
+ENV PATH=/opt/anaconda/bin:${PATH}
+ENV LD_LIBRARY_PATH=/opt/anaconda/lib:${LD_LIBRARY_PATH}
 
 # python prereqs
 RUN conda tos accept --override-channels --channel conda-forge

From 152722a0ceadaf92c566761d6a22ee6353ac5b65 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Wed, 17 Dec 2025 21:18:29 -0800
Subject: [PATCH 11/31] PTT fixes and switch to FP32

---
 cuslines/Makefile     |   7 +-
 cuslines/cuslines.cpp |  29 ++--
 cuslines/globals.h    |   4 +-
 cuslines/ptt.cu       | 347 +++++++++++++++++++++++++-----------------
 cuslines/ptt.cuh      |  18 +--
 cuslines/utils.cu     |  18 +--
 6 files changed, 249 insertions(+), 174 deletions(-)

diff --git a/cuslines/Makefile b/cuslines/Makefile
index 1061a16..c8fe6c7 100644
--- a/cuslines/Makefile
+++ b/cuslines/Makefile
@@ -37,7 +37,12 @@ SMS ?= 70
 CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM))
 LASTSM := $(lastword $(sort $(SMS)))
 CUDA_ARCH += -gencode arch=compute_$(LASTSM),code=compute_$(LASTSM)
-CUDACFLAGS=-c -O3 -lineinfo -Xptxas=-v -std=c++11 -Xcompiler -fPIC -Xcompiler=-fopenmp $(CUDA_ARCH)
+
+COMMON_FLAGS = -c -std=c++11 -Xcompiler -fPIC --use_fast_math -Xcompiler=-fopenmp $(CUDA_ARCH)
+RELEASE_FLAGS = -O3 -Xptxas=-O3
+DEBUG_FLAGS   = -O0 -Xptxas=-v -g -G -lineinfo
+CUDACFLAGS = $(COMMON_FLAGS) $(RELEASE_FLAGS)
+
 LDFLAGS= -shared -fopenmp -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt
 
 all: cuslines
diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
index 1363705..f0b8690 100644
--- a/cuslines/cuslines.cpp
+++ b/cuslines/cuslines.cpp
@@ -61,12 +61,12 @@ py::capsule cleanup(T* ptr) {
 class GPUTracker {
   public:
     GPUTracker(ModelType model_type,
-               double max_angle,
-               double min_signal,
-               double tc_threshold,
-               double step_size,
-               double relative_peak_thresh,
-               double min_separation_angle,
+               REAL max_angle,
+               REAL min_signal,
+               REAL tc_threshold,
+               REAL step_size,
+               REAL relative_peak_thresh,
+               REAL min_separation_angle,
                np_array_cast dataf,
                np_array_cast H,
                np_array_cast R,
@@ -149,6 +149,7 @@ class GPUTracker {
         CHECK_CUDA(cudaSetDevice(n));
         CHECK_CUDA(cudaMallocManaged(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size));
         CUDA_MEM_ADVISE(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n);
+        // CHECK_CUDA(cudaMemPrefetchAsync(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, n));
         CHECK_CUDA(cudaMalloc(&H_d[n], sizeof(*H_d[n]) * H_info.size));
         CHECK_CUDA(cudaMalloc(&R_d[n], sizeof(*R_d[n]) * R_info.size));
         CHECK_CUDA(cudaMalloc(&delta_b_d[n], sizeof(*delta_b_d[n]) * delta_b_info.size));
@@ -294,12 +295,12 @@ class GPUTracker {
     int delta_nr_, samplm_nr_;
 
     ModelType model_type_;
-    double max_angle_;
-    double tc_threshold_;
-    double min_signal_;
-    double step_size_;
-    double relative_peak_thresh_;
-    double min_separation_angle_;
+    REAL max_angle_;
+    REAL tc_threshold_;
+    REAL min_signal_;
+    REAL step_size_;
+    REAL relative_peak_thresh_;
+    REAL min_separation_angle_;
 
     std::vector<int> nSlines_old_;
     std::vector<REAL*> slines_;
@@ -332,8 +333,8 @@ PYBIND11_MODULE(cuslines, m) {
     .value("PTT", PTT);
 
   py::class_<GPUTracker>(m, "GPUTracker")
-    .def(py::init<ModelType, double, double, double, double,
-                  double, double,
+    .def(py::init<ModelType, REAL, REAL, REAL, REAL,
+                  REAL, REAL,
 		              np_array_cast, np_array_cast,
                   np_array_cast, np_array_cast,
                   np_array_cast, np_array_int_cast,
diff --git a/cuslines/globals.h b/cuslines/globals.h
index 25c2f29..7b3f7b4 100644
--- a/cuslines/globals.h
+++ b/cuslines/globals.h
@@ -29,7 +29,7 @@
 #ifndef __GLOBALS_H__
 #define __GLOBALS_H__
 
-#define REAL_SIZE 8
+#define REAL_SIZE 4
 
 #if REAL_SIZE == 4
 
@@ -85,6 +85,8 @@
 
 #define EXCESS_ALLOC_FACT 2
 
+#define NORM_EPS ((REAL)1e-8)
+
 #if 0
   #define DEBUG
 #endif
diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
index 7bb0763..11a2f44 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/ptt.cu
@@ -1,18 +1,19 @@
 template<typename REAL_T>
-__device__ void norm3_d(REAL_T *num, int fail_ind) {
+__device__ __forceinline__ void norm3_d(REAL_T *num, int fail_ind) {
     const REAL_T scale = SQRT(num[0] * num[0] + num[1] * num[1] + num[2] * num[2]);
 
-    if (scale != 0) {
+    if (scale > NORM_EPS) {
         num[0] /= scale;
         num[1] /= scale;
         num[2] /= scale;
     } else {
+        num[0] = num[1] = num[2] = 0;
         num[fail_ind] = 1.0; // this can happen randomly during propogation, though is exceedingly rare
     }
 }
 
 template<typename REAL_T>
-__device__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, const REAL_T *src2, int fail_ind) {
+__device__ __forceinline__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, const REAL_T *src2, int fail_ind) {
     dest[0] = src1[1] * src2[2] - src1[2] * src2[1];
     dest[1] = src1[2] * src2[0] - src1[0] * src2[2];
     dest[2] = src1[0] * src2[1] - src1[1] * src2[0];
@@ -20,13 +21,20 @@ __device__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, const REAL_T *src
     norm3_d(dest, fail_ind);
 }
 
-template<typename REAL_T, typename REAL3_T>
-__device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T *__restrict__ pmf,
+template<int BDIM_X, typename REAL_T, typename REAL3_T>
+__device__ REAL_T interp4_d(const REAL3_T* pos, const REAL_T* frame, const REAL_T *__restrict__ pmf,
                             const int dimx, const int dimy, const int dimz, const int dimt,
                             const REAL3_T *__restrict__ odf_sphere_vertices) {
+    const int tidx = threadIdx.x;
+
+    const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+    const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
     int closest_odf_idx = 0;
-    REAL_T __max_cos = 0;
-    for (int ii = 0; ii < dimt; ii++) {
+    REAL_T __max_cos = REAL_T(0);
+
+    #pragma unroll
+    for (int ii = tidx; ii < dimt; ii+= BDIM_X) {
         REAL_T cos_sim = FABS(
             odf_sphere_vertices[ii].x * frame[0] \
             + odf_sphere_vertices[ii].y * frame[1] \
@@ -36,15 +44,30 @@ __device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T
             closest_odf_idx = ii;
         }
     }
+    __syncwarp(WMASK);
 
-    const int rv = trilinear_interp_d<THR_X_SL>(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, pos, &__max_cos);
+    #pragma unroll
+    for(int i = BDIM_X/2; i; i /= 2) {
+        const REAL_T __tmp = __shfl_xor_sync(WMASK, __max_cos, i, BDIM_X);
+        const int __tmp_idx = __shfl_xor_sync(WMASK, closest_odf_idx, i, BDIM_X);
+        if (__tmp > __max_cos ||
+           (__tmp == __max_cos && __tmp_idx < closest_odf_idx)) {
+            __max_cos = __tmp;
+            closest_odf_idx = __tmp_idx;
+        }
+    }
+    __syncwarp(WMASK);
 
 #if 0
-    printf("inerpolated %f at %f, %f, %f, %i\n", __max_cos, pos.x, pos.y, pos.z, closest_odf_idx);
+    if (closest_odf_idx >= dimt || closest_odf_idx < 0) { 
+        printf("Error: closest_odf_idx out of bounds: %d (dimt: %d)\n", closest_odf_idx, dimt);
+    }
 #endif
 
+    const int rv = trilinear_interp_d<THR_X_SL>(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, *pos, &__max_cos);
+
     if (rv != 0) {
-        return -1;
+        return 0;  // No support
     } else {
         return __max_cos;
     }
@@ -87,24 +110,57 @@ __device__ void prepare_propagator_d(REAL_T k1, REAL_T k2, REAL_T arclength,
     }
 }
 
+template<typename REAL_T>
+__device__ void random_normal(curandStatePhilox4_32_10_t *st, REAL_T* probing_frame) {
+    probing_frame[3] = curand_normal(st);
+    probing_frame[4] = curand_normal(st);
+    probing_frame[5] = curand_normal(st);
+    REAL_T dot = probing_frame[3]*probing_frame[0]
+        + probing_frame[4]*probing_frame[1]
+        + probing_frame[5]*probing_frame[2];
+
+    probing_frame[3] -= dot*probing_frame[0];
+    probing_frame[4] -= dot*probing_frame[1];
+    probing_frame[5] -= dot*probing_frame[2];
+    REAL_T n2 = probing_frame[3]*probing_frame[3]
+        + probing_frame[4]*probing_frame[4]
+        + probing_frame[5]*probing_frame[5];
+
+    if (n2 < NORM_EPS) {
+        REAL_T abs_x = FABS(probing_frame[0]);
+        REAL_T abs_y = FABS(probing_frame[1]);
+        REAL_T abs_z = FABS(probing_frame[2]);
+
+        if (abs_x <= abs_y && abs_x <= abs_z) {
+            probing_frame[3] = 0.0;
+            probing_frame[4] = probing_frame[2];
+            probing_frame[5] = -probing_frame[1];
+        } 
+        else if (abs_y <= abs_z) {
+            probing_frame[3] = -probing_frame[2];
+            probing_frame[4] = 0.0;
+            probing_frame[5] = probing_frame[0];
+        } 
+        else {
+            probing_frame[3] = probing_frame[1];
+            probing_frame[4] = -probing_frame[0];
+            probing_frame[5] = 0.0;
+        }
+    }
+}
+
 template<bool IS_INIT, typename REAL_T>
 __device__ void get_probing_frame_d(const REAL_T* frame, curandStatePhilox4_32_10_t *st, REAL_T* probing_frame) {
     if (IS_INIT) {
         for (int ii = 0; ii < 3; ii++) { // tangent
             probing_frame[ii] = frame[ii];
         }
-        if ((probing_frame[0] != 0) && (probing_frame[1] != 0)) { // norm
-            probing_frame[3] = -probing_frame[1];
-            probing_frame[4] = probing_frame[0];
-            probing_frame[5] = 0;
-        } else {
-            probing_frame[3] = 0;
-            probing_frame[4] = -probing_frame[2];
-            probing_frame[5] = 0;
-        }
+        norm3_d(probing_frame, 0);
 
-        norm3_d(probing_frame, 0); // tangent
+        random_normal(st, probing_frame);
         norm3_d(probing_frame + 3, 1); // norm
+
+        // calculate binorm
         crossnorm3_d(probing_frame + 2*3, probing_frame, probing_frame + 3, 2); // binorm
     } else {
         for (int ii = 0; ii < 9; ii++) {
@@ -123,49 +179,59 @@ __device__ void propogate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* dir
         frame[2*3 + ii] = propagator[6]*frame[ii] + propagator[7]*frame[3+ii] + propagator[8]*frame[6+ii];
     }
 
-#if 1
     norm3_d(__tmp, 0); // normalize tangent
     crossnorm3_d(frame + 3, frame + 2*3, __tmp, 1); // calc normal
     crossnorm3_d(frame + 2*3, __tmp, frame + 3, 2); // calculate binorm from tangent, norm
-#else
-    norm3_d(__tmp, 0); // normalize tangent
-    norm3_d(frame + 2*3, 2); // normalize binorm
-    crossnorm3_d(frame + 3, frame + 2*3, __tmp, 1); // calculate normal from binorm, tangent
-#endif
 
     for (int ii = 0; ii < 3; ii++) {
         frame[ii] = __tmp[ii];
     }
 }
 
-template<typename REAL_T, typename REAL3_T>
+template<int BDIM_X, typename REAL_T, typename REAL3_T>
 __device__ REAL_T calculate_data_support_d(REAL_T support,
                                            const REAL3_T pos, const REAL_T *__restrict__ pmf,
                                            const int dimx, const int dimy, const int dimz, const int dimt,
                                            const REAL_T probe_step_size,
                                            const REAL3_T *__restrict__ odf_sphere_vertices,
-                                           REAL_T k1, REAL_T k2,
-                                           REAL_T* probing_frame) {
-    REAL_T probing_prop[9];
-    REAL_T direc[3];
-    REAL3_T probing_pos;
-    REAL_T fod_amp;
-
-    prepare_propagator_d(k1, k2, probe_step_size, probing_prop);
-    probing_pos.x = pos.x;
-    probing_pos.y = pos.y;
-    probing_pos.z = pos.z;
+                                           REAL_T* probing_prop_sh,
+                                           REAL_T* direc_sh, 
+                                           REAL3_T* probing_pos_sh,
+                                           REAL_T* k1_sh, REAL_T* k2_sh,
+                                           REAL_T* probing_frame_sh) {
+    const int tidx = threadIdx.x;
+
+    const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+    const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+    if (tidx == 0) {
+        prepare_propagator_d(
+            *k1_sh, *k2_sh,
+            probe_step_size, probing_prop_sh);
+        probing_pos_sh->x = pos.x;
+        probing_pos_sh->y = pos.y;
+        probing_pos_sh->z = pos.z;
+    }
+    __syncwarp(WMASK);
 
     for (int ii = 0; ii < PROBE_QUALITY; ii++) { // we spend about 2/3 of our time in this loop when doing PTT
-        propogate_frame_d(probing_prop, probing_frame, direc);
+        if (tidx == 0) {
+            propogate_frame_d(
+                probing_prop_sh,
+                probing_frame_sh,
+                direc_sh);
+
+            probing_pos_sh->x += direc_sh[0];
+            probing_pos_sh->y += direc_sh[1];
+            probing_pos_sh->z += direc_sh[2];
+        }
+        __syncwarp(WMASK);
 
-        probing_pos.x += direc[0];
-        probing_pos.y += direc[1];
-        probing_pos.z += direc[2];
+        const REAL_T fod_amp = interp4_d<BDIM_X>(
+            probing_pos_sh, probing_frame_sh, pmf,
+            dimx, dimy, dimz, dimt,
+            odf_sphere_vertices);
 
-        fod_amp = interp4_d(probing_pos, probing_frame, pmf,
-                            dimx, dimy, dimz, dimt,
-                            odf_sphere_vertices);
         if (!ALLOW_WEAK_LINK && (fod_amp < PMF_THRESHOLD_P)) {
             return 0;
         }
@@ -204,13 +270,27 @@ __device__ int get_direction_ptt_d(
     const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
     const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
 
-	REAL_T __shared__ face_cdf_sh[BDIM_Y*DISC_FACE_CNT];
-    REAL_T __shared__ vert_pdf_sh[BDIM_Y*DISC_VERT_CNT];
-    REAL_T __shared__ first_val_sh[BDIM_Y];
+	__shared__ REAL_T face_cdf_sh[BDIM_Y*DISC_FACE_CNT];
+    __shared__ REAL_T vert_pdf_sh[BDIM_Y*DISC_VERT_CNT];
+
+    __shared__ REAL_T probing_frame_sh[BDIM_Y*9];
+    __shared__ REAL_T k1_probe_sh[BDIM_Y];
+    __shared__ REAL_T k2_probe_sh[BDIM_Y];
+
+    __shared__ REAL_T probing_prop_sh[BDIM_Y*9];
+    __shared__ REAL_T direc_sh[BDIM_Y*3];
+    __shared__ REAL3_T probing_pos_sh[BDIM_Y];
 
     REAL_T *__face_cdf_sh = face_cdf_sh + tidy*DISC_FACE_CNT;
     REAL_T *__vert_pdf_sh = vert_pdf_sh + tidy*DISC_VERT_CNT;
-    REAL_T *__first_val_sh = first_val_sh + tidy;
+
+    REAL_T *__probing_frame_sh = probing_frame_sh + tidy*9;
+    REAL_T *__k1_probe_sh = k1_probe_sh + tidy;
+    REAL_T *__k2_probe_sh = k2_probe_sh + tidy;
+
+    REAL_T *__probing_prop_sh = probing_prop_sh + tidy*9;
+    REAL_T *__direc_sh = direc_sh + tidy*3;
+    REAL3_T *__probing_pos_sh = probing_pos_sh + tidy;
 
     const REAL_T max_curvature = SIN(max_angle / 2) / step_size; // bigger numbers means wiggle more
     const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1));
@@ -225,30 +305,31 @@ __device__ int get_direction_ptt_d(
             __frame_sh[2] = dir.z;
         }
     }
-    if (tidx==0) {
-        *__first_val_sh = interp4_d(pos, __frame_sh, pmf,
-                                    dimx, dimy, dimz, dimt,
-                                    odf_sphere_vertices);
-    }
+
+    const REAL_T first_val = interp4_d<BDIM_X>(
+        __probing_pos_sh, __frame_sh, pmf,
+        dimx, dimy, dimz, dimt,
+        odf_sphere_vertices);
     __syncwarp(WMASK);
 
     // Calculate __vert_pdf_sh
-    REAL_T probing_frame[9];
-    REAL_T k1_probe, k2_probe;
-    bool support_found = 0;
-    for (int ii = tidx; ii < DISC_VERT_CNT; ii += BDIM_X) {
-        k1_probe = DISC_VERT[ii*2] * max_curvature;
-        k2_probe = DISC_VERT[ii*2+1] * max_curvature;
-
-        get_probing_frame_d<IS_INIT>(__frame_sh, st, probing_frame);
+    bool support_found = false;
+    for (int ii = 0; ii < DISC_VERT_CNT; ii++) {
+        if (tidx == 0) {
+            *__k1_probe_sh = DISC_VERT[ii*2] * max_curvature;
+            *__k2_probe_sh = DISC_VERT[ii*2+1] * max_curvature;
+            get_probing_frame_d<IS_INIT>(__frame_sh, st, __probing_frame_sh);
+        }
+        __syncwarp(WMASK);
 
-        const REAL_T this_support = calculate_data_support_d(
-            *__first_val_sh,
+        const REAL_T this_support = calculate_data_support_d<BDIM_X>(
+            first_val,
             pos, pmf, dimx, dimy, dimz, dimt,
             probe_step_size,
             odf_sphere_vertices,
-            k1_probe, k2_probe,
-            probing_frame);
+            __probing_prop_sh, __direc_sh, __probing_pos_sh,
+            __k1_probe_sh, __k2_probe_sh,
+            __probing_frame_sh);
 
 #if 0
         if (threadIdx.y == 1 && ii == 0) { 
@@ -257,14 +338,17 @@ __device__ int get_direction_ptt_d(
 #endif
 
         if (this_support < NORM_MIN_SUPPORT) {
-            __vert_pdf_sh[ii] = 0;
+            if (tidx == 0) {
+                __vert_pdf_sh[ii] = 0;
+            }
         } else {
-            __vert_pdf_sh[ii] = this_support;
+            if (tidx == 0) {
+                __vert_pdf_sh[ii] = this_support;
+            }
             support_found = 1;
         }
     }
-    const int __msk = __ballot_sync(WMASK, support_found);
-    if (__msk == 0) {
+    if (support_found == 0) {
         return 0;
     }
 
@@ -323,82 +407,69 @@ __device__ int get_direction_ptt_d(
 #endif
 
     // Sample random valid faces randomly
-    REAL_T r1, r2;
-    for (int ii = 0; ii < TRIES_PER_REJECTION_SAMPLING / BDIM_X; ii++) {
-        r1 = curand_uniform(st);
-        r2 = curand_uniform(st);
-		if (r1 + r2 > 1) {
-			r1 = 1 - r1;
-			r2 = 1 - r2;
-		}
-
-        __tmp = curand_uniform(st) * last_cdf;
-		int jj;
-		for (jj = 0; jj < DISC_FACE_CNT; jj++) {
-			if (__face_cdf_sh[jj] >= __tmp)
-				break;
-		}
-
-        const REAL_T vx0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2];
-        const REAL_T vx1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2];
-        const REAL_T vx2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2];
-
-        const REAL_T vy0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2 + 1];
-        const REAL_T vy1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2 + 1];
-        const REAL_T vy2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2 + 1];
-
-        k1_probe = vx0 + r1 * (vx1 - vx0) + r2 * (vx2 - vx0);
-        k2_probe = vy0 + r1 * (vy1 - vy0) + r2 * (vy2 - vy0);
-
-        get_probing_frame_d<IS_INIT>(__frame_sh, st, probing_frame);
-
-        const REAL_T this_support = calculate_data_support_d(*__first_val_sh,
-                                                             pos, pmf, dimx, dimy, dimz, dimt,
-                                                             probe_step_size,
-                                                             odf_sphere_vertices,
-                                                             k1_probe, k2_probe,
-                                                             probing_frame);
+    for (int ii = 0; ii < TRIES_PER_REJECTION_SAMPLING; ii++) {
+        if (tidx == 0) {
+            REAL_T r1 = curand_uniform(st);
+            REAL_T r2 = curand_uniform(st);
+            if (r1 + r2 > 1) {
+                r1 = 1 - r1;
+                r2 = 1 - r2;
+            }
+
+            __tmp = curand_uniform(st) * last_cdf;
+            int jj;
+            for (jj = 0; jj < DISC_FACE_CNT; jj++) { // TODO: parallelize this
+                if (__face_cdf_sh[jj] >= __tmp)
+                    break;
+            }
+
+            const REAL_T vx0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2];
+            const REAL_T vx1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2];
+            const REAL_T vx2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2];
 
+            const REAL_T vy0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2 + 1];
+            const REAL_T vy1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2 + 1];
+            const REAL_T vy2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2 + 1];
 
+            *__k1_probe_sh = vx0 + r1 * (vx1 - vx0) + r2 * (vx2 - vx0);
+            *__k2_probe_sh = vy0 + r1 * (vy1 - vy0) + r2 * (vy2 - vy0);
+            get_probing_frame_d<IS_INIT>(__frame_sh, st, __probing_frame_sh);
+        }
         __syncwarp(WMASK);
-        int winning_lane = -1; // -1 indicates nobody won
-        int __msk = __ballot_sync(WMASK, this_support >= NORM_MIN_SUPPORT);
-        if (__msk != 0) {
-            REAL_T group_max_support = this_support;
-            #pragma unroll
-            for(int j = 1; j < PROBABILISTIC_GROUP_SZ; j *= 2) {
-                __tmp = __shfl_xor_sync(WMASK, group_max_support, j, BDIM_X);
-                group_max_support = MAX(group_max_support, __tmp);
-            }
 
-            __msk &= __ballot_sync(WMASK, this_support == group_max_support);
-            winning_lane = __ffs(__msk) - 1;
+        const REAL_T this_support = calculate_data_support_d<BDIM_X>(
+            first_val,
+            pos, pmf, dimx, dimy, dimz, dimt,
+            probe_step_size,
+            odf_sphere_vertices,
+            __probing_prop_sh, __direc_sh, __probing_pos_sh,
+            __k1_probe_sh, __k2_probe_sh,
+            __probing_frame_sh);
+        __syncwarp(WMASK);
+
+        if (this_support < NORM_MIN_SUPPORT) {
+            continue;
         }
-        if (winning_lane != -1) {
-            if (tidx == winning_lane) {
-#if 0
-                if (threadIdx.y == 1) {
-                    printf("winning k1 %f, k2 %f, cdf %f, cdf_idx %i", k1_probe, k2_probe, __tmp, jj);
-                }
-#endif
-                if (IS_INIT) {
-                    dirs[0] = dir;
-                } else {
-                    REAL_T __prop[9];
-                    REAL_T __dir[3];
-                    prepare_propagator_d(k1_probe, k2_probe, step_size/STEP_FRAC, __prop);
-                    propogate_frame_d(__prop, probing_frame, __dir);
-                    norm3_d(__dir, 0); // this will be scaled by the generic stepping code
-                    dirs[0] = (REAL3_T) {__dir[0], __dir[1], __dir[2]};
-                }
 
-                for (int jj = 0; jj < 9; jj++) {
-                    __frame_sh[jj] = probing_frame[jj];
-                }
+        if (tidx == 0) {
+            if (IS_INIT) {
+                dirs[0] = dir;
+            } else {
+                // Propogate, but only 1/STEP_FRAC of a step
+                prepare_propagator_d(
+                    *__k1_probe_sh, *__k2_probe_sh,
+                    step_size/STEP_FRAC, __probing_prop_sh);
+                propogate_frame_d(__probing_prop_sh, __probing_frame_sh, __direc_sh);
+                norm3_d(__direc_sh, 0); // this will be scaled by the generic stepping code
+                dirs[0] = (REAL3_T) {__direc_sh[0], __direc_sh[1], __direc_sh[2]};
             }
-            __syncwarp(WMASK);
-            return 1;
         }
+
+        if (tidx < 9) {
+            __frame_sh[tidx] = __probing_frame_sh[tidx];
+        }
+        __syncwarp(WMASK);
+        return 1;
     }
     return 0;
 }
diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh
index e3317ff..751c4bb 100644
--- a/cuslines/ptt.cuh
+++ b/cuslines/ptt.cuh
@@ -4,18 +4,16 @@
 #include "disc.h"
 #include "globals.h"
 
-#define STEP_FRAC 20 // divides output step size (usually 0.5) into this many internal steps
-#define PROBE_FRAC 2 // divides output step size (usually 0.5) to find probe length
-#define PROBE_QUALITY 4 // Number of probing steps
-#define SAMPLING_QUALITY 4 // can be 2-7
-#define DETERMINISTIC_BIAS 0 // Should be 0, higher values bias more towards higher fODF values when tracking 
-#define ALLOW_WEAK_LINK 0
-#define TRIES_PER_REJECTION_SAMPLING 1024
-#define DEFAULT_PTT_MINDATASUPPORT 0.01 // 0.01
-#define K_SMALL 0.0001
+#define STEP_FRAC (20) // divides output step size (usually 0.5) into this many internal steps
+#define PROBE_FRAC (2) // divides output step size (usually 0.5) to find probe length
+#define PROBE_QUALITY (4) // Number of probing steps
+#define SAMPLING_QUALITY (2) // can be 2-7
+#define ALLOW_WEAK_LINK (0)
+#define TRIES_PER_REJECTION_SAMPLING (1024)
+#define DEFAULT_PTT_MINDATASUPPORT ((REAL) 0.01) // 0.01
+#define K_SMALL ((REAL) 0.0001)
 
 #define NORM_MIN_SUPPORT (DEFAULT_PTT_MINDATASUPPORT * PROBE_QUALITY)
-#define PROBABILISTIC_GROUP_SZ POW2(DETERMINISTIC_BIAS)
 
 #if SAMPLING_QUALITY == 2
 #define DISC_VERT_CNT DISC_2_VERT_CNT
diff --git a/cuslines/utils.cu b/cuslines/utils.cu
index c7fe47f..19282de 100644
--- a/cuslines/utils.cu
+++ b/cuslines/utils.cu
@@ -59,7 +59,7 @@ __device__ void printArray(const char *name, int ncol, int n, REAL_T *arr) {
 }
 
 template<typename REAL_T>
-__device__ REAL_T interpolation_helper_d(const REAL_T* dataf, const REAL_T wgh[3][2], const long long coo[3][2], int dimy, int dimz, int dimt, int t) {
+__device__ REAL_T interpolation_helper_d(const REAL_T*__restrict__ dataf, const REAL_T wgh[3][2], const long long coo[3][2], int dimy, int dimz, int dimt, int t) {
     REAL_T __tmp = 0;
     #pragma unroll
     for (int i = 0; i < 2; i++) {
@@ -130,14 +130,12 @@ __device__ int trilinear_interp_d(const int dimx,
                 *__vox_data = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, dimt_idx);
         }
 
-        /*
-        __syncwarp(WMASK);
-        if (tidx == 0 && threadIdx.y == 0) {
-                printf("point: %f, %f, %f\n", point.x, point.y, point.z);
-                for(int i = 0; i < dimt; i++) {
-                        printf("__vox_data[%d]: %f\n", i, __vox_data[i]);
-                }
-        }
-        */
+        // if (threadIdx.x == 0) {
+        //         printf("point: %f, %f, %f\n", point.x, point.y, point.z);
+        //         printf("dimt_idx: %d\n", dimt_idx);
+        //         // for(int i = 0; i < dimt; i++) {
+        //         //         printf("__vox_data[%d]: %f\n", i, __vox_data[i]);
+        //         // }
+        // }
         return 0;
 }

From d93e5edc2559d3f6bd3235290b5dc730dd29319f Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Thu, 18 Dec 2025 19:08:54 -0800
Subject: [PATCH 12/31] PTT looking even better

---
 cuslines/generate_streamlines_cuda.cu | 24 ------------------------
 cuslines/globals.h                    |  2 +-
 cuslines/ptt.cu                       | 23 ++++++++++++++---------
 cuslines/ptt.cuh                      |  3 ---
 cuslines/utils.cu                     | 23 +++++++++++++++++++++++
 5 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index 374c7c1..0efefdd 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -307,30 +307,6 @@ __device__ VAL_T max_mask_transl_d(const int n,
         return __m;
 }
 
-template<int BDIM_X,
-         typename VAL_T>
-__device__ VAL_T max_d(const int n, const VAL_T *__restrict__ src, const VAL_T minVal) {
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        VAL_T __m = minVal;
-
-        for(int i = tidx; i < n; i += BDIM_X) {
-		__m = MAX(__m, src[i]);
-        }
-
-        #pragma unroll
-        for(int i = BDIM_X/2; i; i /= 2) {
-                const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X);
-                __m = MAX(__m, __tmp);
-        }
-
-        return __m;
-}
-
 template<int BDIM_X,
          typename VAL_T>
 __device__ VAL_T min_d(const int n, const VAL_T *__restrict__ src, const VAL_T maxVal) {
diff --git a/cuslines/globals.h b/cuslines/globals.h
index 7b3f7b4..0d852e9 100644
--- a/cuslines/globals.h
+++ b/cuslines/globals.h
@@ -70,7 +70,7 @@
 #endif
 
 #define MAX_SLINE_LEN	(501)
-#define PMF_THRESHOLD_P	((REAL)0.1)
+#define PMF_THRESHOLD_P	((REAL)0.05)
 
 #define THR_X_BL (64)
 #define THR_X_SL (32)
diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
index 11a2f44..f6cbc24 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/ptt.cu
@@ -22,7 +22,7 @@ __device__ __forceinline__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, c
 }
 
 template<int BDIM_X, typename REAL_T, typename REAL3_T>
-__device__ REAL_T interp4_d(const REAL3_T* pos, const REAL_T* frame, const REAL_T *__restrict__ pmf,
+__device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T *__restrict__ pmf,
                             const int dimx, const int dimy, const int dimz, const int dimt,
                             const REAL3_T *__restrict__ odf_sphere_vertices) {
     const int tidx = threadIdx.x;
@@ -64,7 +64,7 @@ __device__ REAL_T interp4_d(const REAL3_T* pos, const REAL_T* frame, const REAL_
     }
 #endif
 
-    const int rv = trilinear_interp_d<THR_X_SL>(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, *pos, &__max_cos);
+    const int rv = trilinear_interp_d<THR_X_SL>(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, pos, &__max_cos);
 
     if (rv != 0) {
         return 0;  // No support
@@ -193,6 +193,7 @@ __device__ REAL_T calculate_data_support_d(REAL_T support,
                                            const REAL3_T pos, const REAL_T *__restrict__ pmf,
                                            const int dimx, const int dimy, const int dimz, const int dimt,
                                            const REAL_T probe_step_size,
+                                           const REAL_T absolpmf_thresh,
                                            const REAL3_T *__restrict__ odf_sphere_vertices,
                                            REAL_T* probing_prop_sh,
                                            REAL_T* direc_sh, 
@@ -227,12 +228,12 @@ __device__ REAL_T calculate_data_support_d(REAL_T support,
         }
         __syncwarp(WMASK);
 
-        const REAL_T fod_amp = interp4_d<BDIM_X>(
-            probing_pos_sh, probing_frame_sh, pmf,
+        const REAL_T fod_amp = interp4_d<BDIM_X>( // This is the most expensive call
+            *probing_pos_sh, probing_frame_sh, pmf,
             dimx, dimy, dimz, dimt,
             odf_sphere_vertices);
 
-        if (!ALLOW_WEAK_LINK && (fod_amp < PMF_THRESHOLD_P)) {
+        if (!ALLOW_WEAK_LINK && (fod_amp < absolpmf_thresh)) {
             return 0;
         }
         support += fod_amp;
@@ -292,8 +293,9 @@ __device__ int get_direction_ptt_d(
     REAL_T *__direc_sh = direc_sh + tidy*3;
     REAL3_T *__probing_pos_sh = probing_pos_sh + tidy;
 
-    const REAL_T max_curvature = SIN(max_angle / 2) / step_size; // bigger numbers means wiggle more
     const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1));
+    const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / step_size;
+    const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d<BDIM_X>(dimt, pmf, REAL_MIN);
 
     REAL_T __tmp;
 
@@ -307,7 +309,7 @@ __device__ int get_direction_ptt_d(
     }
 
     const REAL_T first_val = interp4_d<BDIM_X>(
-        __probing_pos_sh, __frame_sh, pmf,
+        pos, __frame_sh, pmf,
         dimx, dimy, dimz, dimt,
         odf_sphere_vertices);
     __syncwarp(WMASK);
@@ -326,6 +328,7 @@ __device__ int get_direction_ptt_d(
             first_val,
             pos, pmf, dimx, dimy, dimz, dimt,
             probe_step_size,
+            absolpmf_thresh,
             odf_sphere_vertices,
             __probing_prop_sh, __direc_sh, __probing_pos_sh,
             __k1_probe_sh, __k2_probe_sh,
@@ -337,7 +340,7 @@ __device__ int get_direction_ptt_d(
         }
 #endif
 
-        if (this_support < NORM_MIN_SUPPORT) {
+        if (this_support < PROBE_QUALITY * absolpmf_thresh) {
             if (tidx == 0) {
                 __vert_pdf_sh[ii] = 0;
             }
@@ -441,13 +444,14 @@ __device__ int get_direction_ptt_d(
             first_val,
             pos, pmf, dimx, dimy, dimz, dimt,
             probe_step_size,
+            absolpmf_thresh,
             odf_sphere_vertices,
             __probing_prop_sh, __direc_sh, __probing_pos_sh,
             __k1_probe_sh, __k2_probe_sh,
             __probing_frame_sh);
         __syncwarp(WMASK);
 
-        if (this_support < NORM_MIN_SUPPORT) {
+        if (this_support < PROBE_QUALITY * absolpmf_thresh) {
             continue;
         }
 
@@ -459,6 +463,7 @@ __device__ int get_direction_ptt_d(
                 prepare_propagator_d(
                     *__k1_probe_sh, *__k2_probe_sh,
                     step_size/STEP_FRAC, __probing_prop_sh);
+                get_probing_frame_d<0>(__frame_sh, st, __probing_frame_sh);
                 propogate_frame_d(__probing_prop_sh, __probing_frame_sh, __direc_sh);
                 norm3_d(__direc_sh, 0); // this will be scaled by the generic stepping code
                 dirs[0] = (REAL3_T) {__direc_sh[0], __direc_sh[1], __direc_sh[2]};
diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh
index 751c4bb..9126250 100644
--- a/cuslines/ptt.cuh
+++ b/cuslines/ptt.cuh
@@ -10,11 +10,8 @@
 #define SAMPLING_QUALITY (2) // can be 2-7
 #define ALLOW_WEAK_LINK (0)
 #define TRIES_PER_REJECTION_SAMPLING (1024)
-#define DEFAULT_PTT_MINDATASUPPORT ((REAL) 0.01) // 0.01
 #define K_SMALL ((REAL) 0.0001)
 
-#define NORM_MIN_SUPPORT (DEFAULT_PTT_MINDATASUPPORT * PROBE_QUALITY)
-
 #if SAMPLING_QUALITY == 2
 #define DISC_VERT_CNT DISC_2_VERT_CNT
 #define DISC_FACE_CNT DISC_2_FACE_CNT
diff --git a/cuslines/utils.cu b/cuslines/utils.cu
index 19282de..93b1190 100644
--- a/cuslines/utils.cu
+++ b/cuslines/utils.cu
@@ -1,3 +1,26 @@
+template<int BDIM_X,
+         typename VAL_T>
+__device__ VAL_T max_d(const int n, const VAL_T *__restrict__ src, const VAL_T minVal) {
+
+        const int tidx = threadIdx.x;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        VAL_T __m = minVal;
+
+        for(int i = tidx; i < n; i += BDIM_X) {
+		__m = MAX(__m, src[i]);
+        }
+
+        #pragma unroll
+        for(int i = BDIM_X/2; i; i /= 2) {
+                const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X);
+                __m = MAX(__m, __tmp);
+        }
+
+        return __m;
+}
 
 template<int BDIM_X, typename REAL_T>
 __device__ void prefix_sum_sh_d(REAL_T *num_sh, int __len) {

From a5743bb249faa35564b8a74003f67300c40dd891 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Fri, 19 Dec 2025 10:40:23 -0800
Subject: [PATCH 13/31] bf

---
 cuslines/ptt.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
index f6cbc24..fa238c6 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/ptt.cu
@@ -34,7 +34,7 @@ __device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T
     REAL_T __max_cos = REAL_T(0);
 
     #pragma unroll
-    for (int ii = tidx; ii < dimt; ii+= BDIM_X) {
+    for (int ii = tidx; ii < dimt; ii+= BDIM_X) {  // TODO: I need to think about better ways of parallelizing this
         REAL_T cos_sim = FABS(
             odf_sphere_vertices[ii].x * frame[0] \
             + odf_sphere_vertices[ii].y * frame[1] \
@@ -64,6 +64,7 @@ __device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T
     }
 #endif
 
+    // TODO: maybe this should be texture memory, I am not so sure
     const int rv = trilinear_interp_d<THR_X_SL>(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, pos, &__max_cos);
 
     if (rv != 0) {
@@ -294,7 +295,7 @@ __device__ int get_direction_ptt_d(
     REAL3_T *__probing_pos_sh = probing_pos_sh + tidy;
 
     const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1));
-    const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / step_size;
+    const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / (step_size / PROBE_FRAC);
     const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d<BDIM_X>(dimt, pmf, REAL_MIN);
 
     REAL_T __tmp;

From ef4617c06805ca395842b48a5bf8a9fa1d35d30c Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 22 Dec 2025 11:14:12 -0800
Subject: [PATCH 14/31] finally put to bed the max curve stuff

---
 cuslines/ptt.cu | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
index fa238c6..57a27ab 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/ptt.cu
@@ -295,9 +295,15 @@ __device__ int get_direction_ptt_d(
     REAL3_T *__probing_pos_sh = probing_pos_sh + tidy;
 
     const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1));
-    const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / (step_size / PROBE_FRAC);
+    const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / step_size;
     const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d<BDIM_X>(dimt, pmf, REAL_MIN);
 
+#if 0
+        printf("absolpmf_thresh: %f, max_curvature: %f, probe_step_size: %f\n", absolpmf_thresh, max_curvature, probe_step_size);
+        printf("max_angle: %f\n", max_angle);
+        printf("step_size: %f\n", step_size);
+#endif
+
     REAL_T __tmp;
 
     __syncwarp(WMASK);

From 1df2ec09b1cf9ffb2d51534ae6669e31c6c164d3 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 22 Dec 2025 11:17:45 -0800
Subject: [PATCH 15/31] spelling error

---
 cuslines/ptt.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
index 57a27ab..3cdd149 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/ptt.cu
@@ -171,7 +171,7 @@ __device__ void get_probing_frame_d(const REAL_T* frame, curandStatePhilox4_32_1
 }
 
 template<typename REAL_T>
-__device__ void propogate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* direc) {
+__device__ void propagate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* direc) {
     REAL_T __tmp[3];
 
     for (int ii = 0; ii < 3; ii++) {
@@ -218,7 +218,7 @@ __device__ REAL_T calculate_data_support_d(REAL_T support,
 
     for (int ii = 0; ii < PROBE_QUALITY; ii++) { // we spend about 2/3 of our time in this loop when doing PTT
         if (tidx == 0) {
-            propogate_frame_d(
+            propagate_frame_d(
                 probing_prop_sh,
                 probing_frame_sh,
                 direc_sh);
@@ -466,12 +466,12 @@ __device__ int get_direction_ptt_d(
             if (IS_INIT) {
                 dirs[0] = dir;
             } else {
-                // Propogate, but only 1/STEP_FRAC of a step
+                // propagate, but only 1/STEP_FRAC of a step
                 prepare_propagator_d(
                     *__k1_probe_sh, *__k2_probe_sh,
                     step_size/STEP_FRAC, __probing_prop_sh);
                 get_probing_frame_d<0>(__frame_sh, st, __probing_frame_sh);
-                propogate_frame_d(__probing_prop_sh, __probing_frame_sh, __direc_sh);
+                propagate_frame_d(__probing_prop_sh, __probing_frame_sh, __direc_sh);
                 norm3_d(__direc_sh, 0); // this will be scaled by the generic stepping code
                 dirs[0] = (REAL3_T) {__direc_sh[0], __direc_sh[1], __direc_sh[2]};
             }

From 09a83a8b2c1a1bfe2d5f4ba23473c0d76bf3290a Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 22 Dec 2025 12:26:11 -0800
Subject: [PATCH 16/31] staring this up

---
 cuslines/__init__.py   |   0
 cuslines/cuslines.py   | 295 +++++++++++++++++++++++++++++++++++++++++
 cuslines/globals.py    |   0
 run_gpu_streamlines.py |   3 +-
 4 files changed, 296 insertions(+), 2 deletions(-)
 create mode 100644 cuslines/__init__.py
 create mode 100644 cuslines/cuslines.py
 create mode 100644 cuslines/globals.py

diff --git a/cuslines/__init__.py b/cuslines/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/cuslines/cuslines.py b/cuslines/cuslines.py
new file mode 100644
index 0000000..bb83e5e
--- /dev/null
+++ b/cuslines/cuslines.py
@@ -0,0 +1,295 @@
+from cuda.bindings import driver, nvrtc, runtime
+# TODO: this would be better if only using CUDA core
+
+import numpy as np
+import logging
+
+import re
+import os
+
+
+logger = logging.getLogger("GPUStreamlines")
+
+
+# We extract REAL_DTYPE, MAX_SLINE_LEN from globals.h
+# Maybe there is a more elegant way of doing this?
+dir_path = os.path.dirname(os.path.abspath(__file__))
+globals_path = os.path.join(dir_path, "globals.h")
+with open(globals_path, 'r') as f:
+    content = f.read()
+
+defines = dict(re.findall(r"#define\s+(\w+)\s+([^\s/]+)", content))
+REAL_SIZE = int(defines["REAL_SIZE"])
+if REAL_SIZE == 4:
+    REAL_DTYPE = np.float32
+elif REAL_SIZE == 8:
+    REAL_DTYPE = np.float64
+else:
+    raise NotImplementedError(f"Unsupported REAL_SIZE={REAL_SIZE} in globals.h")
+MAX_SLINE_LEN = int(defines["MAX_SLINE_LEN"])
+
+
+def _cudaGetErrorEnum(error):
+    if isinstance(error, driver.CUresult):
+        err, name = driver.cuGetErrorName(error)
+        return name if err == driver.CUresult.CUDA_SUCCESS else "<unknown>"
+    elif isinstance(error, nvrtc.nvrtcResult):
+        return nvrtc.nvrtcGetErrorString(error)[1]
+    else:
+        raise RuntimeError('Unknown error type: {}'.format(error))
+
+def checkCudaErrors(result):
+    if result[0].value:
+        raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0])))
+    if len(result) == 1:
+        return None
+    elif len(result) == 2:
+        return result[1]
+    else:
+        return result[1:]
+
+
+class GPUTracker:
+    def __init__(
+        self,
+        model_type: ModelType,
+        max_angle: float,
+        min_signal: float,
+        tc_threshold: float,
+        step_size: float,
+        relative_peak_thresh: float,
+        min_separation_angle: float,
+        dataf: np.ndarray,
+        H: np.ndarray,
+        R: np.ndarray,
+        delta_b: np.ndarray,
+        delta_q: np.ndarray, # TODO: some of these only needed for boot
+        b0s_mask: np.ndarray,
+        metric_map: np.ndarray,
+        sampling_matrix: np.ndarray,
+        sphere_vertices: np.ndarray,
+        sphere_edges: np.ndarray,
+        ngpus: int = 1,
+        rng_seed: int = 0,
+        rng_offset: int = 0,
+    ):
+        for name, arr, dt in [
+            ("dataf", dataf, REAL_DTYPE),
+            ("H", H, REAL_DTYPE),
+            ("R", R, REAL_DTYPE),
+            ("delta_b", delta_b, REAL_DTYPE),
+            ("delta_q", delta_q, REAL_DTYPE),
+            ("b0s_mask", b0s_mask, np.int32),
+            ("metric_map", metric_map, REAL_DTYPE),
+            ("sampling_matrix", sampling_matrix, REAL_DTYPE),
+            ("sphere_vertices", sphere_vertices, REAL_DTYPE),
+            ("sphere_edges", sphere_edges, np.int32),
+        ]:
+            if arr.dtype != dt:
+                raise TypeError(f"{name} must have dtype {dt}, got {arr.dtype}")
+            if not arr.flags.c_contiguous:
+                raise ValueError(f"{name} must be C-contiguous")
+
+        self.dataf = dataf
+        self.H = H
+        self.R = R
+        self.delta_b = delta_b
+        self.delta_q = delta_q
+        self.b0s_mask = b0s_mask
+        self.metric_map = metric_map
+        self.sampling_matrix = sampling_matrix
+        self.sphere_vertices = sphere_vertices
+        self.sphere_edges = sphere_edges
+
+        self.dimx, self.dimy, self.dimz, self.dimt = dataf.shape
+        self.nedges = int(sphere_edges.shape[0])
+        self.delta_nr = int(delta_b.shape[0])
+        self.samplm_nr = int(sampling_matrix.shape[0])
+
+        self.model_type = int(model_type)
+        self.max_angle = REAL_DTYPE(max_angle)
+        self.min_signal = REAL_DTYPE(min_signal)
+        self.tc_threshold = REAL_DTYPE(tc_threshold)
+        self.step_size = REAL_DTYPE(step_size)
+        self.relative_peak_thresh = REAL_DTYPE(relative_peak_thresh)
+        self.min_separation_angle = REAL_DTYPE(min_separation_angle)
+
+        self.ngpus = int(ngpus)
+        self.rng_seed = int(rng_seed)
+        self.rng_offset = int(rng_offset)
+
+        self.nSlines_old = []
+        self.slines = []
+        self.sline_lens = []
+
+        checkCudaErrors(driver.cuInit(0))
+        avail = checkCudaErrors(runtime.cudaGetDeviceCount())
+        if self.ngpus > avail:
+            raise RuntimeError(f"Requested {self.ngpus} GPUs but only {avail} available")
+
+        logger.info("Creating GPUTracker with %d GPUs...", self.ngpus)
+
+        self.dataf_pts = []
+        self.H_pts = []
+        self.R_pts = []
+        self.delta_b_pts = []
+        self.delta_q_pts = []
+        self.b0s_mask_pts = []
+        self.metric_map_pts = []
+        self.sampling_matrix_pts = []
+        self.sphere_vertices_pts = []
+        self.sphere_edges_pts = []
+
+        for ii in range(self.ngpus):
+            checkCudaErrors(runtime.cudaSetDevice(ii))
+            self.dataf_pts.append( # TODO: put this in texture memory?
+                checkCudaErrors(runtime.cudaMallocManaged(
+                    REAL_SIZE*self.dataf.size, 
+                    runtime.cudaMemAttachGlobal)))
+            checkCudaErrors(runtime.cudaMemAdvise(
+                self.dataf_pts[ii],
+                REAL_SIZE*self.dataf.size,
+                runtime.cudaMemAdviseSetPreferredLocation,
+                ii))
+            self.H_pts.append(
+                checkCudaErrors(runtime.cudaMalloc(
+                    REAL_SIZE*self.H.size)))
+            self.R_pts.append(
+                checkCudaErrors(runtime.cudaMalloc(
+                    REAL_SIZE*self.R.size)))
+            self.delta_b_pts.append(
+                checkCudaErrors(runtime.cudaMalloc(
+                    REAL_SIZE*self.delta_b.size)))
+            self.delta_q_pts.append(
+                checkCudaErrors(runtime.cudaMalloc(
+                    REAL_SIZE*self.delta_q.size)))
+            self.b0s_mask_pts.append(
+                checkCudaErrors(runtime.cudaMalloc(
+                    np.int32().nbytes*self.b0s_mask.size)))
+            self.metric_map_pts.append(
+                checkCudaErrors(runtime.cudaMalloc(
+                    REAL_SIZE*self.metric_map.size)))
+            self.sampling_matrix_pts.append(
+                checkCudaErrors(runtime.cudaMalloc(
+                    REAL_SIZE*self.sampling_matrix.size)))
+            self.sphere_vertices_pts.append(
+                checkCudaErrors(runtime.cudaMalloc(
+                    REAL_SIZE*self.sphere_vertices.size)))
+            self.sphere_edges_pts.append(
+                checkCudaErrors(runtime.cudaMalloc(
+                    np.int32().nbytes*self.sphere_edges.size)))
+            
+            checkCudaErrors(runtime.cudaMemcpy(
+                self.dataf_pts[ii],
+                self.dataf.ctypes.data,
+                REAL_SIZE*self.dataf.size,
+                runtime.cudaMemcpyHostToDevice))
+            checkCudaErrors(runtime.cudaMemcpy(
+                self.H_pts[ii],
+                self.H.ctypes.data,
+                REAL_SIZE*self.H.size,
+                runtime.cudaMemcpyHostToDevice))
+            checkCudaErrors(runtime.cudaMemcpy(
+                self.R_pts[ii],
+                self.R.ctypes.data,
+                REAL_SIZE*self.R.size,
+                runtime.cudaMemcpyHostToDevice))
+            checkCudaErrors(runtime.cudaMemcpy(
+                self.delta_b_pts[ii],
+                self.delta_b.ctypes.data,
+                REAL_SIZE*self.delta_b.size,
+                runtime.cudaMemcpyHostToDevice))
+            checkCudaErrors(runtime.cudaMemcpy(
+                self.delta_q_pts[ii],
+                self.delta_q.ctypes.data,
+                REAL_SIZE*self.delta_q.size,
+                runtime.cudaMemcpyHostToDevice))
+            checkCudaErrors(runtime.cudaMemcpy(
+                self.b0s_mask_pts[ii],
+                self.b0s_mask.ctypes.data,
+                np.int32().nbytes*self.b0s_mask.size,
+                runtime.cudaMemcpyHostToDevice))
+            checkCudaErrors(runtime.cudaMemcpy(
+                self.metric_map_pts[ii],
+                self.metric_map.ctypes.data,
+                REAL_SIZE*self.metric_map.size,
+                runtime.cudaMemcpyHostToDevice))
+            checkCudaErrors(runtime.cudaMemcpy(
+                self.sampling_matrix_pts[ii],
+                self.sampling_matrix.ctypes.data,
+                REAL_SIZE*self.sampling_matrix.size,
+                runtime.cudaMemcpyHostToDevice))
+            checkCudaErrors(runtime.cudaMemcpy(
+                self.sphere_vertices_pts[ii],
+                self.sphere_vertices.ctypes.data,
+                REAL_SIZE*self.sphere_vertices.size,
+                runtime.cudaMemcpyHostToDevice))
+            checkCudaErrors(runtime.cudaMemcpy(
+                self.sphere_edges_pts[ii],
+                self.sphere_edges.ctypes.data,
+                np.int32().nbytes*self.sphere_edges.size,
+                runtime.cudaMemcpyHostToDevice))
+
+        self.streams = []
+        for ii in range(self.ngpus):
+            checkCudaErrors(runtime.cudaSetDevice(ii))
+            self.streams.append(
+                checkCudaErrors(runtime.cudaStreamCreateWithFlags(
+                    runtime.cudaStreamNonBlocking)))
+
+    def generate_streamlines(self, seeds):  # TODO: location this is going should be these arguments
+        nseeds = len(seeds)
+        nseeds_per_gpu = (nseeds + self.ngpus - 1) // self.ngpus
+
+        seeds_ptrs = []
+
+        for ii in range(self.ngpus):
+            nseeds_gpu = min(nseeds_per_gpu, max(0, nseeds - ii * nseeds_per_gpu))
+            checkCudaErrors(runtime.cudaSetDevice(ii))
+            seeds_ptrs.append(checkCudaErrors(runtime.cudaMalloc(
+                REAL_SIZE*3*nseeds_gpu)))
+            checkCudaErrors(runtime.cudaMemcpy(
+                seeds_ptrs[ii],
+                seeds[ii*nseeds_per_gpu:(ii+1)*nseeds_per_gpu].ctypes.data,
+                REAL_SIZE*3*nseeds_gpu,
+                runtime.cudaMemcpyHostToDevice))
+        
+        nSlines = [0] * self.ngpus  # TODO: figure out what this is doing
+        # TODO:
+    #   // Call GPU routine
+    #   generate_streamlines_cuda_mgpu(model_type_, max_angle_, min_signal_, tc_threshold_, step_size_,
+    #                                  relative_peak_thresh_, min_separation_angle_,
+    #                                  nseeds, seeds_d,
+    #                                  dimx_, dimy_, dimz_, dimt_,
+    #                                  dataf_d, H_d, R_d, delta_nr_, delta_b_d, delta_q_d, b0s_mask_d, metric_map_d, samplm_nr_, sampling_matrix_d,
+    #                                  sphere_vertices_d, sphere_edges_d, nedges_,
+    #                                  slines_, slinesLen_, nSlines, nSlines_old_, rng_seed_, rng_offset_, ngpus_,
+    #                                  streams_);
+
+        self.nSlines_old = nSlines.copy()  # TODO: figure out what this is doing
+        self.rng_offset += nseeds
+
+        nSlines_total = 0
+        for ii in range(self.ngpus):
+            checkCudaErrors(runtime.cudaFree(seeds_ptrs[ii]))
+            nSlines_total += nSlines[ii]
+
+
+        # TODO
+    #   std::vector<py::array_t<REAL>> slines_list;
+    #   slines_list.reserve(nSlines_total);
+    #   for (int n = 0; n < ngpus_; ++n) {
+    #     for (int i = 0; i < nSlines[n]; ++i) {
+    #       REAL* sl = new REAL[slinesLen_[n][i]*3];
+    #       std::memcpy(sl, slines_[n] + i*3*2*MAX_SLINE_LEN, slinesLen_[n][i]*3*sizeof(*sl));
+    #       auto sl_arr = py::array_t<REAL>({slinesLen_[n][i], 3}, // shape
+    #                                       {3*sizeof(REAL), sizeof(REAL)}, // strides
+    #                                       sl,
+    #                                       cleanup(sl));
+    #       slines_list.push_back(sl_arr);
+    #     }
+    #   }
+
+    #   return slines_list;
+
+    # }
\ No newline at end of file
diff --git a/cuslines/globals.py b/cuslines/globals.py
new file mode 100644
index 0000000..e69de29
diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py
index e627978..d546d60 100644
--- a/run_gpu_streamlines.py
+++ b/run_gpu_streamlines.py
@@ -33,11 +33,10 @@
 import zipfile
 
 import numpy as np
-import numpy.linalg as npl
 
 import dipy.reconst.dti as dti
 from dipy.io import read_bvals_bvecs
-from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
+from dipy.io.stateful_tractogram import Space, StatefulTractogram
 from dipy.io.streamline import save_tractogram
 from dipy.tracking import utils
 from dipy.core.gradients import gradient_table, unique_bvals_magnitude

From 3828b73956c7da8e04106ee67a07fb423d90860b Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Thu, 1 Jan 2026 15:13:12 -0800
Subject: [PATCH 17/31] preparing to implement compilation

---
 cuslines/cuslines.py | 295 -------------------------------------------
 cuslines/globals.py  |   0
 2 files changed, 295 deletions(-)
 delete mode 100644 cuslines/cuslines.py
 delete mode 100644 cuslines/globals.py

diff --git a/cuslines/cuslines.py b/cuslines/cuslines.py
deleted file mode 100644
index bb83e5e..0000000
--- a/cuslines/cuslines.py
+++ /dev/null
@@ -1,295 +0,0 @@
-from cuda.bindings import driver, nvrtc, runtime
-# TODO: this would be better if only using CUDA core
-
-import numpy as np
-import logging
-
-import re
-import os
-
-
-logger = logging.getLogger("GPUStreamlines")
-
-
-# We extract REAL_DTYPE, MAX_SLINE_LEN from globals.h
-# Maybe there is a more elegant way of doing this?
-dir_path = os.path.dirname(os.path.abspath(__file__))
-globals_path = os.path.join(dir_path, "globals.h")
-with open(globals_path, 'r') as f:
-    content = f.read()
-
-defines = dict(re.findall(r"#define\s+(\w+)\s+([^\s/]+)", content))
-REAL_SIZE = int(defines["REAL_SIZE"])
-if REAL_SIZE == 4:
-    REAL_DTYPE = np.float32
-elif REAL_SIZE == 8:
-    REAL_DTYPE = np.float64
-else:
-    raise NotImplementedError(f"Unsupported REAL_SIZE={REAL_SIZE} in globals.h")
-MAX_SLINE_LEN = int(defines["MAX_SLINE_LEN"])
-
-
-def _cudaGetErrorEnum(error):
-    if isinstance(error, driver.CUresult):
-        err, name = driver.cuGetErrorName(error)
-        return name if err == driver.CUresult.CUDA_SUCCESS else "<unknown>"
-    elif isinstance(error, nvrtc.nvrtcResult):
-        return nvrtc.nvrtcGetErrorString(error)[1]
-    else:
-        raise RuntimeError('Unknown error type: {}'.format(error))
-
-def checkCudaErrors(result):
-    if result[0].value:
-        raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0])))
-    if len(result) == 1:
-        return None
-    elif len(result) == 2:
-        return result[1]
-    else:
-        return result[1:]
-
-
-class GPUTracker:
-    def __init__(
-        self,
-        model_type: ModelType,
-        max_angle: float,
-        min_signal: float,
-        tc_threshold: float,
-        step_size: float,
-        relative_peak_thresh: float,
-        min_separation_angle: float,
-        dataf: np.ndarray,
-        H: np.ndarray,
-        R: np.ndarray,
-        delta_b: np.ndarray,
-        delta_q: np.ndarray, # TODO: some of these only needed for boot
-        b0s_mask: np.ndarray,
-        metric_map: np.ndarray,
-        sampling_matrix: np.ndarray,
-        sphere_vertices: np.ndarray,
-        sphere_edges: np.ndarray,
-        ngpus: int = 1,
-        rng_seed: int = 0,
-        rng_offset: int = 0,
-    ):
-        for name, arr, dt in [
-            ("dataf", dataf, REAL_DTYPE),
-            ("H", H, REAL_DTYPE),
-            ("R", R, REAL_DTYPE),
-            ("delta_b", delta_b, REAL_DTYPE),
-            ("delta_q", delta_q, REAL_DTYPE),
-            ("b0s_mask", b0s_mask, np.int32),
-            ("metric_map", metric_map, REAL_DTYPE),
-            ("sampling_matrix", sampling_matrix, REAL_DTYPE),
-            ("sphere_vertices", sphere_vertices, REAL_DTYPE),
-            ("sphere_edges", sphere_edges, np.int32),
-        ]:
-            if arr.dtype != dt:
-                raise TypeError(f"{name} must have dtype {dt}, got {arr.dtype}")
-            if not arr.flags.c_contiguous:
-                raise ValueError(f"{name} must be C-contiguous")
-
-        self.dataf = dataf
-        self.H = H
-        self.R = R
-        self.delta_b = delta_b
-        self.delta_q = delta_q
-        self.b0s_mask = b0s_mask
-        self.metric_map = metric_map
-        self.sampling_matrix = sampling_matrix
-        self.sphere_vertices = sphere_vertices
-        self.sphere_edges = sphere_edges
-
-        self.dimx, self.dimy, self.dimz, self.dimt = dataf.shape
-        self.nedges = int(sphere_edges.shape[0])
-        self.delta_nr = int(delta_b.shape[0])
-        self.samplm_nr = int(sampling_matrix.shape[0])
-
-        self.model_type = int(model_type)
-        self.max_angle = REAL_DTYPE(max_angle)
-        self.min_signal = REAL_DTYPE(min_signal)
-        self.tc_threshold = REAL_DTYPE(tc_threshold)
-        self.step_size = REAL_DTYPE(step_size)
-        self.relative_peak_thresh = REAL_DTYPE(relative_peak_thresh)
-        self.min_separation_angle = REAL_DTYPE(min_separation_angle)
-
-        self.ngpus = int(ngpus)
-        self.rng_seed = int(rng_seed)
-        self.rng_offset = int(rng_offset)
-
-        self.nSlines_old = []
-        self.slines = []
-        self.sline_lens = []
-
-        checkCudaErrors(driver.cuInit(0))
-        avail = checkCudaErrors(runtime.cudaGetDeviceCount())
-        if self.ngpus > avail:
-            raise RuntimeError(f"Requested {self.ngpus} GPUs but only {avail} available")
-
-        logger.info("Creating GPUTracker with %d GPUs...", self.ngpus)
-
-        self.dataf_pts = []
-        self.H_pts = []
-        self.R_pts = []
-        self.delta_b_pts = []
-        self.delta_q_pts = []
-        self.b0s_mask_pts = []
-        self.metric_map_pts = []
-        self.sampling_matrix_pts = []
-        self.sphere_vertices_pts = []
-        self.sphere_edges_pts = []
-
-        for ii in range(self.ngpus):
-            checkCudaErrors(runtime.cudaSetDevice(ii))
-            self.dataf_pts.append( # TODO: put this in texture memory?
-                checkCudaErrors(runtime.cudaMallocManaged(
-                    REAL_SIZE*self.dataf.size, 
-                    runtime.cudaMemAttachGlobal)))
-            checkCudaErrors(runtime.cudaMemAdvise(
-                self.dataf_pts[ii],
-                REAL_SIZE*self.dataf.size,
-                runtime.cudaMemAdviseSetPreferredLocation,
-                ii))
-            self.H_pts.append(
-                checkCudaErrors(runtime.cudaMalloc(
-                    REAL_SIZE*self.H.size)))
-            self.R_pts.append(
-                checkCudaErrors(runtime.cudaMalloc(
-                    REAL_SIZE*self.R.size)))
-            self.delta_b_pts.append(
-                checkCudaErrors(runtime.cudaMalloc(
-                    REAL_SIZE*self.delta_b.size)))
-            self.delta_q_pts.append(
-                checkCudaErrors(runtime.cudaMalloc(
-                    REAL_SIZE*self.delta_q.size)))
-            self.b0s_mask_pts.append(
-                checkCudaErrors(runtime.cudaMalloc(
-                    np.int32().nbytes*self.b0s_mask.size)))
-            self.metric_map_pts.append(
-                checkCudaErrors(runtime.cudaMalloc(
-                    REAL_SIZE*self.metric_map.size)))
-            self.sampling_matrix_pts.append(
-                checkCudaErrors(runtime.cudaMalloc(
-                    REAL_SIZE*self.sampling_matrix.size)))
-            self.sphere_vertices_pts.append(
-                checkCudaErrors(runtime.cudaMalloc(
-                    REAL_SIZE*self.sphere_vertices.size)))
-            self.sphere_edges_pts.append(
-                checkCudaErrors(runtime.cudaMalloc(
-                    np.int32().nbytes*self.sphere_edges.size)))
-            
-            checkCudaErrors(runtime.cudaMemcpy(
-                self.dataf_pts[ii],
-                self.dataf.ctypes.data,
-                REAL_SIZE*self.dataf.size,
-                runtime.cudaMemcpyHostToDevice))
-            checkCudaErrors(runtime.cudaMemcpy(
-                self.H_pts[ii],
-                self.H.ctypes.data,
-                REAL_SIZE*self.H.size,
-                runtime.cudaMemcpyHostToDevice))
-            checkCudaErrors(runtime.cudaMemcpy(
-                self.R_pts[ii],
-                self.R.ctypes.data,
-                REAL_SIZE*self.R.size,
-                runtime.cudaMemcpyHostToDevice))
-            checkCudaErrors(runtime.cudaMemcpy(
-                self.delta_b_pts[ii],
-                self.delta_b.ctypes.data,
-                REAL_SIZE*self.delta_b.size,
-                runtime.cudaMemcpyHostToDevice))
-            checkCudaErrors(runtime.cudaMemcpy(
-                self.delta_q_pts[ii],
-                self.delta_q.ctypes.data,
-                REAL_SIZE*self.delta_q.size,
-                runtime.cudaMemcpyHostToDevice))
-            checkCudaErrors(runtime.cudaMemcpy(
-                self.b0s_mask_pts[ii],
-                self.b0s_mask.ctypes.data,
-                np.int32().nbytes*self.b0s_mask.size,
-                runtime.cudaMemcpyHostToDevice))
-            checkCudaErrors(runtime.cudaMemcpy(
-                self.metric_map_pts[ii],
-                self.metric_map.ctypes.data,
-                REAL_SIZE*self.metric_map.size,
-                runtime.cudaMemcpyHostToDevice))
-            checkCudaErrors(runtime.cudaMemcpy(
-                self.sampling_matrix_pts[ii],
-                self.sampling_matrix.ctypes.data,
-                REAL_SIZE*self.sampling_matrix.size,
-                runtime.cudaMemcpyHostToDevice))
-            checkCudaErrors(runtime.cudaMemcpy(
-                self.sphere_vertices_pts[ii],
-                self.sphere_vertices.ctypes.data,
-                REAL_SIZE*self.sphere_vertices.size,
-                runtime.cudaMemcpyHostToDevice))
-            checkCudaErrors(runtime.cudaMemcpy(
-                self.sphere_edges_pts[ii],
-                self.sphere_edges.ctypes.data,
-                np.int32().nbytes*self.sphere_edges.size,
-                runtime.cudaMemcpyHostToDevice))
-
-        self.streams = []
-        for ii in range(self.ngpus):
-            checkCudaErrors(runtime.cudaSetDevice(ii))
-            self.streams.append(
-                checkCudaErrors(runtime.cudaStreamCreateWithFlags(
-                    runtime.cudaStreamNonBlocking)))
-
-    def generate_streamlines(self, seeds):  # TODO: location this is going should be these arguments
-        nseeds = len(seeds)
-        nseeds_per_gpu = (nseeds + self.ngpus - 1) // self.ngpus
-
-        seeds_ptrs = []
-
-        for ii in range(self.ngpus):
-            nseeds_gpu = min(nseeds_per_gpu, max(0, nseeds - ii * nseeds_per_gpu))
-            checkCudaErrors(runtime.cudaSetDevice(ii))
-            seeds_ptrs.append(checkCudaErrors(runtime.cudaMalloc(
-                REAL_SIZE*3*nseeds_gpu)))
-            checkCudaErrors(runtime.cudaMemcpy(
-                seeds_ptrs[ii],
-                seeds[ii*nseeds_per_gpu:(ii+1)*nseeds_per_gpu].ctypes.data,
-                REAL_SIZE*3*nseeds_gpu,
-                runtime.cudaMemcpyHostToDevice))
-        
-        nSlines = [0] * self.ngpus  # TODO: figure out what this is doing
-        # TODO:
-    #   // Call GPU routine
-    #   generate_streamlines_cuda_mgpu(model_type_, max_angle_, min_signal_, tc_threshold_, step_size_,
-    #                                  relative_peak_thresh_, min_separation_angle_,
-    #                                  nseeds, seeds_d,
-    #                                  dimx_, dimy_, dimz_, dimt_,
-    #                                  dataf_d, H_d, R_d, delta_nr_, delta_b_d, delta_q_d, b0s_mask_d, metric_map_d, samplm_nr_, sampling_matrix_d,
-    #                                  sphere_vertices_d, sphere_edges_d, nedges_,
-    #                                  slines_, slinesLen_, nSlines, nSlines_old_, rng_seed_, rng_offset_, ngpus_,
-    #                                  streams_);
-
-        self.nSlines_old = nSlines.copy()  # TODO: figure out what this is doing
-        self.rng_offset += nseeds
-
-        nSlines_total = 0
-        for ii in range(self.ngpus):
-            checkCudaErrors(runtime.cudaFree(seeds_ptrs[ii]))
-            nSlines_total += nSlines[ii]
-
-
-        # TODO
-    #   std::vector<py::array_t<REAL>> slines_list;
-    #   slines_list.reserve(nSlines_total);
-    #   for (int n = 0; n < ngpus_; ++n) {
-    #     for (int i = 0; i < nSlines[n]; ++i) {
-    #       REAL* sl = new REAL[slinesLen_[n][i]*3];
-    #       std::memcpy(sl, slines_[n] + i*3*2*MAX_SLINE_LEN, slinesLen_[n][i]*3*sizeof(*sl));
-    #       auto sl_arr = py::array_t<REAL>({slinesLen_[n][i], 3}, // shape
-    #                                       {3*sizeof(REAL), sizeof(REAL)}, // strides
-    #                                       sl,
-    #                                       cleanup(sl));
-    #       slines_list.push_back(sl_arr);
-    #     }
-    #   }
-
-    #   return slines_list;
-
-    # }
\ No newline at end of file
diff --git a/cuslines/globals.py b/cuslines/globals.py
deleted file mode 100644
index e69de29..0000000

From 7f77687dc9339f99a0232758949a37a2dece7427 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Thu, 1 Jan 2026 15:13:57 -0800
Subject: [PATCH 18/31] new folder

---
 cuslines/cu_direction_getters.py | 308 +++++++++++++++++++++++++++++++
 cuslines/cu_propagate_seeds.py   | 218 ++++++++++++++++++++++
 cuslines/cu_tractography.py      | 179 ++++++++++++++++++
 cuslines/cutils.py               |  65 +++++++
 4 files changed, 770 insertions(+)
 create mode 100644 cuslines/cu_direction_getters.py
 create mode 100644 cuslines/cu_propagate_seeds.py
 create mode 100644 cuslines/cu_tractography.py
 create mode 100644 cuslines/cutils.py

diff --git a/cuslines/cu_direction_getters.py b/cuslines/cu_direction_getters.py
new file mode 100644
index 0000000..cbf2959
--- /dev/null
+++ b/cuslines/cu_direction_getters.py
@@ -0,0 +1,308 @@
+import numpy as np
+from abc import ABC, abstractmethod
+import logging
+from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
+
+from cuda.bindings import runtime
+from cuda.core import Device
+
+from cutils import (
+    REAL_SIZE,
+    REAL_DTYPE,
+    checkCudaErrors,
+)
+
+
+__all__ = [
+    "ProbDirectionGetter",
+    "PTTDirectionGetter",
+    "BootDirectionGetter"
+]
+
+
+logger = logging.getLogger("GPUStreamlines")
+
+
+_program = None
+
+
+def _compile_program(debug=False):  # TODO: compile kernels individually as needed
+    if _program is None:
+        logger.info("Compiling GPUStreamlines")
+        dev = Device()
+        dev.set_current()
+
+        if debug:
+            comp_kwargs = {
+                "debug": True,
+                "lineinfo": True,
+                "device_code_optimize": True,
+                "ptxas_options": ["-v", "-O0"]
+            }
+        else:
+            comp_kwargs = {"ptxas_options": ["-O3"]}
+        program_options = ProgramOptions(  # include_path maybe needed here?
+            name="GPUStreamlines",
+            arch=f"sm_{dev.arch}",
+            use_fast_math=True,
+            extra_device_vectorization=True,
+            std="c++11",
+            **comp_kwargs
+        )
+        prog = Program(code, code_type="c++", options=program_options)
+        _program = prog.compile("cubin", name_expressions=("vector_add<float>",))
+
+
+class _GPUDirectionGetter(ABC):
+    @abstractmethod
+    def get_direction(self):
+        pass
+
+    @abstractmethod
+    def get_num_streamlines(self):
+        pass
+
+    @abstractmethod
+    def allocate_on_gpu(self):
+        pass
+
+    @abstractmethod
+    def deallocate_on_gpu(self):
+        pass
+
+
+class BootDirectionGetter(_GPUDirectionGetter):
+    def __init__(  # TODO: Maybe accept a dipy thing and extract arrays here? maybe as a from_ function?
+            self,
+            min_signal: float,
+            H: np.ndarray,
+            R: np.ndarray,
+            delta_b: np.ndarray,
+            delta_q: np.ndarray,
+            sampling_matrix: np.ndarray,
+            b0s_mask: np.ndarray):
+        for name, arr, dt in [
+                ("H", H, REAL_DTYPE),
+                ("R", R, REAL_DTYPE),
+                ("delta_b", delta_b, REAL_DTYPE),
+                ("delta_q", delta_q, REAL_DTYPE),
+                ("b0s_mask", b0s_mask, np.int32),
+                ("sampling_matrix", sampling_matrix, REAL_DTYPE)]:
+            if arr.dtype != dt:
+                raise TypeError(f"{name} must have dtype {dt}, got {arr.dtype}")
+            if not arr.flags.c_contiguous:
+                raise ValueError(f"{name} must be C-contiguous")
+
+        self.H = H
+        self.R = R
+        self.delta_b = delta_b
+        self.delta_q = delta_q
+        self.delta_nr = int(delta_b.shape[0])
+        self.min_signal = REAL_DTYPE(min_signal)
+        self.sampling_matrix = sampling_matrix
+
+        self.H_d = []
+        self.R_d = []
+        self.delta_b_d = []
+        self.delta_q_d = []
+        self.b0s_mask_d = []
+        self.sampling_matrix_d = []
+
+    def allocate_on_gpu(self, n):
+        self.H_d.append(
+            checkCudaErrors(runtime.cudaMalloc(
+                REAL_SIZE*self.H.size)))
+        self.R_d.append(
+            checkCudaErrors(runtime.cudaMalloc(
+                REAL_SIZE*self.R.size)))
+        self.delta_b_d.append(
+            checkCudaErrors(runtime.cudaMalloc(
+                REAL_SIZE*self.delta_b.size)))
+        self.delta_q_d.append(
+            checkCudaErrors(runtime.cudaMalloc(
+                REAL_SIZE*self.delta_q.size)))
+        self.b0s_mask_d.append(
+            checkCudaErrors(runtime.cudaMalloc(
+                np.int32().nbytes*self.b0s_mask.size)))
+        self.sampling_matrix_d.append(
+            checkCudaErrors(runtime.cudaMalloc(
+                REAL_SIZE*self.sampling_matrix.size)))
+
+        checkCudaErrors(runtime.cudaMemcpy(
+            self.H_d[n],
+            self.H.ctypes.data,
+            REAL_SIZE*self.H.size,
+            runtime.cudaMemcpyHostToDevice))
+        checkCudaErrors(runtime.cudaMemcpy(
+            self.R_d[n],
+            self.R.ctypes.data,
+            REAL_SIZE*self.R.size,
+            runtime.cudaMemcpyHostToDevice))
+        checkCudaErrors(runtime.cudaMemcpy(
+            self.delta_b_d[n],
+            self.delta_b.ctypes.data,
+            REAL_SIZE*self.delta_b.size,
+            runtime.cudaMemcpyHostToDevice))
+        checkCudaErrors(runtime.cudaMemcpy(
+            self.delta_q_d[n],
+            self.delta_q.ctypes.data,
+            REAL_SIZE*self.delta_q.size,
+            runtime.cudaMemcpyHostToDevice))
+        checkCudaErrors(runtime.cudaMemcpy(
+            self.b0s_mask_d[n],
+            self.b0s_mask.ctypes.data,
+            np.int32().nbytes*self.b0s_mask.size,
+            runtime.cudaMemcpyHostToDevice))
+        checkCudaErrors(runtime.cudaMemcpy(
+            self.sampling_matrix_d[n],
+            self.sampling_matrix.ctypes.data,
+            REAL_SIZE*self.sampling_matrix.size,
+            runtime.cudaMemcpyHostToDevice))
+
+    def deallocate_on_gpu(self, n):
+        if self.H_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.H_d[n]))
+        if self.R_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.R_d[n]))
+        if self.delta_b_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.delta_b_d[n]))
+        if self.delta_q_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.delta_q_d[n]))
+        if self.b0s_mask_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.b0s_mask_d[n]))
+        if self.sampling_matrix_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.sampling_matrix_d[n]))
+
+    def getNumStreamlines(self):
+       pass
+
+    def generateStreamlines(self):
+       pass
+
+
+
+
+// Precompute number of streamlines before allocating memory
+if (!((model_type == PTT) || (model_type == PROB))) {
+    shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_boot_d
+                sizeof(int)*samplm_nr;						      // for peak_directions_d	
+    getNumStreamlinesBoot_k<THR_X_SL,
+                            THR_X_BL/THR_X_SL>
+                            <<<grid, block, shSizeGNS>>>(
+                                    model_type,
+                                    max_angle,
+                                    min_signal,
+                                    relative_peak_thresh,
+                                    min_separation_angle,
+                                    rng_seed,
+                                    nseeds_gpu,
+                                    reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                                    dimx,
+                                    dimy,
+                                    dimz,
+                                    dimt,
+                                    dataf_d[n],
+                                    H_d[n],
+                                    R_d[n],
+                                    delta_nr,
+                                    delta_b_d[n],
+                                    delta_q_d[n],
+                                    b0s_mask_d[n],
+                                    samplm_nr,
+                                    sampling_matrix_d[n],
+                                    reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]),
+                                    reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                                    nedges,
+                                    shDirTemp0_d[n],
+                                    slinesOffs_d[n]);
+} else {
+    shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt + sizeof(int)*(THR_X_BL/THR_X_SL)*n32dimt;
+    getNumStreamlinesProb_k<THR_X_SL,
+                            THR_X_BL/THR_X_SL>
+                            <<<grid, block, shSizeGNS>>>(
+                                    max_angle,
+                                    relative_peak_thresh,
+                                    min_separation_angle,
+                                    rng_seed,
+                                    nseeds_gpu,
+                                    reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                                    dimx,
+                                    dimy,
+                                    dimz,
+                                    dimt,
+                                    dataf_d[n],
+                                    reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]),
+                                    reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                                    nedges,
+                                    shDirTemp0_d[n],
+                                    slinesOffs_d[n]);
+}
+    
+
+  //#pragma omp parallel for
+  for (int n = 0; n < ngpus; ++n) {
+    CHECK_CUDA(cudaSetDevice(n));
+    int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu));
+    if (nseeds_gpu == 0) continue;
+    dim3 block(THR_X_SL, THR_X_BL/THR_X_SL);
+    dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL));
+#if 0
+    std::cerr << "GPU " << n << ": ";
+    std::cerr << "Generating " << nSlines_h[n] << " streamlines (from " << nseeds_gpu << " seeds)" << std::endl; 
+#endif
+
+    //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y);
+    switch(model_type) {
+        case OPDT:
+            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, OPDT> <<<grid, block, shSizeGNS, streams[n]>>>(
+                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
+                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
+                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
+                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+            break;
+
+        case CSA:
+            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, CSA> <<<grid, block, shSizeGNS, streams[n]>>>(
+                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
+                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
+                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
+                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+            break;
+
+        case PROB:
+            // Shared memory requirements are smaller for probabilistic for main run
+            // than for preliminary run
+            shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt;
+            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, PROB> <<<grid, block, shSizeGNS, streams[n]>>>(
+                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
+                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
+                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
+                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+            break;
+
+        case PTT:
+            shSizeGNS = 0; // PTT uses exclusively static shared memory
+            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, PTT> <<<grid, block, shSizeGNS, streams[n]>>>(
+                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
+                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
+                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
+                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+            break;
+
+        default:
+            printf("FATAL: Invalid Model Type.\n");
+            break;
+    }
+
+    CHECK_ERROR("genStreamlinesMerge_k");
+  }
+
+
diff --git a/cuslines/cu_propagate_seeds.py b/cuslines/cu_propagate_seeds.py
new file mode 100644
index 0000000..a334da6
--- /dev/null
+++ b/cuslines/cu_propagate_seeds.py
@@ -0,0 +1,218 @@
+import numpy as np
+import ctypes
+from cuda.bindings import runtime
+from nibabel.streamlines.array_sequence import ArraySequence
+import logging
+
+from cutils import (
+    REAL_SIZE,
+    REAL_DTYPE,
+    REAL3_DTYPE,
+    MAX_SLINE_LEN,
+    EXCESS_ALLOC_FACT,
+    THR_X_SL,
+    THR_X_BL,
+    div_up,
+    checkCudaErrors,
+)
+
+
+logger = logging.getLogger("GPUStreamlines")
+
+
+class SeedBatchPropagator:
+    def __init__(
+            self,
+            gpu_tracker):
+        self.gpu_tracker = gpu_tracker
+
+        self.nSlines_old = np.zeros(self.ngpus, dtype=np.int32)
+        self.nSlines = np.zeros(self.ngpus, dtype=np.int32)
+        self.slines = np.zeros(self.ngpus, dtype=ctypes.c_void_p)
+        self.sline_lens = np.zeros(self.ngpus, dtype=ctypes.c_void_p)
+
+        self.seeds_d = np.empty(self.ngpus, dtype=ctypes.c_void_p)
+        self.slineSeed_d = np.empty(self.ngpus, dtype=ctypes.c_void_p)
+        self.slinesOffs_d = np.empty(self.ngpus, dtype=ctypes.c_void_p)
+        self.shDirTemp0_d = np.empty(self.ngpus, dtype=ctypes.c_void_p)
+        self.slineLen_d = np.empty(self.ngpus, dtype=ctypes.c_void_p)
+        self.sline_d = np.empty(self.ngpus, dtype=ctypes.c_void_p)
+
+    def _switch_device(self, n):
+        checkCudaErrors(runtime.cudaSetDevice(n))
+
+        nseeds_gpu =  min(
+            self.nseeds_per_gpu, max(0, self.nseeds - n * self.nseeds_per_gpu))
+        block = (THR_X_SL, THR_X_BL//THR_X_SL, 1)
+        grid = (div_up(nseeds_gpu, THR_X_BL//THR_X_SL), 1, 1)
+
+        return nseeds_gpu, block, grid
+
+    def _get_sl_buffer_size(self, n):
+        return REAL_SIZE*2*3*MAX_SLINE_LEN*self.nSlines[n]
+
+    def _allocate_seed_memory(self):
+        # Move seeds to GPU
+        for ii in range(self.ngpus):
+            nseeds_gpu, _, _ = self._switch_device(ii)
+            self.seeds_d[ii] = checkCudaErrors(runtime.cudaMalloc(
+                REAL_SIZE*3*nseeds_gpu))
+            checkCudaErrors(runtime.cudaMemcpy(
+                self.seeds_d[ii],
+                self.seeds[ii*self.nseeds_per_gpu:(ii+1)*self.nseeds_per_gpu].ctypes.data,
+                REAL_SIZE*3*nseeds_gpu,
+                runtime.cudaMemcpyHostToDevice))
+
+        for ii in range(self.ngpus):
+            nseeds_gpu, block, grid = self._switch_device(ii)
+            # Streamline offsets
+            self.slinesOffs_d[ii] = checkCudaErrors(runtime.cudaMalloc(
+                np.uint64().nbytes * (nseeds_gpu + 1)))
+            # Initial directions from each seed
+            self.shDirTemp0_d[ii] = checkCudaErrors(runtime.cudaMalloc(
+                REAL3_DTYPE.nbytes * self.samplm_nr * grid[0] * block[1]))
+
+    def _cumsum_offsets(self):
+        for ii in range(self.ngpus):
+            nseeds_gpu, _, _ = self._switch_device(ii)
+            if (nseeds_gpu == 0):
+                self.nSlines[ii] = 0
+                continue
+
+            slinesOffs_h = np.empty(nseeds_gpu + 1, dtype=np.int32)
+            checkCudaErrors(runtime.cudaMemcpy(
+                slinesOffs_h.ctypes.data,
+                self.slinesOffs_d[ii],
+                slinesOffs_h.nbytes * (nseeds_gpu + 1),
+                runtime.cudaMemcpyDeviceToHost))
+
+            slinesOffs_h = np.concatenate((
+                [0], np.cumsum(slinesOffs_h[:-1], dtype=slinesOffs_h.dtype)))
+            self.nSlines[ii] = int(slinesOffs_h[-1])
+
+            checkCudaErrors(runtime.cudaMemcpy(
+                self.slinesOffs_d[ii],
+                slinesOffs_h.ctypes.data,
+                self.slinesOffs_d.size * (nseeds_gpu + 1),
+                runtime.cudaMemcpyHostToDevice))
+
+    def _allocate_tracking_memory(self):
+        for ii in range(self.ngpus):
+            self._switch_device(ii)
+
+            self.slineSeed_d[ii] = checkCudaErrors(runtime.cudaMalloc(
+                self.nSlines[ii] * np.int32().nbytes))
+            checkCudaErrors(runtime.cudaMemset(
+                self.slineSeed_d[ii],
+                -1,
+                self.nSlines[ii] * np.int32().nbytes))
+
+            if self.nSlines[ii] > EXCESS_ALLOC_FACT*self.nSlines_old[ii]:
+                if self.slines[ii]:
+                    checkCudaErrors(runtime.cudaFreeHost(
+                        self.slines[ii]))
+                if self.sline_lens[ii]:
+                    checkCudaErrors(runtime.cudaFreeHost(
+                        self.sline_lens[ii]))
+                self.slines[ii] = 0  # Nullptr
+                self.sline_lens[ii] = 0  # Nullptr
+
+            buffer_size = self._get_sl_buffer_size(ii)
+            logger.debug(f"Streamline buffer size: {buffer_size}")
+
+            if not self.slines[ii]:
+                self.slines[ii] = checkCudaErrors(runtime.cudaMallocHost(
+                    buffer_size))
+            if not self.slines_lens[ii]:
+                self.slines_lens[ii] = checkCudaErrors(runtime.cudaMallocHost(
+                    np.int32().nbytes*EXCESS_ALLOC_FACT*self.nSlines[ii]))
+
+        for ii in range(self.ngpus):
+            self._switch_device(ii)
+            buffer_size = self._get_sl_buffer_size(ii)
+
+            self.slineLen_d[ii] = checkCudaErrors(runtime.cudaMalloc(
+                np.int32().nbytes * self.nSlines[ii]))
+            self.sline_d[ii] = checkCudaErrors(runtime.cudaMalloc(
+                buffer_size))
+
+    def _cleanup(self):
+        for ii in range(self.ngpus):
+            self._switch_device(ii)
+            checkCudaErrors(runtime.cudaMemcpyAsync(
+                self.slines[ii],
+                self.sline_d[ii],
+                self._get_sl_buffer_size(ii),
+                runtime.cudaMemcpyDeviceToHost,
+                self.gpu_tracker.streams[ii]))
+            checkCudaErrors(runtime.cudaMemcpyAsync(
+                self.sline_lens[ii],
+                self.slineLen_d[ii],
+                np.int32().nbytes*self.nSlines[ii],
+                runtime.cudaMemcpyDeviceToHost,
+                self.gpu_tracker.streams[ii]))
+
+        for ii in range(self.ngpus):
+            self._switch_device(ii)
+            checkCudaErrors(runtime.cudaStreamSynchronize(
+                self.gpu_tracker.streams[ii]))
+            checkCudaErrors(runtime.cudaFree(self.seeds_d[ii]))
+            checkCudaErrors(runtime.cudaFree(self.slineSeed_d[ii]))
+            checkCudaErrors(runtime.cudaFree(self.slinesOffs_d[ii]))
+            checkCudaErrors(runtime.cudaFree(self.shDirTemp0_d[ii]))
+            checkCudaErrors(runtime.cudaFree(self.slineLen_d[ii]))
+            checkCudaErrors(runtime.cudaFree(self.sline_d[ii]))
+
+        self.nSlines_old = self.nSlines.copy()
+        self.rng_offset += self.nseeds
+
+    def propagate(self, seeds):
+        self.seeds = seeds
+        self.nseeds = len(seeds)
+        self.nseeds_per_gpu = (self.nseeds + self.gpu_tracker.ngpus - 1) // self.gpu_tracker.ngpus
+
+        self._seeds_to_gpu()
+        self._allocate_seed_memory()
+
+        for ii in range(self.ngpus):
+            nseeds_gpu, block, grid = self._switch_device(ii)
+            if (nseeds_gpu == 0):
+                continue
+
+            getNumStreamlines() # TODO: these will each be classes you can pass in
+
+        self._cumsum_offsets()
+        self._allocate_tracking_memory()
+
+        for ii in range(self.ngpus):
+            nseeds_gpu, block, grid = self._switch_device(ii)
+            if (nseeds_gpu == 0):
+                continue
+
+            mergeStreamlines() # TODO
+
+        self._cleanup()
+
+    def as_array_sequence(self):  # TODO: optimize memory usage here? also, direct to trx?
+        buffer_size = 0
+        for ii in range(self.ngpus):
+            lens = self.sline_lens[ii]
+            for jj in range(self.nSlines[ii]):
+                buffer_size += lens[jj] * 3 * REAL_SIZE
+
+        def _yield_slines():
+            for ii in range(self.ngpus):
+                this_sls = self.slines[ii]
+                this_len = self.sline_lens[ii]
+
+                for jj in range(self.nSlines[ii]):
+                    npts = this_len[jj]
+                    offset = jj * 3 * 2 * MAX_SLINE_LEN
+
+                    sl = np.asarray(
+                        this_sls[offset : offset + npts * 3],
+                        dtype=REAL_DTYPE)
+                    sl = sl.reshape((npts, 3))
+                    yield sl
+
+        return ArraySequence(_yield_slines, buffer_size)
diff --git a/cuslines/cu_tractography.py b/cuslines/cu_tractography.py
new file mode 100644
index 0000000..acfcc96
--- /dev/null
+++ b/cuslines/cu_tractography.py
@@ -0,0 +1,179 @@
+from cuda.bindings import driver, runtime
+# TODO: consider cuda core over cuda bindings
+
+import numpy as np
+import logging
+
+from cutils import (
+    REAL_SIZE,
+    REAL_DTYPE,
+    checkCudaErrors,
+)
+from cu_direction_getters import (
+    GPUDirectionGetter,
+    BootDirectionGetter
+)
+from cu_propagate_seeds import SeedBatchPropagator
+
+
+logger = logging.getLogger("GPUStreamlines")
+
+# TODO: we need to organize this package into folders, then make it pip installable.
+# but should merge in PTT FIRST
+class GPUTracker:  # TODO: bring in pyAFQ prep stuff
+    def __init__(
+        self,
+        dg: GPUDirectionGetter,
+        max_angle: float,
+        tc_threshold: float,
+        step_size: float,
+        relative_peak_thresh: float,
+        min_separation_angle: float,
+        dataf: np.ndarray, # TODO: reasonable defaults for floats, reorganize order, better names, documentation
+        metric_map: np.ndarray,
+        sphere_vertices: np.ndarray,
+        sphere_edges: np.ndarray,
+        ngpus: int = 1,
+        rng_seed: int = 0,
+        rng_offset: int = 0,
+    ):
+        for name, arr, dt in [
+            ("dataf", dataf, REAL_DTYPE),
+            ("metric_map", metric_map, REAL_DTYPE),
+            ("sphere_vertices", sphere_vertices, REAL_DTYPE),
+            ("sphere_edges", sphere_edges, np.int32),
+        ]:
+            if arr.dtype != dt:
+                raise TypeError(f"{name} must have dtype {dt}, got {arr.dtype}")
+            if not arr.flags.c_contiguous:
+                raise ValueError(f"{name} must be C-contiguous")
+
+        self.dataf = dataf
+        self.metric_map = metric_map
+        self.sphere_vertices = sphere_vertices
+        self.sphere_edges = sphere_edges
+
+        self.dimx, self.dimy, self.dimz, self.dimt = dataf.shape
+        self.nedges = int(sphere_edges.shape[0])
+        if isinstance(dg, BootDirectionGetter):
+            self.samplm_nr = int(dg.sampling_matrix.shape[0])
+        else:
+            self.samplm_nr = self.dimt
+
+        self.dg = dg
+        self.max_angle = REAL_DTYPE(max_angle)
+        self.tc_threshold = REAL_DTYPE(tc_threshold)
+        self.step_size = REAL_DTYPE(step_size)
+        self.relative_peak_thresh = REAL_DTYPE(relative_peak_thresh)
+        self.min_separation_angle = REAL_DTYPE(min_separation_angle)
+
+        self.ngpus = int(ngpus)
+        self.rng_seed = int(rng_seed)
+        self.rng_offset = int(rng_offset)
+
+        checkCudaErrors(driver.cuInit(0))
+        avail = checkCudaErrors(runtime.cudaGetDeviceCount())
+        if self.ngpus > avail:
+            raise RuntimeError(f"Requested {self.ngpus} GPUs but only {avail} available")
+
+        logger.info("Creating GPUTracker with %d GPUs...", self.ngpus)
+
+        self.dataf_d = []
+        self.metric_map_d = []
+        self.sphere_vertices_d = []
+        self.sphere_edges_d = []
+
+        self.seed_propagator = SeedBatchPropagator(
+            gpu_tracker=self)
+        self._allocated = False
+
+    def __enter__(self):
+        self._allocate()
+        return self
+
+    def _allocate(self):
+        if self._allocated:
+            return
+
+        for ii in range(self.ngpus):
+            checkCudaErrors(runtime.cudaSetDevice(ii))
+            self.dataf_d.append( # TODO: put this in texture memory?
+                checkCudaErrors(runtime.cudaMallocManaged(  # TODO: look at cuda core managed memory
+                    REAL_SIZE*self.dataf.size, 
+                    runtime.cudaMemAttachGlobal)))
+            checkCudaErrors(runtime.cudaMemAdvise(
+                self.dataf_d[ii],
+                REAL_SIZE*self.dataf.size,
+                runtime.cudaMemAdviseSetPreferredLocation,
+                ii))
+            self.metric_map_d.append(
+                checkCudaErrors(runtime.cudaMalloc(
+                    REAL_SIZE*self.metric_map.size)))
+            self.sphere_vertices_d.append(
+                checkCudaErrors(runtime.cudaMalloc(
+                    REAL_SIZE*self.sphere_vertices.size)))
+            self.sphere_edges_d.append(
+                checkCudaErrors(runtime.cudaMalloc(
+                    np.int32().nbytes*self.sphere_edges.size)))
+            
+            checkCudaErrors(runtime.cudaMemcpy(
+                self.dataf_d[ii],
+                self.dataf.ctypes.data,
+                REAL_SIZE*self.dataf.size,
+                runtime.cudaMemcpyHostToDevice))
+            checkCudaErrors(runtime.cudaMemcpy(
+                self.metric_map_d[ii],
+                self.metric_map.ctypes.data,
+                REAL_SIZE*self.metric_map.size,
+                runtime.cudaMemcpyHostToDevice))
+            checkCudaErrors(runtime.cudaMemcpy(
+                self.sphere_vertices_d[ii],
+                self.sphere_vertices.ctypes.data,
+                REAL_SIZE*self.sphere_vertices.size,
+                runtime.cudaMemcpyHostToDevice))
+            checkCudaErrors(runtime.cudaMemcpy(
+                self.sphere_edges_d[ii],
+                self.sphere_edges.ctypes.data,
+                np.int32().nbytes*self.sphere_edges.size,
+                runtime.cudaMemcpyHostToDevice))
+            
+            self.dg.allocate_on_gpu(ii)
+
+        self.streams = []
+        for ii in range(self.ngpus):
+            checkCudaErrors(runtime.cudaSetDevice(ii))
+            self.streams.append(
+                checkCudaErrors(runtime.cudaStreamCreateWithFlags(
+                    runtime.cudaStreamNonBlocking)))
+
+        self._allocated = True
+
+    def __exit__(self, exc_type, exc, tb):
+        logger.info("Destroying GPUTracker and freeing GPU memory...")
+
+        for n in range(self.ngpus):
+            checkCudaErrors(runtime.cudaSetDevice(n))
+            if self.dataf_d[n]:
+                checkCudaErrors(runtime.cudaFree(self.dataf_d[n]))
+            if self.metric_map_d[n]:
+                checkCudaErrors(runtime.cudaFree(self.metric_map_d[n]))
+            if self.sphere_vertices_d[n]:
+                checkCudaErrors(runtime.cudaFree(self.sphere_vertices_d[n]))
+            if self.sphere_edges_d[n]:
+                checkCudaErrors(runtime.cudaFree(self.sphere_edges_d[n]))
+
+            if self.seed_propagator.sline_lens[n]:
+                checkCudaErrors(runtime.cudaFreeHost(
+                    self.seed_propagator.sline_lens[n]))
+            if self.seed_propagator.slines[n]:
+                checkCudaErrors(runtime.cudaFreeHost(
+                    self.seed_propagator.slines[n]))
+                
+            self.dg.deallocate_on_gpu(n)
+
+            checkCudaErrors(runtime.cudaStreamDestroy(self.streams[n]))
+        return False
+
+    def generate_streamlines(self, seeds):
+        self.seed_propagator.propagate(seeds)
+        return self.seed_propagator.as_array_sequence()
diff --git a/cuslines/cutils.py b/cuslines/cutils.py
new file mode 100644
index 0000000..4d75847
--- /dev/null
+++ b/cuslines/cutils.py
@@ -0,0 +1,65 @@
+from cuda.bindings import driver, nvrtc
+
+import re
+import os
+import numpy as np
+
+from enum import IntEnum
+
+
+class ModelType(IntEnum):
+    OPDT = 0
+    CSA = 1
+    PROB = 2
+    PTT = 3
+
+
+# We extract REAL_DTYPE, MAX_SLINE_LEN from globals.h
+# Maybe there is a more elegant way of doing this?
+dir_path = os.path.dirname(os.path.abspath(__file__))
+globals_path = os.path.join(dir_path, "globals.h")
+with open(globals_path, 'r') as f:
+    content = f.read()
+
+defines = dict(re.findall(r"#define\s+(\w+)\s+([^\s/]+)", content))
+REAL_SIZE = int(defines["REAL_SIZE"])
+REAL3_SIZE = 3 * REAL_SIZE
+if REAL_SIZE == 4:
+    REAL_DTYPE = np.float32
+    REAL3_DTYPE = np.dtype([('x', np.float32),
+                            ('y', np.float32),
+                            ('z', np.float32)])
+elif REAL_SIZE == 8:
+    REAL_DTYPE = np.float64
+    REAL3_DTYPE = np.dtype([('x', np.float64),
+                            ('y', np.float64),
+                            ('z', np.float64)])
+else:
+    raise NotImplementedError(f"Unsupported REAL_SIZE={REAL_SIZE} in globals.h")
+MAX_SLINE_LEN = int(defines["MAX_SLINE_LEN"])
+THR_X_SL = int(defines["THR_X_SL"])
+THR_X_BL = int(defines["THR_X_BL"])
+EXCESS_ALLOC_FACT = int(defines["EXCESS_ALLOC_FACT"])
+
+
+def _cudaGetErrorEnum(error):
+    if isinstance(error, driver.CUresult):
+        err, name = driver.cuGetErrorName(error)
+        return name if err == driver.CUresult.CUDA_SUCCESS else "<unknown>"
+    elif isinstance(error, nvrtc.nvrtcResult):
+        return nvrtc.nvrtcGetErrorString(error)[1]
+    else:
+        raise RuntimeError('Unknown error type: {}'.format(error))
+
+def checkCudaErrors(result):
+    if result[0].value:
+        raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0])))
+    if len(result) == 1:
+        return None
+    elif len(result) == 2:
+        return result[1]
+    else:
+        return result[1:]
+
+def div_up(a, b):
+    return (a + b - 1) // b

From 544445b99b6a3182ab176c29316b2d871b5b18b9 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Tue, 6 Jan 2026 09:54:16 -0800
Subject: [PATCH 19/31] first draft of cuda python GPUstreamlines; currently
 broken

---
 cuslines/Makefile                             |   6 +-
 cuslines/__init__.py                          |  13 +
 cuslines/cu_direction_getters.py              | 308 --------------
 cuslines/cuda_python/__init__.py              |  13 +
 .../__pycache__/__init__.cpython-312.pyc      | Bin 0 -> 385 bytes
 .../__pycache__/_globals.cpython-312.pyc      | Bin 0 -> 389 bytes
 .../cu_direction_getters.cpython-312.pyc      | Bin 0 -> 22688 bytes
 .../cu_propagate_seeds.cpython-312.pyc        | Bin 0 -> 14574 bytes
 .../cu_tractography.cpython-312.pyc           | Bin 0 -> 9441 bytes
 .../__pycache__/cutils.cpython-312.pyc        | Bin 0 -> 2922 bytes
 cuslines/cuda_python/_globals.py              |  10 +
 cuslines/cuda_python/cu_direction_getters.py  | 381 ++++++++++++++++++
 .../{ => cuda_python}/cu_propagate_seeds.py   | 103 ++---
 cuslines/{ => cuda_python}/cu_tractography.py |  97 ++---
 cuslines/{ => cuda_python}/cutils.py          |  28 +-
 cuslines/cuwsort.cuh                          |  19 +-
 cuslines/generate_streamlines_cuda.cu         | 153 ++++---
 cuslines/globals.h                            |  33 +-
 cuslines/ptt.cu                               |   2 +-
 pyproject.toml                                |  14 +-
 run_gpu_streamlines.py                        |   2 +-
 setup.py                                      |  49 +++
 22 files changed, 704 insertions(+), 527 deletions(-)
 delete mode 100644 cuslines/cu_direction_getters.py
 create mode 100644 cuslines/cuda_python/__init__.py
 create mode 100644 cuslines/cuda_python/__pycache__/__init__.cpython-312.pyc
 create mode 100644 cuslines/cuda_python/__pycache__/_globals.cpython-312.pyc
 create mode 100644 cuslines/cuda_python/__pycache__/cu_direction_getters.cpython-312.pyc
 create mode 100644 cuslines/cuda_python/__pycache__/cu_propagate_seeds.cpython-312.pyc
 create mode 100644 cuslines/cuda_python/__pycache__/cu_tractography.cpython-312.pyc
 create mode 100644 cuslines/cuda_python/__pycache__/cutils.cpython-312.pyc
 create mode 100644 cuslines/cuda_python/_globals.py
 create mode 100644 cuslines/cuda_python/cu_direction_getters.py
 rename cuslines/{ => cuda_python}/cu_propagate_seeds.py (69%)
 rename cuslines/{ => cuda_python}/cu_tractography.py (69%)
 rename cuslines/{ => cuda_python}/cutils.py (70%)
 create mode 100644 setup.py

diff --git a/cuslines/Makefile b/cuslines/Makefile
index c8fe6c7..8fd8528 100644
--- a/cuslines/Makefile
+++ b/cuslines/Makefile
@@ -31,14 +31,14 @@ CUDACC=$(CUDA_HOME)/bin/nvcc # -G -g -dopt=on
 CXX=g++
 LD=g++
 
-CXXFLAGS= -c -O3 -std=c++11 -fopenmp -fPIC `python3 -m pybind11 --includes` -I$(CUDA_HOME)/include
+CXXFLAGS= -c -O3 -std=c++17 -fopenmp -fPIC `python3 -m pybind11 --includes` -I$(CUDA_HOME)/include
 
-SMS ?= 70
+SMS ?= 75 80
 CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM))
 LASTSM := $(lastword $(sort $(SMS)))
 CUDA_ARCH += -gencode arch=compute_$(LASTSM),code=compute_$(LASTSM)
 
-COMMON_FLAGS = -c -std=c++11 -Xcompiler -fPIC --use_fast_math -Xcompiler=-fopenmp $(CUDA_ARCH)
+COMMON_FLAGS = -c -std=c++17 -Xcompiler -fPIC --use_fast_math -Xcompiler=-fopenmp $(CUDA_ARCH)
 RELEASE_FLAGS = -O3 -Xptxas=-O3
 DEBUG_FLAGS   = -O0 -Xptxas=-v -g -G -lineinfo
 CUDACFLAGS = $(COMMON_FLAGS) $(RELEASE_FLAGS)
diff --git a/cuslines/__init__.py b/cuslines/__init__.py
index e69de29..b96cca1 100644
--- a/cuslines/__init__.py
+++ b/cuslines/__init__.py
@@ -0,0 +1,13 @@
+from .cuda_python import (
+    GPUTracker,
+    ProbDirectionGetter,
+    PttDirectionGetter,
+    BootDirectionGetter
+)
+
+__all__ = [
+    "GPUTracker",
+    "ProbDirectionGetter",
+    "PttDirectionGetter",
+    "BootDirectionGetter"
+]
diff --git a/cuslines/cu_direction_getters.py b/cuslines/cu_direction_getters.py
deleted file mode 100644
index cbf2959..0000000
--- a/cuslines/cu_direction_getters.py
+++ /dev/null
@@ -1,308 +0,0 @@
-import numpy as np
-from abc import ABC, abstractmethod
-import logging
-from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
-
-from cuda.bindings import runtime
-from cuda.core import Device
-
-from cutils import (
-    REAL_SIZE,
-    REAL_DTYPE,
-    checkCudaErrors,
-)
-
-
-__all__ = [
-    "ProbDirectionGetter",
-    "PTTDirectionGetter",
-    "BootDirectionGetter"
-]
-
-
-logger = logging.getLogger("GPUStreamlines")
-
-
-_program = None
-
-
-def _compile_program(debug=False):  # TODO: compile kernels individually as needed
-    if _program is None:
-        logger.info("Compiling GPUStreamlines")
-        dev = Device()
-        dev.set_current()
-
-        if debug:
-            comp_kwargs = {
-                "debug": True,
-                "lineinfo": True,
-                "device_code_optimize": True,
-                "ptxas_options": ["-v", "-O0"]
-            }
-        else:
-            comp_kwargs = {"ptxas_options": ["-O3"]}
-        program_options = ProgramOptions(  # include_path maybe needed here?
-            name="GPUStreamlines",
-            arch=f"sm_{dev.arch}",
-            use_fast_math=True,
-            extra_device_vectorization=True,
-            std="c++11",
-            **comp_kwargs
-        )
-        prog = Program(code, code_type="c++", options=program_options)
-        _program = prog.compile("cubin", name_expressions=("vector_add<float>",))
-
-
-class _GPUDirectionGetter(ABC):
-    @abstractmethod
-    def get_direction(self):
-        pass
-
-    @abstractmethod
-    def get_num_streamlines(self):
-        pass
-
-    @abstractmethod
-    def allocate_on_gpu(self):
-        pass
-
-    @abstractmethod
-    def deallocate_on_gpu(self):
-        pass
-
-
-class BootDirectionGetter(_GPUDirectionGetter):
-    def __init__(  # TODO: Maybe accept a dipy thing and extract arrays here? maybe as a from_ function?
-            self,
-            min_signal: float,
-            H: np.ndarray,
-            R: np.ndarray,
-            delta_b: np.ndarray,
-            delta_q: np.ndarray,
-            sampling_matrix: np.ndarray,
-            b0s_mask: np.ndarray):
-        for name, arr, dt in [
-                ("H", H, REAL_DTYPE),
-                ("R", R, REAL_DTYPE),
-                ("delta_b", delta_b, REAL_DTYPE),
-                ("delta_q", delta_q, REAL_DTYPE),
-                ("b0s_mask", b0s_mask, np.int32),
-                ("sampling_matrix", sampling_matrix, REAL_DTYPE)]:
-            if arr.dtype != dt:
-                raise TypeError(f"{name} must have dtype {dt}, got {arr.dtype}")
-            if not arr.flags.c_contiguous:
-                raise ValueError(f"{name} must be C-contiguous")
-
-        self.H = H
-        self.R = R
-        self.delta_b = delta_b
-        self.delta_q = delta_q
-        self.delta_nr = int(delta_b.shape[0])
-        self.min_signal = REAL_DTYPE(min_signal)
-        self.sampling_matrix = sampling_matrix
-
-        self.H_d = []
-        self.R_d = []
-        self.delta_b_d = []
-        self.delta_q_d = []
-        self.b0s_mask_d = []
-        self.sampling_matrix_d = []
-
-    def allocate_on_gpu(self, n):
-        self.H_d.append(
-            checkCudaErrors(runtime.cudaMalloc(
-                REAL_SIZE*self.H.size)))
-        self.R_d.append(
-            checkCudaErrors(runtime.cudaMalloc(
-                REAL_SIZE*self.R.size)))
-        self.delta_b_d.append(
-            checkCudaErrors(runtime.cudaMalloc(
-                REAL_SIZE*self.delta_b.size)))
-        self.delta_q_d.append(
-            checkCudaErrors(runtime.cudaMalloc(
-                REAL_SIZE*self.delta_q.size)))
-        self.b0s_mask_d.append(
-            checkCudaErrors(runtime.cudaMalloc(
-                np.int32().nbytes*self.b0s_mask.size)))
-        self.sampling_matrix_d.append(
-            checkCudaErrors(runtime.cudaMalloc(
-                REAL_SIZE*self.sampling_matrix.size)))
-
-        checkCudaErrors(runtime.cudaMemcpy(
-            self.H_d[n],
-            self.H.ctypes.data,
-            REAL_SIZE*self.H.size,
-            runtime.cudaMemcpyHostToDevice))
-        checkCudaErrors(runtime.cudaMemcpy(
-            self.R_d[n],
-            self.R.ctypes.data,
-            REAL_SIZE*self.R.size,
-            runtime.cudaMemcpyHostToDevice))
-        checkCudaErrors(runtime.cudaMemcpy(
-            self.delta_b_d[n],
-            self.delta_b.ctypes.data,
-            REAL_SIZE*self.delta_b.size,
-            runtime.cudaMemcpyHostToDevice))
-        checkCudaErrors(runtime.cudaMemcpy(
-            self.delta_q_d[n],
-            self.delta_q.ctypes.data,
-            REAL_SIZE*self.delta_q.size,
-            runtime.cudaMemcpyHostToDevice))
-        checkCudaErrors(runtime.cudaMemcpy(
-            self.b0s_mask_d[n],
-            self.b0s_mask.ctypes.data,
-            np.int32().nbytes*self.b0s_mask.size,
-            runtime.cudaMemcpyHostToDevice))
-        checkCudaErrors(runtime.cudaMemcpy(
-            self.sampling_matrix_d[n],
-            self.sampling_matrix.ctypes.data,
-            REAL_SIZE*self.sampling_matrix.size,
-            runtime.cudaMemcpyHostToDevice))
-
-    def deallocate_on_gpu(self, n):
-        if self.H_d[n]:
-            checkCudaErrors(runtime.cudaFree(self.H_d[n]))
-        if self.R_d[n]:
-            checkCudaErrors(runtime.cudaFree(self.R_d[n]))
-        if self.delta_b_d[n]:
-            checkCudaErrors(runtime.cudaFree(self.delta_b_d[n]))
-        if self.delta_q_d[n]:
-            checkCudaErrors(runtime.cudaFree(self.delta_q_d[n]))
-        if self.b0s_mask_d[n]:
-            checkCudaErrors(runtime.cudaFree(self.b0s_mask_d[n]))
-        if self.sampling_matrix_d[n]:
-            checkCudaErrors(runtime.cudaFree(self.sampling_matrix_d[n]))
-
-    def getNumStreamlines(self):
-       pass
-
-    def generateStreamlines(self):
-       pass
-
-
-
-
-// Precompute number of streamlines before allocating memory
-if (!((model_type == PTT) || (model_type == PROB))) {
-    shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_boot_d
-                sizeof(int)*samplm_nr;						      // for peak_directions_d	
-    getNumStreamlinesBoot_k<THR_X_SL,
-                            THR_X_BL/THR_X_SL>
-                            <<<grid, block, shSizeGNS>>>(
-                                    model_type,
-                                    max_angle,
-                                    min_signal,
-                                    relative_peak_thresh,
-                                    min_separation_angle,
-                                    rng_seed,
-                                    nseeds_gpu,
-                                    reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                                    dimx,
-                                    dimy,
-                                    dimz,
-                                    dimt,
-                                    dataf_d[n],
-                                    H_d[n],
-                                    R_d[n],
-                                    delta_nr,
-                                    delta_b_d[n],
-                                    delta_q_d[n],
-                                    b0s_mask_d[n],
-                                    samplm_nr,
-                                    sampling_matrix_d[n],
-                                    reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]),
-                                    reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                                    nedges,
-                                    shDirTemp0_d[n],
-                                    slinesOffs_d[n]);
-} else {
-    shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt + sizeof(int)*(THR_X_BL/THR_X_SL)*n32dimt;
-    getNumStreamlinesProb_k<THR_X_SL,
-                            THR_X_BL/THR_X_SL>
-                            <<<grid, block, shSizeGNS>>>(
-                                    max_angle,
-                                    relative_peak_thresh,
-                                    min_separation_angle,
-                                    rng_seed,
-                                    nseeds_gpu,
-                                    reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                                    dimx,
-                                    dimy,
-                                    dimz,
-                                    dimt,
-                                    dataf_d[n],
-                                    reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]),
-                                    reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                                    nedges,
-                                    shDirTemp0_d[n],
-                                    slinesOffs_d[n]);
-}
-    
-
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    CHECK_CUDA(cudaSetDevice(n));
-    int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu));
-    if (nseeds_gpu == 0) continue;
-    dim3 block(THR_X_SL, THR_X_BL/THR_X_SL);
-    dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL));
-#if 0
-    std::cerr << "GPU " << n << ": ";
-    std::cerr << "Generating " << nSlines_h[n] << " streamlines (from " << nseeds_gpu << " seeds)" << std::endl; 
-#endif
-
-    //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y);
-    switch(model_type) {
-        case OPDT:
-            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, OPDT> <<<grid, block, shSizeGNS, streams[n]>>>(
-                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
-                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
-                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
-                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
-            break;
-
-        case CSA:
-            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, CSA> <<<grid, block, shSizeGNS, streams[n]>>>(
-                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
-                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
-                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
-                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
-            break;
-
-        case PROB:
-            // Shared memory requirements are smaller for probabilistic for main run
-            // than for preliminary run
-            shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt;
-            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, PROB> <<<grid, block, shSizeGNS, streams[n]>>>(
-                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
-                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
-                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
-                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
-            break;
-
-        case PTT:
-            shSizeGNS = 0; // PTT uses exclusively static shared memory
-            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, PTT> <<<grid, block, shSizeGNS, streams[n]>>>(
-                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
-                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
-                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
-                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
-            break;
-
-        default:
-            printf("FATAL: Invalid Model Type.\n");
-            break;
-    }
-
-    CHECK_ERROR("genStreamlinesMerge_k");
-  }
-
-
diff --git a/cuslines/cuda_python/__init__.py b/cuslines/cuda_python/__init__.py
new file mode 100644
index 0000000..d0b42d4
--- /dev/null
+++ b/cuslines/cuda_python/__init__.py
@@ -0,0 +1,13 @@
+from .cu_tractography import GPUTracker
+from .cu_direction_getters import (
+    ProbDirectionGetter,
+    PttDirectionGetter,
+    BootDirectionGetter
+)
+
+__all__ = [
+    "GPUTracker",
+    "ProbDirectionGetter",
+    "PttDirectionGetter",
+    "BootDirectionGetter"
+]
diff --git a/cuslines/cuda_python/__pycache__/__init__.cpython-312.pyc b/cuslines/cuda_python/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6aedacf00242281ecdb484682fe1cfc13555eb98
GIT binary patch
literal 385
zcmZ9IF-ycS7>2*J=~eD5g6HB75Gjb=MI1!9O)4r#mr$F~R-2}z>Cn+%;qK;dxqlFF
za&l8|5S;vacuT+G;mw=meM6Q(Fa?~0tGm)C0Pn+MGtR$Q9ieyz4xCgVNkYh=oyw7J
z;!2uO=_MY49Ju>Jl1WcP?*p?SBzC%pZ*O#(JqjJVb{=bWe^F|YnNrmYVT{mrH#TPM
z&Cit@n<3RMhH5&%8!$m!hQ8g&I%crMs9dK_(XY?hJ#_h|i{<OGc68hNjHOk@*ek3s
z41Q0dLP-%lsG^SW>DNY!R90mz+9>PVjo>M3dQ+%6Vyvu7!`N}t+pSZncvp!t9bnY(
d0qs8G(<Oww!QqcXiSq@B?`?19Jo$QtyMH~$XmtPp

literal 0
HcmV?d00001

diff --git a/cuslines/cuda_python/__pycache__/_globals.cpython-312.pyc b/cuslines/cuda_python/__pycache__/_globals.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c8e522c2585d7677e056b5dea76fbd1c729cef2
GIT binary patch
literal 389
zcmX|7Jx{_=6n*8hh>d(i;^^SQVs#}3t?&?%N89vaf^T_2Ujvvz35B6s-Hn5b(SN`n
zadBa6-l!8B32|}qH4)En?zuPjoSWPaMcDxY#`DeaUDU6r*+Tjgn<F7!00RUtiAXF)
zVk}PLEI|@1Ns=r@QY=l<zFf|%Vq$d_MGn7(y<d4e`F!$Wwg2`SVR61nL<<)Y9SYLR
zGS9<q4Ps0+gp3+(XtfS6wzV$BsHH<pO=wXJVVx`gLyDlzW!<pa6q=atne7JcG%bjm
z2C7q&XDp~8idzJ7S<GpdUZSnSXaP3jGWe;e19xPr_wHb<UYXZ;Hnn>rXE?S!^=|HM
z1?$m?KO4AXmG&L?w&!@q6Q9dxBbUt`yE-k1sOVANsi1XSlB9JC?3b1xAC?Z5pcv+g
dff8nOfgBb}!Cttl1eq|ah&HP{$kUu~{Q}jiZTtWL

literal 0
HcmV?d00001

diff --git a/cuslines/cuda_python/__pycache__/cu_direction_getters.cpython-312.pyc b/cuslines/cuda_python/__pycache__/cu_direction_getters.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f59ad4d2e033c1468cb38fc97a82f693071d1aae
GIT binary patch
literal 22688
zcmdr!TW}lKb-Mr-&jkU##5V~_qG^y6^_Epxwk1)b6j`F4RwUTg8pJM1KzPs#P!U<G
zc2Xrn#%U!r(^5*55uLcTSh=-2(`lX7GgZ^fG}HKlYtjX0O;6par){P`v`N=9{^~h*
zu?rBqplHfYC%FXAJ@<9)`<`>py?1}<b~_m`ul;Wz{Pqrp`44pHhRY!HJuJhFF(M<f
zQ6|8GZHk&==72e730PRop9^p#+#0ZgZI0Sv_JEy)S)z`ZBj6x77j?#50auI<@G*D5
z9V-eH#XJEI3A0AMvEo26!EMo!SZSb?;Pz-)tUORoa7T1atRhfBaA&kKRu!m<RR^kL
zHG!H~ZJ;(*7pRNX2kK)Dfrgkb;EOc|8e?k%Ygxv`NL-{-uUOU=U3Z~wXaeg^%t1!v
zuP~xpGT$|4e4B`G5%_u}YvhRLn+uJY%_OE5VirqHhVP@qw*-7kbA0{8w+wvCCBG53
zWsvo+`4tlH@|#ri-hCZvQLsCil7pdCOiJ}7L_e!?J>jU71jLdG$0R?eTK7vA!y!rK
zj|2zep}vkpyeHhN+K$VKUO5<3tx@8w7G*q+^{2v#I1;Gs3CBetelaYDgF>Ga6eU>@
z!?F}gCFDT}FYc96LO33Z4v3P_A58UOBAYxACmFM9QD{I69+hID{=uh#lHaL1P9E5M
zL^$=ezyZ}s(EX>+9Y3HJXK`WgDdE)VlWGZiZ52qk?pL}vS_et%3H3>#3muTt0a;GS
zN!4*QAxhEHgZ+|fKYi$=a8@{VM78ZZa;)QN;hbvQf8dO895N_6c>I}DDOn1}qT#rd
z4CQuNrf->f91vrSM8<T$B$-9_3UfCTGhhL?DaXx;<|_=0ZVN!0Xcsxj-eVH2SGa&<
z(B^ljC6MEO(tOaq2c=X>l0(+qYyrod=Pdxnm_cTO?V4cxR@IV}qCF}bSDo>sB#B9(
zw|_w8x}%BE1=Z3kheg$t>`x*oA2FYIZtP3Mq>Yh8Uwq@@YHkb-BuRY%5rabiAPkx~
z;F*4ff(g*6P_kru{~+n$u7OxiGYzd@T8VZ@u{c%()&duYmzfXk{5&8`C^L%?CuUJi
z<$CawG1&ww`@l+Kbf5ePK((}2ic4}ZCFPdm5o$S>#9Idh<VO+s0fbBda!sK^m`EYy
zMu?llG`>txJi#cmIuu2~hUy;}T1zQLUYuq~pJ|L4JO72sA-Crjpjedh$Wwb{Nvswi
zByU21?Vxg^)IHF<s5yA>C(RLs3Xd@<qrdd=e%X{VnlGZS{E$dDBCD^IER(Vsz9!xG
zvU!-5ouXN^^qNGD{8?{VE?eT92(zV^4YQXmDTk4+T*O!o(V24Te#2&+R?4mWW!L8%
zKi-Jv(c|g0h<J6Mtleu7-M6@5%WLc(vX{A3sUD)IxojP_M#^>HtgVN$%&<99k@eRx
zu}Ck+Wm~FB_Z_y0p1YZr+|FFKr)rE4d!$asv$md#=pE+XDAswwWrv7-mC-4&%u9|4
zPM|y^mJBlyU!Es8!FTOa-yYK&r8>o7Mkvu^JfP>xmX4NT(-6oFIxQAfvvtg7nAYDg
zvCKJ!g}?8eyJSAcd;>UI*HC3gBGw;<g{jxKc)e2X`hqoN-f(QwkZHrk(|(KU?oV9`
zCWVB?s?;)(a5y0ZD?jm!g@+`0E0kWf<7yX<_atNxUKk{@b%+bKwr+k*bqGS&nUkkG
z1VOdpYAC1BmV>~lIMosl#w67}kd%a;U@|4dfW4{aWJ*+dQ3Adwflo+IsC+K}8>-cp
zDgl_X&RPT{-X0oIS$W6|Sp(~Kkdmg5N7?jBaG`<ja2z<O7qb?mOZ~ExOd=~D+P1h+
zd?C_WlITkgghCLhXCN9K^o8TTruHp8zN8dN#Kk08PW`1a@&#b>iQZmGRxP9{aQ4bB
z^oJVBGERLNXO!xUhP$z}p-Y!!oWiOr2`nBOMV8_z)gDQN<H+Y#OQK(j%Q#t83)E9o
zZJ8peCBO{g12N%(B*&$wKw7aHS|ztVa>J~#gg6kD{7!AfC4J^hrl2zflp<8O-v6W(
zRlZ-7TCkUsswc;V4W*i4AVA|1)`sO|dsGX?Nums^F-L?NU-4whEM%xq7xTP08-Px7
z8?22lGqdGY-#;_roV6ESb&a|v%i5>y8$YV7yAheH^pCjz-R1poZS$S7@w%Bu+h(fj
zW@>#OJGk1S(IazSrr0;$G+r@hHq{o599gh4HU2rqSyMYxTr*Q$KjW>KsapS$xAfZ4
zDeu}@Z`+--Q{Ej}yK&09F70hsyzO%iOZBFYU669FjPcZbTn^ZXXQ7_)l)ut9VxF<P
zu6jm2)Ap*Fvc_~-i&EAy+4{Aqvcn^eS$FX@|BX}A?#A&Gv!&HHHs9WHbH@*N-Whl+
zF<sg<vi~zPQ?&NOHPvI$>(Lp$^h0l1+S{ynn`aQ&sCYNdapwA>IgY7VH_vbto)OFU
zTnjwo@MukO_?`W~?3{c?P>u-i*<bjq7>G@}8$VCtK>OMDy8SKYpSN)Px7xb=Y$r&C
zc7co&n1@b5(8dGcydb<d5R8(^Ajl{a$;e9OHUt|HYyqHJx)X`096&cNe}uo0{*`Od
zi2#L*<Q@Q5m>I70V-w4{=go|@Y!0zGD`0tkpI9M)fRC+hT#y|<y}+QY5l50C_&1Es
zwC3LdudKKzvXU7DK$B#NkS&xEpCAxoh7fF$JOd$?f+1YN5Np8@Tfq=}p^#pCz&7Y0
zeX}o-NOhzxsm@q9E+oUf@nBT7gM^(53UOIw52@@))uy?-b>v0WlMKfCaY4tGQx0EJ
z?cJM_fG00>`R($vP-Xc!1kWQV0|09;u7^ojN?=cs+PVdne&J#wEDHTFKzhPb6a?S=
zE`9-1BS2co9spOE4;|%`6)ltY))`0XwKHSqub-c^ub*-7BRy9Wqlrm-{bIc2TIblY
z>&Hl(T)g^5=j~%R!M{zTBU~3I?M-vV4zKl_hqQr!87~?Lx@<B3FA%bLAmA{74(Dhc
z4&6=n=L+~+3;5d#_}l6JJzmjq#R7X3C+t}4q6?q{_qPG3=mzK#VOJaA2R(kTT8g6}
zZ}S2p0(V*hcAFf88q3%ivWP%Jfc+u&BIpAU(x(aF3^E@gZC1m?9%F{th#_xcux1R&
zhrIwIyxfq&6vU)(IUpv2>@CmeQWr=tM!fjm2jpQ|CJ)N=rRL$ukwHP){UIY-eq_k3
z-(pjSG)j{~hs`NN-g<-6B`2tZk^2METMDGdwkGBQ>A3>wjV0;<>8%CQ8?t(~|Ax5;
zuA+HHw4#ul)3Qc;Wo7bVV?O~cqt7?y0VT5)C|R*yfCr?vr;7BLk@Y&3wHNi4v3uwR
z8C$@WjTquG_7KFiW>J{2e;{F%i^7b(0|~2M6lUxmL|ba79s(oJp4zp@wR@54$sCvC
zM-C{dPxpZx*D`dLpG73!<#KXCPK|m_um?Gy<5^pu^|)v7ySh}%vE%zstLBbVdxuKD
z7QYybhDBd?;qeXCxA|fN$&{~K@+IQ%-{Wh>6wPhEW=PWv0yQU+B?|XkLzPRE6}Ykq
z7j~;`o60`<`5KT_bMy>HCAn9EQ2#oW8|a61PBjB_lCkUL7ZD^7$N*I5nP7B4B1$OL
z6z^AygGrFXQ{mo$#6U7A%fUez#~W>tRJmkd5QukUI9W+?)l@mc2g%~8a-q~E0d^pV
z1W`4g1mMW9+${PcpiZ630$!ZQg29VW5oH`}G7d2rR~#9Ktb7c?aRew!`5l@(POA6d
zwLOVzyCGj|;x$Nh;dnSD2t&0L=FzT)Mj&PKEm-|uW<ZMN*Q8xFimPVIRX4uoU03sL
zedBBX*V|uhpQ_(Hvi}c`&A2Pl?mER?H|1^^-~O)qQ7yuTS2s-6Z^8&4mDEl)Y`M4V
zUfbl8&rQ}nKUH#ml0W}ZacR1^Nhxj`556%pUAznARqHIxuc9hl(W+Fm-Z}X8uIY+S
zh{w;;{3>g19K3z>=FvNiN^RRz<pzl8o~>R#9+;}$1h{9mp)uXCRcY9I@7O!LrW?+T
zSf=f@vz689$~L7EB>J~c{=?ayo}I459NaVI6<|3(EG-AiK9jMkYQR-Ald-C6!R480
zT!)nE^%U!OzkT+t!}o*l*zONb9zQXCGBkNYoD`+$o=cO`;N;-Y<l5iG1PwE7`+mOj
ze#<ZRzSH}QbCYMjK6$n~eYQ_I+c$MKGWqojlNX}XvEQ4FzBI=$drW&x3k<W*blL>2
zy{2c-dd74Ht=6xbkX$n;veNci#a{cKy`IQIbXC$GU>(5R8SLYH6ET>1HY8=Jm9h$?
zh&>ywV=03r0&Rmm>7fi0F_!DB-w@E&SxdeR3LJ*UlE1Vv8jL+#o>i66U?(|rzN~gM
z*ijDc)wH9*x^n1zS!Fa>Uk>fnl#wEQJXBvBinoX_TQVKf85&LD2IGz*P^@ZfJfi29
zwI8ZZxpNFP=y_&UWo*eK0?vmjBdTMFiL~oj)=q8G@%;Io?<Vtqiyk{`Q_Hql4~M?n
zrsG)~e6tP%8)i|4qpvbX7-e=FVU$^Agi(6l2&0U2E$j{UE^7F|=&b>2d)oSodec1f
zfI8Av<hDE#;EWhzW;SfKD|=ShjTqu*mg?QiHgHun^=9eln9;hlb$vB8rO_#{u6>0y
zU2go*xWj5{N~81Zujz8LiN<4AQ&Sq9Uw=&t&VQK0G|scqn*N$N-d8x<ms>Sy{BET!
zvr4}C71or(b+r{NmF1s?LO@^R2)Cl}L@lMhlIBoY5eNRFwFDJ_smpI$fqK#JlV5_e
ztJWZF)8nFyx)vGdo9e{#prhp2T*k$m?7-nt1)dSUjBXLZHxWn(zJ;I%!S5p&LU0*?
zYK?afrlcg<kjq!ln<zKGjaZmOK$?jj`33a(Jp{OQ&J=*$gHbJF5DvFcv8Apd<s#mP
z63Nu*MCP6j*F7jpJxk91$(hH{I@(@3?@GXRpq|_ZJnv-&HfmSxqjo$8n&oS59Zc7@
zE4A&j-l`kxZ$EPLk=v~|TNQ89yqT?Qo@ZG1qYGw}!w*Ly4)<!46^$09?RAR1Zb^P^
zH`~(HtqT0St(fBm%&{GF+^FUF5J{naX?wk5uV0jF$(a4R{rk?ew^8vnj<0_m)CB3Z
z+Z6bDw_yo(U<n?>66_=;&`6~1O^Ur~5g}-<balH@-JY)As=&{?6_anr<U26=V_F5A
zqt3LwO0ichO1x&wecgSlZ_3+@iTs$T1rxRACA#(KOhwJuOV?kTDJr?zH`;gY#nB77
zXj$c0*Y&RP%}QC*OfFvQOV@5uYPa0mtkiCwDX$uXql)npN_q2w$6n)HVC)XpLK)+z
zyk$#QwkVY?Grafeq0vLvPK-W{Xy<6>)uW?F(|o<cL-Uni6UTb4_l#Y*enH_IXZh+h
zzggio&pRwtTNfCMW7~rBE2Pym+I0i6tHXjF89kEb8x+1F&97JZ^;*&3Kr+3iS%Dwl
zypqCw=>(6B9=ov@S_(@@n$4&1zBJ#Y@GxYGuwh?0oaXDG4mZTxv751U{U!x|{H7I@
zHqEb9__ff(d=rpQ^BQTaYMO6T_%_l6=`}6NnwIpMjSBqujVmB~_3-Fn7)(cJ`4Vi@
z8)dhvZ&r`*y;(PfZCHZUxb^51zb=OqSLOVa;T*1|${i~#F1Vea-vY3Zaz|qla%)!O
zpWRx}PHdLjQ0S~ix5;LWq7+(1o&~AsdFeLE(_rjS`;1vdo&~AsdFeLElh<>~+N;R3
zAQe3?-6nZbgw$2!S&)jJmu{0h3vRN$&=@L6MbAsONuHEVSEinAUtru8q@w4g+jyQa
z$dH?kaN7lCl^hubt~rUpPFc!GBt#e?atIMYh^#@x3>30*ZzPgC5JOEgp<u}!z<<TG
zbwtxX<MLeHIlA-O-tRu1VO4FQ$G&s=&9f6{?*%8GovQtvX@1wrV?8<g<c-F6UA2pe
zou4@W)?Q`9lT)=%P4jz@+DhX+HTu+z&F{K&86s0SVM2K8gtFnOsoFi${9d4!b~P%l
z#&O6QxC4=Hw3z7)9m<A|`;d3%RPDiO{t)K9YjoGO;CG)`6!Fc#MBpuU;`yoC$KT_3
z&zTv%YMB)e@BP5dNMkByZ^fZ<S!5By`4|XZ`G&UUW$!w!JWYX^lr=>^KSt>l%auKu
zrYY_W6(wZ`Ew1ItOMtmz?x#?M{}mp-!J!h7zj61iy5Rb(fH&9SHkfLQZ`~q>V=2`f
z3tmzk<lrg>*Gsa(xBLnun#kPCZ2~-Z7O2Gn9H7aPD8!_gK!kMqoLCa=KatXl0N_5C
zr}9>X;_;0aE1tD8z{){%&=~S-r|y)zSwB&q_CKZgpSoZ2&-MRUpWb^;*?VrPDWLFy
zkIk(2Irby(B60IIxG&|uv-ZvQiS~5MJ_UZ>eIIisZwEWam>eD2U7|(zzi~@VIBO@2
zphBFLrji8iNf~qcx&yaYnMg*}2RRsKm@{J7a>k;0*fMMb<sg;>4BSt3X&8)c>Ma%8
zc+%F<Vf!%VO&L=KDACqZdY&{@6+I?J;Zz`Zno`IFaVd(NO#SFN(o`<=nDu(gX6*uH
z^JQb`Sb>;p_3*4+Am;jP3?0+z6``p{m<UBFl4%DfQc%SJ{ZCPxfG={gJY!JXlotR_
zBdnlGqChRTQD`xd9TZ<pMNjW3imEzOB6|E?UywdUX$|$EDP{Ch?kSYR-OTtyd1J!V
zHDQ+5K~x!gTXiP;aD_xQg}+fgLA>BXMCP`xb_d`G7#k&H8QF^b#~8_`UEfWT>kg^|
z7HJ_E?~O`o8MzcF^#|d720k9i_{vfg+!x_hkQBTir260{bDwIL;Wi*XU;%RE8H)(o
zkZ|l0frA7N5txE&grFtqfeQ=PKDdq@l{9hbWh_rOf;{=?2KrqA0Cyhx`y^QsE=qC=
z-aI9NL{Y{|5_^f;8Yh^_C;Q+++i5A*zX@XT+OwErJw1?SB6Itp5$WQD$~K{lyhk-7
z({g8d4!nng`-|jZ73C@fxwlGZNK0;`{11>%{*fl6fg0e#=!JCAdZlRn_~~@hZl!7W
zRM8V~v$wJ~UD>Www%?JbDz}WdXN&8`58SidubVDDHe&tAUbKjPk^SWO#XCvdvcmS&
z2*h2r;lRBl-LX#L*Uj*2Ffg-`G~(gz{EIq_Ev~kw#X{w1BXUsKmSSM;4n7x$nq)TO
z8CI=T2e!>d4o1;%^L*z-ZVmkk?}y79e47ym9F6G6rK{tgZm)LuG|3S6zq&Cv_ATJQ
zzcn12zM}mmb8&i&XndU>lg80mc;S+faEDP!pz(5jt)}e1BR+i)W$gkS?BO^%y+$++
zPmfuU>r;6?od!(;&=155%K>y+G}(Y2^UIJFmg_;948hS3wGByOIjMjq0qD6uv>YJc
zf>D;?`0{PQwVQqvRxgiZ=&Jx!J{8Ka?nD&s8zxgyKam{dpJ0?9BX|wtIgzg?fN@GG
z`A^aJ&k($h;0}Tb1b>d;E&!E>m&$TD1aATQ1yTMs`V;>B24a7K;4cx}L+~a5c!Pv*
zxK4q{0DRg>97m)$;5LE-k-vgyG`@@DZm5~29MRut<y`gTf$1jSpmW~+O4I(Sq5}`X
zc^C1hi~|+B2K)I!%s4{;mCkqAJ1rBu!f*R3S#(Z;@30C_ju7%?fkhQ~ScaQvmdsf|
z5vr(f0E4bY|4hc8Zm)4T2~Y0wyYpX^9KwOa^+Xf!_+$uiTO1zv;T_!XfGhiaNG4OD
z8U1ju<i9V8qaOl1-b=m$;0iNmYT_C`tgM;iz=k87s`@z_Vs^$`a&>5QXwHEyr|xp0
zi)YqUjs>m<=G^Ei(p?^Od6|0O?ZC~zTrs*zn2M^g$o0s4DY!oAVmnyQ^~qsV5%<I=
zJ~P+Be&TH4eDnKQAE(uwlr97XGYpJiV&+le&wYdfk2%PL^#D9X0h}Wb)vbdZyf804
zE+@M4o_8#Iv5oh8Q6n(_*Mz9ZX0>2&=R4<RFmZu2q#5EeBqnf~$z5|JT(m4hhs(b5
zC_o4JuV!FBX}mJf{orC#1|>JitzDllz59*ThTGjTmO>sta1p?ewe9%HWBb}BY%91a
zEw@67oagFffDP4B8q~P*ya`ttP3bhV+8eyjO+O!>d@eDW=wE<Rv-|OO?u+aJv<{d$
z$z9!ph3@Ja9Rejs^$6+M-OwptrU;uda!KK4k{F^O<!G<rXt#KY+7eI7iJ_^H^bU?t
z6h{w}E3O#uWZQVIF}zeMhV4d7#CV*c`+zFgc%%Y%>Wl{(Al^|FwX}yY8G!-SXY@2Q
zVUr%ymo1f!6`)BMGifEpg(c4r5%`fjPjDKd4~-5@w4~Rfg+iGglO}4?W6}gqdQ6(g
zNkp><O~j<r*hZ;=F5c1Fh9+Dt7oVn};v&12$+e)sc{lTf3C8a7quhAfMiy90m;58h
zPUTPmvstsY_#23pNs~E`L2&j2fM}NFKS4j7Z{&&%(ei7`r%v=chya@>E0*R-pDU0!
z<vhs}g-T7lEF?FgJTTNmTf&#<YqsNBK17!q4SmgQWg{F1R2s(tdz6+vO68us`vSLm
zL7W@!em(wbJpIUS1%CW)I9jNBV#NJXzJdqeSzkMdZ%PU)duSr(|DTp8Q<{er3>VfF
zJA4c4Gf&O09emGTtLbG-&}X_qoG`V+fD}HlD{z)&FfN@O3<i)iPG!z+Ez}`=6C&px
zNfE)4K8BVYLLJ93ggOT85b7~Fe;Pme`uSJSr<)#E;OBXK(*F1|+Nb~il36!MQ?_i~
zb!^4wMaFVQlgSpbMuWKnOY-XZWNjK-U&I<|Y09$cb*un&V{y~tkrOT&&B|;H9V<Yy
zS`W|K1!7`<=a~XA>CB#1PG25Da1Q6|sc39oXDl?DG-gllEgJi$$D}iUTJ9~B8qsOe
z7{A7(>5QJ1+a^kGghpzs9wHh{uRw3n7(Knj%dvP`>1Zr|xt^mje~$J%!VuC{rDuY6
z*r&Hn);4JF&-&?@7M4Ay%0Bv_t{4{moSKLkT&|e0RmP=WlQp*a*X76>$H8CQ+~e*;
z&XXqaP%B5I_#1ToErOpQ_%VXl5WI!p!8$GZrx@=Nf*T0(1de;?N2HFwL+tMnyp5oc
z{DEV6sF^N*EcwLQDdZkMqe~zw*ITVtp!m(zVa@WMYwRnXbOqeQiYY75UoD{iYGLOR
zNd~XG=Y1dLJ2+T~lIjPD;qb`URNY5E1RHTUAZxqAd}1<#4#`=?)qS#q<#x|o*K&14
zj35z15XUf(h?x0Gu;@yoK{I@KtK@hp|1%)Jb$TNCXV3=uDj}}z`03Ma@;_k^RpV2$
zv}Fv15AF{hR^jvP^mDPvXXBIc!~$x1I#9*aujzgEn~2`0wX=}I2N%g)Lu6wR8lk7*
zi=w73vWjXWiY~{lr&TU{{4J0xS1(~LQ6&e@4m{qBW|pIlWipKM^i&&;C^^XQ=wJ}(
z@Zidf;CUy@6>AYOj=xpq;0t5@gPLO39PAFsSRwL>DAk4^I)l#+X%8|=!?FH^oPw{K
zwZj#J#DEM}0%R<r_Ej|UrLgu;LYBza&a{musyekjrwcIEgEuqV@vCR}HB?FaTAKEa
zvG!0X6xBYQ2A?T|Sn&Ne5%NxI?@r1W`L$=hi>kf-sL?-b2H%A<zQYN4DjZE}`-@^M
z2I{r6&oh)Eh6*X|+Y=Rt;artb{iHfzUwVXm;Z5UoC|s%zINTTrr3RpOFJMff%)XBp
zY3jwzuiAHOU*39BejlRXdXW4ffH^bEvhOn`|HiC)pDB8uaeu%xeZZ9e7xVOc%+nt*
zJ3nBWKVTX^U^abbaj~Y)7#+-2!xzm)D$<T>#Zf(3(>mp7d)c~ReVh%lH%b>6v=@e1
zCu<$KnD#U%o`wYm91GPQX4W}b-L}9WvT(%O$+C6CjmW}jlZ`E&tZMs|0h<ho{{vfE
Bv%CNR

literal 0
HcmV?d00001

diff --git a/cuslines/cuda_python/__pycache__/cu_propagate_seeds.cpython-312.pyc b/cuslines/cuda_python/__pycache__/cu_propagate_seeds.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11e42a13d64f632e5ca383b5d69b721b0cf73250
GIT binary patch
literal 14574
zcmd5@drTbHdY{=B%d)^4Y+k!I%frAr2HSu!ej5z7gE4UoNu6!ny=!I+yWR($9c~O+
z+O&;SEcq&SZd!wGDuWu;SaB61xe8ahQr-ULs#V1Vu9rLEN~tbaZTgpOd>_4P)qdZZ
zJu|!O0ozS1^$5<Kne+b6Ilu35zWt}%TpI)7U;g{|14k<u<`?)vgDII<oq@y+hGRH3
z#CTcqG<Xg0G=z*{lh?#zo-t$&Tf7#MHifdnR<D($*-*AO8}iK|TR6v?6SjNp;aqQS
zIM16G&iCer9bQLxgLgx?z*`V5^cIFUdN;C+f#I`*+vq^FCujMTRo=Ws2Ie@!WxdaE
z)=v$|Z^a|5C;J<G>G7~~mM|O<17RMLdHutj{}dnY9~wCsh;Z;Fw@ncIBfb2kVLsB&
zdu(#{na;LuU+=5lPT5A19er<|?v!&d-Q-hVa!<9L_4RhY+SBRl?(C5_be=uh+1u-D
z>+XK-sP9<Y(LUMQcjAl>%g|JNx180{`KIr5-x=A$1-|ba9+LC>2l@VsM`1{vf)EvA
za^CUNZ}f@+?+=Fp5kA(hOh<{6^JP|h;Qa=}lNA8!v+pxrBWHlv#2Fzrb0&x_oEhRQ
z&H}L&sROjPLTux*A<p4!5Zk#Nh;v8mo?N-Gm*=^5zt}%`T8Iw$2mE4G=ueW9^pBgs
z2>+}94Dk&n&c+!Z$-pSi22;x>ze5Q+Xht;C!a)m7Y0o&LS6`$Xj|p0}vNUyvrOOTd
zsI9-`e$=sl%YO3cEVO6ZeAM~A<bJYg|0(@!q~EmX8tYR;OQen6PQQMKZRWm&Hq|xw
z4#%1Q62|Fx32knmZH_Rv4L!G6+02O}L%gRz&KVdQ_KAYO{~|BQ<_NsSWJ6>~HjnZ`
zG$xw^5wWRJ&WZFAgz`m0oSc<>fduDAG?gtek}KQD(--0+P?HZ2i6a88q?{E4w2ApR
zIfoQL?12}%QtPz~7ck2n8|(-OeNbyByeTbp^ASjADXnrIgKUcNp$kGWv>8JHa=U-m
zv12eA=63|6gOMHSXu6|+Sm^)~oZmMzA`V6)koFBF5s&wgVKoem$X1^(5DADr->6Fm
z7#nD=6FbfZkb94LXklEn<EBMx$<iiwVpEN@sb<`C)wbf?e6w}3b*kf5*L2t2^Ir^p
zKDg*SOv)`6m)^L1{qmIQmTlU0xAKd+&+8V8_d_N7a#6{R!Rvz);?2>?(Ypm-lzm>d
zSabk#a+b?nH|r+rrkZc<o8EW#%opCzy^Cc>ASZt%t+m?4;(es0wC)EMorjdJU0ZIR
zm^?Ancgs8Nz5C9hi;Sprd5zC$o&Mfp*+J|uefUa;Kd`R?MVK_kfe=F&M+wmB1|w=L
z0l$(ERDqs;L6$-}Jbki^m__pekW`4I-xx;Bp=l1-Q&0sW`W0K%uQ>_~WE3PI%2Mca
z%phtop&dl^+bHZh&TzpvW)ur)FL8soMoHQ#XCQS<aYNh~H(fCHG9F`(fE)N5q>jfX
zAdUz~A2}B{YcDT$@ZS&g^Rh7<h{$;n<@^lsf)5TTeDq(IEy_m$X&~U#1stWvEYx8$
zY(%ytYhexZ`B1d~qHG!v0$dE4ryGOd!J>J-*gF9b1biIna<o$S1f-RzhkCI%MDH=n
z*5Z$|5+(IgN&Sp}p=9Txb=OK!>5|=%u$N2r@=pq<UY{|2mUBC2)<0LB*mp{Tzm`){
z-Km9&o>e1Te3~7%t!ya$xb>5>lCys1(!68WxOv%8bZzvL);njVs@)5YmN|BA*L*(g
z!msm7Cw5O&&*#_7TWf?@U;t@b4+%)N-U&#+)`M6H9E%Ii888-A$pznFd+2V)QmBae
zWU0O(ONA|ai~@=T0&a3)Gk!1-dJ9dEPC}AG)k5C@FZyC3-}&JS7hv6Dfl+?6R+qF-
z7u<ubRYC;x%`chQaOK^JU023FJ}2dGn`!>+!0iKb?Crx+{=RwZzNe|69>D*C5vLwN
zn5#N>Xge9`aqD2t{VT(a8K4x*J2S)$wA|a!>X>oNG-lF{r-*%$5F}~^eN&#IdN7rB
zh#Qfzv_1NSe$`{9)sGtvz>BEzuV8_eM^kXfanw3;3|Zmz@{FU<NEst)l8u2L+A*{4
z{7PuA+H<Xu;~t8egJoJtnhLsTN_*;4N!&t91l=^HJwHwET}nfix%vwesAQZe1d7!d
zC}jpF4yu8cFV@i7K|K5!K~uAIYN|jE^Id-*-Ftz3lr?76rL*2X(sqH4iuh@AF2U^?
zlY-Z{ku&2;ud(kMbZRL3PGvyP*x@^A3&BR3(w;!CX6=opf-N+qJ;{Exaz=_i@hr|v
z({v5#6IyN6YSNT)UatOwi5X!$Sv|r&$PhMRgiD5kUo6D)LjofB@3IDYf#2EC>e(!t
z1%G6K7jSnAh(0MC$wqW&e+W2;0)7Zg0uD>w;E(l3LDwD_jt<9&C@3H$1SFfR)FZmL
zZ0SdFGbWojzvvfm*UCktS(1Gs8Wa1XNsebi)m=D*MJ%D{01Qet1tJ%s0t#Qi_0Ero
zd`zf@7Xd{A*%}b}Frq6F3+3#XKRgr)`yzs{6|z0q3O_Ux4k{Z00Rh)SW_=)H5}|Yu
zIw#!;yRl7Nl^DVq;gA}a+~gw^@QaGn=nM1Vs4z0>(M5E<qPP;VSvW-RF)Ibd6Q-NC
zN!x>hs`2)v@~T96qg38F+qh8PBH3NzT@&oYuI2KEJMScRv`IVKmK~)N71whnx+F&p
z@XVccs|@S*jCXv{eWiP%0BWq1IuoV!Qfd7RzvL*m)|@D;k>Jl!GbPMaC$=4w;IH=J
zTu`b#@zBWb=werj+)vDgvdZzUCl)5x_3`n9t3h%#EIBsbuwS=NoRJ*vB}ZApQ6V`h
z5)N!BbqvdnEeXeN$+3Gi+q8MluNYHq>%)Aeu=-aeOm_LtthwX<tJY;}-Um5Xa<09)
zXx&0TrBAzNTcT#KRI_)ka-rs+WUqp0n<vDT>ZZHL6V1n@=3~o_f*aQB)*srGaZGt<
zE2jTIax?=8cDDctw(c44NZ2<^_RSOLC3^+TWNWil*O_osOOEO(F;Tlms@;>QZIj^7
z(RQ!n{)WFlE;&x#=jZ$0_{TxX@g_ESmc<s|Vznl}v{yXJg;^;aRz*KpM#Lr}XB503
zhrPz0ivTvFP^!mcRS}r*SOaibV;l@V7eS&E97M!b?GZ47WQeoKA+I*9k5JH#0TEEd
zGed%&xO~T`azB%(>9!i~>TzF_9_J0J9F_J*WGy}J$r3(k(&Lk&$|r*=AJp1=n){{D
zz^#>|DUKarQ%WF=WrVniV>nY86E_7BU^0S6oeQco1X55fEe5qsIF}TJUUR*ko4QPJ
zJ(}srJkL-0AndEmK=%o9q7>@u@moZ32z0hAbB(=viSQk>qTCQBLzvGu{{z8OBpiYA
zDLe)RXu?Q@3QQruYkWsIimx3Qbz+1IB6MMd9jCGng>m#nk&VbXz>xX@r6ceUY-cze
z8x9M&8D%4wGQfa<oZ=X~Cp9IZ2b)BmAiRd9QjHRWnW|F0PifE>1)Bvg#zw1j;Xl35
z8_;a5jR5~LMZm;5zz@JX;r}pysjB*#DFuy7sA-{VcEfbfPb&Ye?#FeD&ZFbq%XX)R
z->!tCN^(>s9CebT4&Xcqot2Vf>yjgxnGWAi5Wb(-8L$N$nmn|$q2QYFqcb<&y8hNg
zeCFuPnfdaabM{1Qm(<#|xalNtk5^71Qo4HB@$QxK%Bcgh2C1rPp}aYj9i@&)cL*8d
zQ&`N*w%JR!8zjfx^a1`F2l%LJeI)(}_Unu2I)KEFo(G9jjSwl=fgB(s0H&*Yj+PZC
z=%$_aplT}6%BYg74!W)(vFe<Ag%9Y;()w7e+Xmv-+Qt;j{g!h<z6n7?jdkg!Vjq+$
zQ);Hh3U!f27Xc`QMh<m7m1k%r721$o9X^tz9AXd&K^@LO*N8=n!B$#Qdurw@Fd?n4
zV*yY95%ZTS2O%x$jv1*PJC;Ik9TXN*DYWMPBWSMbMVtxWIr9ahsA@iUER^%FM?uKb
zr5}z|FaZAF`OZe0M`^2fV$w#dJ0g^cU_4JuDF#RwJt{#Nf$l`GfBg0HA;NR)fald4
zRT)cfmUPeQ+C0`ln>8>))loG9tb-YP2&e8@N;9B@p1!L{q%%|p?S&4B`Z-#nU3R(^
zyO<mK@jkb(7t5&1s;4YdR8S41LK~LUBnKii5V4>NGE}7#xU+y$5b$IP<rujkk_`j>
za#nvd6yp0uVLyI8fuGGBe|~sC5kpY<BO0uL=U71T4$OWiD-l^i&PlQ8#SqRwNmOip
zrJ9aNZd3x10jej~>k100o|sWQ46Vj0V6%e#FI{AycGW4vqP=RRs`2igL{q2K)S0Te
zebRWVb-MMh_9d#CCHNx>Q!^?|)kI-(fx-wHp548Y!KU4y^-UFk&ICH-`t4;IH@7@t
zjJbOr+8D>i58_wi6YWxd`C?_$Y)7KGLu&50cSdSHAyuCEGXKO<WfcfAIiSC`U(a8v
zsfCyP>-kd#5K~aYvt74+WdFKsOQNh%DnlK(tY!S<vb_M6+j(cx?8(`K^QCQz_V%v}
zi*M||zJEga@bGeF^{u+;y5&tZQ{t`obbPiQG(2h3o}X;Kcj?FF_bcW%b}yGzgN9pG
zqje28HXBQB9KL>ds&TQfX1S*B*5LHu^2V)G+^x`bXtq(RX^}RzByU`xZY|1RWTXp@
zs)sv3{&>>D6qclF?rE#Blre{u%3XJB6OA2GV@IYY+i|OVx_h=js!py>6Ru7btj-6o
zU3m>iX)j;VwErcC6P5nWP{h53N?BjpU27<(Ik0zdSFTztFDw#)qW5g&^t%y>06J8N
zhtVd3!q6d4RXJ%6sE?|8EUJQ%p6Us;bp)QZ)u>N_8DgNI_CoVpQz(H^tjW|x#3K@d
z?qnk6p~Ak77y_-L9+VL@G3aGe23YDwFfg;=0R$d9V+LX-HgLui+NnZH5N(VZfir0(
z!8Dy-6I3K6z%18g)F8Z{{1!V^6?$lY#5D0@dnP$nZ8o}sw;sz~gJq%!MO&lKHFlzY
ztEj?e1Z7{)*68zvT5ZW_5?;T9R$E_6t81P&I&=DbAquRCQnWStBnFTk6DkNgXEcby
zMn<3~9UPieAr@-u?8M7#@K8o&c!jb@p8h(Ti`Sw)6=UEcfEr{DLTSY+a~M*diZ!Jy
za8*+s=h|W;k$y#lLRm<WnoOAL!w8)*z(s?2vBijxrN~W0C(xubBC#FE3OErWbqTLx
z)lKBWn^f<Gjs}Hj1VlpFil2`OJTD*&3MkwuCLmP4WK(~1XoSdivP}RfOR>iYD2%4c
zYNW4p8O`eJ5ApuU@X#8b9?9Gfp{3aNXG&m6+e?$=YC?&L3OQ+JbTELRF=nP<!LePn
zY!K573S2Z(a%^ACF>h{o!kBY+fH_7n%F~9az=WO9bf!>IeQW#l_8CE{Y?K_0&+h2g
z*wMocXtv+7uGwC_tx$|sX);f8{(B|YUfmfPS)37@yCW%UZkJr!XZk*S`}W&&l~Vn|
z1=pcP`{86GN?xM=AY{y)m+IRVT<weYqZzq}r20d1f>htW;5xc!?|6aS&wRIib7!Rb
zBMUARC(~-Cj_85?@WVnN%flkZ<-U1f@<775U2<+;a5gCJK303eS|V9X64t6OtyLg^
zzB~P{WNmz4t$O+{-U{1@2xE;9r`e}eVJ6jUQlIg`S3GVRLC>!eE&OoD4vNnU)PYS7
zV^DhlXBFbot1@?WqyU>L80raVPciqDnlPTij0wQ2CekGZiIf`8=qvS38}BzsfL=i8
z%?uQvLBl@=VuKM%A!ucUxGKbu+ZWTAdCUS9eFM3DVXtmkgQBYaMQbjIsFxAq;P?|a
z$1QP&ikom(e}XUzM8=Fj83VBmsX(q3((=XsZ*y0zXxeDknLEOIM$qO4jLXuauqzY?
z1>|0%$vf*7n%DzAj7D%cB-l@wk#fO+LXx1vZ-mz|;|NB0&n+9c0eJ&>;PecKHRD7~
z@L~Z!7APFxBfJ2&38~ou$nj(;MuAgUlrXg(Orac>a+RC~uV5g+pco0VRrkZu&3Z-w
zHT!R&X6!TwnqUMdDxKK+!ytMD!fo-*cPHPS=~I9*QGZl|zw)E^{8IVx`yGkyx25j4
z6W!+}_}g+G5zF~Y?FRPgOW7ravr5TYIkkJy>Y+EaDdkl7S1zmdZ*A$Fg`<RcexZp$
z@&8KLcS-hLvz4>Kdt2{syf6OL-uEyY!PU-W=O(N!$?Ad&Vls#t$y&2$t^2n9HA(iS
z+4ecxz0Uj1^RK`0Q~R4*haco#$^Ak8(-*p^0TP&AfyAEyP;=}Ut9dS7d4@i6?g+q&
z0ZOSL3e6-#+<<}*_zNaA28MxhIv}N#OMxmd&yShFwF!U&TSz*uAwwYnjeIESjj>mY
zOw84G6En_SVBr@5R&ZR(p`}2_GSdSEV2j5H0ywpql2>$Sa_c7_lMxQB3I12%R~*cr
zvnLrgjc;8sudkY&<6%e)l+q&wLrB3wt{wrDTtf<>%z93raZ)aYP@r()J$amX&A@%E
zI(hy37^R%XT{K^I4(@%+n7?3euz=0mfv5v$!TcNJJFEbU!Fa4G;xdA=BJ2{u7AGcW
z`I9~{c;-D8#ad_;2Lmzi1B}T^3ZP0h1cS0EG9<<bfW(j(h!5`f6okn2jRg1*2bJ(g
zA)^QMP{JSc5q}$SVOD<0(r^I$bNr#$p@y{DKZCYo??Uw3e`kJ+K;oR}xY;$?^`NNs
znqkRVeslli{wZPd@HH#G9G*Np(<nI`u31;g%E3|DK5d^lBe{1zDBE?deaTrh)joOP
zTE`0b2XC9)wp0Sv!b$t`mO3z&Ik$biZO2UWLrCpfWel5kJ<4L7jgPXK;=2DN6a3S*
z!|etZsB#1bqd>G9Q-pj-0Rayo<P$^d_wWP4scb@La$y+X-+_p<^*ur@g;?N%ltGEn
zt<OcUFR`@<L_n6MqT=x!)zYc}9atNQtq>e(#yb?cbx(HB6o3T#pr{^*=WtGlK#<Eb
z6_fEh0jYeC<k)jP=aG>qZg^-$YoX1Sz3WK{<JdfT?nl0viwi0Kn$WEp5O@gqpeT_l
zBXX)V^e3#x&@)=md3h)*h;A|J?mKhV9f^wW5ngl$z~6}v^AVBfz`v`oCn~<GeIY+I
zy3G*CQwU}equ}p<d9+NAN=n|p!9Z3~)PoI^S217NxsZ2k-g->IP57*a|J%?{>J44M
zO)89G^a@5Eh-77~7uOl<#s9#<e?+VGc=FJfKjIJbKJf7J`NC0dID~1t&v$9qA5wAz
zG*T&kZu>DsWCE4t3&`DtC`Ka~{UJtwj8XdV1T;gEs0<P@6an669vd9y{g11y=Gw;_
zSx8w*&2O=fTZ+uBk9Ss?w><7*E6h!gkFid3$>V*6P+%KtKF+RMFji)ct$Iq}w;01=
z&~Xu2gmVxnp5YDW;b&}cuMktb#fkE)errhn@J;cMsEh>8`_J>C21Pd~VA7Bd8syB$
zS?Fd5ZOTsii-`tQzLUS0fV3D0#T3^+L^k4fr+7evqQRs%paVNnywHgkhnx*!RX6eA
zQ`CJzu5=R0)&t549TKiWW#o}Dv<iTq0n2{H6nw>G|D37#IkWSZ%z>|%yx*8|Si^4^
n8a;F|Hs^S8BD+k=E}M70vXH&~J<D$_uN&Fp>|=&J2uc4Jp3g?r

literal 0
HcmV?d00001

diff --git a/cuslines/cuda_python/__pycache__/cu_tractography.cpython-312.pyc b/cuslines/cuda_python/__pycache__/cu_tractography.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67a83039c23c63c11bb41d1469710e789f7c805e
GIT binary patch
literal 9441
zcmcgSS!^3emessTHN}=hT@)ozr%fN!WyQAQ9F}Fvb{tPe_Rhr2W(k_zqDb?a?pD^~
zYOuy4AM(Hh+sFczkSwf^`EZZ`>i`2x=4-vc02^TU!yOT6p+|!;FvbG;Sb!6G5MY4?
z_Ej~jn-W?ZJF}RQSoP|7udY|`sOo?B`+W#P`~SR?S?NIN4>&`EqXumM0)TsnM?8~5
zc_z-}ZE+ie^R}El?}$4HY|lCKuDH7fd*WWGx95CuAEb_4L!OPZd4JrWZ;UsRHqKli
z&&9cXQ@n}fUAg9bOT2|aHYE76`{^Lely`r^=(%{S4b39n^FHFepV(?;ZA<p3?`xci
z+B6q0X5JG-%`Gk#<V;?OGFoG5kx#xU<Wr@kH!=nOV$`mA=T4uzkhpX%ep>SpaOU!r
zw@zz;)Pj(@b_$xG7R90{0lntgxBhA-BMK=wQ!LC1vJA9ari(>+S0VJ%a!C;Q>7<-m
zcuOpnlIf&e6tzacza)!7GM~#71SzGDS&zGFneER3xQ7H3XLuC1c@h5OgFGYHd7I$i
z?eKHJ&j~*l{M>>w%hnUpVcxSxxp$9p-yY=+f}3aI=NH`bA-?f_XWYXF-bb*p9K=4p
z3E~F68De&+CEBX_V2+o?Wa=8Mjg3!h-hA?UB3Vf11dWwb33)*jq=jOR*SwM}loC>=
zENDSd$R%aiszgahUaPgyg857#Aql0V2zA9mjfj&^%E@`nmltF)lS<^1B`qM876eg9
zz*ftdlptwrtw7+@kaHH&rA0~eh=nwe6L`&sq2m0!B*>}8x}|diY=_@Aa^fwJYkY=K
zw#Hp`ZPBt@rY8e^%257WMhQ%Tgzv!j?167k{SJKZ9{4c!=jz`l(}u{3fp*LZuM7#x
z?kq=PGyQ8AGtAseO8_SF1m^*6s4F*ERi+G&mVH(<d0Qu-(X$RzjGdBu4SI|mp)lP6
zaep0UCfiTz&Gfl?Yo_fBP0J_?GI4i+5gBc)Tk!TWAn}eea1X?=FM!iR=`1sl3(}ly
z?Ao%eZqBV1_4c~@Hj8>kU46Sny|b>~m{(o@uDW`>1tiq1X+vFo$fBODt2g$z4&Pr_
zA2#q`HlSB1lzo*#X386@L#Z%GHgQkVmf2|vndv7rkYfI{5%%qfSx=$m#%nVOiDCIP
zE#aNkEk8uOYq=3(_dgk{`VWkJhT>(<QOHbrk2NjIzYI!aJ{h7IBsaVxw{J(z(2=BN
zbVuC*nv;J_p=I=`m#zTxbBLDnKFA%Wxg`{BxCm;@$;(S6LB#U!Ykcg!jzCoQ%?UqQ
z6hMCSk#Zmc0$++;U6domLT)Ki_C=EKB{R9?)tpcsI0ed1&J@zPQB5mE{x%~oM22{R
z7mLN9mj)2f30YSfXQDC9Rw!vrNhwtXt({&hE=oyJOfHF73W@kgh?wt1%-otYQ;;Xd
zMa;;WQ(8!t1kJ%`^4CecMB*}uWz7yvHCI7b3cie#DM)g%kP<{Jt3Zh)^CeKrX;3Mk
zS+0YA!Ug$60qlpu#5fR@#Q^RX3qY|6M_2%e_(Y5NAdC34i1<i|nD0doj<7b+T&cx#
zg^Vm>(XKUP3j~aZ8R0!pR;P+!nrQ5tZlDlLLv!Vd=`^TMN2V}e)Z9ejC3uZBRUYJ;
zb}2n5YOGg#katVO)&PB&PZpACflpvkA{NImUC@Myl4+ax63JYym`cK?Mgy8d5_0q3
zG2NXzLAGGm5@=4s3KCwn$j<QJCr1{Fd0`}5Tqum}G>b-3ixRPq0N|5}(vrMTEC5V6
zL@uVqWNBe3R$9_Li9`mLHIXQ{nCum!g?Kll{{$U=13inNwhpy5s<cKQ4E?fPX`Nni
z-12RO!fGg{gklfMzX?@B7oaS#)!V1`PAI(-4}b6<Q<dHytvD*4@K(4-4Ua0}(TCyR
z%>CQj|N3?%d=Xc4KAD{Q)jOZPqj<&v*L&y4KOSG@Kgv7^e%iU!*138}4INTKhaUV;
z35{&DjY5-#t%lY+S1X~zm4?_>cVxX;?VC{gCe*&;O5gF%`;@*D8{Mz0%>4Y~lkUFz
zSJtkq%-p)T+1Y)+bFFiIZms9Rw9*lK*sru70b1;n<{%{Qt<Hh<a~qv8fE%~M!|Tf%
zVW@_|?Wh=xJsvr=F*t*3`nGoDp0;+Zwtf%?eBV|_*Xr31{Q&y6Lfxy`4@UqDJoU7H
z<ht))^RFj2I`&sQ(Jij`7j88&r9`GSA}2PuS5};x4T0NJH>XyeUp91aHTSHC9?bmn
z`H#;(IPviBA1yvge-VG&`}RijmB;Lrr=G^!4L2Je2akV#<X@(KJyr3XdD7kq{ji(i
z9y0LFz5$%}Z1zNO+O@0tY47-ENBEhYiIC>uC!yXI$Ik=Xu-zzSm~4>2LW`JlSbE+A
zp_;9)g)$tdQedF|68f0ILeFa)g(xGQfJPEVjEGntL1jWHfZ-2Z7u>;hwbIMDJ(dnP
z(58pI-FUOH;_287w5&9UuS4mM5n_XhjPM3PWQ0(#JBZ8@l5r~%U=~=4><Uz%&!Be&
zgnr7%hQTJ|K1j$KCK)XOgM^`&G{x&+We#TX8Qv8vd$BMNWh@Cv$QmXSEip|zIs%$9
z=vLbC0yDqkco(GbpFMXh$#(_Ixf`aHm2oJWwT6jg5?yA%xUTWyrxpec?0{hynOIh%
zfR+~HU1qTC%Nk~~NkfiM7%=-OY^K>k6Q&S0T<K_OO1Q)@N@z>y^Le?%Fj`F99b+>L
z7qk9($F^gPhIvy*=l^SrF_R{RzSkH>DYlvZS?f4K=~x-JRNiG@qOHsnc;c9uqtJ`b
zNuDxr4P%~`fGcD;Fkre{$ter7-}*$<`22N~Hih=!ZVRU%J412IbO$HCz$vLS{Lh-y
zDD+89Zkzo19qVK`axb(_7N=5QCs-4!Q}un;#EQ-ylH-Nf<oP+m;%pl;wK|*MXQozk
z{_-=mkPya9txoLo&h#%NDPEWd{%`qVxY|Suou}u^OtWvAFojG@he3B7x7r=pbw>$W
zjlkh`W81H}BntC_D1viDydk)Fgi?30gs&i_vI}1{9v;)2I2|p!NpfiNa&(`FZy_|N
z2v?ng?gkP64<CBXj~$sy)a4WLmT9hJsU#G5t&`LdM~yhk!jHi{K9ftPC9T!0x>zhs
z!+5TNe<~iw=(bc!YkgC9>)<-OC@#$jQgKmC30l~)_y_t7#3LBnLw!Bb1zF-#gUbh3
zvEn4I_Ti4-ByDt88!HRqZ;MAU)Q6)1+|pIJx+Dvd?);78V$4%D0_2umb1ftbJly`e
zQur!C0%sgMp<q-Xt&Q|Unw>35^5vrL6BEBv3(zqp;O-YL2%<J(i)*$_MzR5RL^sIA
zQvmNY#VO+3P$SKQ=?tbg_$ROZ?6pm{`Dt_es^h+I&G%LFz{>O%+x5|`8Xi-^V-Nc`
z!jl#DrA?N*ec|Q>mF-p7-kPg(Qi+`03U;Z%F(o**ZD;x>wh_~G<WF{+f3oUCErZWk
z)Y75TRrjD<bs>L~%62Pkx5^GG?BEvHtaAGmZa?uaUsl2|e<4=FbI<IyrsijMhrhAv
z0}OL4n_N)k`V_8jy<g#mR4%4)v8`bDDt|w_mc3tGD=NXmFt(-<px__1=#F@8<4R;)
z#{vwsXIz2*;5eSs5j>|!jQmm!xnE)XRW_=y)a5gfbwsu2r~?1NqZs`-MxVmyFW1oV
z0;=q=!VcG#QAcmPclPet56-K>AtgAp&a1;mmEoi6@CgO}gC{VlS23yAFsav>XCCBl
zx&6k?H&#6rwy(BdF(n*><+#V*Wmo4EE~0Y#6mB0b@!j?5i==YH3I`o`t`6Qmuy)}7
z$l8d)!903YZc^bUx4n+;qtB4Tf9x5ovfhEa!bWy>P`BQgc;Ie8C#!M?74D!;90=>z
z2zWcJ$%iqI)yAw7SQi!U0NKz%g&Wk#sobc-jS`|dK{Yg?z&|(fTx16nZeV>{Uk17a
zn7dT&u)-Y%X1nLP>-nIzI_raK|Af*%q4pnB;Ga9TM;H3Scq?o_J{Ge#XH|AUVWFR|
z*nQP^7~uTs-!Q0)c;LE!iv=DHSeIkZb^B*<VBtZ9mZK~#-4%GlCjxs1iw6?GYQZ9E
zSD-f347$U-+X`Nv!}N1ye7{A4l^?roSXGn;(IUp<0>uU2*g@{|4q8&S@N!|Kds3l&
z(UkOQ;R}C{KKG<T`=Tl7)5=?X@p<k^h4w{L(x>rAVXkc!FYT^i?hNd|acfbxn^vB*
zrKhZ~A{_%wLGLrSIvi-p7PVh2kIcaRwOCx*bvF&FDl#t$LajKGC#n^0sn0|MqM_f!
zb1?Q@N?yc<h^{_8SgoA_Dk9OFL}g;9Q)}V{s+(#`O=~iBd~HO;Jcv4VW#<vD!<#AS
zP0bFFW|OaKOhLk@gIuTTS5j~fDqOd?2h|HdfX32)L39Ic(Yv?PaO?K8sqKz@uY9+x
zHubNZtXG76aD`Z5C#x(1jT49{fyjE(mw|zr5Fb?{qYs1X*t9Y>{b)=XJG~J(Q{iUe
zic1X)DS@H&<d=c{yV}kwW3yi{%Gmjh$Qu>z0%_Z?1p3#<z6=cQYCEHh%{=-cAZ$d=
zR=9J9+!<0>pj~12Kh<^iq!ONd$p0$$S?<x;XT^>1nXlN{YA<RYs74UistY4*C(wXJ
z4iQE<0QS0Uvl2}8rXw73O0O=>8sf?<q>lC_Bw}+sY8SCpt9j{_yMSLf669e6UZvqV
z1zsYkiQ~kV^HRN)qXpAKL4b#FA)#ODlMC{4Z`}=~Rf7l6rF0M?av7<5!ip!XdWIFx
z@cLrKbC~SIfD#x`1CvT%awBl`vF9jRIruK(YkWgRUVcav_2c{YLNYHT5}GfO$QSv=
z9ERCM;wOvAoL(d1gR42`b47SW!Db@7o$&hWO%#)R4M#ILI)x)1M;VAdMx<N9&mtD1
zB<cV_`W&M7Q8k3x+BZ8oH$z>2>^|;{JYjpPPDsJj2zFN881kU@aMg<;A1-Ua5Q};T
zt9}eMVkm$i4sp%5%Qwr_CY)<Vt?l>D-#uS#!MPv`h3|*fLe*BBYeVgw_p*1h)pnfg
zKwL|;6GI`?(pK%lP#Cp#Y~%5K^B!Y!2EXwfclH9<;%wbMU~?YRCxzz|ZP&ek*j4!I
z1f+{ZUWW7+A!6qn6|vgKD!Pg6#quGRDvI!~g=6s|5Q9$|7W1X0+5+R(BA}>s(RY&=
z#;LuV04!&6l5Xc?eN3LEwCEmf6FmLAC6OlIw@741^(V4@dm$M-O}?*?^cN;C5V$}d
zvUI^ocC~~@@H&pjWxD5t&PK0_e+Oll;iL-?RqYJJ{2oPrhem&g+O}PYY56T`{MO-M
xY=1-$ee1I`wm*3gbC!Yq_jRqbtKN{}4L$BUxZyo?!}YCe&c+=32H}|W_P-No<qiM<

literal 0
HcmV?d00001

diff --git a/cuslines/cuda_python/__pycache__/cutils.cpython-312.pyc b/cuslines/cuda_python/__pycache__/cutils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e9b7ef57a194403dfb4286a085a35f8efc6ccee
GIT binary patch
literal 2922
zcmb_dO>7&-6`on{a>*qrTB3eQ$)=5_5uHRbY{zJX#HmCPj$=g@M9OFujTaPmEs<U>
z+1;gE(h|a`1}dQ!<|G?+;q=r5Mgbq2Q+@A+w2X?F3TTU@2i@E#ryh#FndM5dprn^R
zU}xUU`<b`zeQ)N^p^$`t0{@cppLr4biY{D&HKV=%V1zc2id3wlB9?KHlQ~S+obHi%
zHVd)<vq$$9MOkDrUibY0v^aDHslq*^dNuK}$8jk8RZ){_0zz^?lT_dBQH0DJ8ebUf
zoOo!GLmUFh@S&{^tNM?(xW%c`cS-`^C0X)3hQFjFht;5bLJi5?no#(Gvv+uuBgPU$
z-L)SzM1Pe2N7TnybfP|9LZd!b@>0}ci24MJ`s~qo;IAS^5Hi`Xc5BjaklORx{r_FV
z+JH7#<3MLfi^?%Iau3PFYA=i<Y9EXzt6ZXgm%2=F#H*V5JDN!Z<Bn<PW)ql*R}DL5
zREogjv$?}3cM^*BX!SOchGeWFnNzXsQ8@sP2SCHi-l~xB68{G!Rns%ovSvagXqVdV
zGLC{~HX#z9q8QntrYJ;GlwwJ(=#&pC%FilUy(8hT&rN5DCz+lk{M`KYDdL&SWWGRq
z-~#>oN8`(-qBdSAEgR!k<`&Ynsb!0L-q5V^T*Y!3HLH}X_HxO9O*^k!W96#p19LV6
zfU){vxQE(-V12qR1?owc`y^@hx{4;%+~r{)_!k6M39b?0mR`!*#LK1&yVNP9vs78s
zH6l9mg<AiDVO7fIl4)ydd_FZfqol9Osds9jc;1LF>7~UigtVM<Qwu&2^bw*-+WRr=
zZX$bMwt{S$K}H1{%Mp6qk(zJWoz6t4;CB5X-4*2a#!taw6|ZtD`1?H<=+7M?)-T)&
zmxg;J{c@D-4x;LX;1MHO?ngOj)1)#gbh+HYd=RC|4Z${?(d|lAPa)#+2lIi2`+sXV
z!OzzGZ&!@lM(O9qWz!GZv?gpB*G#iy#%%!UMEve*LLyS$${UuQHFBCsQ%HQt1yi#s
zx=p%rl`EQ^q876NTM|qzOiwE5g=8|7PMZuF0oFtFj(&tKToV9i-dG~R`HErZiyCtS
z$SIk{tex->fki?*FcF?&-=e3-<8iZ(M$xV0Dr(mCMVr%n4fbD!&sv3{j&{WE#w!o|
zADwz|>baQM>5XppCR)9T&tKW<J-6Xs_pE2y1JQ@K9+e)Hwogv9PEI@<m{_-)zH3hd
zFzomvjnu<8w_~rjVy{2<zp)eU-42ho!lR$}ZH335;8ys2(|7*su4prMVLLX_icS3S
z&8^tvR@YS1H^rLdB>>YTGXVbum`$_-0HFmez;~3)iw96uxdMg$5&AxBTXdCOEw}J4
z$lUcBwrNz14q&2s7>IEd9V_LTQ~=)|2Y{4QQ52w<5N6rER_muQ=m04PSKd2!S4RO#
z3}`0pgM>hYJ6XM=nLmV;No&T6M?AV_z#Z##@MY01@02^7Th?;7lTdPo6U#gYGWuCM
z40Y5NL+ja3s=xYp?c<%pHM#Wk^p|h_>D@2ieLnM1^QPL=ZnbVMZOtri#qwKSg{H5t
zBTDO+etEZ%eI^dH!;u|-aQ()wWB2hh|4=*J*T`;Eo4&!nQ?9YNUTgY>oZ9YRI&eDW
zS#9atPHVe@Yrv_v20sft$RRu2tdjuRr0~e#EWwKwHFXi~Rr7b0O4+;!dlT?k6zA|~
z!B7Klq#BbOSMQ&01!7G;#zIJ(A|cuWV~cr1g+{VicRF1sKs%JHgfq&-o1?vK5niJ$
zLcv5v+^1lS+IOCUF{N|!vPXdU_wQvQQT776qlxHT?B|_LWjgcWT#5)PJ%R)rJ!h|*
z*)v|5Oe^WkJn1<kclX4c;=CA<m-Um(v2)s`i^P{xI*&+Xwq#!|mUXSD8Ss2KmxeF%
z-n{Y?Wom|WxbzGWr)I7vuPGlAZ)vfh<!mBOr*0^7nR$jZ1|ODblh(s|)Y!e^d)s+;
zFPk5LfFjU31>*}3#`tgO)vr*`Us33vXz*+F+P7XF2OHwI2&NZ7)D><FZG2Rh+MZwo
zuU&3;M>eCMM(gjld!mgqo1>qO-cNsa=J#iRceXkFMtdZFEdN5_hR6317dcnI_QH#X
z;!VD<J#@Os_qU_*M`I7hTKqs;=xUr^yZL|d51zsQnU4SQ2jk%XAJRxOe(6c|$?c~j
q&6C%+q`4+P*Oq!3@3o}iy4aRZ)WvUj5hw9CJw6=W>&1AOCGfulx|Wv!

literal 0
HcmV?d00001

diff --git a/cuslines/cuda_python/_globals.py b/cuslines/cuda_python/_globals.py
new file mode 100644
index 0000000..c19368e
--- /dev/null
+++ b/cuslines/cuda_python/_globals.py
@@ -0,0 +1,10 @@
+# AUTO-GENERATED FROM globals.h — DO NOT EDIT
+
+EXCESS_ALLOC_FACT = 2
+MAX_SLINES_PER_SEED = 10
+MAX_SLINE_LEN = 501
+NORM_EPS = 1e-08
+PMF_THRESHOLD_P = 0.05
+REAL_SIZE = 4
+THR_X_BL = 64
+THR_X_SL = 32
diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py
new file mode 100644
index 0000000..2dc54cc
--- /dev/null
+++ b/cuslines/cuda_python/cu_direction_getters.py
@@ -0,0 +1,381 @@
+import numpy as np
+from abc import ABC, abstractmethod
+import logging
+import ctypes
+from importlib.resources import files
+from time import time
+
+from cuda.core import Device, LaunchConfig, Program, launch, ProgramOptions
+from cuda.pathfinder import find_nvidia_header_directory
+from cuda.cccl import get_include_paths
+from cuda.bindings import runtime
+from cuda.bindings.runtime import cudaMemcpyKind
+
+from cuslines.cuda_python.cutils import (
+    REAL_SIZE,
+    REAL_DTYPE,
+    REAL_DTYPE_AS_STR,
+    REAL3_DTYPE_AS_STR,
+    REAL_DTYPE_AS_CTYPE,
+    checkCudaErrors,
+    ModelType,
+    THR_X_SL,
+    BLOCK_Y,
+    DEV_PTR,
+)
+
+logger = logging.getLogger("GPUStreamlines")
+
+
+class GPUDirectionGetter(ABC):
+    @abstractmethod
+    def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
+        pass
+
+    @abstractmethod
+    def generateStreamlines(self):
+        pass
+
+    def allocate_on_gpu(self, n):
+        pass
+
+    def deallocate_on_gpu(self, n):
+        pass
+
+    def compile_program(self, debug: bool = False):
+        start_time = time()
+        logger.info("Compiling GPUStreamlines")
+
+        cuslines_cuda = files("cuslines")
+
+        if debug:
+            program_opts = {
+                "ptxas_options": ["-O0", "-v"],
+                "device_code_optimize": True,
+                "debug": True,
+                "lineinfo": True,
+            }
+        else:
+            program_opts = {
+                "ptxas_options": ["-O3"]
+            }
+
+        program_options = ProgramOptions(
+            name="cuslines",
+            use_fast_math=True,
+            std="c++17",
+            define_macro="__NVRTC__",
+            include_path=[
+                str(cuslines_cuda),
+                find_nvidia_header_directory("cudart"),
+                find_nvidia_header_directory("curand"),
+                get_include_paths().libcudacxx],
+            **program_opts)
+
+        # Here we assume all devices are the same,
+        # so we compile once for any current device.
+        # I think this is reasonable
+        dev = Device()
+        dev.set_current()
+        cuda_path = cuslines_cuda.joinpath("generate_streamlines_cuda.cu")
+        with open(cuda_path, "r") as f:
+            prog = Program(f.read(), code_type="c++", options=program_options)
+        self.module = prog.compile(
+            "cubin",
+            name_expressions=(
+                self.getnum_kernel_name,
+                self.genstreamlines_kernel_name,
+            ))
+        logger.info("GPUStreamlines compiled successfully in %.2f seconds", time() - start_time)
+
+
+class _BootCtx(ctypes.Structure):
+    _fields_ = [
+        ("min_signal", REAL_DTYPE_AS_CTYPE),
+        ("delta_nr", ctypes.c_int32),
+        ("H", ctypes.c_void_p),
+        ("R", ctypes.c_void_p),
+        ("delta_b", ctypes.c_void_p),
+        ("delta_q", ctypes.c_void_p),
+        ("sampling_matrix", ctypes.c_void_p),
+        ("b0s_mask", ctypes.c_void_p),
+    ]
+
+
+class BootDirectionGetter(GPUDirectionGetter):
+    def __init__(  # TODO: Maybe accept a dipy thing and extract arrays here? maybe as a from_ function?
+            self,
+            model_type: str,
+            min_signal: float,
+            H: np.ndarray,
+            R: np.ndarray,
+            delta_b: np.ndarray,
+            delta_q: np.ndarray,
+            sampling_matrix: np.ndarray,
+            b0s_mask: np.ndarray):
+        if model_type.upper() == "OPDT":
+            self.model_type = int(ModelType.OPDT)
+        elif model_type.upper() == "CSA":
+            self.model_type = int(ModelType.CSA)
+        else:
+            raise ValueError(f"Invalid model_type {model_type}, must be one of 'OPDT', 'CSA'")
+
+        self.H = np.ascontiguousarray(H, dtype=REAL_DTYPE)
+        self.R = np.ascontiguousarray(R, dtype=REAL_DTYPE)
+        self.delta_b = np.ascontiguousarray(delta_b, dtype=REAL_DTYPE)
+        self.delta_q = np.ascontiguousarray(delta_q, dtype=REAL_DTYPE)
+        self.delta_nr = int(delta_b.shape[0])
+        self.min_signal = REAL_DTYPE(min_signal)
+        self.sampling_matrix = np.ascontiguousarray(sampling_matrix, dtype=REAL_DTYPE)
+        self.b0s_mask = np.ascontiguousarray(b0s_mask, dtype=np.int32)
+        self.ctx_h = []
+
+        self.H_d = []
+        self.R_d = []
+        self.delta_b_d = []
+        self.delta_q_d = []
+        self.b0s_mask_d = []
+        self.sampling_matrix_d = []
+        self.ctx_d = []
+
+        self.getnum_kernel_name = f"getNumStreamlinesBoot_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.genstreamlines_kernel_name = f"genStreamlinesMerge_k<{THR_X_SL},{BLOCK_Y},{model_type.upper()},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.compile_program()
+
+    def allocate_on_gpu(self, n):
+        self.H_d.append(
+            checkCudaErrors(runtime.cudaMalloc(
+                REAL_SIZE*self.H.size)))
+        self.R_d.append(
+            checkCudaErrors(runtime.cudaMalloc(
+                REAL_SIZE*self.R.size)))
+        self.delta_b_d.append(
+            checkCudaErrors(runtime.cudaMalloc(
+                REAL_SIZE*self.delta_b.size)))
+        self.delta_q_d.append(
+            checkCudaErrors(runtime.cudaMalloc(
+                REAL_SIZE*self.delta_q.size)))
+        self.b0s_mask_d.append(
+            checkCudaErrors(runtime.cudaMalloc(
+                np.int32().nbytes*self.b0s_mask.size)))
+        self.sampling_matrix_d.append(
+            checkCudaErrors(runtime.cudaMalloc(
+                REAL_SIZE*self.sampling_matrix.size)))
+        self.ctx_d.append(
+            checkCudaErrors(runtime.cudaMalloc(
+                ctypes.sizeof(_BootCtx))))
+        self.ctx_h.append(_BootCtx(
+            min_signal=self.min_signal,
+            H=self.H_d[n],
+            R=self.R_d[n],
+            delta_b=self.delta_b_d[n],
+            delta_q=self.delta_q_d[n],
+            sampling_matrix=self.sampling_matrix_d[n],
+            b0s_mask=self.b0s_mask_d[n],
+        ))
+
+        checkCudaErrors(runtime.cudaMemcpy(
+            self.H_d[n],
+            self.H.ctypes.data,
+            REAL_SIZE*self.H.size,
+            cudaMemcpyKind.cudaMemcpyHostToDevice))
+        checkCudaErrors(runtime.cudaMemcpy(
+            self.R_d[n],
+            self.R.ctypes.data,
+            REAL_SIZE*self.R.size,
+            cudaMemcpyKind.cudaMemcpyHostToDevice))
+        checkCudaErrors(runtime.cudaMemcpy(
+            self.delta_b_d[n],
+            self.delta_b.ctypes.data,
+            REAL_SIZE*self.delta_b.size,
+            cudaMemcpyKind.cudaMemcpyHostToDevice))
+        checkCudaErrors(runtime.cudaMemcpy(
+            self.delta_q_d[n],
+            self.delta_q.ctypes.data,
+            REAL_SIZE*self.delta_q.size,
+            cudaMemcpyKind.cudaMemcpyHostToDevice))
+        checkCudaErrors(runtime.cudaMemcpy(
+            self.b0s_mask_d[n],
+            self.b0s_mask.ctypes.data,
+            np.int32().nbytes*self.b0s_mask.size,
+            cudaMemcpyKind.cudaMemcpyHostToDevice))
+        checkCudaErrors(runtime.cudaMemcpy(
+            self.sampling_matrix_d[n],
+            self.sampling_matrix.ctypes.data,
+            REAL_SIZE*self.sampling_matrix.size,
+            cudaMemcpyKind.cudaMemcpyHostToDevice))
+        checkCudaErrors(runtime.cudaMemcpy(
+            self.ctx_d[n],
+            ctypes.byref(self.ctx_h[n]),
+            ctypes.sizeof(_BootCtx),
+            cudaMemcpyKind.cudaMemcpyHostToDevice
+        ))
+
+    def deallocate_on_gpu(self, n):
+        if self.H_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.H_d[n]))
+        if self.R_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.R_d[n]))
+        if self.delta_b_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.delta_b_d[n]))
+        if self.delta_q_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.delta_q_d[n]))
+        if self.b0s_mask_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.b0s_mask_d[n]))
+        if self.sampling_matrix_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.sampling_matrix_d[n]))
+        if self.ctx_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.ctx_d[n]))
+
+    def _shared_mem_bytes(self, sp):
+        return REAL_SIZE*BLOCK_Y*2*(
+            sp.gpu_tracker.n32dimt + max(sp.gpu_tracker.n32dimt, sp.gpu_tracker.samplm_nr)) + \
+                np.int32().nbytes*BLOCK_Y*sp.gpu_tracker.samplm_nr
+
+    def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
+        ker = self.module.get_kernel(self.getnum_kernel_name)
+        shared_memory = self._shared_mem_bytes(sp)
+        config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory)
+
+        launch(
+            sp.gpu_tracker.streams[n], config, ker,
+            self.model_type,
+            sp.gpu_tracker.max_angle,
+            sp.gpu_tracker.min_separation_angle,
+            sp.gpu_tracker.relative_peak_thresh,
+            sp.gpu_tracker.rng_seed,
+            nseeds_gpu,
+            sp.seeds_d[n],
+            sp.gpu_tracker.dimx,
+            sp.gpu_tracker.dimy,
+            sp.gpu_tracker.dimz,
+            sp.gpu_tracker.dimt,
+            sp.gpu_tracker.dataf_d[n].handle,
+            self.H_d[n],
+            self.R_d[n],
+            self.delta_nr,
+            self.delta_b_d[n],
+            self.delta_q_d[n],
+            self.b0s_mask_d[n],
+            sp.gpu_tracker.samplm_nr,
+            self.sampling_matrix_d[n],
+            sp.gpu_tracker.sphere_vertices_d[n],
+            sp.gpu_tracker.sphere_edges_d[n],
+            sp.gpu_tracker.nedges,
+            sp.shDirTemp0_d[n],
+            sp.slinesOffs_d[n])
+
+    def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
+        ker = self.module.get_kernel(self.genstreamlines_kernel_name)
+        shared_memory = self._shared_mem_bytes(sp)
+        config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory)
+
+        launch(
+            sp.gpu_tracker.streams[n], config, ker,
+            sp.gpu_tracker.max_angle,
+            sp.gpu_tracker.tc_threshold,
+            sp.gpu_tracker.step_size,
+            sp.gpu_tracker.relative_peak_thresh,
+            sp.gpu_tracker.min_separation_angle,
+            sp.gpu_tracker.rng_seed,
+            sp.gpu_tracker.rng_offset + n*nseeds_gpu,
+            nseeds_gpu,
+            sp.seeds_d[n],
+            sp.gpu_tracker.dimx,
+            sp.gpu_tracker.dimy,
+            sp.gpu_tracker.dimz,
+            sp.gpu_tracker.dimt,
+            sp.gpu_tracker.dataf_d[n].handle,
+            sp.gpu_tracker.metric_map_d[n],
+            self.ctx_d[n],
+            sp.gpu_tracker.samplm_nr,
+            sp.gpu_tracker.sphere_vertices_d[n],
+            sp.gpu_tracker.sphere_edges_d[n],
+            sp.gpu_tracker.nedges,
+            sp.slinesOffs_d[n],
+            sp.shDirTemp0_d[n],
+            sp.slineSeed_d[n],
+            sp.slineLen_d[n],
+            sp.sline_d[n]
+        )
+
+
+class ProbDirectionGetter(GPUDirectionGetter):
+    def __init__(self):
+        self.getnum_kernel_name = f"getNumStreamlinesProb_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.genstreamlines_kernel_name = f"genStreamlinesMerge_k<{THR_X_SL},{BLOCK_Y},PROB,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.compile_program()
+
+    def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
+        ker = self.module.get_kernel(self.getnum_kernel_name)
+        shared_memory = REAL_SIZE*BLOCK_Y*sp.gpu_tracker.n32dimt + \
+            np.int32().nbytes*BLOCK_Y*sp.gpu_tracker.n32dimt
+        config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory)
+
+        launch(
+            sp.gpu_tracker.streams[n], config, ker,
+            sp.gpu_tracker.max_angle,
+            sp.gpu_tracker.relative_peak_thresh,
+            sp.gpu_tracker.min_separation_angle,
+            sp.gpu_tracker.rng_seed,
+            nseeds_gpu,
+            sp.seeds_d[n],
+            sp.gpu_tracker.dimx,
+            sp.gpu_tracker.dimy,
+            sp.gpu_tracker.dimz,
+            sp.gpu_tracker.dimt,
+            sp.gpu_tracker.dataf_d[n].handle,
+            sp.gpu_tracker.sphere_vertices_d[n],
+            sp.gpu_tracker.sphere_edges_d[n],
+            sp.gpu_tracker.nedges,
+            sp.shDirTemp0_d[n],
+            sp.slinesOffs_d[n])
+
+    def _shared_mem_bytes(self, sp):
+        return REAL_SIZE * BLOCK_Y * sp.gpu_tracker.n32dimt
+
+    def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
+        ker = self.module.get_kernel(self.genstreamlines_kernel_name)
+        shared_memory = self._shared_mem_bytes(sp)
+        config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory)
+
+        launch(
+            sp.gpu_tracker.streams[n], config, ker,
+            sp.gpu_tracker.max_angle,
+            sp.gpu_tracker.tc_threshold,
+            sp.gpu_tracker.step_size,
+            sp.gpu_tracker.relative_peak_thresh,
+            sp.gpu_tracker.min_separation_angle,
+            sp.gpu_tracker.rng_seed,
+            sp.gpu_tracker.rng_offset + n*nseeds_gpu,
+            nseeds_gpu,
+            sp.seeds_d[n],
+            sp.gpu_tracker.dimx,
+            sp.gpu_tracker.dimy,
+            sp.gpu_tracker.dimz,
+            sp.gpu_tracker.dimt,
+            sp.gpu_tracker.dataf_d[n].handle,
+            sp.gpu_tracker.metric_map_d[n],
+            int(0),
+            sp.gpu_tracker.samplm_nr,
+            sp.gpu_tracker.sphere_vertices_d[n],
+            sp.gpu_tracker.sphere_edges_d[n],
+            sp.gpu_tracker.nedges,
+            sp.slinesOffs_d[n],
+            sp.shDirTemp0_d[n],
+            sp.slineSeed_d[n],
+            sp.slineLen_d[n],
+            sp.sline_d[n]
+        )
+
+
+
+class PttDirectionGetter(ProbDirectionGetter):
+    def __init__(self):
+        self.getnum_kernel_name = f"getNumStreamlinesProb_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.genstreamlines_kernel_name = f"genStreamlinesMerge_k<{THR_X_SL},{BLOCK_Y},PTT,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.compile_program()
+
+    def _shared_mem_bytes(self, sp):
+        return 0
+
diff --git a/cuslines/cu_propagate_seeds.py b/cuslines/cuda_python/cu_propagate_seeds.py
similarity index 69%
rename from cuslines/cu_propagate_seeds.py
rename to cuslines/cuda_python/cu_propagate_seeds.py
index a334da6..73a4a6c 100644
--- a/cuslines/cu_propagate_seeds.py
+++ b/cuslines/cuda_python/cu_propagate_seeds.py
@@ -1,10 +1,12 @@
 import numpy as np
-import ctypes
+import gc
 from cuda.bindings import runtime
+from cuda.bindings.runtime import cudaMemcpyKind
+
 from nibabel.streamlines.array_sequence import ArraySequence
 import logging
 
-from cutils import (
+from cuslines.cuda_python.cutils import (
     REAL_SIZE,
     REAL_DTYPE,
     REAL3_DTYPE,
@@ -12,6 +14,7 @@
     EXCESS_ALLOC_FACT,
     THR_X_SL,
     THR_X_BL,
+    DEV_PTR,
     div_up,
     checkCudaErrors,
 )
@@ -25,18 +28,19 @@ def __init__(
             self,
             gpu_tracker):
         self.gpu_tracker = gpu_tracker
+        self.ngpus = gpu_tracker.ngpus
 
         self.nSlines_old = np.zeros(self.ngpus, dtype=np.int32)
         self.nSlines = np.zeros(self.ngpus, dtype=np.int32)
-        self.slines = np.zeros(self.ngpus, dtype=ctypes.c_void_p)
-        self.sline_lens = np.zeros(self.ngpus, dtype=ctypes.c_void_p)
+        self.slines = np.zeros(self.ngpus, dtype=np.ndarray)
+        self.sline_lens = np.zeros(self.ngpus, dtype=np.ndarray)
 
-        self.seeds_d = np.empty(self.ngpus, dtype=ctypes.c_void_p)
-        self.slineSeed_d = np.empty(self.ngpus, dtype=ctypes.c_void_p)
-        self.slinesOffs_d = np.empty(self.ngpus, dtype=ctypes.c_void_p)
-        self.shDirTemp0_d = np.empty(self.ngpus, dtype=ctypes.c_void_p)
-        self.slineLen_d = np.empty(self.ngpus, dtype=ctypes.c_void_p)
-        self.sline_d = np.empty(self.ngpus, dtype=ctypes.c_void_p)
+        self.seeds_d = np.empty(self.ngpus, dtype=DEV_PTR)
+        self.slineSeed_d = np.empty(self.ngpus, dtype=DEV_PTR)
+        self.slinesOffs_d = np.empty(self.ngpus, dtype=DEV_PTR)
+        self.shDirTemp0_d = np.empty(self.ngpus, dtype=DEV_PTR)
+        self.slineLen_d = np.empty(self.ngpus, dtype=DEV_PTR)
+        self.sline_d = np.empty(self.ngpus, dtype=DEV_PTR)
 
     def _switch_device(self, n):
         checkCudaErrors(runtime.cudaSetDevice(n))
@@ -51,28 +55,31 @@ def _switch_device(self, n):
     def _get_sl_buffer_size(self, n):
         return REAL_SIZE*2*3*MAX_SLINE_LEN*self.nSlines[n]
 
-    def _allocate_seed_memory(self):
+    def _allocate_seed_memory(self, seeds):
         # Move seeds to GPU
         for ii in range(self.ngpus):
             nseeds_gpu, _, _ = self._switch_device(ii)
             self.seeds_d[ii] = checkCudaErrors(runtime.cudaMalloc(
                 REAL_SIZE*3*nseeds_gpu))
+            seeds_host = np.ascontiguousarray(seeds[
+                ii*self.nseeds_per_gpu:ii*self.nseeds_per_gpu+nseeds_gpu],
+                dtype=REAL_DTYPE)
             checkCudaErrors(runtime.cudaMemcpy(
                 self.seeds_d[ii],
-                self.seeds[ii*self.nseeds_per_gpu:(ii+1)*self.nseeds_per_gpu].ctypes.data,
+                seeds_host.ctypes.data,
                 REAL_SIZE*3*nseeds_gpu,
-                runtime.cudaMemcpyHostToDevice))
+                cudaMemcpyKind.cudaMemcpyHostToDevice))
 
         for ii in range(self.ngpus):
             nseeds_gpu, block, grid = self._switch_device(ii)
             # Streamline offsets
             self.slinesOffs_d[ii] = checkCudaErrors(runtime.cudaMalloc(
-                np.uint64().nbytes * (nseeds_gpu + 1)))
+                np.int32().nbytes * (nseeds_gpu + 1)))
             # Initial directions from each seed
             self.shDirTemp0_d[ii] = checkCudaErrors(runtime.cudaMalloc(
-                REAL3_DTYPE.nbytes * self.samplm_nr * grid[0] * block[1]))
+                REAL3_DTYPE.itemsize * self.gpu_tracker.samplm_nr * grid[0] * block[1]))
 
-    def _cumsum_offsets(self):
+    def _cumsum_offsets(self):  # TODO: do this on device?
         for ii in range(self.ngpus):
             nseeds_gpu, _, _ = self._switch_device(ii)
             if (nseeds_gpu == 0):
@@ -83,8 +90,8 @@ def _cumsum_offsets(self):
             checkCudaErrors(runtime.cudaMemcpy(
                 slinesOffs_h.ctypes.data,
                 self.slinesOffs_d[ii],
-                slinesOffs_h.nbytes * (nseeds_gpu + 1),
-                runtime.cudaMemcpyDeviceToHost))
+                slinesOffs_h.nbytes,
+                cudaMemcpyKind.cudaMemcpyDeviceToHost))
 
             slinesOffs_h = np.concatenate((
                 [0], np.cumsum(slinesOffs_h[:-1], dtype=slinesOffs_h.dtype)))
@@ -94,7 +101,7 @@ def _cumsum_offsets(self):
                 self.slinesOffs_d[ii],
                 slinesOffs_h.ctypes.data,
                 self.slinesOffs_d.size * (nseeds_gpu + 1),
-                runtime.cudaMemcpyHostToDevice))
+                cudaMemcpyKind.cudaMemcpyHostToDevice))
 
     def _allocate_tracking_memory(self):
         for ii in range(self.ngpus):
@@ -108,24 +115,21 @@ def _allocate_tracking_memory(self):
                 self.nSlines[ii] * np.int32().nbytes))
 
             if self.nSlines[ii] > EXCESS_ALLOC_FACT*self.nSlines_old[ii]:
-                if self.slines[ii]:
-                    checkCudaErrors(runtime.cudaFreeHost(
-                        self.slines[ii]))
-                if self.sline_lens[ii]:
-                    checkCudaErrors(runtime.cudaFreeHost(
-                        self.sline_lens[ii]))
-                self.slines[ii] = 0  # Nullptr
-                self.sline_lens[ii] = 0  # Nullptr
+                self.slines[ii] = 0
+                self.sline_lens[ii] = 0
+                gc.collect()
 
             buffer_size = self._get_sl_buffer_size(ii)
             logger.debug(f"Streamline buffer size: {buffer_size}")
 
             if not self.slines[ii]:
-                self.slines[ii] = checkCudaErrors(runtime.cudaMallocHost(
-                    buffer_size))
-            if not self.slines_lens[ii]:
-                self.slines_lens[ii] = checkCudaErrors(runtime.cudaMallocHost(
-                    np.int32().nbytes*EXCESS_ALLOC_FACT*self.nSlines[ii]))
+                self.slines[ii] = np.empty(
+                    (EXCESS_ALLOC_FACT*self.nSlines[ii], MAX_SLINE_LEN*2, 3),
+                    dtype=REAL_DTYPE)
+            if not self.sline_lens[ii]:
+                self.sline_lens[ii] = np.empty(
+                    EXCESS_ALLOC_FACT*self.nSlines[ii],
+                    dtype=np.int32)
 
         for ii in range(self.ngpus):
             self._switch_device(ii)
@@ -143,13 +147,13 @@ def _cleanup(self):
                 self.slines[ii],
                 self.sline_d[ii],
                 self._get_sl_buffer_size(ii),
-                runtime.cudaMemcpyDeviceToHost,
+                cudaMemcpyKind.cudaMemcpyDeviceToHost,
                 self.gpu_tracker.streams[ii]))
             checkCudaErrors(runtime.cudaMemcpyAsync(
                 self.sline_lens[ii],
                 self.slineLen_d[ii],
                 np.int32().nbytes*self.nSlines[ii],
-                runtime.cudaMemcpyDeviceToHost,
+                cudaMemcpyKind.cudaMemcpyDeviceToHost,
                 self.gpu_tracker.streams[ii]))
 
         for ii in range(self.ngpus):
@@ -164,22 +168,19 @@ def _cleanup(self):
             checkCudaErrors(runtime.cudaFree(self.sline_d[ii]))
 
         self.nSlines_old = self.nSlines.copy()
-        self.rng_offset += self.nseeds
+        self.gpu_tracker.rng_offset += self.nseeds
 
     def propagate(self, seeds):
-        self.seeds = seeds
         self.nseeds = len(seeds)
         self.nseeds_per_gpu = (self.nseeds + self.gpu_tracker.ngpus - 1) // self.gpu_tracker.ngpus
 
-        self._seeds_to_gpu()
-        self._allocate_seed_memory()
+        self._allocate_seed_memory(seeds)
 
         for ii in range(self.ngpus):
             nseeds_gpu, block, grid = self._switch_device(ii)
             if (nseeds_gpu == 0):
                 continue
-
-            getNumStreamlines() # TODO: these will each be classes you can pass in
+            self.gpu_tracker.dg.getNumStreamlines(ii, nseeds_gpu, block, grid, self)
 
         self._cumsum_offsets()
         self._allocate_tracking_memory()
@@ -188,12 +189,11 @@ def propagate(self, seeds):
             nseeds_gpu, block, grid = self._switch_device(ii)
             if (nseeds_gpu == 0):
                 continue
-
-            mergeStreamlines() # TODO
+            self.gpu_tracker.dg.generateStreamlines(ii, nseeds_gpu, block, grid, self)
 
         self._cleanup()
 
-    def as_array_sequence(self):  # TODO: optimize memory usage here? also, direct to trx?
+    def as_array_sequence(self):
         buffer_size = 0
         for ii in range(self.ngpus):
             lens = self.sline_lens[ii]
@@ -207,12 +207,15 @@ def _yield_slines():
 
                 for jj in range(self.nSlines[ii]):
                     npts = this_len[jj]
-                    offset = jj * 3 * 2 * MAX_SLINE_LEN
 
-                    sl = np.asarray(
-                        this_sls[offset : offset + npts * 3],
-                        dtype=REAL_DTYPE)
-                    sl = sl.reshape((npts, 3))
-                    yield sl
+                    yield np.asarray(
+                        this_sls[jj],
+                        dtype=REAL_DTYPE)[:npts]
+
+        return ArraySequence(_yield_slines(), buffer_size)
 
-        return ArraySequence(_yield_slines, buffer_size)
+    def to_trx():
+        raise NotImplementedError("Export to TRX not yet implemented")
+    
+    def to_trk():
+        raise NotImplementedError("Export to TRK not yet implemented")
diff --git a/cuslines/cu_tractography.py b/cuslines/cuda_python/cu_tractography.py
similarity index 69%
rename from cuslines/cu_tractography.py
rename to cuslines/cuda_python/cu_tractography.py
index acfcc96..1ff0944 100644
--- a/cuslines/cu_tractography.py
+++ b/cuslines/cuda_python/cu_tractography.py
@@ -1,25 +1,26 @@
 from cuda.bindings import driver, runtime
+from cuda.bindings.runtime import cudaMemcpyKind
+import cuda.core as cc
 # TODO: consider cuda core over cuda bindings
 
 import numpy as np
 import logging
 
-from cutils import (
+from cuslines.cuda_python.cutils import (
     REAL_SIZE,
     REAL_DTYPE,
     checkCudaErrors,
 )
-from cu_direction_getters import (
+from cuslines.cuda_python.cu_direction_getters import (
     GPUDirectionGetter,
     BootDirectionGetter
 )
-from cu_propagate_seeds import SeedBatchPropagator
+from cuslines.cuda_python.cu_propagate_seeds import SeedBatchPropagator
 
 
 logger = logging.getLogger("GPUStreamlines")
 
-# TODO: we need to organize this package into folders, then make it pip installable.
-# but should merge in PTT FIRST
+
 class GPUTracker:  # TODO: bring in pyAFQ prep stuff
     def __init__(
         self,
@@ -37,21 +38,10 @@ def __init__(
         rng_seed: int = 0,
         rng_offset: int = 0,
     ):
-        for name, arr, dt in [
-            ("dataf", dataf, REAL_DTYPE),
-            ("metric_map", metric_map, REAL_DTYPE),
-            ("sphere_vertices", sphere_vertices, REAL_DTYPE),
-            ("sphere_edges", sphere_edges, np.int32),
-        ]:
-            if arr.dtype != dt:
-                raise TypeError(f"{name} must have dtype {dt}, got {arr.dtype}")
-            if not arr.flags.c_contiguous:
-                raise ValueError(f"{name} must be C-contiguous")
-
-        self.dataf = dataf
-        self.metric_map = metric_map
-        self.sphere_vertices = sphere_vertices
-        self.sphere_edges = sphere_edges
+        self.dataf = np.ascontiguousarray(dataf, dtype=REAL_DTYPE)
+        self.metric_map = np.ascontiguousarray(metric_map, dtype=REAL_DTYPE)
+        self.sphere_vertices = np.ascontiguousarray(sphere_vertices, dtype=REAL_DTYPE)
+        self.sphere_edges = np.ascontiguousarray(sphere_edges, dtype=np.int32)
 
         self.dimx, self.dimy, self.dimz, self.dimt = dataf.shape
         self.nedges = int(sphere_edges.shape[0])
@@ -59,6 +49,7 @@ def __init__(
             self.samplm_nr = int(dg.sampling_matrix.shape[0])
         else:
             self.samplm_nr = self.dimt
+        self.n32dimt = ((self.dimt + 31) // 32) * 32
 
         self.dg = dg
         self.max_angle = REAL_DTYPE(max_angle)
@@ -83,6 +74,9 @@ def __init__(
         self.sphere_vertices_d = []
         self.sphere_edges_d = []
 
+        self.streams = []
+        self.managed_data = []
+
         self.seed_propagator = SeedBatchPropagator(
             gpu_tracker=self)
         self._allocated = False
@@ -97,15 +91,22 @@ def _allocate(self):
 
         for ii in range(self.ngpus):
             checkCudaErrors(runtime.cudaSetDevice(ii))
-            self.dataf_d.append( # TODO: put this in texture memory?
-                checkCudaErrors(runtime.cudaMallocManaged(  # TODO: look at cuda core managed memory
-                    REAL_SIZE*self.dataf.size, 
-                    runtime.cudaMemAttachGlobal)))
-            checkCudaErrors(runtime.cudaMemAdvise(
-                self.dataf_d[ii],
-                REAL_SIZE*self.dataf.size,
-                runtime.cudaMemAdviseSetPreferredLocation,
-                ii))
+            self.streams.append(
+                checkCudaErrors(runtime.cudaStreamCreateWithFlags(
+                    runtime.cudaStreamNonBlocking)))
+
+        for ii in range(self.ngpus):
+            checkCudaErrors(runtime.cudaSetDevice(ii))
+
+            # TODO: put this in texture memory?
+            self.managed_data.append(
+                cc.ManagedMemoryResource(
+                    options=cc.ManagedMemoryResourceOptions(preferred_location=ii)
+                )
+            )
+            self.dataf_d.append( 
+                self.managed_data[ii].allocate(
+                    REAL_SIZE*self.dataf.size))
             self.metric_map_d.append(
                 checkCudaErrors(runtime.cudaMalloc(
                     REAL_SIZE*self.metric_map.size)))
@@ -115,37 +116,32 @@ def _allocate(self):
             self.sphere_edges_d.append(
                 checkCudaErrors(runtime.cudaMalloc(
                     np.int32().nbytes*self.sphere_edges.size)))
-            
+
+            logger.info("here-1")
             checkCudaErrors(runtime.cudaMemcpy(
-                self.dataf_d[ii],
+                self.dataf_d[ii].handle,
                 self.dataf.ctypes.data,
                 REAL_SIZE*self.dataf.size,
-                runtime.cudaMemcpyHostToDevice))
+                cudaMemcpyKind.cudaMemcpyHostToDevice))
+            logger.info("here0")
             checkCudaErrors(runtime.cudaMemcpy(
                 self.metric_map_d[ii],
                 self.metric_map.ctypes.data,
                 REAL_SIZE*self.metric_map.size,
-                runtime.cudaMemcpyHostToDevice))
+                cudaMemcpyKind.cudaMemcpyHostToDevice))
             checkCudaErrors(runtime.cudaMemcpy(
                 self.sphere_vertices_d[ii],
                 self.sphere_vertices.ctypes.data,
                 REAL_SIZE*self.sphere_vertices.size,
-                runtime.cudaMemcpyHostToDevice))
+                cudaMemcpyKind.cudaMemcpyHostToDevice))
             checkCudaErrors(runtime.cudaMemcpy(
                 self.sphere_edges_d[ii],
                 self.sphere_edges.ctypes.data,
                 np.int32().nbytes*self.sphere_edges.size,
-                runtime.cudaMemcpyHostToDevice))
-            
+                cudaMemcpyKind.cudaMemcpyHostToDevice))
+            logger.info("here0,5")
             self.dg.allocate_on_gpu(ii)
 
-        self.streams = []
-        for ii in range(self.ngpus):
-            checkCudaErrors(runtime.cudaSetDevice(ii))
-            self.streams.append(
-                checkCudaErrors(runtime.cudaStreamCreateWithFlags(
-                    runtime.cudaStreamNonBlocking)))
-
         self._allocated = True
 
     def __exit__(self, exc_type, exc, tb):
@@ -153,22 +149,17 @@ def __exit__(self, exc_type, exc, tb):
 
         for n in range(self.ngpus):
             checkCudaErrors(runtime.cudaSetDevice(n))
-            if self.dataf_d[n]:
-                checkCudaErrors(runtime.cudaFree(self.dataf_d[n]))
+            # if self.dataf_d[n]: # TODO: find how to do this
+            #     self.managed_data[n].deallocate(
+            #         self.dataf_d[n],
+            #         REAL_SIZE*self.dataf.size)
+            #     self.managed_data[n].close()
             if self.metric_map_d[n]:
                 checkCudaErrors(runtime.cudaFree(self.metric_map_d[n]))
             if self.sphere_vertices_d[n]:
                 checkCudaErrors(runtime.cudaFree(self.sphere_vertices_d[n]))
             if self.sphere_edges_d[n]:
                 checkCudaErrors(runtime.cudaFree(self.sphere_edges_d[n]))
-
-            if self.seed_propagator.sline_lens[n]:
-                checkCudaErrors(runtime.cudaFreeHost(
-                    self.seed_propagator.sline_lens[n]))
-            if self.seed_propagator.slines[n]:
-                checkCudaErrors(runtime.cudaFreeHost(
-                    self.seed_propagator.slines[n]))
-                
             self.dg.deallocate_on_gpu(n)
 
             checkCudaErrors(runtime.cudaStreamDestroy(self.streams[n]))
diff --git a/cuslines/cutils.py b/cuslines/cuda_python/cutils.py
similarity index 70%
rename from cuslines/cutils.py
rename to cuslines/cuda_python/cutils.py
index 4d75847..9cf164e 100644
--- a/cuslines/cutils.py
+++ b/cuslines/cuda_python/cutils.py
@@ -1,11 +1,12 @@
 from cuda.bindings import driver, nvrtc
 
-import re
-import os
 import numpy as np
+import ctypes
 
 from enum import IntEnum
 
+from cuslines.cuda_python._globals import *
+
 
 class ModelType(IntEnum):
     OPDT = 0
@@ -13,34 +14,27 @@ class ModelType(IntEnum):
     PROB = 2
     PTT = 3
 
-
-# We extract REAL_DTYPE, MAX_SLINE_LEN from globals.h
-# Maybe there is a more elegant way of doing this?
-dir_path = os.path.dirname(os.path.abspath(__file__))
-globals_path = os.path.join(dir_path, "globals.h")
-with open(globals_path, 'r') as f:
-    content = f.read()
-
-defines = dict(re.findall(r"#define\s+(\w+)\s+([^\s/]+)", content))
-REAL_SIZE = int(defines["REAL_SIZE"])
 REAL3_SIZE = 3 * REAL_SIZE
 if REAL_SIZE == 4:
     REAL_DTYPE = np.float32
     REAL3_DTYPE = np.dtype([('x', np.float32),
                             ('y', np.float32),
                             ('z', np.float32)])
+    REAL_DTYPE_AS_STR = "float"
+    REAL3_DTYPE_AS_STR = "float3"
+    REAL_DTYPE_AS_CTYPE = ctypes.c_float
 elif REAL_SIZE == 8:
     REAL_DTYPE = np.float64
     REAL3_DTYPE = np.dtype([('x', np.float64),
                             ('y', np.float64),
                             ('z', np.float64)])
+    REAL_DTYPE_AS_STR = "double"
+    REAL3_DTYPE_AS_STR = "double3"
+    REAL_DTYPE_AS_CTYPE = ctypes.c_double
 else:
     raise NotImplementedError(f"Unsupported REAL_SIZE={REAL_SIZE} in globals.h")
-MAX_SLINE_LEN = int(defines["MAX_SLINE_LEN"])
-THR_X_SL = int(defines["THR_X_SL"])
-THR_X_BL = int(defines["THR_X_BL"])
-EXCESS_ALLOC_FACT = int(defines["EXCESS_ALLOC_FACT"])
-
+BLOCK_Y = THR_X_BL//THR_X_SL
+DEV_PTR = object
 
 def _cudaGetErrorEnum(error):
     if isinstance(error, driver.CUresult):
diff --git a/cuslines/cuwsort.cuh b/cuslines/cuwsort.cuh
index 18858f0..aac70ac 100644
--- a/cuslines/cuwsort.cuh
+++ b/cuslines/cuwsort.cuh
@@ -79,12 +79,15 @@ int swap4[3][4] = {{ 2,  3,  0,  1},
 __device__ __constant__
 int swap2[1][2] = {{ 1,  0}};
 
-__device__ __constant__ const int *__swaps[] = {NULL,
-						reinterpret_cast<const int *>(&swap2[0][0]),
-						reinterpret_cast<const int *>(&swap4[0][0]),
-						reinterpret_cast<const int *>(&swap8[0][0]),
-						reinterpret_cast<const int *>(&swap16[0][0]),
-						reinterpret_cast<const int *>(&swap32[0][0])};
+template<int GSIZE>
+__device__ __forceinline__ const int* get_swap_ptr() {
+    if constexpr (GSIZE == 2)  return (const int*)swap2;
+    else if constexpr (GSIZE == 4)  return (const int*)swap4;
+    else if constexpr (GSIZE == 8)  return (const int*)swap8;
+    else if constexpr (GSIZE == 16) return (const int*)swap16;
+    else if constexpr (GSIZE == 32) return (const int*)swap32;
+    else return nullptr;
+}
 
 template<int X>
 struct STATIC_LOG2 {
@@ -113,7 +116,7 @@ __device__  KEY_T warp_sort(KEY_T v) {
 
 	const int gid = lid % GSIZE;
 
-	const int (*swap)[GSIZE] = reinterpret_cast<const int (*)[GSIZE]>(__swaps[LOG2_GSIZE]);
+	const int (*swap)[GSIZE] = reinterpret_cast<const int (*)[GSIZE]>(get_swap_ptr<GSIZE>());
 
         #pragma unroll
         for(int i = 0; i < NSWAP; i++) {
@@ -140,7 +143,7 @@ __device__  void warp_sort(KEY_T *__restrict__ k, VAL_T *__restrict__ v) {
 	
 	const int gid = lid % GSIZE;
 
-	const int (*swap)[GSIZE] = reinterpret_cast<const int (*)[GSIZE]>(__swaps[LOG2_GSIZE]);
+	const int (*swap)[GSIZE] = reinterpret_cast<const int (*)[GSIZE]>(get_swap_ptr<GSIZE>());
 
         #pragma unroll
         for(int i = 0; i < NSWAP; i++) {
diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index 0efefdd..db3c0e2 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -26,23 +26,31 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+// TODO: its possible all the cpp should be refactored
+// out into a separate file, but for now, they are just wrapped
+// in these ifndefs
+#ifndef __NVRTC__
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <getopt.h>
+#endif
+
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
+
+#ifndef __NVRTC__
 #include <cfloat>
 #include <omp.h>
 #include <vector>
-#include <cmath>
+#include <cmath> // Might not be needed anymore?
+#include <iostream>
+#endif
 
 #include "cudamacro.h" /* for time() */
 #include "globals.h"
 
-#include <iostream>
-
 #include "cuwsort.cuh"
 #include "ptt.cuh"
 
@@ -1204,7 +1212,6 @@ template<int BDIM_X,
          typename REAL3_T>
 __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
 			 const REAL_T max_angle,
-			 const REAL_T min_signal,
 			 const REAL_T tc_threshold,
 			 const REAL_T step_size,
 			 const REAL_T relative_peak_thres,
@@ -1218,22 +1225,9 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                          const int dimz,
                          const int dimt,
                          const REAL_T *__restrict__ dataf,
-                         const int *__restrict__ b0s_mask, // not using this (and its opposite, dwi_mask)
-                         const REAL_T *__restrict__ H, 
-                         const REAL_T *__restrict__ R,
-                         // model unused
-                         // step_size from global defines
-                         // max_angle, pmf_threshold from global defines
-                         // b0s_mask already passed
-                         // min_signal from global defines
-                         // tc_threshold from global defines
-                         // pmf_threashold from global defines
                          const REAL_T *__restrict__ metric_map,
-			 const int delta_nr,
-                         const REAL_T *__restrict__ delta_b,
-                         const REAL_T *__restrict__ delta_q, // fit_matrix
-			 const int samplm_nr,
-                         const REAL_T *__restrict__ sampling_matrix,
+                         const typename ModelCtx<MODEL_T, REAL_T>::type* __restrict__ ctx,
+			             const int samplm_nr,
                          const REAL3_T *__restrict__ sphere_vertices,
                          const int2 *__restrict__ sphere_edges,
                          const int num_edges,
@@ -1272,7 +1266,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
         int i;
         for(i = 1; i < MAX_SLINE_LEN*step_frac; i++) {
                 int ndir;
-                if (MODEL_T == PROB) {
+                if constexpr (MODEL_T == PROB) {
                         ndir = get_direction_prob_d<BDIM_X,
                                                     BDIM_Y,
                                                     0>(
@@ -1288,7 +1282,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                                                         sphere_edges,
                                                         num_edges,
                                                         __sh_new_dir + tidy);
-                } else if (MODEL_T == PTT) {
+                } else if constexpr (MODEL_T == PTT) {
                         ndir = get_direction_ptt_d<BDIM_X,
                                                    BDIM_Y,
                                                    0>(
@@ -1310,22 +1304,18 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                                                     MODEL_T>(
                                                         st,
                                                         max_angle,
-                                                        min_signal,
+                                                        ctx->min_signal,
                                                         relative_peak_thres,
                                                         min_separation_angle,
                                                         direction,
                                                         dimx, dimy, dimz, dimt, dataf,
-                                                        b0s_mask /* !dwi_mask */,
+                                                        ctx->b0s_mask /* !dwi_mask */,
                                                         point,
-                                                        H, R,
-                                                        // model unused
-                                                        // max_angle, pmf_threshold from global defines
-                                                        // b0s_mask already passed
-                                                        // min_signal from global defines
-                                                        delta_nr,
-                                                        delta_b, delta_q, // fit_matrix
+                                                        ctx->H, ctx->R,
+                                                        ctx->delta_nr,
+                                                        ctx->delta_b, ctx->delta_q, // fit_matrix
                                                         samplm_nr,
-                                                        sampling_matrix,
+                                                        ctx->sampling_matrix,
                                                         sphere_vertices,
                                                         sphere_edges,
                                                         num_edges,
@@ -1603,7 +1593,6 @@ template<int BDIM_X,
          typename REAL3_T>
 __global__ void genStreamlinesMerge_k(
 				      const REAL_T max_angle,
-				      const REAL_T min_signal,
 				      const REAL_T tc_threshold,
 				      const REAL_T step_size,
 				      const REAL_T relative_peak_thres,
@@ -1617,15 +1606,9 @@ __global__ void genStreamlinesMerge_k(
                                       const int dimz,
                                       const int dimt,
                                       const REAL_T *__restrict__ dataf,
-                                      const REAL_T *__restrict__ H,
-                                      const REAL_T *__restrict__ R,
-				      const int delta_nr,
-                                      const REAL_T *__restrict__ delta_b,
-                                      const REAL_T *__restrict__ delta_q,
-                                      const int    *__restrict__ b0s_mask, // change to int
                                       const REAL_T *__restrict__ metric_map,
-				      const int samplm_nr,
-                                      const REAL_T *__restrict__ sampling_matrix,
+                                      const typename ModelCtx<MODEL_T, REAL_T>::type* __restrict__ ctx,
+				                      const int samplm_nr,
                                       const REAL3_T *__restrict__ sphere_vertices,
                                       const int2 *__restrict__ sphere_edges,
                                       const int num_edges,
@@ -1715,7 +1698,6 @@ __global__ void genStreamlinesMerge_k(
                                                     BDIM_Y,
                                                     MODEL_T>(&st,
 		                		             max_angle,
-			        		             min_signal,
 			        			     tc_threshold,
 	                        			     step_size,
 	                        			     relative_peak_thres,
@@ -1725,13 +1707,9 @@ __global__ void genStreamlinesMerge_k(
                                                              __ptt_frame,
                                                              MAKE_REAL3(1, 1, 1),
                                                              dimx, dimy, dimz, dimt, dataf,
-                                                             b0s_mask,
-                                                             H, R,
                                                              metric_map,
-			                                     delta_nr,
-                                                             delta_b, delta_q, //fit_matrix
-		                			     samplm_nr,
-                                                             sampling_matrix,
+                                                             ctx,
+		                			                         samplm_nr,
                                                              sphere_vertices,
                                                              sphere_edges,
                                                              num_edges,
@@ -1755,7 +1733,6 @@ __global__ void genStreamlinesMerge_k(
                                                     BDIM_Y,
                                                     MODEL_T>(&st,
      	                    			             max_angle,
-			        		             min_signal,
 		        				     tc_threshold,
 	                				     step_size,
 			    				     relative_peak_thres,
@@ -1765,13 +1742,9 @@ __global__ void genStreamlinesMerge_k(
                                                              __ptt_frame + 9,
                                                              MAKE_REAL3(1, 1, 1),
                                                              dimx, dimy, dimz, dimt, dataf,
-                                                             b0s_mask,
-                                                             H, R,
                                                              metric_map,
-			        			     delta_nr,
-                                                             delta_b, delta_q, //fit_matrix
-			        			     samplm_nr,
-                                                             sampling_matrix,
+                                                             ctx,
+			        			                             samplm_nr,
                                                              sphere_vertices,
                                                              sphere_edges,
                                                              num_edges,
@@ -1802,15 +1775,16 @@ __global__ void genStreamlinesMerge_k(
         return;
 }
 
+#ifndef __NVRTC__
 void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
                                     const REAL relative_peak_thresh, const REAL min_separation_angle,
                                     const int nseeds, const std::vector<REAL*> &seeds_d,
                                     const int dimx, const int dimy, const int dimz, const int dimt,
                                     const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
-			            const int delta_nr,
+			                        const int delta_nr,
                                     const std::vector<REAL*> &delta_b_d, const std::vector<REAL*> &delta_q_d,
                                     const std::vector<int*> &b0s_mask_d, const std::vector<REAL*> &metric_map_d,
-			            const int samplm_nr,
+			                        const int samplm_nr,
                                     const std::vector<REAL*> &sampling_matrix_d,
                                     const std::vector<REAL*> &sphere_vertices_d, const std::vector<int*> &sphere_edges_d, const int nedges,
                                     std::vector<REAL*> &slines_h, std::vector<int*> &slinesLen_h, std::vector<int> &nSlines_h,
@@ -1985,25 +1959,45 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
 #endif
 
     //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y);
-    switch(model_type) {
+    switch(model_type) {  // TODO: these may be better as separate functions, not as template specializations
         case OPDT:
-            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, OPDT> <<<grid, block, shSizeGNS, streams[n]>>>(
-                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
-                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
-                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
-                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
-            break;
-
         case CSA:
-            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, CSA> <<<grid, block, shSizeGNS, streams[n]>>>(
-                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
-                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
-                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
-                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+            BootCtx<REAL>* d_ctx;
+            BootCtx<REAL> h_ctx;
+            h_ctx.min_signal      = min_signal;
+            h_ctx.delta_nr        = delta_nr;
+            h_ctx.H               = H_d[n];
+            h_ctx.R               = R_d[n];
+            h_ctx.delta_b         = delta_b_d[n];
+            h_ctx.delta_q         = delta_q_d[n];
+            h_ctx.sampling_matrix = sampling_matrix_d[n];
+            h_ctx.b0s_mask        = b0s_mask_d[n];
+            CHECK_CUDA(cudaMalloc(&d_ctx, sizeof(BootCtx<REAL>)));
+            CHECK_CUDA(cudaMemcpyAsync(
+                d_ctx, &h_ctx, sizeof(BootCtx<REAL>),
+                cudaMemcpyHostToDevice, streams[n]));
+
+            if (model_type == OPDT) {
+                genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, OPDT> <<<grid, block, shSizeGNS, streams[n]>>>(
+                    max_angle, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
+                    rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                    dimx, dimy, dimz, dimt, dataf_d[n],
+                    metric_map_d[n], d_ctx, samplm_nr,
+                    reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                    nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+            } else if (model_type == CSA) {
+                genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, CSA> <<<grid, block, shSizeGNS, streams[n]>>>(
+                    max_angle, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
+                    rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                    dimx, dimy, dimz, dimt, dataf_d[n],
+                    metric_map_d[n], d_ctx, samplm_nr,
+                    reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                    nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+            } else {
+                // Should never reach here
+            }
+            
+            CHECK_CUDA(cudaFree(d_ctx));
             break;
 
         case PROB:
@@ -2011,10 +2005,10 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
             // than for preliminary run
             shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt;
             genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, PROB> <<<grid, block, shSizeGNS, streams[n]>>>(
-                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
+                max_angle, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
                 rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
-                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
+                dimx, dimy, dimz, dimt, dataf_d[n],
+                metric_map_d[n], nullptr, samplm_nr,
                 reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
                 nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
             break;
@@ -2022,10 +2016,10 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
         case PTT:
             shSizeGNS = 0; // PTT uses exclusively static shared memory
             genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, PTT> <<<grid, block, shSizeGNS, streams[n]>>>(
-                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
+                max_angle, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
                 rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
-                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
+                dimx, dimy, dimz, dimt, dataf_d[n],
+                metric_map_d[n], nullptr, samplm_nr,
                 reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
                 nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
             break;
@@ -2394,3 +2388,4 @@ void write_trk(const int num_threads,
         return;
 }
 #endif
+#endif // __NVRTC__
diff --git a/cuslines/globals.h b/cuslines/globals.h
index 0d852e9..e0bcac1 100644
--- a/cuslines/globals.h
+++ b/cuslines/globals.h
@@ -40,7 +40,7 @@
 #define FLOOR		floorf
 #define LOG		__logf
 #define EXP		__expf
-#define REAL_MAX	(FLT_MAX)
+#define REAL_MAX	__int_as_float(0x7f7fffffU)
 #define REAL_MIN	(-REAL_MAX)
 #define COS		__cosf
 #define SIN		__sinf
@@ -58,7 +58,7 @@
 #define FLOOR		floor
 #define LOG		log
 #define EXP		exp
-#define REAL_MAX	(DBL_MAX)
+#define REAL_MAX	__longlong_as_double(0x7fefffffffffffffLL)
 #define REAL_MIN	(-REAL_MAX)
 #define COS		cos
 #define SIN		sin
@@ -98,4 +98,33 @@ enum ModelType {
   PTT = 3,
 };
 
+struct NoCtx {};
+
+template<typename REAL_T>
+struct BootCtx {
+    REAL_T min_signal;
+    int delta_nr;
+    const REAL_T* H;
+    const REAL_T* R;
+    const REAL_T* delta_b;
+    const REAL_T* delta_q;
+    const REAL_T* sampling_matrix;
+    const int* b0s_mask;
+};
+
+template<ModelType M, typename REAL_T>
+struct ModelCtx {
+    using type = NoCtx;
+};
+
+template<typename REAL_T>
+struct ModelCtx<CSA, REAL_T> {
+    using type = BootCtx<REAL_T>;
+};
+
+template<typename REAL_T>
+struct ModelCtx<OPDT, REAL_T> {
+    using type = BootCtx<REAL_T>;
+};
+
 #endif
diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
index 3cdd149..894d0bf 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/ptt.cu
@@ -473,7 +473,7 @@ __device__ int get_direction_ptt_d(
                 get_probing_frame_d<0>(__frame_sh, st, __probing_frame_sh);
                 propagate_frame_d(__probing_prop_sh, __probing_frame_sh, __direc_sh);
                 norm3_d(__direc_sh, 0); // this will be scaled by the generic stepping code
-                dirs[0] = (REAL3_T) {__direc_sh[0], __direc_sh[1], __direc_sh[2]};
+                dirs[0] = MAKE_REAL3(__direc_sh[0], __direc_sh[1], __direc_sh[2]);
             }
         }
 
diff --git a/pyproject.toml b/pyproject.toml
index 7ad8645..a1247c5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [build-system]
-requires = ["scikit-build-core", "pybind11"]
-build-backend = "scikit_build_core.build"
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
 
 [project]
 name = "cuslines"
@@ -10,8 +10,12 @@ readme = "README.md"
 requires-python = ">=3.7"
 dependencies = [
     "numpy",
-    "pybind11"
+    "nibabel",
+    "cuda-python",
+    "cuda-core",
+    "cuda-cccl"
 ]
 
-[tool.scikit-build]
-cmake.build-type = "Release"
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["cuslines*"]
diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py
index d546d60..7585e37 100644
--- a/run_gpu_streamlines.py
+++ b/run_gpu_streamlines.py
@@ -27,7 +27,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import argparse
+import argparse  # TODO: do this again, but for cuda python version
 import random
 import time
 import zipfile
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..cd53ade
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,49 @@
+from setuptools import setup
+from setuptools.command.build_py import build_py
+from pathlib import Path
+import subprocess
+import re
+
+
+def defines_to_python(src, dst):
+    src = Path(src)
+    dst = Path(dst)
+
+    defines = {}
+
+    INT_DEFINE = re.compile(
+        r"#define\s+(\w+)\s+\(?\s*([0-9]+)\s*\)?"
+    )
+
+    REAL_CAST_DEFINE = re.compile(
+        r"#define\s+(\w+)\s+\(\(REAL\)\s*([0-9eE\.\+\-]+)\s*\)"
+    )
+
+    defines = {}
+
+    for line in src.read_text().splitlines():
+        if m := INT_DEFINE.match(line):
+            defines[m.group(1)] = int(m.group(2))
+        elif m := REAL_CAST_DEFINE.match(line):
+            defines[m.group(1)] = float(m.group(2))
+
+    dst.parent.mkdir(parents=True, exist_ok=True)
+
+    with dst.open("w") as f:
+        f.write("# AUTO-GENERATED FROM globals.h — DO NOT EDIT\n\n")
+        for k, v in sorted(defines.items()):
+            f.write(f"{k} = {v}\n")
+
+class build_py_with_cuda(build_py):
+    def run(self):
+        root = Path(__file__).parent
+
+        globals_src = str(root / "cuslines" / "globals.h")
+        globals_dst = str(root / "cuslines" / "cuda_python" / "_globals.py")
+        defines_to_python(globals_src, globals_dst)
+
+        super().run()
+
+setup(
+    cmdclass={"build_py": build_py_with_cuda},
+)

From 039a95b86dcbce9b96a29fde8fb87ba46da793ad Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Tue, 6 Jan 2026 14:13:23 -0800
Subject: [PATCH 20/31] working!

---
 cuslines/cuda_python/cu_direction_getters.py |  8 ++---
 cuslines/cuda_python/cu_propagate_seeds.py   | 33 +++++++++++++-------
 cuslines/cuda_python/cu_tractography.py      | 24 ++++----------
 cuslines/globals.h                           |  2 +-
 4 files changed, 32 insertions(+), 35 deletions(-)

diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py
index 2dc54cc..135cb47 100644
--- a/cuslines/cuda_python/cu_direction_getters.py
+++ b/cuslines/cuda_python/cu_direction_getters.py
@@ -250,7 +250,7 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
             sp.gpu_tracker.dimy,
             sp.gpu_tracker.dimz,
             sp.gpu_tracker.dimt,
-            sp.gpu_tracker.dataf_d[n].handle,
+            sp.gpu_tracker.dataf_d[n],
             self.H_d[n],
             self.R_d[n],
             self.delta_nr,
@@ -285,7 +285,7 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
             sp.gpu_tracker.dimy,
             sp.gpu_tracker.dimz,
             sp.gpu_tracker.dimt,
-            sp.gpu_tracker.dataf_d[n].handle,
+            sp.gpu_tracker.dataf_d[n],
             sp.gpu_tracker.metric_map_d[n],
             self.ctx_d[n],
             sp.gpu_tracker.samplm_nr,
@@ -324,7 +324,7 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
             sp.gpu_tracker.dimy,
             sp.gpu_tracker.dimz,
             sp.gpu_tracker.dimt,
-            sp.gpu_tracker.dataf_d[n].handle,
+            sp.gpu_tracker.dataf_d[n],
             sp.gpu_tracker.sphere_vertices_d[n],
             sp.gpu_tracker.sphere_edges_d[n],
             sp.gpu_tracker.nedges,
@@ -354,7 +354,7 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
             sp.gpu_tracker.dimy,
             sp.gpu_tracker.dimz,
             sp.gpu_tracker.dimt,
-            sp.gpu_tracker.dataf_d[n].handle,
+            sp.gpu_tracker.dataf_d[n],
             sp.gpu_tracker.metric_map_d[n],
             int(0),
             sp.gpu_tracker.samplm_nr,
diff --git a/cuslines/cuda_python/cu_propagate_seeds.py b/cuslines/cuda_python/cu_propagate_seeds.py
index 73a4a6c..92efef3 100644
--- a/cuslines/cuda_python/cu_propagate_seeds.py
+++ b/cuslines/cuda_python/cu_propagate_seeds.py
@@ -3,7 +3,7 @@
 from cuda.bindings import runtime
 from cuda.bindings.runtime import cudaMemcpyKind
 
-from nibabel.streamlines.array_sequence import ArraySequence
+from nibabel.streamlines.array_sequence import ArraySequence, MEGABYTE
 import logging
 
 from cuslines.cuda_python.cutils import (
@@ -16,8 +16,7 @@
     THR_X_BL,
     DEV_PTR,
     div_up,
-    checkCudaErrors,
-)
+    checkCudaErrors)
 
 
 logger = logging.getLogger("GPUStreamlines")
@@ -53,7 +52,7 @@ def _switch_device(self, n):
         return nseeds_gpu, block, grid
 
     def _get_sl_buffer_size(self, n):
-        return REAL_SIZE*2*3*MAX_SLINE_LEN*self.nSlines[n]
+        return REAL_SIZE*2*3*MAX_SLINE_LEN*self.nSlines[n].astype(np.int64)
 
     def _allocate_seed_memory(self, seeds):
         # Move seeds to GPU
@@ -79,7 +78,7 @@ def _allocate_seed_memory(self, seeds):
             self.shDirTemp0_d[ii] = checkCudaErrors(runtime.cudaMalloc(
                 REAL3_DTYPE.itemsize * self.gpu_tracker.samplm_nr * grid[0] * block[1]))
 
-    def _cumsum_offsets(self):  # TODO: do this on device?
+    def _cumsum_offsets(self):  # TODO: do this on device? not crucial for performance now
         for ii in range(self.ngpus):
             nseeds_gpu, _, _ = self._switch_device(ii)
             if (nseeds_gpu == 0):
@@ -93,14 +92,18 @@ def _cumsum_offsets(self):  # TODO: do this on device?
                 slinesOffs_h.nbytes,
                 cudaMemcpyKind.cudaMemcpyDeviceToHost))
 
-            slinesOffs_h = np.concatenate((
-                [0], np.cumsum(slinesOffs_h[:-1], dtype=slinesOffs_h.dtype)))
-            self.nSlines[ii] = int(slinesOffs_h[-1])
+            __pval = slinesOffs_h[0]
+            slinesOffs_h[0] = 0
+            for jj in range(1, nseeds_gpu + 1):
+                __cval = slinesOffs_h[jj]
+                slinesOffs_h[jj] = slinesOffs_h[jj - 1] + __pval
+                __pval = __cval
+            self.nSlines[ii] = int(slinesOffs_h[nseeds_gpu])
 
             checkCudaErrors(runtime.cudaMemcpy(
                 self.slinesOffs_d[ii],
                 slinesOffs_h.ctypes.data,
-                self.slinesOffs_d.size * (nseeds_gpu + 1),
+                slinesOffs_h.nbytes,
                 cudaMemcpyKind.cudaMemcpyHostToDevice))
 
     def _allocate_tracking_memory(self):
@@ -167,10 +170,10 @@ def _cleanup(self):
             checkCudaErrors(runtime.cudaFree(self.slineLen_d[ii]))
             checkCudaErrors(runtime.cudaFree(self.sline_d[ii]))
 
-        self.nSlines_old = self.nSlines.copy()
+        self.nSlines_old = self.nSlines
         self.gpu_tracker.rng_offset += self.nseeds
 
-    def propagate(self, seeds):
+    def propagate(self, seeds): # TODO: better queuing/batching of seeds, if more performance needed
         self.nseeds = len(seeds)
         self.nseeds_per_gpu = (self.nseeds + self.gpu_tracker.ngpus - 1) // self.gpu_tracker.ngpus
 
@@ -181,6 +184,9 @@ def propagate(self, seeds):
             if (nseeds_gpu == 0):
                 continue
             self.gpu_tracker.dg.getNumStreamlines(ii, nseeds_gpu, block, grid, self)
+        for ii in range(self.ngpus):
+            checkCudaErrors(runtime.cudaStreamSynchronize(
+                self.gpu_tracker.streams[ii]))
 
         self._cumsum_offsets()
         self._allocate_tracking_memory()
@@ -190,6 +196,9 @@ def propagate(self, seeds):
             if (nseeds_gpu == 0):
                 continue
             self.gpu_tracker.dg.generateStreamlines(ii, nseeds_gpu, block, grid, self)
+        for ii in range(self.ngpus):
+            checkCudaErrors(runtime.cudaStreamSynchronize(
+                self.gpu_tracker.streams[ii]))
 
         self._cleanup()
 
@@ -212,7 +221,7 @@ def _yield_slines():
                         this_sls[jj],
                         dtype=REAL_DTYPE)[:npts]
 
-        return ArraySequence(_yield_slines(), buffer_size)
+        return ArraySequence(_yield_slines(), buffer_size // MEGABYTE)
 
     def to_trx():
         raise NotImplementedError("Export to TRX not yet implemented")
diff --git a/cuslines/cuda_python/cu_tractography.py b/cuslines/cuda_python/cu_tractography.py
index 1ff0944..eca62dd 100644
--- a/cuslines/cuda_python/cu_tractography.py
+++ b/cuslines/cuda_python/cu_tractography.py
@@ -98,15 +98,9 @@ def _allocate(self):
         for ii in range(self.ngpus):
             checkCudaErrors(runtime.cudaSetDevice(ii))
 
-            # TODO: put this in texture memory?
-            self.managed_data.append(
-                cc.ManagedMemoryResource(
-                    options=cc.ManagedMemoryResourceOptions(preferred_location=ii)
-                )
-            )
-            self.dataf_d.append( 
-                self.managed_data[ii].allocate(
-                    REAL_SIZE*self.dataf.size))
+            self.dataf_d.append(
+                checkCudaErrors(runtime.cudaMalloc(
+                    REAL_SIZE*self.dataf.size)))
             self.metric_map_d.append(
                 checkCudaErrors(runtime.cudaMalloc(
                     REAL_SIZE*self.metric_map.size)))
@@ -117,13 +111,11 @@ def _allocate(self):
                 checkCudaErrors(runtime.cudaMalloc(
                     np.int32().nbytes*self.sphere_edges.size)))
 
-            logger.info("here-1")
             checkCudaErrors(runtime.cudaMemcpy(
-                self.dataf_d[ii].handle,
+                self.dataf_d[ii],
                 self.dataf.ctypes.data,
                 REAL_SIZE*self.dataf.size,
                 cudaMemcpyKind.cudaMemcpyHostToDevice))
-            logger.info("here0")
             checkCudaErrors(runtime.cudaMemcpy(
                 self.metric_map_d[ii],
                 self.metric_map.ctypes.data,
@@ -139,7 +131,6 @@ def _allocate(self):
                 self.sphere_edges.ctypes.data,
                 np.int32().nbytes*self.sphere_edges.size,
                 cudaMemcpyKind.cudaMemcpyHostToDevice))
-            logger.info("here0,5")
             self.dg.allocate_on_gpu(ii)
 
         self._allocated = True
@@ -149,11 +140,8 @@ def __exit__(self, exc_type, exc, tb):
 
         for n in range(self.ngpus):
             checkCudaErrors(runtime.cudaSetDevice(n))
-            # if self.dataf_d[n]: # TODO: find how to do this
-            #     self.managed_data[n].deallocate(
-            #         self.dataf_d[n],
-            #         REAL_SIZE*self.dataf.size)
-            #     self.managed_data[n].close()
+            if self.dataf_d[n]:
+                checkCudaErrors(runtime.cudaFree(self.dataf_d[n]))
             if self.metric_map_d[n]:
                 checkCudaErrors(runtime.cudaFree(self.metric_map_d[n]))
             if self.sphere_vertices_d[n]:
diff --git a/cuslines/globals.h b/cuslines/globals.h
index e0bcac1..b9f8211 100644
--- a/cuslines/globals.h
+++ b/cuslines/globals.h
@@ -68,7 +68,7 @@
 #define ACOS		acos
 
 #endif
-
+// TODO: half this in when WMGMI seeding
 #define MAX_SLINE_LEN	(501)
 #define PMF_THRESHOLD_P	((REAL)0.05)
 

From 55f69e1122182eb59575dd672d5d0b20687aaa5f Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Tue, 6 Jan 2026 14:14:23 -0800
Subject: [PATCH 21/31] ignore pycs

---
 .gitignore                                      |   6 ++++++
 .../__pycache__/__init__.cpython-312.pyc        | Bin 385 -> 0 bytes
 .../__pycache__/_globals.cpython-312.pyc        | Bin 389 -> 0 bytes
 .../cu_direction_getters.cpython-312.pyc        | Bin 22688 -> 0 bytes
 .../cu_propagate_seeds.cpython-312.pyc          | Bin 14574 -> 0 bytes
 .../__pycache__/cu_tractography.cpython-312.pyc | Bin 9441 -> 0 bytes
 .../__pycache__/cutils.cpython-312.pyc          | Bin 2922 -> 0 bytes
 7 files changed, 6 insertions(+)
 create mode 100644 .gitignore
 delete mode 100644 cuslines/cuda_python/__pycache__/__init__.cpython-312.pyc
 delete mode 100644 cuslines/cuda_python/__pycache__/_globals.cpython-312.pyc
 delete mode 100644 cuslines/cuda_python/__pycache__/cu_direction_getters.cpython-312.pyc
 delete mode 100644 cuslines/cuda_python/__pycache__/cu_propagate_seeds.cpython-312.pyc
 delete mode 100644 cuslines/cuda_python/__pycache__/cu_tractography.cpython-312.pyc
 delete mode 100644 cuslines/cuda_python/__pycache__/cutils.cpython-312.pyc

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..78bb5e2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+# Python bytecode
+**/*.pyc
+**/__pycache__/
+*.pyo
+*.pyd
+
diff --git a/cuslines/cuda_python/__pycache__/__init__.cpython-312.pyc b/cuslines/cuda_python/__pycache__/__init__.cpython-312.pyc
deleted file mode 100644
index 6aedacf00242281ecdb484682fe1cfc13555eb98..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 385
zcmZ9IF-ycS7>2*J=~eD5g6HB75Gjb=MI1!9O)4r#mr$F~R-2}z>Cn+%;qK;dxqlFF
za&l8|5S;vacuT+G;mw=meM6Q(Fa?~0tGm)C0Pn+MGtR$Q9ieyz4xCgVNkYh=oyw7J
z;!2uO=_MY49Ju>Jl1WcP?*p?SBzC%pZ*O#(JqjJVb{=bWe^F|YnNrmYVT{mrH#TPM
z&Cit@n<3RMhH5&%8!$m!hQ8g&I%crMs9dK_(XY?hJ#_h|i{<OGc68hNjHOk@*ek3s
z41Q0dLP-%lsG^SW>DNY!R90mz+9>PVjo>M3dQ+%6Vyvu7!`N}t+pSZncvp!t9bnY(
d0qs8G(<Oww!QqcXiSq@B?`?19Jo$QtyMH~$XmtPp

diff --git a/cuslines/cuda_python/__pycache__/_globals.cpython-312.pyc b/cuslines/cuda_python/__pycache__/_globals.cpython-312.pyc
deleted file mode 100644
index 2c8e522c2585d7677e056b5dea76fbd1c729cef2..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 389
zcmX|7Jx{_=6n*8hh>d(i;^^SQVs#}3t?&?%N89vaf^T_2Ujvvz35B6s-Hn5b(SN`n
zadBa6-l!8B32|}qH4)En?zuPjoSWPaMcDxY#`DeaUDU6r*+Tjgn<F7!00RUtiAXF)
zVk}PLEI|@1Ns=r@QY=l<zFf|%Vq$d_MGn7(y<d4e`F!$Wwg2`SVR61nL<<)Y9SYLR
zGS9<q4Ps0+gp3+(XtfS6wzV$BsHH<pO=wXJVVx`gLyDlzW!<pa6q=atne7JcG%bjm
z2C7q&XDp~8idzJ7S<GpdUZSnSXaP3jGWe;e19xPr_wHb<UYXZ;Hnn>rXE?S!^=|HM
z1?$m?KO4AXmG&L?w&!@q6Q9dxBbUt`yE-k1sOVANsi1XSlB9JC?3b1xAC?Z5pcv+g
dff8nOfgBb}!Cttl1eq|ah&HP{$kUu~{Q}jiZTtWL

diff --git a/cuslines/cuda_python/__pycache__/cu_direction_getters.cpython-312.pyc b/cuslines/cuda_python/__pycache__/cu_direction_getters.cpython-312.pyc
deleted file mode 100644
index f59ad4d2e033c1468cb38fc97a82f693071d1aae..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 22688
zcmdr!TW}lKb-Mr-&jkU##5V~_qG^y6^_Epxwk1)b6j`F4RwUTg8pJM1KzPs#P!U<G
zc2Xrn#%U!r(^5*55uLcTSh=-2(`lX7GgZ^fG}HKlYtjX0O;6par){P`v`N=9{^~h*
zu?rBqplHfYC%FXAJ@<9)`<`>py?1}<b~_m`ul;Wz{Pqrp`44pHhRY!HJuJhFF(M<f
zQ6|8GZHk&==72e730PRop9^p#+#0ZgZI0Sv_JEy)S)z`ZBj6x77j?#50auI<@G*D5
z9V-eH#XJEI3A0AMvEo26!EMo!SZSb?;Pz-)tUORoa7T1atRhfBaA&kKRu!m<RR^kL
zHG!H~ZJ;(*7pRNX2kK)Dfrgkb;EOc|8e?k%Ygxv`NL-{-uUOU=U3Z~wXaeg^%t1!v
zuP~xpGT$|4e4B`G5%_u}YvhRLn+uJY%_OE5VirqHhVP@qw*-7kbA0{8w+wvCCBG53
zWsvo+`4tlH@|#ri-hCZvQLsCil7pdCOiJ}7L_e!?J>jU71jLdG$0R?eTK7vA!y!rK
zj|2zep}vkpyeHhN+K$VKUO5<3tx@8w7G*q+^{2v#I1;Gs3CBetelaYDgF>Ga6eU>@
z!?F}gCFDT}FYc96LO33Z4v3P_A58UOBAYxACmFM9QD{I69+hID{=uh#lHaL1P9E5M
zL^$=ezyZ}s(EX>+9Y3HJXK`WgDdE)VlWGZiZ52qk?pL}vS_et%3H3>#3muTt0a;GS
zN!4*QAxhEHgZ+|fKYi$=a8@{VM78ZZa;)QN;hbvQf8dO895N_6c>I}DDOn1}qT#rd
z4CQuNrf->f91vrSM8<T$B$-9_3UfCTGhhL?DaXx;<|_=0ZVN!0Xcsxj-eVH2SGa&<
z(B^ljC6MEO(tOaq2c=X>l0(+qYyrod=Pdxnm_cTO?V4cxR@IV}qCF}bSDo>sB#B9(
zw|_w8x}%BE1=Z3kheg$t>`x*oA2FYIZtP3Mq>Yh8Uwq@@YHkb-BuRY%5rabiAPkx~
z;F*4ff(g*6P_kru{~+n$u7OxiGYzd@T8VZ@u{c%()&duYmzfXk{5&8`C^L%?CuUJi
z<$CawG1&ww`@l+Kbf5ePK((}2ic4}ZCFPdm5o$S>#9Idh<VO+s0fbBda!sK^m`EYy
zMu?llG`>txJi#cmIuu2~hUy;}T1zQLUYuq~pJ|L4JO72sA-Crjpjedh$Wwb{Nvswi
zByU21?Vxg^)IHF<s5yA>C(RLs3Xd@<qrdd=e%X{VnlGZS{E$dDBCD^IER(Vsz9!xG
zvU!-5ouXN^^qNGD{8?{VE?eT92(zV^4YQXmDTk4+T*O!o(V24Te#2&+R?4mWW!L8%
zKi-Jv(c|g0h<J6Mtleu7-M6@5%WLc(vX{A3sUD)IxojP_M#^>HtgVN$%&<99k@eRx
zu}Ck+Wm~FB_Z_y0p1YZr+|FFKr)rE4d!$asv$md#=pE+XDAswwWrv7-mC-4&%u9|4
zPM|y^mJBlyU!Es8!FTOa-yYK&r8>o7Mkvu^JfP>xmX4NT(-6oFIxQAfvvtg7nAYDg
zvCKJ!g}?8eyJSAcd;>UI*HC3gBGw;<g{jxKc)e2X`hqoN-f(QwkZHrk(|(KU?oV9`
zCWVB?s?;)(a5y0ZD?jm!g@+`0E0kWf<7yX<_atNxUKk{@b%+bKwr+k*bqGS&nUkkG
z1VOdpYAC1BmV>~lIMosl#w67}kd%a;U@|4dfW4{aWJ*+dQ3Adwflo+IsC+K}8>-cp
zDgl_X&RPT{-X0oIS$W6|Sp(~Kkdmg5N7?jBaG`<ja2z<O7qb?mOZ~ExOd=~D+P1h+
zd?C_WlITkgghCLhXCN9K^o8TTruHp8zN8dN#Kk08PW`1a@&#b>iQZmGRxP9{aQ4bB
z^oJVBGERLNXO!xUhP$z}p-Y!!oWiOr2`nBOMV8_z)gDQN<H+Y#OQK(j%Q#t83)E9o
zZJ8peCBO{g12N%(B*&$wKw7aHS|ztVa>J~#gg6kD{7!AfC4J^hrl2zflp<8O-v6W(
zRlZ-7TCkUsswc;V4W*i4AVA|1)`sO|dsGX?Nums^F-L?NU-4whEM%xq7xTP08-Px7
z8?22lGqdGY-#;_roV6ESb&a|v%i5>y8$YV7yAheH^pCjz-R1poZS$S7@w%Bu+h(fj
zW@>#OJGk1S(IazSrr0;$G+r@hHq{o599gh4HU2rqSyMYxTr*Q$KjW>KsapS$xAfZ4
zDeu}@Z`+--Q{Ej}yK&09F70hsyzO%iOZBFYU669FjPcZbTn^ZXXQ7_)l)ut9VxF<P
zu6jm2)Ap*Fvc_~-i&EAy+4{Aqvcn^eS$FX@|BX}A?#A&Gv!&HHHs9WHbH@*N-Whl+
zF<sg<vi~zPQ?&NOHPvI$>(Lp$^h0l1+S{ynn`aQ&sCYNdapwA>IgY7VH_vbto)OFU
zTnjwo@MukO_?`W~?3{c?P>u-i*<bjq7>G@}8$VCtK>OMDy8SKYpSN)Px7xb=Y$r&C
zc7co&n1@b5(8dGcydb<d5R8(^Ajl{a$;e9OHUt|HYyqHJx)X`096&cNe}uo0{*`Od
zi2#L*<Q@Q5m>I70V-w4{=go|@Y!0zGD`0tkpI9M)fRC+hT#y|<y}+QY5l50C_&1Es
zwC3LdudKKzvXU7DK$B#NkS&xEpCAxoh7fF$JOd$?f+1YN5Np8@Tfq=}p^#pCz&7Y0
zeX}o-NOhzxsm@q9E+oUf@nBT7gM^(53UOIw52@@))uy?-b>v0WlMKfCaY4tGQx0EJ
z?cJM_fG00>`R($vP-Xc!1kWQV0|09;u7^ojN?=cs+PVdne&J#wEDHTFKzhPb6a?S=
zE`9-1BS2co9spOE4;|%`6)ltY))`0XwKHSqub-c^ub*-7BRy9Wqlrm-{bIc2TIblY
z>&Hl(T)g^5=j~%R!M{zTBU~3I?M-vV4zKl_hqQr!87~?Lx@<B3FA%bLAmA{74(Dhc
z4&6=n=L+~+3;5d#_}l6JJzmjq#R7X3C+t}4q6?q{_qPG3=mzK#VOJaA2R(kTT8g6}
zZ}S2p0(V*hcAFf88q3%ivWP%Jfc+u&BIpAU(x(aF3^E@gZC1m?9%F{th#_xcux1R&
zhrIwIyxfq&6vU)(IUpv2>@CmeQWr=tM!fjm2jpQ|CJ)N=rRL$ukwHP){UIY-eq_k3
z-(pjSG)j{~hs`NN-g<-6B`2tZk^2METMDGdwkGBQ>A3>wjV0;<>8%CQ8?t(~|Ax5;
zuA+HHw4#ul)3Qc;Wo7bVV?O~cqt7?y0VT5)C|R*yfCr?vr;7BLk@Y&3wHNi4v3uwR
z8C$@WjTquG_7KFiW>J{2e;{F%i^7b(0|~2M6lUxmL|ba79s(oJp4zp@wR@54$sCvC
zM-C{dPxpZx*D`dLpG73!<#KXCPK|m_um?Gy<5^pu^|)v7ySh}%vE%zstLBbVdxuKD
z7QYybhDBd?;qeXCxA|fN$&{~K@+IQ%-{Wh>6wPhEW=PWv0yQU+B?|XkLzPRE6}Ykq
z7j~;`o60`<`5KT_bMy>HCAn9EQ2#oW8|a61PBjB_lCkUL7ZD^7$N*I5nP7B4B1$OL
z6z^AygGrFXQ{mo$#6U7A%fUez#~W>tRJmkd5QukUI9W+?)l@mc2g%~8a-q~E0d^pV
z1W`4g1mMW9+${PcpiZ630$!ZQg29VW5oH`}G7d2rR~#9Ktb7c?aRew!`5l@(POA6d
zwLOVzyCGj|;x$Nh;dnSD2t&0L=FzT)Mj&PKEm-|uW<ZMN*Q8xFimPVIRX4uoU03sL
zedBBX*V|uhpQ_(Hvi}c`&A2Pl?mER?H|1^^-~O)qQ7yuTS2s-6Z^8&4mDEl)Y`M4V
zUfbl8&rQ}nKUH#ml0W}ZacR1^Nhxj`556%pUAznARqHIxuc9hl(W+Fm-Z}X8uIY+S
zh{w;;{3>g19K3z>=FvNiN^RRz<pzl8o~>R#9+;}$1h{9mp)uXCRcY9I@7O!LrW?+T
zSf=f@vz689$~L7EB>J~c{=?ayo}I459NaVI6<|3(EG-AiK9jMkYQR-Ald-C6!R480
zT!)nE^%U!OzkT+t!}o*l*zONb9zQXCGBkNYoD`+$o=cO`;N;-Y<l5iG1PwE7`+mOj
ze#<ZRzSH}QbCYMjK6$n~eYQ_I+c$MKGWqojlNX}XvEQ4FzBI=$drW&x3k<W*blL>2
zy{2c-dd74Ht=6xbkX$n;veNci#a{cKy`IQIbXC$GU>(5R8SLYH6ET>1HY8=Jm9h$?
zh&>ywV=03r0&Rmm>7fi0F_!DB-w@E&SxdeR3LJ*UlE1Vv8jL+#o>i66U?(|rzN~gM
z*ijDc)wH9*x^n1zS!Fa>Uk>fnl#wEQJXBvBinoX_TQVKf85&LD2IGz*P^@ZfJfi29
zwI8ZZxpNFP=y_&UWo*eK0?vmjBdTMFiL~oj)=q8G@%;Io?<Vtqiyk{`Q_Hql4~M?n
zrsG)~e6tP%8)i|4qpvbX7-e=FVU$^Agi(6l2&0U2E$j{UE^7F|=&b>2d)oSodec1f
zfI8Av<hDE#;EWhzW;SfKD|=ShjTqu*mg?QiHgHun^=9eln9;hlb$vB8rO_#{u6>0y
zU2go*xWj5{N~81Zujz8LiN<4AQ&Sq9Uw=&t&VQK0G|scqn*N$N-d8x<ms>Sy{BET!
zvr4}C71or(b+r{NmF1s?LO@^R2)Cl}L@lMhlIBoY5eNRFwFDJ_smpI$fqK#JlV5_e
ztJWZF)8nFyx)vGdo9e{#prhp2T*k$m?7-nt1)dSUjBXLZHxWn(zJ;I%!S5p&LU0*?
zYK?afrlcg<kjq!ln<zKGjaZmOK$?jj`33a(Jp{OQ&J=*$gHbJF5DvFcv8Apd<s#mP
z63Nu*MCP6j*F7jpJxk91$(hH{I@(@3?@GXRpq|_ZJnv-&HfmSxqjo$8n&oS59Zc7@
zE4A&j-l`kxZ$EPLk=v~|TNQ89yqT?Qo@ZG1qYGw}!w*Ly4)<!46^$09?RAR1Zb^P^
zH`~(HtqT0St(fBm%&{GF+^FUF5J{naX?wk5uV0jF$(a4R{rk?ew^8vnj<0_m)CB3Z
z+Z6bDw_yo(U<n?>66_=;&`6~1O^Ur~5g}-<balH@-JY)As=&{?6_anr<U26=V_F5A
zqt3LwO0ichO1x&wecgSlZ_3+@iTs$T1rxRACA#(KOhwJuOV?kTDJr?zH`;gY#nB77
zXj$c0*Y&RP%}QC*OfFvQOV@5uYPa0mtkiCwDX$uXql)npN_q2w$6n)HVC)XpLK)+z
zyk$#QwkVY?Grafeq0vLvPK-W{Xy<6>)uW?F(|o<cL-Uni6UTb4_l#Y*enH_IXZh+h
zzggio&pRwtTNfCMW7~rBE2Pym+I0i6tHXjF89kEb8x+1F&97JZ^;*&3Kr+3iS%Dwl
zypqCw=>(6B9=ov@S_(@@n$4&1zBJ#Y@GxYGuwh?0oaXDG4mZTxv751U{U!x|{H7I@
zHqEb9__ff(d=rpQ^BQTaYMO6T_%_l6=`}6NnwIpMjSBqujVmB~_3-Fn7)(cJ`4Vi@
z8)dhvZ&r`*y;(PfZCHZUxb^51zb=OqSLOVa;T*1|${i~#F1Vea-vY3Zaz|qla%)!O
zpWRx}PHdLjQ0S~ix5;LWq7+(1o&~AsdFeLE(_rjS`;1vdo&~AsdFeLElh<>~+N;R3
zAQe3?-6nZbgw$2!S&)jJmu{0h3vRN$&=@L6MbAsONuHEVSEinAUtru8q@w4g+jyQa
z$dH?kaN7lCl^hubt~rUpPFc!GBt#e?atIMYh^#@x3>30*ZzPgC5JOEgp<u}!z<<TG
zbwtxX<MLeHIlA-O-tRu1VO4FQ$G&s=&9f6{?*%8GovQtvX@1wrV?8<g<c-F6UA2pe
zou4@W)?Q`9lT)=%P4jz@+DhX+HTu+z&F{K&86s0SVM2K8gtFnOsoFi${9d4!b~P%l
z#&O6QxC4=Hw3z7)9m<A|`;d3%RPDiO{t)K9YjoGO;CG)`6!Fc#MBpuU;`yoC$KT_3
z&zTv%YMB)e@BP5dNMkByZ^fZ<S!5By`4|XZ`G&UUW$!w!JWYX^lr=>^KSt>l%auKu
zrYY_W6(wZ`Ew1ItOMtmz?x#?M{}mp-!J!h7zj61iy5Rb(fH&9SHkfLQZ`~q>V=2`f
z3tmzk<lrg>*Gsa(xBLnun#kPCZ2~-Z7O2Gn9H7aPD8!_gK!kMqoLCa=KatXl0N_5C
zr}9>X;_;0aE1tD8z{){%&=~S-r|y)zSwB&q_CKZgpSoZ2&-MRUpWb^;*?VrPDWLFy
zkIk(2Irby(B60IIxG&|uv-ZvQiS~5MJ_UZ>eIIisZwEWam>eD2U7|(zzi~@VIBO@2
zphBFLrji8iNf~qcx&yaYnMg*}2RRsKm@{J7a>k;0*fMMb<sg;>4BSt3X&8)c>Ma%8
zc+%F<Vf!%VO&L=KDACqZdY&{@6+I?J;Zz`Zno`IFaVd(NO#SFN(o`<=nDu(gX6*uH
z^JQb`Sb>;p_3*4+Am;jP3?0+z6``p{m<UBFl4%DfQc%SJ{ZCPxfG={gJY!JXlotR_
zBdnlGqChRTQD`xd9TZ<pMNjW3imEzOB6|E?UywdUX$|$EDP{Ch?kSYR-OTtyd1J!V
zHDQ+5K~x!gTXiP;aD_xQg}+fgLA>BXMCP`xb_d`G7#k&H8QF^b#~8_`UEfWT>kg^|
z7HJ_E?~O`o8MzcF^#|d720k9i_{vfg+!x_hkQBTir260{bDwIL;Wi*XU;%RE8H)(o
zkZ|l0frA7N5txE&grFtqfeQ=PKDdq@l{9hbWh_rOf;{=?2KrqA0Cyhx`y^QsE=qC=
z-aI9NL{Y{|5_^f;8Yh^_C;Q+++i5A*zX@XT+OwErJw1?SB6Itp5$WQD$~K{lyhk-7
z({g8d4!nng`-|jZ73C@fxwlGZNK0;`{11>%{*fl6fg0e#=!JCAdZlRn_~~@hZl!7W
zRM8V~v$wJ~UD>Www%?JbDz}WdXN&8`58SidubVDDHe&tAUbKjPk^SWO#XCvdvcmS&
z2*h2r;lRBl-LX#L*Uj*2Ffg-`G~(gz{EIq_Ev~kw#X{w1BXUsKmSSM;4n7x$nq)TO
z8CI=T2e!>d4o1;%^L*z-ZVmkk?}y79e47ym9F6G6rK{tgZm)LuG|3S6zq&Cv_ATJQ
zzcn12zM}mmb8&i&XndU>lg80mc;S+faEDP!pz(5jt)}e1BR+i)W$gkS?BO^%y+$++
zPmfuU>r;6?od!(;&=155%K>y+G}(Y2^UIJFmg_;948hS3wGByOIjMjq0qD6uv>YJc
zf>D;?`0{PQwVQqvRxgiZ=&Jx!J{8Ka?nD&s8zxgyKam{dpJ0?9BX|wtIgzg?fN@GG
z`A^aJ&k($h;0}Tb1b>d;E&!E>m&$TD1aATQ1yTMs`V;>B24a7K;4cx}L+~a5c!Pv*
zxK4q{0DRg>97m)$;5LE-k-vgyG`@@DZm5~29MRut<y`gTf$1jSpmW~+O4I(Sq5}`X
zc^C1hi~|+B2K)I!%s4{;mCkqAJ1rBu!f*R3S#(Z;@30C_ju7%?fkhQ~ScaQvmdsf|
z5vr(f0E4bY|4hc8Zm)4T2~Y0wyYpX^9KwOa^+Xf!_+$uiTO1zv;T_!XfGhiaNG4OD
z8U1ju<i9V8qaOl1-b=m$;0iNmYT_C`tgM;iz=k87s`@z_Vs^$`a&>5QXwHEyr|xp0
zi)YqUjs>m<=G^Ei(p?^Od6|0O?ZC~zTrs*zn2M^g$o0s4DY!oAVmnyQ^~qsV5%<I=
zJ~P+Be&TH4eDnKQAE(uwlr97XGYpJiV&+le&wYdfk2%PL^#D9X0h}Wb)vbdZyf804
zE+@M4o_8#Iv5oh8Q6n(_*Mz9ZX0>2&=R4<RFmZu2q#5EeBqnf~$z5|JT(m4hhs(b5
zC_o4JuV!FBX}mJf{orC#1|>JitzDllz59*ThTGjTmO>sta1p?ewe9%HWBb}BY%91a
zEw@67oagFffDP4B8q~P*ya`ttP3bhV+8eyjO+O!>d@eDW=wE<Rv-|OO?u+aJv<{d$
z$z9!ph3@Ja9Rejs^$6+M-OwptrU;uda!KK4k{F^O<!G<rXt#KY+7eI7iJ_^H^bU?t
z6h{w}E3O#uWZQVIF}zeMhV4d7#CV*c`+zFgc%%Y%>Wl{(Al^|FwX}yY8G!-SXY@2Q
zVUr%ymo1f!6`)BMGifEpg(c4r5%`fjPjDKd4~-5@w4~Rfg+iGglO}4?W6}gqdQ6(g
zNkp><O~j<r*hZ;=F5c1Fh9+Dt7oVn};v&12$+e)sc{lTf3C8a7quhAfMiy90m;58h
zPUTPmvstsY_#23pNs~E`L2&j2fM}NFKS4j7Z{&&%(ei7`r%v=chya@>E0*R-pDU0!
z<vhs}g-T7lEF?FgJTTNmTf&#<YqsNBK17!q4SmgQWg{F1R2s(tdz6+vO68us`vSLm
zL7W@!em(wbJpIUS1%CW)I9jNBV#NJXzJdqeSzkMdZ%PU)duSr(|DTp8Q<{er3>VfF
zJA4c4Gf&O09emGTtLbG-&}X_qoG`V+fD}HlD{z)&FfN@O3<i)iPG!z+Ez}`=6C&px
zNfE)4K8BVYLLJ93ggOT85b7~Fe;Pme`uSJSr<)#E;OBXK(*F1|+Nb~il36!MQ?_i~
zb!^4wMaFVQlgSpbMuWKnOY-XZWNjK-U&I<|Y09$cb*un&V{y~tkrOT&&B|;H9V<Yy
zS`W|K1!7`<=a~XA>CB#1PG25Da1Q6|sc39oXDl?DG-gllEgJi$$D}iUTJ9~B8qsOe
z7{A7(>5QJ1+a^kGghpzs9wHh{uRw3n7(Knj%dvP`>1Zr|xt^mje~$J%!VuC{rDuY6
z*r&Hn);4JF&-&?@7M4Ay%0Bv_t{4{moSKLkT&|e0RmP=WlQp*a*X76>$H8CQ+~e*;
z&XXqaP%B5I_#1ToErOpQ_%VXl5WI!p!8$GZrx@=Nf*T0(1de;?N2HFwL+tMnyp5oc
z{DEV6sF^N*EcwLQDdZkMqe~zw*ITVtp!m(zVa@WMYwRnXbOqeQiYY75UoD{iYGLOR
zNd~XG=Y1dLJ2+T~lIjPD;qb`URNY5E1RHTUAZxqAd}1<#4#`=?)qS#q<#x|o*K&14
zj35z15XUf(h?x0Gu;@yoK{I@KtK@hp|1%)Jb$TNCXV3=uDj}}z`03Ma@;_k^RpV2$
zv}Fv15AF{hR^jvP^mDPvXXBIc!~$x1I#9*aujzgEn~2`0wX=}I2N%g)Lu6wR8lk7*
zi=w73vWjXWiY~{lr&TU{{4J0xS1(~LQ6&e@4m{qBW|pIlWipKM^i&&;C^^XQ=wJ}(
z@Zidf;CUy@6>AYOj=xpq;0t5@gPLO39PAFsSRwL>DAk4^I)l#+X%8|=!?FH^oPw{K
zwZj#J#DEM}0%R<r_Ej|UrLgu;LYBza&a{musyekjrwcIEgEuqV@vCR}HB?FaTAKEa
zvG!0X6xBYQ2A?T|Sn&Ne5%NxI?@r1W`L$=hi>kf-sL?-b2H%A<zQYN4DjZE}`-@^M
z2I{r6&oh)Eh6*X|+Y=Rt;artb{iHfzUwVXm;Z5UoC|s%zINTTrr3RpOFJMff%)XBp
zY3jwzuiAHOU*39BejlRXdXW4ffH^bEvhOn`|HiC)pDB8uaeu%xeZZ9e7xVOc%+nt*
zJ3nBWKVTX^U^abbaj~Y)7#+-2!xzm)D$<T>#Zf(3(>mp7d)c~ReVh%lH%b>6v=@e1
zCu<$KnD#U%o`wYm91GPQX4W}b-L}9WvT(%O$+C6CjmW}jlZ`E&tZMs|0h<ho{{vfE
Bv%CNR

diff --git a/cuslines/cuda_python/__pycache__/cu_propagate_seeds.cpython-312.pyc b/cuslines/cuda_python/__pycache__/cu_propagate_seeds.cpython-312.pyc
deleted file mode 100644
index 11e42a13d64f632e5ca383b5d69b721b0cf73250..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14574
zcmd5@drTbHdY{=B%d)^4Y+k!I%frAr2HSu!ej5z7gE4UoNu6!ny=!I+yWR($9c~O+
z+O&;SEcq&SZd!wGDuWu;SaB61xe8ahQr-ULs#V1Vu9rLEN~tbaZTgpOd>_4P)qdZZ
zJu|!O0ozS1^$5<Kne+b6Ilu35zWt}%TpI)7U;g{|14k<u<`?)vgDII<oq@y+hGRH3
z#CTcqG<Xg0G=z*{lh?#zo-t$&Tf7#MHifdnR<D($*-*AO8}iK|TR6v?6SjNp;aqQS
zIM16G&iCer9bQLxgLgx?z*`V5^cIFUdN;C+f#I`*+vq^FCujMTRo=Ws2Ie@!WxdaE
z)=v$|Z^a|5C;J<G>G7~~mM|O<17RMLdHutj{}dnY9~wCsh;Z;Fw@ncIBfb2kVLsB&
zdu(#{na;LuU+=5lPT5A19er<|?v!&d-Q-hVa!<9L_4RhY+SBRl?(C5_be=uh+1u-D
z>+XK-sP9<Y(LUMQcjAl>%g|JNx180{`KIr5-x=A$1-|ba9+LC>2l@VsM`1{vf)EvA
za^CUNZ}f@+?+=Fp5kA(hOh<{6^JP|h;Qa=}lNA8!v+pxrBWHlv#2Fzrb0&x_oEhRQ
z&H}L&sROjPLTux*A<p4!5Zk#Nh;v8mo?N-Gm*=^5zt}%`T8Iw$2mE4G=ueW9^pBgs
z2>+}94Dk&n&c+!Z$-pSi22;x>ze5Q+Xht;C!a)m7Y0o&LS6`$Xj|p0}vNUyvrOOTd
zsI9-`e$=sl%YO3cEVO6ZeAM~A<bJYg|0(@!q~EmX8tYR;OQen6PQQMKZRWm&Hq|xw
z4#%1Q62|Fx32knmZH_Rv4L!G6+02O}L%gRz&KVdQ_KAYO{~|BQ<_NsSWJ6>~HjnZ`
zG$xw^5wWRJ&WZFAgz`m0oSc<>fduDAG?gtek}KQD(--0+P?HZ2i6a88q?{E4w2ApR
zIfoQL?12}%QtPz~7ck2n8|(-OeNbyByeTbp^ASjADXnrIgKUcNp$kGWv>8JHa=U-m
zv12eA=63|6gOMHSXu6|+Sm^)~oZmMzA`V6)koFBF5s&wgVKoem$X1^(5DADr->6Fm
z7#nD=6FbfZkb94LXklEn<EBMx$<iiwVpEN@sb<`C)wbf?e6w}3b*kf5*L2t2^Ir^p
zKDg*SOv)`6m)^L1{qmIQmTlU0xAKd+&+8V8_d_N7a#6{R!Rvz);?2>?(Ypm-lzm>d
zSabk#a+b?nH|r+rrkZc<o8EW#%opCzy^Cc>ASZt%t+m?4;(es0wC)EMorjdJU0ZIR
zm^?Ancgs8Nz5C9hi;Sprd5zC$o&Mfp*+J|uefUa;Kd`R?MVK_kfe=F&M+wmB1|w=L
z0l$(ERDqs;L6$-}Jbki^m__pekW`4I-xx;Bp=l1-Q&0sW`W0K%uQ>_~WE3PI%2Mca
z%phtop&dl^+bHZh&TzpvW)ur)FL8soMoHQ#XCQS<aYNh~H(fCHG9F`(fE)N5q>jfX
zAdUz~A2}B{YcDT$@ZS&g^Rh7<h{$;n<@^lsf)5TTeDq(IEy_m$X&~U#1stWvEYx8$
zY(%ytYhexZ`B1d~qHG!v0$dE4ryGOd!J>J-*gF9b1biIna<o$S1f-RzhkCI%MDH=n
z*5Z$|5+(IgN&Sp}p=9Txb=OK!>5|=%u$N2r@=pq<UY{|2mUBC2)<0LB*mp{Tzm`){
z-Km9&o>e1Te3~7%t!ya$xb>5>lCys1(!68WxOv%8bZzvL);njVs@)5YmN|BA*L*(g
z!msm7Cw5O&&*#_7TWf?@U;t@b4+%)N-U&#+)`M6H9E%Ii888-A$pznFd+2V)QmBae
zWU0O(ONA|ai~@=T0&a3)Gk!1-dJ9dEPC}AG)k5C@FZyC3-}&JS7hv6Dfl+?6R+qF-
z7u<ubRYC;x%`chQaOK^JU023FJ}2dGn`!>+!0iKb?Crx+{=RwZzNe|69>D*C5vLwN
zn5#N>Xge9`aqD2t{VT(a8K4x*J2S)$wA|a!>X>oNG-lF{r-*%$5F}~^eN&#IdN7rB
zh#Qfzv_1NSe$`{9)sGtvz>BEzuV8_eM^kXfanw3;3|Zmz@{FU<NEst)l8u2L+A*{4
z{7PuA+H<Xu;~t8egJoJtnhLsTN_*;4N!&t91l=^HJwHwET}nfix%vwesAQZe1d7!d
zC}jpF4yu8cFV@i7K|K5!K~uAIYN|jE^Id-*-Ftz3lr?76rL*2X(sqH4iuh@AF2U^?
zlY-Z{ku&2;ud(kMbZRL3PGvyP*x@^A3&BR3(w;!CX6=opf-N+qJ;{Exaz=_i@hr|v
z({v5#6IyN6YSNT)UatOwi5X!$Sv|r&$PhMRgiD5kUo6D)LjofB@3IDYf#2EC>e(!t
z1%G6K7jSnAh(0MC$wqW&e+W2;0)7Zg0uD>w;E(l3LDwD_jt<9&C@3H$1SFfR)FZmL
zZ0SdFGbWojzvvfm*UCktS(1Gs8Wa1XNsebi)m=D*MJ%D{01Qet1tJ%s0t#Qi_0Ero
zd`zf@7Xd{A*%}b}Frq6F3+3#XKRgr)`yzs{6|z0q3O_Ux4k{Z00Rh)SW_=)H5}|Yu
zIw#!;yRl7Nl^DVq;gA}a+~gw^@QaGn=nM1Vs4z0>(M5E<qPP;VSvW-RF)Ibd6Q-NC
zN!x>hs`2)v@~T96qg38F+qh8PBH3NzT@&oYuI2KEJMScRv`IVKmK~)N71whnx+F&p
z@XVccs|@S*jCXv{eWiP%0BWq1IuoV!Qfd7RzvL*m)|@D;k>Jl!GbPMaC$=4w;IH=J
zTu`b#@zBWb=werj+)vDgvdZzUCl)5x_3`n9t3h%#EIBsbuwS=NoRJ*vB}ZApQ6V`h
z5)N!BbqvdnEeXeN$+3Gi+q8MluNYHq>%)Aeu=-aeOm_LtthwX<tJY;}-Um5Xa<09)
zXx&0TrBAzNTcT#KRI_)ka-rs+WUqp0n<vDT>ZZHL6V1n@=3~o_f*aQB)*srGaZGt<
zE2jTIax?=8cDDctw(c44NZ2<^_RSOLC3^+TWNWil*O_osOOEO(F;Tlms@;>QZIj^7
z(RQ!n{)WFlE;&x#=jZ$0_{TxX@g_ESmc<s|Vznl}v{yXJg;^;aRz*KpM#Lr}XB503
zhrPz0ivTvFP^!mcRS}r*SOaibV;l@V7eS&E97M!b?GZ47WQeoKA+I*9k5JH#0TEEd
zGed%&xO~T`azB%(>9!i~>TzF_9_J0J9F_J*WGy}J$r3(k(&Lk&$|r*=AJp1=n){{D
zz^#>|DUKarQ%WF=WrVniV>nY86E_7BU^0S6oeQco1X55fEe5qsIF}TJUUR*ko4QPJ
zJ(}srJkL-0AndEmK=%o9q7>@u@moZ32z0hAbB(=viSQk>qTCQBLzvGu{{z8OBpiYA
zDLe)RXu?Q@3QQruYkWsIimx3Qbz+1IB6MMd9jCGng>m#nk&VbXz>xX@r6ceUY-cze
z8x9M&8D%4wGQfa<oZ=X~Cp9IZ2b)BmAiRd9QjHRWnW|F0PifE>1)Bvg#zw1j;Xl35
z8_;a5jR5~LMZm;5zz@JX;r}pysjB*#DFuy7sA-{VcEfbfPb&Ye?#FeD&ZFbq%XX)R
z->!tCN^(>s9CebT4&Xcqot2Vf>yjgxnGWAi5Wb(-8L$N$nmn|$q2QYFqcb<&y8hNg
zeCFuPnfdaabM{1Qm(<#|xalNtk5^71Qo4HB@$QxK%Bcgh2C1rPp}aYj9i@&)cL*8d
zQ&`N*w%JR!8zjfx^a1`F2l%LJeI)(}_Unu2I)KEFo(G9jjSwl=fgB(s0H&*Yj+PZC
z=%$_aplT}6%BYg74!W)(vFe<Ag%9Y;()w7e+Xmv-+Qt;j{g!h<z6n7?jdkg!Vjq+$
zQ);Hh3U!f27Xc`QMh<m7m1k%r721$o9X^tz9AXd&K^@LO*N8=n!B$#Qdurw@Fd?n4
zV*yY95%ZTS2O%x$jv1*PJC;Ik9TXN*DYWMPBWSMbMVtxWIr9ahsA@iUER^%FM?uKb
zr5}z|FaZAF`OZe0M`^2fV$w#dJ0g^cU_4JuDF#RwJt{#Nf$l`GfBg0HA;NR)fald4
zRT)cfmUPeQ+C0`ln>8>))loG9tb-YP2&e8@N;9B@p1!L{q%%|p?S&4B`Z-#nU3R(^
zyO<mK@jkb(7t5&1s;4YdR8S41LK~LUBnKii5V4>NGE}7#xU+y$5b$IP<rujkk_`j>
za#nvd6yp0uVLyI8fuGGBe|~sC5kpY<BO0uL=U71T4$OWiD-l^i&PlQ8#SqRwNmOip
zrJ9aNZd3x10jej~>k100o|sWQ46Vj0V6%e#FI{AycGW4vqP=RRs`2igL{q2K)S0Te
zebRWVb-MMh_9d#CCHNx>Q!^?|)kI-(fx-wHp548Y!KU4y^-UFk&ICH-`t4;IH@7@t
zjJbOr+8D>i58_wi6YWxd`C?_$Y)7KGLu&50cSdSHAyuCEGXKO<WfcfAIiSC`U(a8v
zsfCyP>-kd#5K~aYvt74+WdFKsOQNh%DnlK(tY!S<vb_M6+j(cx?8(`K^QCQz_V%v}
zi*M||zJEga@bGeF^{u+;y5&tZQ{t`obbPiQG(2h3o}X;Kcj?FF_bcW%b}yGzgN9pG
zqje28HXBQB9KL>ds&TQfX1S*B*5LHu^2V)G+^x`bXtq(RX^}RzByU`xZY|1RWTXp@
zs)sv3{&>>D6qclF?rE#Blre{u%3XJB6OA2GV@IYY+i|OVx_h=js!py>6Ru7btj-6o
zU3m>iX)j;VwErcC6P5nWP{h53N?BjpU27<(Ik0zdSFTztFDw#)qW5g&^t%y>06J8N
zhtVd3!q6d4RXJ%6sE?|8EUJQ%p6Us;bp)QZ)u>N_8DgNI_CoVpQz(H^tjW|x#3K@d
z?qnk6p~Ak77y_-L9+VL@G3aGe23YDwFfg;=0R$d9V+LX-HgLui+NnZH5N(VZfir0(
z!8Dy-6I3K6z%18g)F8Z{{1!V^6?$lY#5D0@dnP$nZ8o}sw;sz~gJq%!MO&lKHFlzY
ztEj?e1Z7{)*68zvT5ZW_5?;T9R$E_6t81P&I&=DbAquRCQnWStBnFTk6DkNgXEcby
zMn<3~9UPieAr@-u?8M7#@K8o&c!jb@p8h(Ti`Sw)6=UEcfEr{DLTSY+a~M*diZ!Jy
za8*+s=h|W;k$y#lLRm<WnoOAL!w8)*z(s?2vBijxrN~W0C(xubBC#FE3OErWbqTLx
z)lKBWn^f<Gjs}Hj1VlpFil2`OJTD*&3MkwuCLmP4WK(~1XoSdivP}RfOR>iYD2%4c
zYNW4p8O`eJ5ApuU@X#8b9?9Gfp{3aNXG&m6+e?$=YC?&L3OQ+JbTELRF=nP<!LePn
zY!K573S2Z(a%^ACF>h{o!kBY+fH_7n%F~9az=WO9bf!>IeQW#l_8CE{Y?K_0&+h2g
z*wMocXtv+7uGwC_tx$|sX);f8{(B|YUfmfPS)37@yCW%UZkJr!XZk*S`}W&&l~Vn|
z1=pcP`{86GN?xM=AY{y)m+IRVT<weYqZzq}r20d1f>htW;5xc!?|6aS&wRIib7!Rb
zBMUARC(~-Cj_85?@WVnN%flkZ<-U1f@<775U2<+;a5gCJK303eS|V9X64t6OtyLg^
zzB~P{WNmz4t$O+{-U{1@2xE;9r`e}eVJ6jUQlIg`S3GVRLC>!eE&OoD4vNnU)PYS7
zV^DhlXBFbot1@?WqyU>L80raVPciqDnlPTij0wQ2CekGZiIf`8=qvS38}BzsfL=i8
z%?uQvLBl@=VuKM%A!ucUxGKbu+ZWTAdCUS9eFM3DVXtmkgQBYaMQbjIsFxAq;P?|a
z$1QP&ikom(e}XUzM8=Fj83VBmsX(q3((=XsZ*y0zXxeDknLEOIM$qO4jLXuauqzY?
z1>|0%$vf*7n%DzAj7D%cB-l@wk#fO+LXx1vZ-mz|;|NB0&n+9c0eJ&>;PecKHRD7~
z@L~Z!7APFxBfJ2&38~ou$nj(;MuAgUlrXg(Orac>a+RC~uV5g+pco0VRrkZu&3Z-w
zHT!R&X6!TwnqUMdDxKK+!ytMD!fo-*cPHPS=~I9*QGZl|zw)E^{8IVx`yGkyx25j4
z6W!+}_}g+G5zF~Y?FRPgOW7ravr5TYIkkJy>Y+EaDdkl7S1zmdZ*A$Fg`<RcexZp$
z@&8KLcS-hLvz4>Kdt2{syf6OL-uEyY!PU-W=O(N!$?Ad&Vls#t$y&2$t^2n9HA(iS
z+4ecxz0Uj1^RK`0Q~R4*haco#$^Ak8(-*p^0TP&AfyAEyP;=}Ut9dS7d4@i6?g+q&
z0ZOSL3e6-#+<<}*_zNaA28MxhIv}N#OMxmd&yShFwF!U&TSz*uAwwYnjeIESjj>mY
zOw84G6En_SVBr@5R&ZR(p`}2_GSdSEV2j5H0ywpql2>$Sa_c7_lMxQB3I12%R~*cr
zvnLrgjc;8sudkY&<6%e)l+q&wLrB3wt{wrDTtf<>%z93raZ)aYP@r()J$amX&A@%E
zI(hy37^R%XT{K^I4(@%+n7?3euz=0mfv5v$!TcNJJFEbU!Fa4G;xdA=BJ2{u7AGcW
z`I9~{c;-D8#ad_;2Lmzi1B}T^3ZP0h1cS0EG9<<bfW(j(h!5`f6okn2jRg1*2bJ(g
zA)^QMP{JSc5q}$SVOD<0(r^I$bNr#$p@y{DKZCYo??Uw3e`kJ+K;oR}xY;$?^`NNs
znqkRVeslli{wZPd@HH#G9G*Np(<nI`u31;g%E3|DK5d^lBe{1zDBE?deaTrh)joOP
zTE`0b2XC9)wp0Sv!b$t`mO3z&Ik$biZO2UWLrCpfWel5kJ<4L7jgPXK;=2DN6a3S*
z!|etZsB#1bqd>G9Q-pj-0Rayo<P$^d_wWP4scb@La$y+X-+_p<^*ur@g;?N%ltGEn
zt<OcUFR`@<L_n6MqT=x!)zYc}9atNQtq>e(#yb?cbx(HB6o3T#pr{^*=WtGlK#<Eb
z6_fEh0jYeC<k)jP=aG>qZg^-$YoX1Sz3WK{<JdfT?nl0viwi0Kn$WEp5O@gqpeT_l
zBXX)V^e3#x&@)=md3h)*h;A|J?mKhV9f^wW5ngl$z~6}v^AVBfz`v`oCn~<GeIY+I
zy3G*CQwU}equ}p<d9+NAN=n|p!9Z3~)PoI^S217NxsZ2k-g->IP57*a|J%?{>J44M
zO)89G^a@5Eh-77~7uOl<#s9#<e?+VGc=FJfKjIJbKJf7J`NC0dID~1t&v$9qA5wAz
zG*T&kZu>DsWCE4t3&`DtC`Ka~{UJtwj8XdV1T;gEs0<P@6an669vd9y{g11y=Gw;_
zSx8w*&2O=fTZ+uBk9Ss?w><7*E6h!gkFid3$>V*6P+%KtKF+RMFji)ct$Iq}w;01=
z&~Xu2gmVxnp5YDW;b&}cuMktb#fkE)errhn@J;cMsEh>8`_J>C21Pd~VA7Bd8syB$
zS?Fd5ZOTsii-`tQzLUS0fV3D0#T3^+L^k4fr+7evqQRs%paVNnywHgkhnx*!RX6eA
zQ`CJzu5=R0)&t549TKiWW#o}Dv<iTq0n2{H6nw>G|D37#IkWSZ%z>|%yx*8|Si^4^
n8a;F|Hs^S8BD+k=E}M70vXH&~J<D$_uN&Fp>|=&J2uc4Jp3g?r

diff --git a/cuslines/cuda_python/__pycache__/cu_tractography.cpython-312.pyc b/cuslines/cuda_python/__pycache__/cu_tractography.cpython-312.pyc
deleted file mode 100644
index 67a83039c23c63c11bb41d1469710e789f7c805e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9441
zcmcgSS!^3emessTHN}=hT@)ozr%fN!WyQAQ9F}Fvb{tPe_Rhr2W(k_zqDb?a?pD^~
zYOuy4AM(Hh+sFczkSwf^`EZZ`>i`2x=4-vc02^TU!yOT6p+|!;FvbG;Sb!6G5MY4?
z_Ej~jn-W?ZJF}RQSoP|7udY|`sOo?B`+W#P`~SR?S?NIN4>&`EqXumM0)TsnM?8~5
zc_z-}ZE+ie^R}El?}$4HY|lCKuDH7fd*WWGx95CuAEb_4L!OPZd4JrWZ;UsRHqKli
z&&9cXQ@n}fUAg9bOT2|aHYE76`{^Lely`r^=(%{S4b39n^FHFepV(?;ZA<p3?`xci
z+B6q0X5JG-%`Gk#<V;?OGFoG5kx#xU<Wr@kH!=nOV$`mA=T4uzkhpX%ep>SpaOU!r
zw@zz;)Pj(@b_$xG7R90{0lntgxBhA-BMK=wQ!LC1vJA9ari(>+S0VJ%a!C;Q>7<-m
zcuOpnlIf&e6tzacza)!7GM~#71SzGDS&zGFneER3xQ7H3XLuC1c@h5OgFGYHd7I$i
z?eKHJ&j~*l{M>>w%hnUpVcxSxxp$9p-yY=+f}3aI=NH`bA-?f_XWYXF-bb*p9K=4p
z3E~F68De&+CEBX_V2+o?Wa=8Mjg3!h-hA?UB3Vf11dWwb33)*jq=jOR*SwM}loC>=
zENDSd$R%aiszgahUaPgyg857#Aql0V2zA9mjfj&^%E@`nmltF)lS<^1B`qM876eg9
zz*ftdlptwrtw7+@kaHH&rA0~eh=nwe6L`&sq2m0!B*>}8x}|diY=_@Aa^fwJYkY=K
zw#Hp`ZPBt@rY8e^%257WMhQ%Tgzv!j?167k{SJKZ9{4c!=jz`l(}u{3fp*LZuM7#x
z?kq=PGyQ8AGtAseO8_SF1m^*6s4F*ERi+G&mVH(<d0Qu-(X$RzjGdBu4SI|mp)lP6
zaep0UCfiTz&Gfl?Yo_fBP0J_?GI4i+5gBc)Tk!TWAn}eea1X?=FM!iR=`1sl3(}ly
z?Ao%eZqBV1_4c~@Hj8>kU46Sny|b>~m{(o@uDW`>1tiq1X+vFo$fBODt2g$z4&Pr_
zA2#q`HlSB1lzo*#X386@L#Z%GHgQkVmf2|vndv7rkYfI{5%%qfSx=$m#%nVOiDCIP
zE#aNkEk8uOYq=3(_dgk{`VWkJhT>(<QOHbrk2NjIzYI!aJ{h7IBsaVxw{J(z(2=BN
zbVuC*nv;J_p=I=`m#zTxbBLDnKFA%Wxg`{BxCm;@$;(S6LB#U!Ykcg!jzCoQ%?UqQ
z6hMCSk#Zmc0$++;U6domLT)Ki_C=EKB{R9?)tpcsI0ed1&J@zPQB5mE{x%~oM22{R
z7mLN9mj)2f30YSfXQDC9Rw!vrNhwtXt({&hE=oyJOfHF73W@kgh?wt1%-otYQ;;Xd
zMa;;WQ(8!t1kJ%`^4CecMB*}uWz7yvHCI7b3cie#DM)g%kP<{Jt3Zh)^CeKrX;3Mk
zS+0YA!Ug$60qlpu#5fR@#Q^RX3qY|6M_2%e_(Y5NAdC34i1<i|nD0doj<7b+T&cx#
zg^Vm>(XKUP3j~aZ8R0!pR;P+!nrQ5tZlDlLLv!Vd=`^TMN2V}e)Z9ejC3uZBRUYJ;
zb}2n5YOGg#katVO)&PB&PZpACflpvkA{NImUC@Myl4+ax63JYym`cK?Mgy8d5_0q3
zG2NXzLAGGm5@=4s3KCwn$j<QJCr1{Fd0`}5Tqum}G>b-3ixRPq0N|5}(vrMTEC5V6
zL@uVqWNBe3R$9_Li9`mLHIXQ{nCum!g?Kll{{$U=13inNwhpy5s<cKQ4E?fPX`Nni
z-12RO!fGg{gklfMzX?@B7oaS#)!V1`PAI(-4}b6<Q<dHytvD*4@K(4-4Ua0}(TCyR
z%>CQj|N3?%d=Xc4KAD{Q)jOZPqj<&v*L&y4KOSG@Kgv7^e%iU!*138}4INTKhaUV;
z35{&DjY5-#t%lY+S1X~zm4?_>cVxX;?VC{gCe*&;O5gF%`;@*D8{Mz0%>4Y~lkUFz
zSJtkq%-p)T+1Y)+bFFiIZms9Rw9*lK*sru70b1;n<{%{Qt<Hh<a~qv8fE%~M!|Tf%
zVW@_|?Wh=xJsvr=F*t*3`nGoDp0;+Zwtf%?eBV|_*Xr31{Q&y6Lfxy`4@UqDJoU7H
z<ht))^RFj2I`&sQ(Jij`7j88&r9`GSA}2PuS5};x4T0NJH>XyeUp91aHTSHC9?bmn
z`H#;(IPviBA1yvge-VG&`}RijmB;Lrr=G^!4L2Je2akV#<X@(KJyr3XdD7kq{ji(i
z9y0LFz5$%}Z1zNO+O@0tY47-ENBEhYiIC>uC!yXI$Ik=Xu-zzSm~4>2LW`JlSbE+A
zp_;9)g)$tdQedF|68f0ILeFa)g(xGQfJPEVjEGntL1jWHfZ-2Z7u>;hwbIMDJ(dnP
z(58pI-FUOH;_287w5&9UuS4mM5n_XhjPM3PWQ0(#JBZ8@l5r~%U=~=4><Uz%&!Be&
zgnr7%hQTJ|K1j$KCK)XOgM^`&G{x&+We#TX8Qv8vd$BMNWh@Cv$QmXSEip|zIs%$9
z=vLbC0yDqkco(GbpFMXh$#(_Ixf`aHm2oJWwT6jg5?yA%xUTWyrxpec?0{hynOIh%
zfR+~HU1qTC%Nk~~NkfiM7%=-OY^K>k6Q&S0T<K_OO1Q)@N@z>y^Le?%Fj`F99b+>L
z7qk9($F^gPhIvy*=l^SrF_R{RzSkH>DYlvZS?f4K=~x-JRNiG@qOHsnc;c9uqtJ`b
zNuDxr4P%~`fGcD;Fkre{$ter7-}*$<`22N~Hih=!ZVRU%J412IbO$HCz$vLS{Lh-y
zDD+89Zkzo19qVK`axb(_7N=5QCs-4!Q}un;#EQ-ylH-Nf<oP+m;%pl;wK|*MXQozk
z{_-=mkPya9txoLo&h#%NDPEWd{%`qVxY|Suou}u^OtWvAFojG@he3B7x7r=pbw>$W
zjlkh`W81H}BntC_D1viDydk)Fgi?30gs&i_vI}1{9v;)2I2|p!NpfiNa&(`FZy_|N
z2v?ng?gkP64<CBXj~$sy)a4WLmT9hJsU#G5t&`LdM~yhk!jHi{K9ftPC9T!0x>zhs
z!+5TNe<~iw=(bc!YkgC9>)<-OC@#$jQgKmC30l~)_y_t7#3LBnLw!Bb1zF-#gUbh3
zvEn4I_Ti4-ByDt88!HRqZ;MAU)Q6)1+|pIJx+Dvd?);78V$4%D0_2umb1ftbJly`e
zQur!C0%sgMp<q-Xt&Q|Unw>35^5vrL6BEBv3(zqp;O-YL2%<J(i)*$_MzR5RL^sIA
zQvmNY#VO+3P$SKQ=?tbg_$ROZ?6pm{`Dt_es^h+I&G%LFz{>O%+x5|`8Xi-^V-Nc`
z!jl#DrA?N*ec|Q>mF-p7-kPg(Qi+`03U;Z%F(o**ZD;x>wh_~G<WF{+f3oUCErZWk
z)Y75TRrjD<bs>L~%62Pkx5^GG?BEvHtaAGmZa?uaUsl2|e<4=FbI<IyrsijMhrhAv
z0}OL4n_N)k`V_8jy<g#mR4%4)v8`bDDt|w_mc3tGD=NXmFt(-<px__1=#F@8<4R;)
z#{vwsXIz2*;5eSs5j>|!jQmm!xnE)XRW_=y)a5gfbwsu2r~?1NqZs`-MxVmyFW1oV
z0;=q=!VcG#QAcmPclPet56-K>AtgAp&a1;mmEoi6@CgO}gC{VlS23yAFsav>XCCBl
zx&6k?H&#6rwy(BdF(n*><+#V*Wmo4EE~0Y#6mB0b@!j?5i==YH3I`o`t`6Qmuy)}7
z$l8d)!903YZc^bUx4n+;qtB4Tf9x5ovfhEa!bWy>P`BQgc;Ie8C#!M?74D!;90=>z
z2zWcJ$%iqI)yAw7SQi!U0NKz%g&Wk#sobc-jS`|dK{Yg?z&|(fTx16nZeV>{Uk17a
zn7dT&u)-Y%X1nLP>-nIzI_raK|Af*%q4pnB;Ga9TM;H3Scq?o_J{Ge#XH|AUVWFR|
z*nQP^7~uTs-!Q0)c;LE!iv=DHSeIkZb^B*<VBtZ9mZK~#-4%GlCjxs1iw6?GYQZ9E
zSD-f347$U-+X`Nv!}N1ye7{A4l^?roSXGn;(IUp<0>uU2*g@{|4q8&S@N!|Kds3l&
z(UkOQ;R}C{KKG<T`=Tl7)5=?X@p<k^h4w{L(x>rAVXkc!FYT^i?hNd|acfbxn^vB*
zrKhZ~A{_%wLGLrSIvi-p7PVh2kIcaRwOCx*bvF&FDl#t$LajKGC#n^0sn0|MqM_f!
zb1?Q@N?yc<h^{_8SgoA_Dk9OFL}g;9Q)}V{s+(#`O=~iBd~HO;Jcv4VW#<vD!<#AS
zP0bFFW|OaKOhLk@gIuTTS5j~fDqOd?2h|HdfX32)L39Ic(Yv?PaO?K8sqKz@uY9+x
zHubNZtXG76aD`Z5C#x(1jT49{fyjE(mw|zr5Fb?{qYs1X*t9Y>{b)=XJG~J(Q{iUe
zic1X)DS@H&<d=c{yV}kwW3yi{%Gmjh$Qu>z0%_Z?1p3#<z6=cQYCEHh%{=-cAZ$d=
zR=9J9+!<0>pj~12Kh<^iq!ONd$p0$$S?<x;XT^>1nXlN{YA<RYs74UistY4*C(wXJ
z4iQE<0QS0Uvl2}8rXw73O0O=>8sf?<q>lC_Bw}+sY8SCpt9j{_yMSLf669e6UZvqV
z1zsYkiQ~kV^HRN)qXpAKL4b#FA)#ODlMC{4Z`}=~Rf7l6rF0M?av7<5!ip!XdWIFx
z@cLrKbC~SIfD#x`1CvT%awBl`vF9jRIruK(YkWgRUVcav_2c{YLNYHT5}GfO$QSv=
z9ERCM;wOvAoL(d1gR42`b47SW!Db@7o$&hWO%#)R4M#ILI)x)1M;VAdMx<N9&mtD1
zB<cV_`W&M7Q8k3x+BZ8oH$z>2>^|;{JYjpPPDsJj2zFN881kU@aMg<;A1-Ua5Q};T
zt9}eMVkm$i4sp%5%Qwr_CY)<Vt?l>D-#uS#!MPv`h3|*fLe*BBYeVgw_p*1h)pnfg
zKwL|;6GI`?(pK%lP#Cp#Y~%5K^B!Y!2EXwfclH9<;%wbMU~?YRCxzz|ZP&ek*j4!I
z1f+{ZUWW7+A!6qn6|vgKD!Pg6#quGRDvI!~g=6s|5Q9$|7W1X0+5+R(BA}>s(RY&=
z#;LuV04!&6l5Xc?eN3LEwCEmf6FmLAC6OlIw@741^(V4@dm$M-O}?*?^cN;C5V$}d
zvUI^ocC~~@@H&pjWxD5t&PK0_e+Oll;iL-?RqYJJ{2oPrhem&g+O}PYY56T`{MO-M
xY=1-$ee1I`wm*3gbC!Yq_jRqbtKN{}4L$BUxZyo?!}YCe&c+=32H}|W_P-No<qiM<

diff --git a/cuslines/cuda_python/__pycache__/cutils.cpython-312.pyc b/cuslines/cuda_python/__pycache__/cutils.cpython-312.pyc
deleted file mode 100644
index 5e9b7ef57a194403dfb4286a085a35f8efc6ccee..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2922
zcmb_dO>7&-6`on{a>*qrTB3eQ$)=5_5uHRbY{zJX#HmCPj$=g@M9OFujTaPmEs<U>
z+1;gE(h|a`1}dQ!<|G?+;q=r5Mgbq2Q+@A+w2X?F3TTU@2i@E#ryh#FndM5dprn^R
zU}xUU`<b`zeQ)N^p^$`t0{@cppLr4biY{D&HKV=%V1zc2id3wlB9?KHlQ~S+obHi%
zHVd)<vq$$9MOkDrUibY0v^aDHslq*^dNuK}$8jk8RZ){_0zz^?lT_dBQH0DJ8ebUf
zoOo!GLmUFh@S&{^tNM?(xW%c`cS-`^C0X)3hQFjFht;5bLJi5?no#(Gvv+uuBgPU$
z-L)SzM1Pe2N7TnybfP|9LZd!b@>0}ci24MJ`s~qo;IAS^5Hi`Xc5BjaklORx{r_FV
z+JH7#<3MLfi^?%Iau3PFYA=i<Y9EXzt6ZXgm%2=F#H*V5JDN!Z<Bn<PW)ql*R}DL5
zREogjv$?}3cM^*BX!SOchGeWFnNzXsQ8@sP2SCHi-l~xB68{G!Rns%ovSvagXqVdV
zGLC{~HX#z9q8QntrYJ;GlwwJ(=#&pC%FilUy(8hT&rN5DCz+lk{M`KYDdL&SWWGRq
z-~#>oN8`(-qBdSAEgR!k<`&Ynsb!0L-q5V^T*Y!3HLH}X_HxO9O*^k!W96#p19LV6
zfU){vxQE(-V12qR1?owc`y^@hx{4;%+~r{)_!k6M39b?0mR`!*#LK1&yVNP9vs78s
zH6l9mg<AiDVO7fIl4)ydd_FZfqol9Osds9jc;1LF>7~UigtVM<Qwu&2^bw*-+WRr=
zZX$bMwt{S$K}H1{%Mp6qk(zJWoz6t4;CB5X-4*2a#!taw6|ZtD`1?H<=+7M?)-T)&
zmxg;J{c@D-4x;LX;1MHO?ngOj)1)#gbh+HYd=RC|4Z${?(d|lAPa)#+2lIi2`+sXV
z!OzzGZ&!@lM(O9qWz!GZv?gpB*G#iy#%%!UMEve*LLyS$${UuQHFBCsQ%HQt1yi#s
zx=p%rl`EQ^q876NTM|qzOiwE5g=8|7PMZuF0oFtFj(&tKToV9i-dG~R`HErZiyCtS
z$SIk{tex->fki?*FcF?&-=e3-<8iZ(M$xV0Dr(mCMVr%n4fbD!&sv3{j&{WE#w!o|
zADwz|>baQM>5XppCR)9T&tKW<J-6Xs_pE2y1JQ@K9+e)Hwogv9PEI@<m{_-)zH3hd
zFzomvjnu<8w_~rjVy{2<zp)eU-42ho!lR$}ZH335;8ys2(|7*su4prMVLLX_icS3S
z&8^tvR@YS1H^rLdB>>YTGXVbum`$_-0HFmez;~3)iw96uxdMg$5&AxBTXdCOEw}J4
z$lUcBwrNz14q&2s7>IEd9V_LTQ~=)|2Y{4QQ52w<5N6rER_muQ=m04PSKd2!S4RO#
z3}`0pgM>hYJ6XM=nLmV;No&T6M?AV_z#Z##@MY01@02^7Th?;7lTdPo6U#gYGWuCM
z40Y5NL+ja3s=xYp?c<%pHM#Wk^p|h_>D@2ieLnM1^QPL=ZnbVMZOtri#qwKSg{H5t
zBTDO+etEZ%eI^dH!;u|-aQ()wWB2hh|4=*J*T`;Eo4&!nQ?9YNUTgY>oZ9YRI&eDW
zS#9atPHVe@Yrv_v20sft$RRu2tdjuRr0~e#EWwKwHFXi~Rr7b0O4+;!dlT?k6zA|~
z!B7Klq#BbOSMQ&01!7G;#zIJ(A|cuWV~cr1g+{VicRF1sKs%JHgfq&-o1?vK5niJ$
zLcv5v+^1lS+IOCUF{N|!vPXdU_wQvQQT776qlxHT?B|_LWjgcWT#5)PJ%R)rJ!h|*
z*)v|5Oe^WkJn1<kclX4c;=CA<m-Um(v2)s`i^P{xI*&+Xwq#!|mUXSD8Ss2KmxeF%
z-n{Y?Wom|WxbzGWr)I7vuPGlAZ)vfh<!mBOr*0^7nR$jZ1|ODblh(s|)Y!e^d)s+;
zFPk5LfFjU31>*}3#`tgO)vr*`Us33vXz*+F+P7XF2OHwI2&NZ7)D><FZG2Rh+MZwo
zuU&3;M>eCMM(gjld!mgqo1>qO-cNsa=J#iRceXkFMtdZFEdN5_hR6317dcnI_QH#X
z;!VD<J#@Os_qU_*M`I7hTKqs;=xUr^yZL|d51zsQnU4SQ2jk%XAJRxOe(6c|$?c~j
q&6C%+q`4+P*Oq!3@3o}iy4aRZ)WvUj5hw9CJw6=W>&1AOCGfulx|Wv!


From 1c6dd28077bb17efb25b88cb8bedbd03a366e4ff Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Tue, 6 Jan 2026 16:22:23 -0800
Subject: [PATCH 22/31] focus in on only cuda python

---
 CMakeLists.txt                               |  70 --
 README.md                                    |  58 +-
 build_cuslines.sh                            |  19 -
 cuslines/Makefile                            |  60 --
 cuslines/cuda_python/cu_direction_getters.py |  75 ++-
 cuslines/cuda_python/cu_propagate_seeds.py   |  22 +-
 cuslines/cuda_python/cu_tractography.py      | 166 ++++-
 cuslines/cuslines.cpp                        | 360 -----------
 cuslines/generate_streamlines_cuda.cu        | 634 -------------------
 cuslines/generate_streamlines_cuda.h         |  70 --
 cuslines/ptt.cu                              |   2 +-
 merge_trk.sh                                 |  99 ---
 pyproject.toml                               |   3 +
 run_gpu_streamlines.py                       | 258 +++-----
 setup.py                                     |   1 -
 15 files changed, 329 insertions(+), 1568 deletions(-)
 delete mode 100644 CMakeLists.txt
 delete mode 100755 build_cuslines.sh
 delete mode 100644 cuslines/Makefile
 delete mode 100644 cuslines/cuslines.cpp
 delete mode 100644 cuslines/generate_streamlines_cuda.h
 delete mode 100755 merge_trk.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
deleted file mode 100644
index f27d490..0000000
--- a/CMakeLists.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-cmake_minimum_required(VERSION 3.24)
-
-project(cuslines LANGUAGES CUDA CXX VERSION 1.0)
-
-# Build settings
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -Werror=reorder")
-set(CMAKE_CXX_FLAGS_RELEASE "-O3")
-
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release")
-endif()
-
-if (CMAKE_BUILD_TYPE STREQUAL "Debug" )
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}")
-else()
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}")
-endif()
-
-# CUDA
-find_package(CUDAToolkit REQUIRED)
-
-# Set default CUDA compute capabilities if unset
-if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-  include(FindCUDA/select_compute_arch.cmake)
-  cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS Auto)
-  set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_FLAGS})
-endif()
-message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-
-# OpenMP
-find_package(OpenMP)
-if(OPENMP_FOUND)
-    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-    
-    # Set OMP runtime based on compiler
-    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-      set(OMP_RUNTIME "INTEL")
-    elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-      set(OMP_RUNTIME "GNU")
-    elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
-      set(OMP_RUNTIME "INTEL")
-    endif()
-    message(STATUS "OpenMP runtime used: ${OMP_RUNTIME}")
-endif()
-
-# Find pybind11
-execute_process(COMMAND python -c "import pybind11; print(pybind11.get_cmake_dir())"
-                OUTPUT_VARIABLE pybind11_DIR
-                OUTPUT_STRIP_TRAILING_WHITESPACE)
-list(APPEND CMAKE_PREFIX_PATH ${pybind11_DIR})
-find_package(pybind11 REQUIRED)
-
-# Build library and pybind11 module
-add_library(cuslines_kernels)
-target_sources(cuslines_kernels
-	       PRIVATE
-	       ${CMAKE_SOURCE_DIR}/cuslines/generate_streamlines_cuda.cu)
-set_target_properties(cuslines_kernels PROPERTIES OUTPUT_NAME cuslines_kernels
-                                                  POSITION_INDEPENDENT_CODE TRUE)
-
-pybind11_add_module(cuslines ${CMAKE_SOURCE_DIR}/cuslines/cuslines.cpp)
-target_include_directories(cuslines PUBLIC "${CMAKE_SOURCE_DIR}/cuslines" "${CUDAToolkit_INCLUDE_DIRS}")
-target_link_libraries(cuslines PRIVATE cuslines_kernels CUDA::cudart_static)
-
-# Install
-install(TARGETS cuslines cuslines_kernels LIBRARY DESTINATION .)
diff --git a/README.md b/README.md
index 5cda7f5..a1e98fa 100644
--- a/README.md
+++ b/README.md
@@ -48,63 +48,7 @@ Destroy GPUTracker...
 
 Note that if you experience memory errors, you can adjust the `--chunk-size` flag.
 
-To run on more seeds, we suggest enabling the `--use-fast-write` flag in the GPU script to not get bottlenecked by writing files. Here is a comparison running on 500K seeds on 1 GPU with and without this flag:
-
-Without `--use-fast-write`:
-```
-$ python run_gpu_streamlines.py --output-prefix small --nseeds 500000 --ngpus 1
-parsing arguments
-Fitting Tensor
-Computing anisotropy measures (FA,MD,RGB)
-slowadcodf
-Bootstrap direction getter
-streamline gen
-Creating GPUTracker with 1 GPUs...
-Generated 143891 streamlines from 100000 seeds, time: 7.978902339935303 s
-Saved streamlines to small.1_5.trk, time 11.439777851104736 s
-Generated 151932 streamlines from 100000 seeds, time: 10.155118703842163 s
-Saved streamlines to small.2_5.trk, time 12.438884019851685 s
-Generated 146971 streamlines from 100000 seeds, time: 9.822870016098022 s
-Saved streamlines to small.3_5.trk, time 12.377111673355103 s
-Generated 153824 streamlines from 100000 seeds, time: 11.133368968963623 s
-Saved streamlines to small.4_5.trk, time 13.317519187927246 s
-Generated 162004 streamlines from 100000 seeds, time: 13.19784665107727 s
-Saved streamlines to small.5_5.trk, time 14.21276593208313 s
-Completed processing 500000 seeds.
-Initialization time: 14.789637088775635 sec
-Streamline generation total time: 116.0746865272522 sec
-        Streamline processing: 52.28810667991638 sec
-        File writing: 63.7860586643219 sec
-Destroy GPUTracker...
-```
-
-With `--use-fast-write`:
-```
-$ python run_gpu_streamlines.py --output-prefix small --nseeds 500000 --ngpus 1 --use-fast-write
-parsing arguments
-Fitting Tensor
-Computing anisotropy measures (FA,MD,RGB)
-slowadcodf
-Bootstrap direction getter
-streamline gen
-Creating GPUTracker with 1 GPUs...
-Generated 143891 streamlines from 100000 seeds, time: 7.962322473526001 s
-Saved streamlines to small.1_5_*.trk, time 0.1053612232208252 s
-Generated 151932 streamlines from 100000 seeds, time: 10.148677825927734 s
-Saved streamlines to small.2_5_*.trk, time 0.1606450080871582 s
-Generated 146971 streamlines from 100000 seeds, time: 9.811130285263062 s
-Saved streamlines to small.3_5_*.trk, time 0.571892499923706 s
-Generated 153824 streamlines from 100000 seeds, time: 11.186563968658447 s
-Saved streamlines to small.4_5_*.trk, time 0.3091111183166504 s
-Generated 162004 streamlines from 100000 seeds, time: 13.282683610916138 s
-Saved streamlines to small.5_5_*.trk, time 0.7107999324798584 s
-Completed processing 500000 seeds.
-Initialization time: 14.705361366271973 sec
-Streamline generation total time: 54.24975609779358 sec
-        Streamline processing: 52.39137816429138 sec
-        File writing: 1.8578097820281982 sec
-Destroy GPUTracker...
-```
+To run on more seeds, we suggest enabling the `--trx` flag in the GPU script to not get bottlenecked by writing files.
 
 ## Running on AWS with Docker
 First, set up an AWS instance with GPU and ssh into it (we recommend a P3 instance with at least 1 V100 16 GB GPU and a Deep Learning AMI Ubuntu 18.04 v 33.0.). Then do the following:
diff --git a/build_cuslines.sh b/build_cuslines.sh
deleted file mode 100755
index 8375223..0000000
--- a/build_cuslines.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-build_dir=$(pwd)/build
-install_dir=$(pwd)/install
-
-# set up build dir
-mkdir -p ${build_dir}
-cd ${build_dir}
-
-# configure
-cmake -DCMAKE_INSTALL_PREFIX=${install_dir} \
-      -DCMAKE_BUILD_TYPE=Release \
-      -DCMAKE_C_COMPILER=gcc \
-      -DCMAKE_CXX_COMPILER=g++ \
-      -DPYTHON_EXECUTABLE=$(which python) \
-      ..
-
-# compile
-make && make install
diff --git a/cuslines/Makefile b/cuslines/Makefile
deleted file mode 100644
index 8fd8528..0000000
--- a/cuslines/Makefile
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-CUDA_HOME=/usr/local/cuda
-CUDACC=$(CUDA_HOME)/bin/nvcc # -G -g -dopt=on
-CXX=g++
-LD=g++
-
-CXXFLAGS= -c -O3 -std=c++17 -fopenmp -fPIC `python3 -m pybind11 --includes` -I$(CUDA_HOME)/include
-
-SMS ?= 75 80
-CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM))
-LASTSM := $(lastword $(sort $(SMS)))
-CUDA_ARCH += -gencode arch=compute_$(LASTSM),code=compute_$(LASTSM)
-
-COMMON_FLAGS = -c -std=c++17 -Xcompiler -fPIC --use_fast_math -Xcompiler=-fopenmp $(CUDA_ARCH)
-RELEASE_FLAGS = -O3 -Xptxas=-O3
-DEBUG_FLAGS   = -O0 -Xptxas=-v -g -G -lineinfo
-CUDACFLAGS = $(COMMON_FLAGS) $(RELEASE_FLAGS)
-
-LDFLAGS= -shared -fopenmp -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt
-
-all: cuslines
-
-cuslines: generate_streamlines_cuda.o cuslines.o
-	$(LD) cuslines.o generate_streamlines_cuda.o -o cuslines`python3-config --extension-suffix` $(LDFLAGS)
-
-%.o : %.cu
-	$(CUDACC) $(CUDACFLAGS) $< -o $@
-
-%.o: %.cpp
-	$(CXX) $(CXXFLAGS) $< -o $@
-
-clean:
-	rm *.o cuslines`python3-config --extension-suffix` __pycache__/*.pyc
diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py
index 135cb47..9901fc3 100644
--- a/cuslines/cuda_python/cu_direction_getters.py
+++ b/cuslines/cuda_python/cu_direction_getters.py
@@ -5,6 +5,8 @@
 from importlib.resources import files
 from time import time
 
+from dipy.reconst import shm
+
 from cuda.core import Device, LaunchConfig, Program, launch, ProgramOptions
 from cuda.pathfinder import find_nvidia_header_directory
 from cuda.cccl import get_include_paths
@@ -103,7 +105,7 @@ class _BootCtx(ctypes.Structure):
 
 
 class BootDirectionGetter(GPUDirectionGetter):
-    def __init__(  # TODO: Maybe accept a dipy thing and extract arrays here? maybe as a from_ function?
+    def __init__(
             self,
             model_type: str,
             min_signal: float,
@@ -142,6 +144,77 @@ def __init__(  # TODO: Maybe accept a dipy thing and extract arrays here? maybe
         self.genstreamlines_kernel_name = f"genStreamlinesMerge_k<{THR_X_SL},{BLOCK_Y},{model_type.upper()},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
         self.compile_program()
 
+    @classmethod
+    def from_dipy_opdt(cls, gtab, sphere,
+                       sh_order_max=6,
+                       full_basis=False,
+                       sh_lambda=0.006,
+                       min_signal=1):
+        sampling_matrix, _, _ = shm.real_sh_descoteaux(
+            sh_order_max, sphere.theta, sphere.phi, full_basis=full_basis, legacy=False
+        )
+
+        model = shm.OpdtModel(
+            gtab, sh_order_max=sh_order_max, smooth=sh_lambda, min_signal=min_signal
+        )
+        fit_matrix = model._fit_matrix
+        delta_b, delta_q = fit_matrix
+
+        b0s_mask = gtab.b0s_mask
+        dwi_mask = ~b0s_mask
+        x, y, z = model.gtab.gradients[dwi_mask].T
+        _, theta, phi = shm.cart2sphere(x, y, z)
+        B, _, _ = shm.real_sym_sh_basis(sh_order_max, theta, phi)
+        H = shm.hat(B)
+        R = shm.lcr_matrix(H)
+
+        return cls(
+            model_type="OPDT",
+            min_signal=min_signal,
+            H=H,
+            R=R,
+            delta_b=delta_b,
+            delta_q=delta_q,
+            sampling_matrix=sampling_matrix,
+            b0s_mask=gtab.b0s_mask
+        )
+
+    @classmethod
+    def from_dipy_csa(cls, gtab, sphere,
+                      sh_order_max=6,
+                      full_basis=False,
+                      sh_lambda=0.006,
+                      min_signal=1):
+        sampling_matrix, _, _ = shm.real_sh_descoteaux(
+            sh_order_max, sphere.theta, sphere.phi, full_basis=full_basis, legacy=False
+        )
+
+        model = shm.CsaOdfModel(
+            gtab, sh_order_max=sh_order_max, smooth=sh_lambda, min_signal=min_signal
+        )
+        fit_matrix = model._fit_matrix
+        delta_b = fit_matrix
+        delta_q = fit_matrix
+
+        b0s_mask = gtab.b0s_mask
+        dwi_mask = ~b0s_mask
+        x, y, z = model.gtab.gradients[dwi_mask].T
+        _, theta, phi = shm.cart2sphere(x, y, z)
+        B, _, _ = shm.real_sym_sh_basis(sh_order_max, theta, phi)
+        H = shm.hat(B)
+        R = shm.lcr_matrix(H)
+
+        return cls(
+            model_type="CSA",
+            min_signal=min_signal,
+            H=H,
+            R=R,
+            delta_b=delta_b,
+            delta_q=delta_q,
+            sampling_matrix=sampling_matrix,
+            b0s_mask=gtab.b0s_mask
+        )
+
     def allocate_on_gpu(self, n):
         self.H_d.append(
             checkCudaErrors(runtime.cudaMalloc(
diff --git a/cuslines/cuda_python/cu_propagate_seeds.py b/cuslines/cuda_python/cu_propagate_seeds.py
index 92efef3..72037c6 100644
--- a/cuslines/cuda_python/cu_propagate_seeds.py
+++ b/cuslines/cuda_python/cu_propagate_seeds.py
@@ -78,7 +78,7 @@ def _allocate_seed_memory(self, seeds):
             self.shDirTemp0_d[ii] = checkCudaErrors(runtime.cudaMalloc(
                 REAL3_DTYPE.itemsize * self.gpu_tracker.samplm_nr * grid[0] * block[1]))
 
-    def _cumsum_offsets(self):  # TODO: do this on device? not crucial for performance now
+    def _cumsum_offsets(self):  # TODO: performance: do this on device? not crucial for performance now
         for ii in range(self.ngpus):
             nseeds_gpu, _, _ = self._switch_device(ii)
             if (nseeds_gpu == 0):
@@ -173,7 +173,11 @@ def _cleanup(self):
         self.nSlines_old = self.nSlines
         self.gpu_tracker.rng_offset += self.nseeds
 
-    def propagate(self, seeds): # TODO: better queuing/batching of seeds, if more performance needed
+    # TODO: performance: better queuing/batching of seeds,
+    # if more performance needed,
+    # given exponential nature of streamlines
+    # May be better to do in cuda code directly
+    def propagate(self, seeds):
         self.nseeds = len(seeds)
         self.nseeds_per_gpu = (self.nseeds + self.gpu_tracker.ngpus - 1) // self.gpu_tracker.ngpus
 
@@ -202,13 +206,15 @@ def propagate(self, seeds): # TODO: better queuing/batching of seeds, if more pe
 
         self._cleanup()
 
-    def as_array_sequence(self):
+    def get_buffer_size(self):
         buffer_size = 0
         for ii in range(self.ngpus):
             lens = self.sline_lens[ii]
             for jj in range(self.nSlines[ii]):
                 buffer_size += lens[jj] * 3 * REAL_SIZE
+        return buffer_size
 
+    def as_generator(self):
         def _yield_slines():
             for ii in range(self.ngpus):
                 this_sls = self.slines[ii]
@@ -220,11 +226,7 @@ def _yield_slines():
                     yield np.asarray(
                         this_sls[jj],
                         dtype=REAL_DTYPE)[:npts]
+        return _yield_slines()
 
-        return ArraySequence(_yield_slines(), buffer_size // MEGABYTE)
-
-    def to_trx():
-        raise NotImplementedError("Export to TRX not yet implemented")
-    
-    def to_trk():
-        raise NotImplementedError("Export to TRK not yet implemented")
+    def as_array_sequence(self):
+        return ArraySequence(self.as_generator(), self.get_buffer_size() // MEGABYTE)
diff --git a/cuslines/cuda_python/cu_tractography.py b/cuslines/cuda_python/cu_tractography.py
index eca62dd..1d34adc 100644
--- a/cuslines/cuda_python/cu_tractography.py
+++ b/cuslines/cuda_python/cu_tractography.py
@@ -1,10 +1,11 @@
 from cuda.bindings import driver, runtime
 from cuda.bindings.runtime import cudaMemcpyKind
-import cuda.core as cc
 # TODO: consider cuda core over cuda bindings
 
 import numpy as np
+from tqdm import tqdm
 import logging
+from math import radians
 
 from cuslines.cuda_python.cutils import (
     REAL_SIZE,
@@ -17,29 +18,81 @@
 )
 from cuslines.cuda_python.cu_propagate_seeds import SeedBatchPropagator
 
+from trx.trx_file_memmap import TrxFile
+
+from nibabel.streamlines.tractogram import Tractogram
+from nibabel.streamlines.array_sequence import ArraySequence, MEGABYTE
+
+from dipy.io.stateful_tractogram import Space, StatefulTractogram
 
 logger = logging.getLogger("GPUStreamlines")
 
+# TODO performance:
+# ACT
+# SCIL streamline reduction onboard GPU
+# Remove small/long streamlines on gpu
 
-class GPUTracker:  # TODO: bring in pyAFQ prep stuff
+class GPUTracker:
     def __init__(
         self,
         dg: GPUDirectionGetter,
-        max_angle: float,
-        tc_threshold: float,
-        step_size: float,
-        relative_peak_thresh: float,
-        min_separation_angle: float,
-        dataf: np.ndarray, # TODO: reasonable defaults for floats, reorganize order, better names, documentation
-        metric_map: np.ndarray,
+        dataf: np.ndarray,
+        stop_map: np.ndarray,
+        stop_theshold: float,
         sphere_vertices: np.ndarray,
         sphere_edges: np.ndarray,
+        max_angle: float = radians(60),
+        step_size: float = 0.5,
+        relative_peak_thresh: float = 0.25,
+        min_separation_angle: float = radians(45),
         ngpus: int = 1,
         rng_seed: int = 0,
         rng_offset: int = 0,
+        chunk_size: int = 100000,
     ):
+        """
+        Initialize GPUTracker with necessary data.
+
+        Parameters
+        ----------
+        dg : GPUDirectionGetter
+            Direction getter to use for tracking from
+            cuslines.cu_direction_getters
+        dataf : np.ndarray
+            4D numpy array with ODFs for prob/ptt, diffusion data if doing
+            bootstrapping.
+        stop_map : np.ndarray
+            3D numpy array with stopping metric (e.g., GFA, FA)
+        stop_theshold : float
+            Threshold for stopping metric (e.g., 0.2)
+        sphere_vertices : np.ndarray
+            Vertices of the sphere used for direction sampling.
+        sphere_edges : np.ndarray
+            Edges of the sphere used for direction sampling.
+        max_angle : float, optional
+            Maximum angle (in radians) between steps
+            default: radians(60)
+        step_size : float, optional
+            Step size for tracking
+            default: 0.5
+        relative_peak_thresh : float, optional
+            Relative peak threshold for direction selection
+            default: 0.25
+        min_separation_angle : float, optional
+            Minimum separation angle (in radians) between peaks
+            default: radians(45)
+        ngpus : int, optional
+            Number of GPUs to use
+            default: 1
+        rng_seed : int, optional
+            Seed for random number generator
+            default: 0
+        rng_offset : int, optional
+            Offset for random number generator
+            default: 0
+        """
         self.dataf = np.ascontiguousarray(dataf, dtype=REAL_DTYPE)
-        self.metric_map = np.ascontiguousarray(metric_map, dtype=REAL_DTYPE)
+        self.metric_map = np.ascontiguousarray(stop_map, dtype=REAL_DTYPE)
         self.sphere_vertices = np.ascontiguousarray(sphere_vertices, dtype=REAL_DTYPE)
         self.sphere_edges = np.ascontiguousarray(sphere_edges, dtype=np.int32)
 
@@ -53,7 +106,7 @@ def __init__(
 
         self.dg = dg
         self.max_angle = REAL_DTYPE(max_angle)
-        self.tc_threshold = REAL_DTYPE(tc_threshold)
+        self.tc_threshold = REAL_DTYPE(stop_theshold)
         self.step_size = REAL_DTYPE(step_size)
         self.relative_peak_thresh = REAL_DTYPE(relative_peak_thresh)
         self.min_separation_angle = REAL_DTYPE(min_separation_angle)
@@ -61,6 +114,7 @@ def __init__(
         self.ngpus = int(ngpus)
         self.rng_seed = int(rng_seed)
         self.rng_offset = int(rng_offset)
+        self.chunk_size = int(chunk_size)
 
         checkCudaErrors(driver.cuInit(0))
         avail = checkCudaErrors(runtime.cudaGetDeviceCount())
@@ -98,6 +152,7 @@ def _allocate(self):
         for ii in range(self.ngpus):
             checkCudaErrors(runtime.cudaSetDevice(ii))
 
+            # TODO: performance: dataf could be managed or texture memory instead?
             self.dataf_d.append(
                 checkCudaErrors(runtime.cudaMalloc(
                     REAL_SIZE*self.dataf.size)))
@@ -153,6 +208,89 @@ def __exit__(self, exc_type, exc, tb):
             checkCudaErrors(runtime.cudaStreamDestroy(self.streams[n]))
         return False
 
-    def generate_streamlines(self, seeds):
-        self.seed_propagator.propagate(seeds)
-        return self.seed_propagator.as_array_sequence()
+    def _divide_chunks(self, seeds):
+        global_chunk_sz = self.chunk_size * self.ngpus
+        nchunks = (seeds.shape[0] + global_chunk_sz - 1) // global_chunk_sz
+        return global_chunk_sz, nchunks
+    
+    def generate_sft(self, seeds, ref_img):
+        global_chunk_sz, nchunks = self._divide_chunks(seeds)
+        buffer_size = 0
+        generators = []
+
+        with tqdm(total=seeds.shape[0]) as pbar:
+            for idx in range(nchunks):
+                self.seed_propagator.propagate(
+                    seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz]
+                )
+                buffer_size += self.seed_propagator.get_buffer_size()
+                generators.append(self.seed_propagator.as_generator())
+                pbar.update(
+                    seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz].shape[0]
+                )
+        array_sequence = ArraySequence(
+            (item for gen in generators for item in gen),
+            buffer_size // MEGABYTE
+        )
+        return StatefulTractogram(array_sequence, ref_img, Space.VOX)
+
+    # TODO: performance: consider a way to just output in VOX space directly
+    def generate_trx(self, seeds, ref_img):
+        global_chunk_sz, nchunks = self._divide_chunks(seeds)
+
+        # Will resize by a factor of 2 if these are exceeded
+        sl_len_guess = 100
+        sl_per_seed_guess = 3
+        n_sls_guess = sl_per_seed_guess * seeds.shape[0]
+
+        # trx files use memory mapping
+        trx_file = TrxFile(
+            reference=ref_img,
+            nb_streamlines=n_sls_guess,
+            nb_vertices=n_sls_guess * sl_len_guess,
+        )
+        trx_file.streamlines._offsets = trx_file.streamlines._offsets.astype(np.uint64)
+        offsets_idx = 0
+        sls_data_idx = 0
+
+        with tqdm(total=seeds.shape[0]) as pbar:
+            for idx in range(int(nchunks)):
+                self.seed_propagator.propagate(
+                    seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz]
+                )
+                tractogram = Tractogram(
+                    self.seed_propagator.as_array_sequence(),
+                    affine_to_rasmm=ref_img.affine)
+                tractogram.to_world()
+                sls = tractogram.streamlines
+
+                new_offsets_idx = offsets_idx + len(sls._offsets)
+                new_sls_data_idx = sls_data_idx + len(sls._data)
+
+                if (
+                    new_offsets_idx > trx_file.header["NB_STREAMLINES"]
+                    or new_sls_data_idx > trx_file.header["NB_VERTICES"]
+                ):
+                    print("TRX resizing...")
+                    trx_file.resize(
+                        nb_streamlines=new_offsets_idx * 2,
+                        nb_vertices=new_sls_data_idx * 2,
+                    )
+
+                # TRX uses memmaps here
+                trx_file.streamlines._data[sls_data_idx:new_sls_data_idx] = sls._data
+                trx_file.streamlines._offsets[offsets_idx:new_offsets_idx] = (
+                    sls_data_idx + sls._offsets
+                )
+                trx_file.streamlines._lengths[offsets_idx:new_offsets_idx] = (
+                    sls._lengths
+                )
+
+                offsets_idx = new_offsets_idx
+                sls_data_idx = new_sls_data_idx
+                pbar.update(
+                    seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz].shape[0]
+                )
+        trx_file.resize()
+
+        return trx_file
diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
deleted file mode 100644
index f0b8690..0000000
--- a/cuslines/cuslines.cpp
+++ /dev/null
@@ -1,360 +0,0 @@
-/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <cstring>
-#include <iostream>
-#include <stdexcept>
-
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-namespace py = pybind11;
-
-#include <cuda_runtime.h>
-
-// #define USE_NVTX
-
-#include "globals.h"
-#include "cudamacro.h"
-#include "generate_streamlines_cuda.h"
-
-using np_array = py::array_t<REAL>;
-using np_array_int = py::array_t<int>;
-
-using np_array_cast = py::array_t<REAL, py::array::c_style | py::array::forcecast>;
-using np_array_int_cast = py::array_t<int, py::array::c_style | py::array::forcecast>;
-
-// Handle to cleanup returned host allocations when associated Python object is destroyed
-template <typename T>
-py::capsule cleanup(T* ptr) {
-  return py::capsule(ptr, [](void *f) {
-           T *g = reinterpret_cast<T *>(f);
-           delete [] g;
-         });
-}
-
-class GPUTracker {
-  public:
-    GPUTracker(ModelType model_type,
-               REAL max_angle,
-               REAL min_signal,
-               REAL tc_threshold,
-               REAL step_size,
-               REAL relative_peak_thresh,
-               REAL min_separation_angle,
-               np_array_cast dataf,
-               np_array_cast H,
-               np_array_cast R,
-               np_array_cast delta_b,
-               np_array_cast delta_q,
-               np_array_int_cast b0s_mask,
-               np_array_cast metric_map,
-               np_array_cast sampling_matrix,
-               np_array_cast sphere_vertices,
-               np_array_int_cast sphere_edges,
-               int ngpus = 1,
-               int rng_seed = 0,
-               int rng_offset = 0) {
-
-      // Get info structs from numpy objects
-      auto dataf_info = dataf.request();
-      auto H_info = H.request();
-      auto R_info = R.request();
-      auto delta_b_info = delta_b.request();
-      auto delta_q_info = delta_q.request();
-      auto b0s_mask_info = b0s_mask.request();
-      auto metric_map_info = metric_map.request();
-      auto sampling_matrix_info = sampling_matrix.request();
-      auto sphere_vertices_info = sphere_vertices.request();
-      auto sphere_edges_info = sphere_edges.request();
-
-      dimx_ = dataf_info.shape[0];
-      dimy_ = dataf_info.shape[1];
-      dimz_ = dataf_info.shape[2];
-      dimt_ = dataf_info.shape[3];
-      nedges_ = sphere_edges_info.shape[0];
-
-      delta_nr_ = delta_b_info.shape[0];
-      samplm_nr_ = sampling_matrix_info.shape[0];
-
-// No longer needed
-#if 0
-      // Error checking for template parameters.
-      // TODO: Need to make kernel more general.
-      if (delta_b_info.shape[0] != 28 ||
-          sampling_matrix_info.shape[0] != 181 ||
-          dataf_info.shape[3] > 160) {
-          std::cout << delta_b_info.shape[0] << " " << sampling_matrix_info.shape[0] << " " << dataf_info.shape[3] << std::endl;
-          throw std::invalid_argument("Input data dimensions not currently supported.");
-      }
-#endif
-
-      // Get number of GPUs
-      int ngpus_avail;
-      CHECK_CUDA(cudaGetDeviceCount(&ngpus_avail));
-      if (ngpus > ngpus_avail) {
-        throw std::runtime_error("Requested to use more GPUs than available on system.");
-      }
-
-      std::cerr << "Creating GPUTracker with " << ngpus << " GPUs..." << std::endl;
-      ngpus_ = ngpus;
-
-      model_type_ = model_type;
-      max_angle_ = max_angle;
-      min_signal_ = min_signal;
-      tc_threshold_ = tc_threshold;
-      step_size_ = step_size;
-      relative_peak_thresh_ = relative_peak_thresh,
-      min_separation_angle_ = min_separation_angle,
-
-      // Allocate/copy constant problem data on GPUs
-      dataf_d.resize(ngpus_, nullptr);
-      H_d.resize(ngpus_, nullptr);
-      R_d.resize(ngpus_, nullptr);
-      delta_b_d.resize(ngpus_, nullptr);
-      delta_q_d.resize(ngpus_, nullptr);
-      b0s_mask_d.resize(ngpus_, nullptr);
-      metric_map_d.resize(ngpus_, nullptr);
-      sampling_matrix_d.resize(ngpus_, nullptr);
-      sphere_vertices_d.resize(ngpus_, nullptr);
-      sphere_edges_d.resize(ngpus_, nullptr);
-
-      //#pragma omp parallel for
-      for (int n = 0; n < ngpus_; ++n) {
-        CHECK_CUDA(cudaSetDevice(n));
-        CHECK_CUDA(cudaMallocManaged(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size));
-        CUDA_MEM_ADVISE(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n);
-        // CHECK_CUDA(cudaMemPrefetchAsync(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, n));
-        CHECK_CUDA(cudaMalloc(&H_d[n], sizeof(*H_d[n]) * H_info.size));
-        CHECK_CUDA(cudaMalloc(&R_d[n], sizeof(*R_d[n]) * R_info.size));
-        CHECK_CUDA(cudaMalloc(&delta_b_d[n], sizeof(*delta_b_d[n]) * delta_b_info.size));
-        CHECK_CUDA(cudaMalloc(&delta_q_d[n], sizeof(*delta_q_d[n]) * delta_q_info.size));
-        CHECK_CUDA(cudaMalloc(&b0s_mask_d[n], sizeof(*b0s_mask_d[n]) * b0s_mask_info.size));
-        CHECK_CUDA(cudaMalloc(&metric_map_d[n], sizeof(*metric_map_d[n]) * metric_map_info.size));
-        CHECK_CUDA(cudaMalloc(&sampling_matrix_d[n], sizeof(*sampling_matrix_d[n]) * sampling_matrix_info.size));
-        CHECK_CUDA(cudaMalloc(&sphere_vertices_d[n], sizeof(*sphere_vertices_d[n]) * sphere_vertices_info.size));
-        CHECK_CUDA(cudaMalloc(&sphere_edges_d[n], sizeof(*sphere_edges_d[n]) * sphere_edges_info.size));
-
-        CHECK_CUDA(cudaMemcpy(dataf_d[n], dataf_info.ptr, sizeof(*dataf_d[n]) * dataf_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(H_d[n], H_info.ptr, sizeof(*H_d[n]) * H_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(R_d[n], R_info.ptr, sizeof(*R_d[n]) * R_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(delta_b_d[n], delta_b_info.ptr, sizeof(*delta_b_d[n]) * delta_b_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(delta_q_d[n], delta_q_info.ptr, sizeof(*delta_q_d[n]) * delta_q_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(b0s_mask_d[n], b0s_mask_info.ptr, sizeof(*b0s_mask_d[n]) * b0s_mask_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(metric_map_d[n], metric_map_info.ptr, sizeof(*metric_map_d[n]) * metric_map_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(sampling_matrix_d[n], sampling_matrix_info.ptr, sizeof(*sampling_matrix_d[n]) * sampling_matrix_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(sphere_vertices_d[n], sphere_vertices_info.ptr, sizeof(*sphere_vertices_d[n]) * sphere_vertices_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(sphere_edges_d[n], sphere_edges_info.ptr, sizeof(*sphere_edges_d[n]) * sphere_edges_info.size, cudaMemcpyHostToDevice));
-      }
-
-      rng_seed_ = rng_seed;
-      rng_offset_ = rng_offset;
-      nSlines_old_.resize(ngpus_, 0);
-      slines_.resize(ngpus_, nullptr);
-      slinesLen_.resize(ngpus_, nullptr);
-
-      streams_.resize(ngpus_);
-      for (int n = 0; n < ngpus_; ++n) {
-        CHECK_CUDA(cudaSetDevice(n));
-        CHECK_CUDA(cudaStreamCreateWithFlags(&streams_[n], cudaStreamNonBlocking));
-      }
-
-    }
-
-    ~GPUTracker() {
-      std::cerr << "Destroy GPUTracker..." << std::endl;
-      for (int n = 0; n < ngpus_; ++n) {
-        CHECK_CUDA(cudaSetDevice(n));
-        if (dataf_d[n]) CHECK_CUDA(cudaFree(dataf_d[n]));
-        if (H_d[n]) CHECK_CUDA(cudaFree(H_d[n]));
-        if (R_d[n]) CHECK_CUDA(cudaFree(R_d[n]));
-        if (delta_b_d[n]) CHECK_CUDA(cudaFree(delta_b_d[n]));
-        if (delta_q_d[n]) CHECK_CUDA(cudaFree(delta_q_d[n]));
-        if (b0s_mask_d[n]) CHECK_CUDA(cudaFree(b0s_mask_d[n]));
-        if (metric_map_d[n]) CHECK_CUDA(cudaFree(metric_map_d[n]));
-        if (sampling_matrix_d[n]) CHECK_CUDA(cudaFree(sampling_matrix_d[n]));
-        if (sphere_vertices_d[n]) CHECK_CUDA(cudaFree(sphere_vertices_d[n]));
-        if (sphere_edges_d[n]) CHECK_CUDA(cudaFree(sphere_edges_d[n]));
-
-        if (slines_[n]) CHECK_CUDA(cudaFreeHost(slines_[n]));
-        if (slinesLen_[n]) CHECK_CUDA(cudaFreeHost(slinesLen_[n]));
-
-        CHECK_CUDA(cudaStreamDestroy(streams_[n]));
-      }
-    }
-
-    std::vector<py::array_t<REAL>> generate_streamlines(np_array seeds) {
-
-      auto seeds_info = seeds.request();
-      int nseeds = seeds_info.shape[0];
-
-      std::vector<REAL*> seeds_d(ngpus_, nullptr);
-      int nseeds_per_gpu = (nseeds + ngpus_ - 1) / ngpus_;
-
-      //#pragma omp parallel for
-      for (int n = 0; n < ngpus_; ++n) {
-        int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu));
-        CHECK_CUDA(cudaSetDevice(n));
-        CHECK_CUDA(cudaMalloc(&seeds_d[n], sizeof(*seeds_d[n]) * 3 * nseeds_gpu));
-        CHECK_CUDA(cudaMemcpy(seeds_d[n], reinterpret_cast<REAL *>(seeds_info.ptr) + 3*n*nseeds_per_gpu, sizeof(*seeds_d[n]) * 3 * nseeds_gpu, cudaMemcpyHostToDevice));
-      }
-
-      std::vector<int> nSlines(ngpus_);
-
-      // Call GPU routine
-      generate_streamlines_cuda_mgpu(model_type_, max_angle_, min_signal_, tc_threshold_, step_size_,
-                                     relative_peak_thresh_, min_separation_angle_,
-                                     nseeds, seeds_d,
-                                     dimx_, dimy_, dimz_, dimt_,
-                                     dataf_d, H_d, R_d, delta_nr_, delta_b_d, delta_q_d, b0s_mask_d, metric_map_d, samplm_nr_, sampling_matrix_d,
-                                     sphere_vertices_d, sphere_edges_d, nedges_,
-                                     slines_, slinesLen_, nSlines, nSlines_old_, rng_seed_, rng_offset_, ngpus_,
-                                     streams_);
-
-      nSlines_old_ = nSlines;  //store number of slines for next set of seeds
-
-      // Update rng_offset for next set of seeds
-      rng_offset_ += nseeds;
-
-      int nSlines_total = 0;
-      for (int n = 0; n < ngpus_; ++n) {
-        CHECK_CUDA(cudaFree(seeds_d[n]));
-        nSlines_total += nSlines[n];
-      }
-
-      std::vector<py::array_t<REAL>> slines_list;
-      slines_list.reserve(nSlines_total);
-      for (int n = 0; n < ngpus_; ++n) {
-        for (int i = 0; i < nSlines[n]; ++i) {
-          REAL* sl = new REAL[slinesLen_[n][i]*3];
-          std::memcpy(sl, slines_[n] + i*3*2*MAX_SLINE_LEN, slinesLen_[n][i]*3*sizeof(*sl));
-          auto sl_arr = py::array_t<REAL>({slinesLen_[n][i], 3}, // shape
-                                          {3*sizeof(REAL), sizeof(REAL)}, // strides
-                                          sl,
-                                          cleanup(sl));
-          slines_list.push_back(sl_arr);
-        }
-      }
-
-      return slines_list;
-
-    }
-
-    void dump_streamlines(std::string output_prefix, std::string voxel_order,
-                          np_array_int roi_shape, np_array voxel_size, np_array vox_to_ras) {
-
-      auto roi_shape_info = roi_shape.request();
-      auto voxel_size_info = voxel_size.request();
-      auto vox_to_ras_info = vox_to_ras.request();
-
-      START_RANGE("filewrite", 0);
-
-      //#pragma omp parallel for
-      for (int n = 0; n < ngpus_; ++n) {
-        std::stringstream ss;
-        ss << output_prefix << "_" << std::to_string(n) <<  ".trk";
-        write_trk(ss.str().c_str(), reinterpret_cast<int *>(roi_shape_info.ptr), reinterpret_cast<REAL *>(voxel_size_info.ptr),
-                  voxel_order.c_str(), reinterpret_cast<REAL *>(vox_to_ras_info.ptr), nSlines_old_[n], slinesLen_[n],
-                  reinterpret_cast<REAL3 *>(slines_[n]));
-      }
-
-      END_RANGE;
-    }
-
-  private:
-    int ngpus_;
-    int rng_seed_;
-    int rng_offset_;
-    int dimx_, dimy_, dimz_, dimt_;
-    int nedges_;
-    int delta_nr_, samplm_nr_;
-
-    ModelType model_type_;
-    REAL max_angle_;
-    REAL tc_threshold_;
-    REAL min_signal_;
-    REAL step_size_;
-    REAL relative_peak_thresh_;
-    REAL min_separation_angle_;
-
-    std::vector<int> nSlines_old_;
-    std::vector<REAL*> slines_;
-    std::vector<int*> slinesLen_;
-
-    std::vector<REAL*> dataf_d;
-    std::vector<REAL*> H_d;
-    std::vector<REAL*> R_d;
-    std::vector<REAL*> delta_b_d;
-    std::vector<REAL*> delta_q_d;
-    std::vector<int*> b0s_mask_d;
-    std::vector<REAL*> metric_map_d;
-    std::vector<REAL*> sampling_matrix_d;
-    std::vector<REAL*> sphere_vertices_d;
-    std::vector<int*> sphere_edges_d;
-
-    std::vector<cudaStream_t> streams_;
-
-};
-
-
-PYBIND11_MODULE(cuslines, m) {
-  m.attr("MAX_SLINE_LEN") = py::int_(MAX_SLINE_LEN);
-  m.attr("REAL_SIZE") = py::int_(REAL_SIZE);
-
-  py::enum_<ModelType>(m, "ModelType")
-    .value("OPDT", OPDT)
-    .value("CSA", CSA)
-    .value("PROB", PROB)
-    .value("PTT", PTT);
-
-  py::class_<GPUTracker>(m, "GPUTracker")
-    .def(py::init<ModelType, REAL, REAL, REAL, REAL,
-                  REAL, REAL,
-		              np_array_cast, np_array_cast,
-                  np_array_cast, np_array_cast,
-                  np_array_cast, np_array_int_cast,
-                  np_array_cast, np_array_cast,
-                  np_array_cast, np_array_int_cast,
-                  int, int, int>(),
-                  py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(),
-                  py::arg().noconvert(), py::arg().noconvert(),
-                  py::arg().noconvert(), py::arg().noconvert(),
-                  py::arg().noconvert(), py::arg().noconvert(),
-                  py::arg().noconvert(), py::arg().noconvert(),
-                  py::arg().noconvert(), py::arg().noconvert(),
-                  py::arg().noconvert(), py::arg().noconvert(),
-                  py::arg("ngpus") = 1, py::arg("rng_seed") = 0,
-                  py::arg("rng_offset") = 0)
-
-    .def("generate_streamlines", &GPUTracker::generate_streamlines,
-         "Generates streamline for dipy test case.")
-
-    .def("dump_streamlines", &GPUTracker::dump_streamlines,
-         "Dump streamlines to file.");
-}
-
diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index db3c0e2..9b04e60 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -26,28 +26,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// TODO: its possible all the cpp should be refactored
-// out into a separate file, but for now, they are just wrapped
-// in these ifndefs
-#ifndef __NVRTC__
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <getopt.h>
-#endif
-
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
 
-#ifndef __NVRTC__
-#include <cfloat>
-#include <omp.h>
-#include <vector>
-#include <cmath> // Might not be needed anymore?
-#include <iostream>
-#endif
-
 #include "cudamacro.h" /* for time() */
 #include "globals.h"
 
@@ -1774,618 +1755,3 @@ __global__ void genStreamlinesMerge_k(
         }
         return;
 }
-
-#ifndef __NVRTC__
-void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
-                                    const REAL relative_peak_thresh, const REAL min_separation_angle,
-                                    const int nseeds, const std::vector<REAL*> &seeds_d,
-                                    const int dimx, const int dimy, const int dimz, const int dimt,
-                                    const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
-			                        const int delta_nr,
-                                    const std::vector<REAL*> &delta_b_d, const std::vector<REAL*> &delta_q_d,
-                                    const std::vector<int*> &b0s_mask_d, const std::vector<REAL*> &metric_map_d,
-			                        const int samplm_nr,
-                                    const std::vector<REAL*> &sampling_matrix_d,
-                                    const std::vector<REAL*> &sphere_vertices_d, const std::vector<int*> &sphere_edges_d, const int nedges,
-                                    std::vector<REAL*> &slines_h, std::vector<int*> &slinesLen_h, std::vector<int> &nSlines_h,
-                                    const std::vector<int> nSlines_old_h, const int rng_seed, const int rng_offset,
-                                    const int ngpus, const std::vector<cudaStream_t> &streams) {
-
-  int nseeds_per_gpu = (nseeds + ngpus - 1) / ngpus;
-
-  std::vector<int*> slinesOffs_d(ngpus, nullptr);
-  std::vector<REAL3*> shDirTemp0_d(ngpus, nullptr);
-
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    CHECK_CUDA(cudaSetDevice(n));
-    int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu));
-    dim3 block(THR_X_SL, THR_X_BL/THR_X_SL);
-    dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL));
-
-    CHECK_CUDA(cudaMalloc(&slinesOffs_d[n], sizeof(*slinesOffs_d[n])*(nseeds_gpu+1)));
-    CHECK_CUDA(cudaMalloc(&shDirTemp0_d[n], sizeof(*shDirTemp0_d[n])*samplm_nr*grid.x*block.y));
-  }
-
-  int n32dimt = ((dimt+31)/32)*32;
-
-  size_t shSizeGNS;
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    CHECK_CUDA(cudaSetDevice(n));
-    int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu));
-    if (nseeds_gpu == 0) continue;
-    dim3 block(THR_X_SL, THR_X_BL/THR_X_SL);
-    dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL));
-
-    // Precompute number of streamlines before allocating memory
-    if (!((model_type == PTT) || (model_type == PROB))) {
-        shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_boot_d
-                    sizeof(int)*samplm_nr;						      // for peak_directions_d	
-        getNumStreamlinesBoot_k<THR_X_SL,
-                                THR_X_BL/THR_X_SL>
-                                <<<grid, block, shSizeGNS>>>(
-                                        model_type,
-                                        max_angle,
-                                        min_signal,
-                                        relative_peak_thresh,
-                                        min_separation_angle,
-                                        rng_seed,
-                                        nseeds_gpu,
-                                        reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                                        dimx,
-                                        dimy,
-                                        dimz,
-                                        dimt,
-                                        dataf_d[n],
-                                        H_d[n],
-                                        R_d[n],
-                                        delta_nr,
-                                        delta_b_d[n],
-                                        delta_q_d[n],
-                                        b0s_mask_d[n],
-                                        samplm_nr,
-                                        sampling_matrix_d[n],
-                                        reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]),
-                                        reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                                        nedges,
-                                        shDirTemp0_d[n],
-                                        slinesOffs_d[n]);
-    } else {
-        shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt + sizeof(int)*(THR_X_BL/THR_X_SL)*n32dimt;
-        getNumStreamlinesProb_k<THR_X_SL,
-                                THR_X_BL/THR_X_SL>
-                                <<<grid, block, shSizeGNS>>>(
-                                        max_angle,
-                                        relative_peak_thresh,
-                                        min_separation_angle,
-                                        rng_seed,
-                                        nseeds_gpu,
-                                        reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                                        dimx,
-                                        dimy,
-                                        dimz,
-                                        dimt,
-                                        dataf_d[n],
-                                        reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]),
-                                        reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                                        nedges,
-                                        shDirTemp0_d[n],
-                                        slinesOffs_d[n]);
-    }
-  }
-
-  std::vector<int> slinesOffs_h;
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    //std::vector<int> slinesOffs_h;
-    int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu));
-    if (nseeds_gpu == 0) {
-      nSlines_h[n] = 0;
-      continue;
-    }
-    slinesOffs_h.resize(nseeds_gpu+1);
-    CHECK_CUDA(cudaMemcpy(slinesOffs_h.data(), slinesOffs_d[n], sizeof(*slinesOffs_h.data())*(nseeds_gpu+1), cudaMemcpyDeviceToHost));
-
-    int __pval = slinesOffs_h[0];
-    slinesOffs_h[0] = 0;
-    for(int i = 1; i < nseeds_gpu+1; i++) {
-      const int __cval = slinesOffs_h[i];
-      slinesOffs_h[i] = slinesOffs_h[i-1] + __pval;
-      __pval = __cval;
-    }
-    nSlines_h[n] = slinesOffs_h[nseeds_gpu];
-    CHECK_CUDA(cudaMemcpy(slinesOffs_d[n], slinesOffs_h.data(), sizeof(*slinesOffs_d[n])*(nseeds_gpu+1), cudaMemcpyHostToDevice));
-  }
-
-  std::vector<int*> slineSeed_d(ngpus, nullptr);
-
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    CHECK_CUDA(cudaSetDevice(n));
-    int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu));
-
-    CHECK_CUDA(cudaMalloc(&slineSeed_d[n], sizeof(*slineSeed_d[n])*nSlines_h[n]));
-    CHECK_CUDA(cudaMemset(slineSeed_d[n], -1, sizeof(*slineSeed_d[n])*nSlines_h[n]));
-
-    // Allocate/reallocate output arrays if necessary
-    if (nSlines_h[n] > EXCESS_ALLOC_FACT*nSlines_old_h[n]) {
-      if(slines_h[n]) cudaFreeHost(slines_h[n]);
-      if(slinesLen_h[n]) cudaFreeHost(slinesLen_h[n]);
-      slines_h[n] = nullptr;
-      slinesLen_h[n] = nullptr;
-    }
-
-#ifdef DEBUG
-    printf("buffer size %zu\n", sizeof(*slines_h[n])*EXCESS_ALLOC_FACT*2*3*MAX_SLINE_LEN*nSlines_h[n]);
-#endif
-
-    if (!slines_h[n]) CHECK_CUDA(cudaMallocHost(&slines_h[n], sizeof(*slines_h[n])*EXCESS_ALLOC_FACT*2*3*MAX_SLINE_LEN*nSlines_h[n]));
-    if (!slinesLen_h[n]) CHECK_CUDA(cudaMallocHost(&slinesLen_h[n], sizeof(*slinesLen_h[n])*EXCESS_ALLOC_FACT*nSlines_h[n]));
-  }
-
-  //if (nSlines_h) {
-
-  std::vector<int*> slineLen_d(ngpus, nullptr);
-  std::vector<REAL3*> sline_d(ngpus, nullptr);
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    CHECK_CUDA(cudaSetDevice(n));
-    CHECK_CUDA(cudaMalloc(&slineLen_d[n], sizeof(*slineLen_d[n])*nSlines_h[n]));
-
-    CHECK_CUDA(cudaMalloc(&sline_d[n], sizeof(*sline_d[n])*2*MAX_SLINE_LEN*nSlines_h[n]));
-
-#if 0
-    size_t free_mem, total_mem;
-    CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
-    std::cerr << "GPU " << n << ": ";
-    std::cerr << "GPU Memory Usage before genStreamlinesMerge_k: ";
-    std::cerr << (total_mem-free_mem)/(1024*1024) << " MiB used, ";
-    std::cerr << total_mem/(1024*1024) << " MiB total ";
-    std::cerr << std::endl;
-#endif
-  }
-
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    CHECK_CUDA(cudaSetDevice(n));
-    int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu));
-    if (nseeds_gpu == 0) continue;
-    dim3 block(THR_X_SL, THR_X_BL/THR_X_SL);
-    dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL));
-#if 0
-    std::cerr << "GPU " << n << ": ";
-    std::cerr << "Generating " << nSlines_h[n] << " streamlines (from " << nseeds_gpu << " seeds)" << std::endl; 
-#endif
-
-    //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y);
-    switch(model_type) {  // TODO: these may be better as separate functions, not as template specializations
-        case OPDT:
-        case CSA:
-            BootCtx<REAL>* d_ctx;
-            BootCtx<REAL> h_ctx;
-            h_ctx.min_signal      = min_signal;
-            h_ctx.delta_nr        = delta_nr;
-            h_ctx.H               = H_d[n];
-            h_ctx.R               = R_d[n];
-            h_ctx.delta_b         = delta_b_d[n];
-            h_ctx.delta_q         = delta_q_d[n];
-            h_ctx.sampling_matrix = sampling_matrix_d[n];
-            h_ctx.b0s_mask        = b0s_mask_d[n];
-            CHECK_CUDA(cudaMalloc(&d_ctx, sizeof(BootCtx<REAL>)));
-            CHECK_CUDA(cudaMemcpyAsync(
-                d_ctx, &h_ctx, sizeof(BootCtx<REAL>),
-                cudaMemcpyHostToDevice, streams[n]));
-
-            if (model_type == OPDT) {
-                genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, OPDT> <<<grid, block, shSizeGNS, streams[n]>>>(
-                    max_angle, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
-                    rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                    dimx, dimy, dimz, dimt, dataf_d[n],
-                    metric_map_d[n], d_ctx, samplm_nr,
-                    reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                    nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
-            } else if (model_type == CSA) {
-                genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, CSA> <<<grid, block, shSizeGNS, streams[n]>>>(
-                    max_angle, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
-                    rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                    dimx, dimy, dimz, dimt, dataf_d[n],
-                    metric_map_d[n], d_ctx, samplm_nr,
-                    reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                    nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
-            } else {
-                // Should never reach here
-            }
-            
-            CHECK_CUDA(cudaFree(d_ctx));
-            break;
-
-        case PROB:
-            // Shared memory requirements are smaller for probabilistic for main run
-            // than for preliminary run
-            shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt;
-            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, PROB> <<<grid, block, shSizeGNS, streams[n]>>>(
-                max_angle, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
-                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                dimx, dimy, dimz, dimt, dataf_d[n],
-                metric_map_d[n], nullptr, samplm_nr,
-                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
-            break;
-
-        case PTT:
-            shSizeGNS = 0; // PTT uses exclusively static shared memory
-            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, PTT> <<<grid, block, shSizeGNS, streams[n]>>>(
-                max_angle, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
-                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                dimx, dimy, dimz, dimt, dataf_d[n],
-                metric_map_d[n], nullptr, samplm_nr,
-                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
-            break;
-
-        default:
-            printf("FATAL: Invalid Model Type.\n");
-            break;
-    }
-
-    CHECK_ERROR("genStreamlinesMerge_k");
-  }
-
-  //CHECK_CUDA(cudaDeviceSynchronize());
-
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    CHECK_CUDA(cudaSetDevice(n));
-    CHECK_CUDA(cudaMemcpyAsync(slines_h[n],
-                          reinterpret_cast<REAL *>(sline_d[n]),
-                          sizeof(*slines_h[n])*2*MAX_SLINE_LEN*nSlines_h[n]*3,
-                          cudaMemcpyDeviceToHost, streams[n]));
-    CHECK_CUDA(cudaMemcpyAsync(slinesLen_h[n],
-                          slineLen_d[n],
-                          sizeof(*slinesLen_h[n])*nSlines_h[n],
-                          cudaMemcpyDeviceToHost, streams[n]));
-
-  }
-  //};
-
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    CHECK_CUDA(cudaSetDevice(n));
-    CHECK_CUDA(cudaStreamSynchronize(streams[n]));
-    CHECK_CUDA(cudaFree(slineSeed_d[n]));
-    CHECK_CUDA(cudaFree(slinesOffs_d[n]));
-    CHECK_CUDA(cudaFree(shDirTemp0_d[n]));
-    CHECK_CUDA(cudaFree(slineLen_d[n]));
-    CHECK_CUDA(cudaFree(sline_d[n]));
-  }
-
-}
-
-#if 1
-void write_trk(const char *fname,
-               const /*short*/ int *dims,
-               const REAL *voxel_size,
-               const char *voxel_order,
-               const REAL *vox_to_ras,
-               const int nsline,
-               const int *slineLen,
-               const REAL3 *sline) {
-
-        FILE *fp = fopen(fname, "w");
-        if (!fp) {
-                fprintf(stderr, "Cannot open file %s for writing...\n", fname);
-                exit(EXIT_FAILURE);
-        }
-
-        const char ID_STRING[6] = "TRACK";
-        short DIM[3] = {1, 1, 1};
-        float VOXEL_SIZE[3] = {1.0f, 1.0f, 1.0f};
-        float VOX_TO_RAS[4][4] = {{1.0f, 0.0f, 0.0, 0.0f},
-                                  {0.0f, 1.0f, 0.0, 0.0f},
-                                  {0.0f, 0.0f, 1.0, 0.0f},
-                                  {0.0f, 0.0f, 0.0, 1.0f}};
-        //const char VOXEL_ORDER[2][4] = {"RAS", "LAS"};
-        const float ORIGIN[3] = {0.0f, 0.0f, 0.0f};
-        const float IMAGE_ORIENTATION_PATIENT[6] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
-        const int VERSION = 2;
-        const int HDR_SIZE = 1000;
-
-        // write header
-        unsigned char header[1000];
-        memset(&header[0], 0, sizeof(header));
-
-        long long int off = 0;
-
-        memcpy(header, ID_STRING, sizeof(ID_STRING));
-        off += sizeof(ID_STRING);
-
-        if (dims) {
-                DIM[0] = dims[0];
-                DIM[1] = dims[1];
-                DIM[2] = dims[2];
-        }
-        memcpy(header+off, DIM, sizeof(DIM));
-        off += sizeof(DIM);
-
-        if (voxel_size) {
-                VOXEL_SIZE[0] = (float)voxel_size[0];
-                VOXEL_SIZE[1] = (float)voxel_size[1];
-                VOXEL_SIZE[2] = (float)voxel_size[2];
-        }
-        memcpy(header+off, VOXEL_SIZE, sizeof(VOXEL_SIZE));
-        off += sizeof(VOXEL_SIZE);
-
-        memcpy(header+off, ORIGIN, sizeof(ORIGIN));
-        off += sizeof(ORIGIN);
-
-        // skip n_scalaer(2b) + scalar_name(200b) +
-        //      n_properties(2b) + property_name(200b)
-        off += 404;
-
-        if (vox_to_ras) {
-                for(int i = 0; i < 4; i++) {
-                        for(int j = 0; j < 4; j++) {
-                                VOX_TO_RAS[i][j] = (float)vox_to_ras[i*4+j];
-                        }
-                }
-        }
-        memcpy(header+off, VOX_TO_RAS, sizeof(VOX_TO_RAS));
-        off += sizeof(VOX_TO_RAS);
-
-        // skip reserved(444b)
-        off += 444;
-
-        if (voxel_order) {
-                memcpy(header+off, voxel_order, 4);
-        } else {
-                memcpy(header+off, "LAS", 4);
-        }
-        off += 4; //sizeof(VOXEL_ORDER[voxel_order]);
-
-        // skip pad2(4b)
-        off += 4;
-
-        memcpy(header+off, IMAGE_ORIENTATION_PATIENT, sizeof(IMAGE_ORIENTATION_PATIENT));
-        off += sizeof(IMAGE_ORIENTATION_PATIENT);
-
-        // skip pad1(2b)
-        off += 2;
-
-        // skip invert_x(1b), invert_y(1b), invert_x(1b), swap_xy(1b), swap_yz(1b), swap_zx(1b)
-        off += 6;
-
-        memcpy(header+off, &nsline, sizeof(int));
-        off += sizeof(int);
-
-        memcpy(header+off, &VERSION, sizeof(VERSION));
-        off += sizeof(VERSION);
-
-        memcpy(header+off, &HDR_SIZE, sizeof(HDR_SIZE));
-        off += sizeof(HDR_SIZE);
-
-        //assert(off == 1000);
-        if (off != 1000) {
-                fprintf(stderr, "%s:%s:%d: heder size = %lld, (!= 1000)!\n", __FILE__, __func__, __LINE__, off);
-                exit(EXIT_FAILURE);
-        }
-        
-        size_t nw = fwrite(header, sizeof(header), 1, fp);
-        if (nw != 1) {
-                fprintf(stderr, "Error while writing to file!\n");
-                exit(EXIT_FAILURE);
-        }
-#if 0
-        // write body
-        long long maxSlineLen = slineLen[0];
-        for(long long i = 1; i < nsline; i++) {
-                maxSlineLen = MAX(maxSlineLen, slineLen[i]);
-        }
-
-        float *slineData = (float *)Malloc((1+3*maxSlineLen)*sizeof(*slineData));
-#else
-        float slineData[1 + 3*(2*MAX_SLINE_LEN)];
-#endif
-        for(int i = 0; i < nsline; i++) {
-                reinterpret_cast<int *>(slineData)[0] = slineLen[i];
-                for(int j = 0; j < slineLen[i]; j++) {
-                        slineData[1+3*j+0] = (float)((sline[i*2*MAX_SLINE_LEN + j].x+0.5)*VOXEL_SIZE[0]);
-                        slineData[1+3*j+1] = (float)((sline[i*2*MAX_SLINE_LEN + j].y+0.5)*VOXEL_SIZE[1]);
-                        slineData[1+3*j+2] = (float)((sline[i*2*MAX_SLINE_LEN + j].z+0.5)*VOXEL_SIZE[2]);
-                }
-                nw = fwrite(slineData, (1+3*slineLen[i])*sizeof(*slineData), 1, fp);
-                if (nw != 1) {
-                        fprintf(stderr, "Error while writing to file!\n");
-                        exit(EXIT_FAILURE);
-                }
-        }
-#if 0
-        free(slineData);
-#endif
-        fclose(fp);
-
-        return;
-}
-#else
-void write_trk(const int num_threads,
-               const char *fname,
-               const /*short*/ int *dims,
-               const REAL *voxel_size,
-               const char *voxel_order,
-               const REAL *vox_to_ras,
-               const int nsline,
-               const int *slineLen,
-               const REAL3 *sline) {
-
-        FILE *fp = fopen(fname, "w");
-        if (!fp) {
-                fprintf(stderr, "Cannot open file %s for writing...\n", fname);
-                exit(EXIT_FAILURE);
-        }
-
-        const char ID_STRING[6] = "TRACK";
-        short DIM[3] = {1, 1, 1};
-        float VOXEL_SIZE[3] = {1.0f, 1.0f, 1.0f};
-        float VOX_TO_RAS[4][4] = {{1.0f, 0.0f, 0.0, 0.0f},
-                                  {0.0f, 1.0f, 0.0, 0.0f},
-                                  {0.0f, 0.0f, 1.0, 0.0f},
-                                  {0.0f, 0.0f, 0.0, 1.0f}};
-        //const char VOXEL_ORDER[2][4] = {"RAS", "LAS"};
-        const float ORIGIN[3] = {0.0f, 0.0f, 0.0f};
-        const float IMAGE_ORIENTATION_PATIENT[6] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
-        const int VERSION = 2;
-        const int HDR_SIZE = 1000;
-
-        // write header
-        unsigned char header[1000];
-        memset(&header[0], 0, sizeof(header));
-
-        long long int off = 0;
-
-        memcpy(header, ID_STRING, sizeof(ID_STRING));
-        off += sizeof(ID_STRING);
-
-        if (dims) {
-                DIM[0] = dims[0];
-                DIM[1] = dims[1];
-                DIM[2] = dims[2];
-        }
-        memcpy(header+off, DIM, sizeof(DIM));
-        off += sizeof(DIM);
-
-        if (voxel_size) {
-                VOXEL_SIZE[0] = (float)voxel_size[0];
-                VOXEL_SIZE[1] = (float)voxel_size[1];
-                VOXEL_SIZE[2] = (float)voxel_size[2];
-        }
-        memcpy(header+off, VOXEL_SIZE, sizeof(VOXEL_SIZE));
-        off += sizeof(VOXEL_SIZE);
-
-        memcpy(header+off, ORIGIN, sizeof(ORIGIN));
-        off += sizeof(ORIGIN);
-
-        // skip n_scalaer(2b) + scalar_name(200b) +
-        //      n_properties(2b) + property_name(200b)
-        off += 404;
-
-        if (vox_to_ras) {
-                for(int i = 0; i < 4; i++) {
-                        for(int j = 0; j < 4; j++) {
-                                VOX_TO_RAS[i][j] = (float)vox_to_ras[i*4+j];
-                        }
-                }
-        }
-        memcpy(header+off, VOX_TO_RAS, sizeof(VOX_TO_RAS));
-        off += sizeof(VOX_TO_RAS);
-
-        // skip reserved(444b)
-        off += 444;
-
-        if (voxel_order) {
-                memcpy(header+off, voxel_order, 4);
-        } else {
-                memcpy(header+off, "LAS", 4);
-        }
-        off += 4; //sizeof(VOXEL_ORDER[voxel_order]);
-
-        // skip pad2(4b)
-        off += 4;
-
-        memcpy(header+off, IMAGE_ORIENTATION_PATIENT, sizeof(IMAGE_ORIENTATION_PATIENT));
-        off += sizeof(IMAGE_ORIENTATION_PATIENT);
-
-        // skip pad1(2b)
-        off += 2;
-
-        // skip invert_x(1b), invert_y(1b), invert_x(1b), swap_xy(1b), swap_yz(1b), swap_zx(1b)
-        off += 6;
-
-        memcpy(header+off, &nsline, sizeof(int));
-        off += sizeof(int);
-
-        memcpy(header+off, &VERSION, sizeof(VERSION));
-        off += sizeof(VERSION);
-
-        memcpy(header+off, &HDR_SIZE, sizeof(HDR_SIZE));
-        off += sizeof(HDR_SIZE);
-
-        //assert(off == 1000);
-        if (off != 1000) {
-                fprintf(stderr, "%s:%s:%d: heder size = %lld, (!= 1000)!\n", __FILE__, __func__, __LINE__, off);
-                exit(EXIT_FAILURE);
-        }
-        
-        size_t nw = fwrite(header, sizeof(header), 1, fp);
-        if (nw != 1) {
-                fprintf(stderr, "Error while writing to file!\n");
-                exit(EXIT_FAILURE);
-        }
-
-        // write body
-        long long maxSlineLen = slineLen[0];
-        for(long long i = 1; i < nsline; i++) {
-                maxSlineLen = MAX(maxSlineLen, slineLen[i]);
-        }
-
-        //omp_set_dynamic(0);
-        const int NTHREADS = num_threads > 0 ? num_threads : 1;
-        omp_set_num_threads(NTHREADS);
-
-        const int NFLTS_PER_TH = 1 + 2*(3*MAX_SLINE_LEN);
-        float *slineData = (float *)Malloc(NFLTS_PER_TH*NTHREADS*sizeof(*slineData));
-
-        #pragma omp parallel 
-        {
-                const int tid = omp_get_thread_num();
-                float *__mySlineData = slineData+tid*NFLTS_PER_TH;
-#if 1
-                //#pragma omp for schedule(static)
-                for(int i = 0; i < nsline; i += NTHREADS) {
-                        if (i+tid < nsline) {
-                                reinterpret_cast<int *>(__mySlineData)[0] = slineLen[i+tid];
-                                for(int j = 0; j < slineLen[i+tid]; j++) {
-                                        __mySlineData[1+3*j+0] = (float)((sline[(i+tid)*2*MAX_SLINE_LEN + j].x+0.5)*VOXEL_SIZE[0]);
-                                        __mySlineData[1+3*j+1] = (float)((sline[(i+tid)*2*MAX_SLINE_LEN + j].y+0.5)*VOXEL_SIZE[1]);
-                                        __mySlineData[1+3*j+2] = (float)((sline[(i+tid)*2*MAX_SLINE_LEN + j].z+0.5)*VOXEL_SIZE[2]);
-                                }
-                        }
-                        #pragma omp barrier
-                        if (tid == 0) {
-                                for(int j = 0; j < NTHREADS; j++) {
-                                        if (i+j >= nsline) {
-                                               break;
-                                        }
-                                        nw = fwrite(slineData+j*NFLTS_PER_TH, (1+3*slineLen[i+j])*sizeof(*slineData), 1, fp);
-                                        if (nw != 1) {
-                                                fprintf(stderr, "Error while writing to file!\n");
-                                                exit(EXIT_FAILURE);
-                                        }
-                                }
-                        }
-                        #pragma omp barrier
-                }
-#else
-                // streamlines are not required to be in any specific order inside the trk file...
-                #pragma omp for
-                for(int i = 0; i < nsline; i++) {
-                        reinterpret_cast<int *>(__mySlineData)[0] = slineLen[i];
-                        for(int j = 0; j < slineLen[i]; j++) {
-                                __mySlineData[1+3*j+0] = (float)((sline[i*2*MAX_SLINE_LEN + j].x+0.5)*VOXEL_SIZE[0]);
-                                __mySlineData[1+3*j+1] = (float)((sline[i*2*MAX_SLINE_LEN + j].y+0.5)*VOXEL_SIZE[1]);
-                                __mySlineData[1+3*j+2] = (float)((sline[i*2*MAX_SLINE_LEN + j].z+0.5)*VOXEL_SIZE[2]);
-                        }
-                        nw = fwrite(__mySlineData, (1+3*slineLen[i])*sizeof(*__mySlineData), 1, fp);
-                        if (nw != 1) {
-                                fprintf(stderr, "Error while writing to file!\n");
-                                exit(EXIT_FAILURE);
-                        }
-                }
-#endif
-        }
-        free(slineData);
-        fclose(fp);
-
-        return;
-}
-#endif
-#endif // __NVRTC__
diff --git a/cuslines/generate_streamlines_cuda.h b/cuslines/generate_streamlines_cuda.h
deleted file mode 100644
index 14105ce..0000000
--- a/cuslines/generate_streamlines_cuda.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __GENERATE_STREAMLINES_CUDA_H__
-#define __GENERATE_STREAMLINES_CUDA_H__
-
-#include <vector>
-
-#include "globals.h"
-
-void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
-                                    const REAL relative_peak_thresh, const REAL min_separation_angle,
-                                    const int nseeds, const std::vector<REAL*> &seeds_d,
-                                    const int dimx, const int dimy, const int dimz, const int dimt,
-                                    const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
-                                    const int delta_nr,
-                                    const std::vector<REAL*> &delta_b_d, const std::vector<REAL*> &delta_q_d,
-                                    const std::vector<int*> &b0s_mask_d, const std::vector<REAL*> &metric_map_d,
-                                    const int samplm_nr,
-                                    const std::vector<REAL*> &sampling_matrix_d,
-                                    const std::vector<REAL*> &sphere_vertices_d, const std::vector<int*> &sphere_edges_d, const int nedges,
-                                    std::vector<REAL*> &slines_h, std::vector<int*> &slinesLen_h, std::vector<int> &nSlines_h,
-                                    const std::vector<int> nSlines_old_h, const int rng_seed, const int rng_offset,
-                                    const int ngpus, const std::vector<cudaStream_t> &streams);
-#if 1
-void write_trk(const char *fname,
-               const /*short*/ int *dims,
-	       const REAL *voxel_size,
-	       const char *voxel_order,
-	       const REAL *vox_to_ras,
-	       const int nsline,
-	       const int *slineLen,
-	       const REAL3 *sline);
-#else
-void write_trk(const int num_threads,
-	       const char *fname,
-               const /*short*/ int *dims,
-	       const REAL *voxel_size,
-	       const char *voxel_order,
-	       const REAL *vox_to_ras,
-	       const int nsline,
-	       const int *slineLen,
-	       const REAL3 *sline);
-#endif
-#endif
diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
index 894d0bf..5684272 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/ptt.cu
@@ -295,7 +295,7 @@ __device__ int get_direction_ptt_d(
     REAL3_T *__probing_pos_sh = probing_pos_sh + tidy;
 
     const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1));
-    const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / step_size;
+    const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / (step_size / PROBE_FRAC); // This seems to work well
     const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d<BDIM_X>(dimt, pmf, REAL_MIN);
 
 #if 0
diff --git a/merge_trk.sh b/merge_trk.sh
deleted file mode 100755
index c47412f..0000000
--- a/merge_trk.sh
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-usage() {
-	echo "$(basename $0) [ -v ] -o trk_outfile <trk_infile0> <trk_infile1> ..."
-}
-
-if [ $# -eq 0 ]; then
-	usage
-	exit 1
-fi
-
-OUT_FILE=""
-VERBOSE=""
-
-OPTS=$(getopt -o "vho:" -- "$@")
-eval set -- "$OPTS"
-
-while true; do
-	case "$1" in
-		-o)
-			OUT_FILE=$2
-			shift
-			shift
-			;;
-		-v)
-			VERBOSE="1"
-			shift
-			;;
-		-h)
-			usage
-			exit 1
-			;;
-		--)
-			shift
-			break
-			;;
-	esac
-done
-
-if [ -z $OUT_FILE ]; then
-	echo "Please provide an output file name with the -o option!"
-	exit 1
-fi
-
-# necessary when running via docker to expand again
-# the parameter list turning spaces into separators
-set -- $*
-
-TRK_FILES=("$@")
-NTRKF=$(($#))
-
-#echo $TRK_FILES
-#echo $NTRKF
-
-if [ $VERBOSE ]; then
-	echo "Merging $NTRKF files into $OUT_FILE..."
-fi
-
-head -c1000 ${TRK_FILES[0]} > $OUT_FILE
-
-NTRACK=0
-for((i=0; i<$NTRKF; i++)); do
-	if [ $VERBOSE ]; then
-		printf "%8d/%8d\r" $i $NTRKF
-	fi
-	NTRACK=$(($NTRACK + $(od -A none -t dI -j 988 -N4 ${TRK_FILES[$i]})));
-	tail -c+1001 ${TRK_FILES[$i]} >> $OUT_FILE
-done
-
-NTRACK=$(printf "%08X" $NTRACK)
-
-printf "\x${NTRACK:6:2}\x${NTRACK:4:2}\x${NTRACK:2:2}\x${NTRACK:0:2}" | dd of=$OUT_FILE bs=1 seek=988 count=4 conv=notrunc &> /dev/null
diff --git a/pyproject.toml b/pyproject.toml
index a1247c5..67877c1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,9 @@ requires-python = ">=3.7"
 dependencies = [
     "numpy",
     "nibabel",
+    "tqdm",
+    "dipy",
+    "trx-python",
     "cuda-python",
     "cuda-core",
     "cuda-cccl"
diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py
index 7585e37..89bbd04 100644
--- a/run_gpu_streamlines.py
+++ b/run_gpu_streamlines.py
@@ -27,10 +27,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import argparse  # TODO: do this again, but for cuda python version
+import argparse  #  TODO: get this running, commit it, then run clode cleaner
 import random
 import time
-import zipfile
 
 import numpy as np
 
@@ -41,22 +40,28 @@
 from dipy.tracking import utils
 from dipy.core.gradients import gradient_table, unique_bvals_magnitude
 from dipy.data import default_sphere
-from dipy.direction import (BootDirectionGetter, ProbabilisticDirectionGetter, PTTDirectionGetter)
+from dipy.direction import (
+  BootDirectionGetter as cpu_BootDirectionGetter,
+  ProbabilisticDirectionGetter as cpu_ProbDirectionGetter,
+  PTTDirectionGetter as cpu_PTTDirectionGetter)
 from dipy.reconst.shm import OpdtModel, CsaOdfModel
 from dipy.reconst.csdeconv import ConstrainedSphericalDeconvModel, auto_response_ssst
 from dipy.tracking.local_tracking import LocalTracking
 from dipy.tracking.stopping_criterion import ThresholdStoppingCriterion
-from dipy.reconst import shm
 from dipy.data import get_fnames
 from dipy.data import read_stanford_pve_maps
 
 import nibabel as nib
 from nibabel.orientations import aff2axcodes
 
-from trx.trx_file_memmap import TrxFile, zip_from_folder
+from trx.io import save as save_trx
 
-# Import custom module
-import cuslines
+from cuslines import (
+    BootDirectionGetter,
+    GPUTracker,
+    ProbDirectionGetter,
+    PttDirectionGetter,
+)
 
 t0 = time.time()
 
@@ -86,7 +91,7 @@ def get_img(ep2_seq):
 parser.add_argument("--chunk-size", type=int, default=100000, help="how many seeds to process per sweep, per GPU")
 parser.add_argument("--nseeds", type=int, default=100000, help="how many seeds to process in total")
 parser.add_argument("--ngpus", type=int, default=1, help="number of GPUs to use if using gpu")
-parser.add_argument("--write-method", type=str, default="fast", help="Can be trx, fast, or standard")
+parser.add_argument("--write-method", type=str, default="trk", help="Can be trx or trk")
 parser.add_argument("--max-angle", type=float, default=60, help="max angle (in degrees)")
 parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
 parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
@@ -100,9 +105,9 @@ def get_img(ep2_seq):
 
 args = parser.parse_args()
 
-if args.device == "cpu" and args.write_method != "standard":
-  print("WARNING: only standard write method is implemented for cpu testing.")
-  write_method = "standard"
+if args.device == "cpu" and args.write_method != "trk":
+  print("WARNING: only trk write method is implemented for cpu testing.")
+  write_method = "trk"
 else:
   write_method = args.write_method
 
@@ -137,15 +142,11 @@ def get_img(ep2_seq):
 tenfit = tenmodel.fit(data, mask)
 print('Computing anisotropy measures (FA,MD,RGB)')
 FA = tenfit.fa
-FA[np.isnan(FA)] = 0
 
 # Setup tissue_classifier args
 tissue_classifier = ThresholdStoppingCriterion(FA, args.fa_threshold)
-metric_map = np.asarray(FA, 'float64')
 
 # Create seeds for ROI
-# seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
-# seed_mask = seed_mask[0:args.nseeds]
 seed_mask = np.asarray(utils.random_seeds_from_mask(
   roi_data, seeds_count=args.nseeds,
   seed_count_per_voxel=False,
@@ -154,20 +155,27 @@ def get_img(ep2_seq):
 # Setup model
 sphere = default_sphere
 if args.model == "opdt":
-  model_type = cuslines.ModelType.OPDT
-  print("Running OPDT model...")
-  model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
-  fit_matrix = model._fit_matrix
-  delta_b, delta_q = fit_matrix
+  if args.device == "cpu":
+    model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
+    dg = cpu_BootDirectionGetter
+  else:
+    dg = BootDirectionGetter.from_dipy_opdt(
+      gtab,
+      sphere,
+      sh_order_max=args.sh_order,
+      sh_lambda=args.sm_lambda,
+      min_signal=args.min_signal)
 elif args.model == "csa":
-  model_type = cuslines.ModelType.CSA
-  print("Running CSA model...")
-  model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
-  fit_matrix = model._fit_matrix
-  # Unlike OPDT, CSA has a single matrix used for fit_matrix. Populating delta_b and delta_q with necessary values for
-  # now.
-  delta_b = fit_matrix
-  delta_q = fit_matrix
+  if args.device == "cpu":
+    model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
+    dg = cpu_BootDirectionGetter
+  else:
+    dg = BootDirectionGetter.from_dipy_csa(
+      gtab,
+      sphere,
+      sh_order_max=args.sh_order,
+      sh_lambda=args.sm_lambda,
+      min_signal=args.min_signal)
 else:
   print("Running CSD model...")
   unique_bvals = unique_bvals_magnitude(gtab.bvals)
@@ -185,158 +193,64 @@ def get_img(ep2_seq):
     roi_radii=10,
     fa_thr=0.7)
   model = ConstrainedSphericalDeconvModel(gtab, response, sh_order=args.sh_order)
-  # TODO: we shouldnt have to do this, also for CSA, but we populate delta_b, delta_q.
-  # we need to name change delta_b/delta_q and make it possible for them to be None, or something like this
-  delta_b = model._X
-  delta_q = model.B_reg
-
-if args.dg != "boot":
-  if args.dg == "prob":
-    model_type = cuslines.ModelType.PROB
-    dg = ProbabilisticDirectionGetter
-  else:
-    model_type = cuslines.ModelType.PTT
-    dg = PTTDirectionGetter
-  fit = model.fit(data, mask=(metric_map >= args.fa_threshold))
+  fit = model.fit(data, mask=(FA >= args.fa_threshold))
   data = fit.odf(sphere).clip(min=0)
-else:
-  dg = BootDirectionGetter
-
-global_chunk_size = args.chunk_size
+  if args.model == "ptt":
+      if args.device == "cpu":
+          dg = cpu_PTTDirectionGetter()
+      else:
+        # Set FOD to 0 outside mask for probing
+        data[FA < args.fa_threshold, :] = 0
+        dg = PttDirectionGetter()
+  elif args.model == "prob":
+      if args.device == "cpu":
+        dg = cpu_ProbDirectionGetter()
+      else:
+        dg = ProbDirectionGetter()
+  else:
+      raise ValueError("Unknown model type: {}".format(args.model))
 
 # Setup direction getter args
 if args.device == "cpu":
   if args.dg != "boot":
     dg = dg.from_pmf(data, max_angle=args.max_angle, sphere=sphere, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
   else:
-    dg = BootDirectionGetter.from_data(data, model, max_angle=args.max_angle, sphere=sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
-else:
-  # Setup direction getter args
-  b0s_mask = gtab.b0s_mask
-  dwi_mask = ~b0s_mask
-
-  # setup sampling matrix
-  theta = sphere.theta
-  phi = sphere.phi
-  sampling_matrix, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)
-
-  ## from BootPmfGen __init__
-  # setup H and R matrices
-  # TODO: figure out how to get H, R matrices from direction getter object
-  x, y, z = model.gtab.gradients[dwi_mask].T
-  r, theta, phi = shm.cart2sphere(x, y, z)
-  B, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)
-  H = shm.hat(B)
-  R = shm.lcr_matrix(H)
-
-  # create floating point copy of data
-  dataf = np.asarray(data, dtype=np.float64)
-
-  gpu_tracker = cuslines.GPUTracker(model_type,
-                                    args.max_angle * np.pi/180,
-                                    args.min_signal,
-                                    args.fa_threshold,
-                                    args.step_size,
-                                    args.relative_peak_threshold,
-                                    args.min_separation_angle * np.pi/180,
-                                    dataf.astype(np.float64), H.astype(np.float64), R.astype(np.float64), delta_b.astype(np.float64), delta_q.astype(np.float64),
-                                    b0s_mask.astype(np.int32), metric_map.astype(np.float64), sampling_matrix.astype(np.float64),
-                                    sphere.vertices.astype(np.float64), sphere.edges.astype(np.int32),
-                                    ngpus=args.ngpus, rng_seed=0)
-
-print('streamline gen')
-nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
-
-t1 = time.time()
-streamline_time = 0
-io_time = 0
-
-if args.output_prefix and write_method == "trx":
-  # Will resize by a factor of 2 if these are exceeded
-  sl_len_guess = 100
-  sl_per_seed_guess = 3
-  n_sls_guess = sl_per_seed_guess*len(seed_mask)
-
-  # trx files use memory mapping
-  trx_file = TrxFile(
-    reference=hardi_nifti_fname,
-    nb_streamlines=n_sls_guess,
-    nb_vertices=n_sls_guess*sl_len_guess)
-  offsets_idx = 0
-  sls_data_idx = 0
-
-for idx in range(int(nchunks)):
-  # Main streamline computation
-  ts = time.time()
-  if args.device == "cpu":
-    streamline_generator = LocalTracking(dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=np.eye(4), step_size=args.step_size)
-    streamlines = [s for s in streamline_generator]
-  else:
-    streamlines = gpu_tracker.generate_streamlines(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size])
-  te = time.time()
-  streamline_time += (te-ts)
-  print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
-                                                                    seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
-                                                                    te-ts))
+    dg = dg.from_data(data, model, max_angle=args.max_angle, sphere=sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
 
-  # Save tracklines file
-  if args.output_prefix:
     ts = time.time()
-    if write_method == "standard":
-      fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
-      sft = StatefulTractogram(streamlines, args.nifti_file, Space.VOX)
-      save_tractogram(sft, fname)
-      te = time.time()
-      print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-    elif write_method == "trx":
-      tractogram = nib.streamlines.Tractogram(streamlines, affine_to_rasmm=img.affine)
-      tractogram.to_world()
-      sls = tractogram.streamlines
-
-      new_offsets_idx = offsets_idx + len(sls._offsets)
-      new_sls_data_idx = sls_data_idx + len(sls._data)
-
-      if new_offsets_idx > trx_file.header["NB_STREAMLINES"]\
-          or new_sls_data_idx > trx_file.header["NB_VERTICES"]:
-        print("TRX resizing...")
-        trx_file.resize(nb_streamlines=new_offsets_idx*2, nb_vertices=new_sls_data_idx*2)
-
-      # TRX uses memmaps here
-      trx_file.streamlines._data[sls_data_idx:new_sls_data_idx] = sls._data
-      trx_file.streamlines._offsets[offsets_idx:new_offsets_idx] = offsets_idx + sls._offsets
-      trx_file.streamlines._lengths[offsets_idx:new_offsets_idx] = sls._lengths
-
-      offsets_idx = new_offsets_idx
-      sls_data_idx = new_sls_data_idx
-      
-      te = time.time()
-      print("Streamlines to TRX format, time {} s".format(te-ts))
-    else:
-      fname = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
-      gpu_tracker.dump_streamlines(fname, voxel_order, wm.shape, wm.header.get_zooms(), img.affine)
-      te = time.time()
-      print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-
-    io_time += (te-ts)
-
-if args.output_prefix and write_method == "trx":
-  ts = time.time()
-  fname = "{}.trx".format(args.output_prefix)
-  trx_file.resize()
-  zip_from_folder(
-    trx_file._uncompressed_folder_handle.name,
-    fname,
-    zipfile.ZIP_STORED)
-  trx_file.close()
-  te = time.time()
-  print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-  io_time += (te-ts)
-
-t2 = time.time()
+    streamline_generator = LocalTracking(dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=args.step_size)
+    sft = StatefulTractogram(streamline_generator, img, Space.VOX)
+    te = time.time()
+else:
+    with GPUTracker(
+        dg,
+        data,
+        FA,
+        args.fa_threshold,
+        sphere.vertices,
+        sphere.edges,
+        max_angle=args.max_angle * np.pi/180,
+        step_size=args.step_size,
+        relative_peak_thresh=args.relative_peak_threshold,
+        min_separation_angle=args.min_separation_angle * np.pi/180,
+        ngpus=args.ngpus,
+        rng_seed=0,
+        chunk_size=args.chunk_size
+    ) as gpu_tracker:
+        ts = time.time()
+        if args.output_prefix and write_method == "trx":
+            trx_file = gpu_tracker.generate_trx(seed_mask, img)
+        else:
+            sft = gpu_tracker.generate_sft(seed_mask, img)
+        te = time.time()
+print("Generated {} streamlines from {} seeds, time: {} s".format(len(sft.streamlines),
+                                                                  seed_mask.shape[0],
+                                                                  te-ts))
 
-print("Completed processing {} seeds.".format(seed_mask.shape[0]))
-print("Initialization time: {} sec".format(t1-t0))
-print("Streamline generation total time: {} sec".format(t2-t1))
-print("\tStreamline processing: {} sec".format(streamline_time))
 if args.output_prefix:
-  print("\tFile writing: {} sec".format(io_time))
+  if write_method == "trx":
+    fname = "{}.trx".format(args.output_prefix)
+    save_trx(trx_file, fname)
+  else:
+    fname = "{}.trk".format(args.output_prefix)
+    save_tractogram(sft, fname)
diff --git a/setup.py b/setup.py
index cd53ade..a392cc6 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,6 @@
 from setuptools import setup
 from setuptools.command.build_py import build_py
 from pathlib import Path
-import subprocess
 import re
 
 
From 10f2fa3426a464c5be8e6d8598ee72f75f7598fa Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Tue, 6 Jan 2026 16:54:47 -0800
Subject: [PATCH 23/31] remove todo comment

---
 run_gpu_streamlines.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py
index 89bbd04..06c61c6 100644
--- a/run_gpu_streamlines.py
+++ b/run_gpu_streamlines.py
@@ -27,7 +27,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import argparse  #  TODO: get this running, commit it, then run clode cleaner
+import argparse
 import random
 import time
 

From a751e8dfc563b3bba3291765cab8fa09a3f81356 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Tue, 6 Jan 2026 17:45:45 -0800
Subject: [PATCH 24/31] trying to fix boot

---
 cuslines/cuda_python/cu_direction_getters.py | 36 +++++++++-----------
 cuslines/cuda_python/cutils.py               |  4 +--
 cuslines/generate_streamlines_cuda.cu        |  4 +--
 3 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py
index 9901fc3..5e03600 100644
--- a/cuslines/cuda_python/cu_direction_getters.py
+++ b/cuslines/cuda_python/cu_direction_getters.py
@@ -1,7 +1,7 @@
 import numpy as np
 from abc import ABC, abstractmethod
-import logging
 import ctypes
+import logging
 from importlib.resources import files
 from time import time
 
@@ -18,12 +18,11 @@
     REAL_DTYPE,
     REAL_DTYPE_AS_STR,
     REAL3_DTYPE_AS_STR,
-    REAL_DTYPE_AS_CTYPE,
     checkCudaErrors,
     ModelType,
     THR_X_SL,
     BLOCK_Y,
-    DEV_PTR,
+    REAL_DTYPE_AS_CTYPE,
 )
 
 logger = logging.getLogger("GPUStreamlines")
@@ -95,13 +94,12 @@ class _BootCtx(ctypes.Structure):
     _fields_ = [
         ("min_signal", REAL_DTYPE_AS_CTYPE),
         ("delta_nr", ctypes.c_int32),
-        ("H", ctypes.c_void_p),
-        ("R", ctypes.c_void_p),
-        ("delta_b", ctypes.c_void_p),
-        ("delta_q", ctypes.c_void_p),
-        ("sampling_matrix", ctypes.c_void_p),
-        ("b0s_mask", ctypes.c_void_p),
-    ]
+        ("H", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
+        ("R", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
+        ("delta_b", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
+        ("delta_q", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
+        ("sampling_matrix", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
+        ("b0s_mask", ctypes.POINTER(ctypes.c_int32))]
 
 
 class BootDirectionGetter(GPUDirectionGetter):
@@ -239,12 +237,13 @@ def allocate_on_gpu(self, n):
                 ctypes.sizeof(_BootCtx))))
         self.ctx_h.append(_BootCtx(
             min_signal=self.min_signal,
-            H=self.H_d[n],
-            R=self.R_d[n],
-            delta_b=self.delta_b_d[n],
-            delta_q=self.delta_q_d[n],
-            sampling_matrix=self.sampling_matrix_d[n],
-            b0s_mask=self.b0s_mask_d[n],
+            delta_nr=self.delta_nr,
+            H=ctypes.cast(self.H_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
+            R=ctypes.cast(self.R_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
+            delta_b=ctypes.cast(self.delta_b_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
+            delta_q=ctypes.cast(self.delta_q_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
+            sampling_matrix=ctypes.cast(self.sampling_matrix_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
+            b0s_mask=ctypes.cast(self.b0s_mask_d[n], ctypes.POINTER(ctypes.c_int32))
         ))
 
         checkCudaErrors(runtime.cudaMemcpy(
@@ -279,10 +278,9 @@ def allocate_on_gpu(self, n):
             cudaMemcpyKind.cudaMemcpyHostToDevice))
         checkCudaErrors(runtime.cudaMemcpy(
             self.ctx_d[n],
-            ctypes.byref(self.ctx_h[n]),
+            ctypes.addressof(self.ctx_h[n]),
             ctypes.sizeof(_BootCtx),
-            cudaMemcpyKind.cudaMemcpyHostToDevice
-        ))
+            cudaMemcpyKind.cudaMemcpyHostToDevice))
 
     def deallocate_on_gpu(self, n):
         if self.H_d[n]:
diff --git a/cuslines/cuda_python/cutils.py b/cuslines/cuda_python/cutils.py
index 9cf164e..2fd688e 100644
--- a/cuslines/cuda_python/cutils.py
+++ b/cuslines/cuda_python/cutils.py
@@ -19,7 +19,7 @@ class ModelType(IntEnum):
     REAL_DTYPE = np.float32
     REAL3_DTYPE = np.dtype([('x', np.float32),
                             ('y', np.float32),
-                            ('z', np.float32)])
+                            ('z', np.float32)], align=True)
     REAL_DTYPE_AS_STR = "float"
     REAL3_DTYPE_AS_STR = "float3"
     REAL_DTYPE_AS_CTYPE = ctypes.c_float
@@ -27,7 +27,7 @@ class ModelType(IntEnum):
     REAL_DTYPE = np.float64
     REAL3_DTYPE = np.dtype([('x', np.float64),
                             ('y', np.float64),
-                            ('z', np.float64)])
+                            ('z', np.float64)], align=True)
     REAL_DTYPE_AS_STR = "double"
     REAL3_DTYPE_AS_STR = "double3"
     REAL_DTYPE_AS_CTYPE = ctypes.c_double
diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index 9b04e60..b9a84c2 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -1208,7 +1208,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                          const REAL_T *__restrict__ dataf,
                          const REAL_T *__restrict__ metric_map,
                          const typename ModelCtx<MODEL_T, REAL_T>::type* __restrict__ ctx,
-			             const int samplm_nr,
+		         const int samplm_nr,
                          const REAL3_T *__restrict__ sphere_vertices,
                          const int2 *__restrict__ sphere_edges,
                          const int num_edges,
@@ -1589,7 +1589,7 @@ __global__ void genStreamlinesMerge_k(
                                       const REAL_T *__restrict__ dataf,
                                       const REAL_T *__restrict__ metric_map,
                                       const typename ModelCtx<MODEL_T, REAL_T>::type* __restrict__ ctx,
-				                      const int samplm_nr,
+				      const int samplm_nr,
                                       const REAL3_T *__restrict__ sphere_vertices,
                                       const int2 *__restrict__ sphere_edges,
                                       const int num_edges,

From 785e4219d534bc4036452230a958267514ad9d85 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Tue, 6 Jan 2026 17:46:45 -0800
Subject: [PATCH 25/31] move the cuda c stuff into their own folder

---
 cuslines/{ => cuda_c}/cudamacro.h                  | 0
 cuslines/{ => cuda_c}/cuwsort.cuh                  | 0
 cuslines/{ => cuda_c}/disc.h                       | 0
 cuslines/{ => cuda_c}/generate_streamlines_cuda.cu | 0
 cuslines/{ => cuda_c}/globals.h                    | 0
 cuslines/{ => cuda_c}/ptt.cu                       | 0
 cuslines/{ => cuda_c}/ptt.cuh                      | 0
 cuslines/{ => cuda_c}/utils.cu                     | 0
 cuslines/cuda_python/cu_direction_getters.py       | 2 +-
 9 files changed, 1 insertion(+), 1 deletion(-)
 rename cuslines/{ => cuda_c}/cudamacro.h (100%)
 rename cuslines/{ => cuda_c}/cuwsort.cuh (100%)
 rename cuslines/{ => cuda_c}/disc.h (100%)
 rename cuslines/{ => cuda_c}/generate_streamlines_cuda.cu (100%)
 rename cuslines/{ => cuda_c}/globals.h (100%)
 rename cuslines/{ => cuda_c}/ptt.cu (100%)
 rename cuslines/{ => cuda_c}/ptt.cuh (100%)
 rename cuslines/{ => cuda_c}/utils.cu (100%)

diff --git a/cuslines/cudamacro.h b/cuslines/cuda_c/cudamacro.h
similarity index 100%
rename from cuslines/cudamacro.h
rename to cuslines/cuda_c/cudamacro.h
diff --git a/cuslines/cuwsort.cuh b/cuslines/cuda_c/cuwsort.cuh
similarity index 100%
rename from cuslines/cuwsort.cuh
rename to cuslines/cuda_c/cuwsort.cuh
diff --git a/cuslines/disc.h b/cuslines/cuda_c/disc.h
similarity index 100%
rename from cuslines/disc.h
rename to cuslines/cuda_c/disc.h
diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/cuda_c/generate_streamlines_cuda.cu
similarity index 100%
rename from cuslines/generate_streamlines_cuda.cu
rename to cuslines/cuda_c/generate_streamlines_cuda.cu
diff --git a/cuslines/globals.h b/cuslines/cuda_c/globals.h
similarity index 100%
rename from cuslines/globals.h
rename to cuslines/cuda_c/globals.h
diff --git a/cuslines/ptt.cu b/cuslines/cuda_c/ptt.cu
similarity index 100%
rename from cuslines/ptt.cu
rename to cuslines/cuda_c/ptt.cu
diff --git a/cuslines/ptt.cuh b/cuslines/cuda_c/ptt.cuh
similarity index 100%
rename from cuslines/ptt.cuh
rename to cuslines/cuda_c/ptt.cuh
diff --git a/cuslines/utils.cu b/cuslines/cuda_c/utils.cu
similarity index 100%
rename from cuslines/utils.cu
rename to cuslines/cuda_c/utils.cu
diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py
index 5e03600..d1b9e28 100644
--- a/cuslines/cuda_python/cu_direction_getters.py
+++ b/cuslines/cuda_python/cu_direction_getters.py
@@ -78,7 +78,7 @@ def compile_program(self, debug: bool = False):
         # I think this is reasonable
         dev = Device()
         dev.set_current()
-        cuda_path = cuslines_cuda.joinpath("generate_streamlines_cuda.cu")
+        cuda_path = cuslines_cuda.joinpath("cuda_c/generate_streamlines_cuda.cu")
         with open(cuda_path, "r") as f:
             prog = Program(f.read(), code_type="c++", options=program_options)
         self.module = prog.compile(

From 136e9aaa0c5a918d7aa30ec7eec89526e2bfb575 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Wed, 7 Jan 2026 12:53:56 -0800
Subject: [PATCH 26/31] Fix boot, refactor boot, other bfs

---
 README.md                                    |    2 +-
 cuslines/cuda_c/boot.cu                      | 1066 +++++++++++++++++
 cuslines/cuda_c/generate_streamlines_cuda.cu | 1074 +-----------------
 cuslines/cuda_c/globals.h                    |   29 +-
 cuslines/cuda_c/tracking_helpers.cu          |  290 +++++
 cuslines/cuda_c/utils.cu                     |  138 +--
 cuslines/cuda_python/cu_direction_getters.py |   66 +-
 cuslines/cuda_python/cu_tractography.py      |    3 +-
 cuslines/cuda_python/cutils.py               |    3 -
 run_gpu_streamlines.py                       |   12 +-
 10 files changed, 1449 insertions(+), 1234 deletions(-)
 create mode 100644 cuslines/cuda_c/boot.cu
 create mode 100644 cuslines/cuda_c/tracking_helpers.cu

diff --git a/README.md b/README.md
index a1e98fa..9e0275d 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ Destroy GPUTracker...
 
 Note that if you experience memory errors, you can adjust the `--chunk-size` flag.
 
-To run on more seeds, we suggest enabling the `--trx` flag in the GPU script to not get bottlenecked by writing files.
+To run on more seeds, we suggest setting the `--write-method trx` flag in the GPU script to not get bottlenecked by writing files.
 
 ## Running on AWS with Docker
 First, set up an AWS instance with GPU and ssh into it (we recommend a P3 instance with at least 1 V100 16 GB GPU and a Deep Learning AMI Ubuntu 18.04 v 33.0.). Then do the following:
diff --git a/cuslines/cuda_c/boot.cu b/cuslines/cuda_c/boot.cu
new file mode 100644
index 0000000..133c43d
--- /dev/null
+++ b/cuslines/cuda_c/boot.cu
@@ -0,0 +1,1066 @@
+//#define USE_FIXED_PERMUTATION
+#ifdef USE_FIXED_PERMUTATION
+//__device__ const int fixedPerm[] = {44, 47, 53,  0,  3,  3, 39,  9, 19, 21, 50, 36, 23,
+//                                     6, 24, 24, 12,  1, 38, 39, 23, 46, 24, 17, 37, 25, 
+//                                    13,  8,  9, 20, 51, 16, 51,  5, 15, 47,  0, 18, 35, 
+//                                    24, 49, 51, 29, 19, 19, 14, 39, 32,  1,  9, 32, 31,
+//                                    10, 52, 23};
+__device__ const int fixedPerm[] = {
+  47, 117,  67, 103,   9,  21,  36,  87,  70,  88, 140,  58,  39,  87,  88,  81,  25,  77,
+  72,   9, 148, 115,  79,  82,  99,  29, 147, 147, 142,  32,   9, 127,  32,  31, 114,  28,
+  34, 128, 128,  53, 133,  38,  17,  79, 132, 105,  42,  31, 120,   1,  65,  57,  35, 102,
+ 119,  11,  82,  91, 128, 142,  99,  53, 140, 121,  84,  68,   6,  47, 127, 131, 100,  78,
+ 143, 148,  23, 141, 117,  85,  48,  49,  69,  95,  94,   0, 113,  36,  48,  93, 131,  98,
+  42, 112, 149, 127,   0, 138, 114,  43, 127,  23, 130, 121,  98,  62, 123,  82, 148,  50,
+  14,  41,  58,  36,  10,  86,  43, 104,  11,   2,  51,  80,  32, 128,  38,  19,  42, 115,
+  77,  30,  24, 125,   2,   3,  94, 107,  13, 112,  40,  72,  19,  95,  72,  67,  61,  14,
+  96,   4, 139,  86, 121, 109};
+#endif
+
+template<int BDIM_X,
+         typename VAL_T>
+__device__ VAL_T avgMask(const int mskLen,
+			 const int *__restrict__ mask,
+			 const VAL_T *__restrict__ data) {
+        
+	const int tidx = threadIdx.x;
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        int   __myCnt = 0;
+        VAL_T __mySum = 0;
+
+        for(int i = tidx; i < mskLen; i += BDIM_X) {
+		if(mask[i]) {
+			__myCnt++;
+			__mySum += data[i];
+		}
+        }
+
+        #pragma unroll
+        for(int i = BDIM_X/2; i; i /= 2) {
+                __mySum += __shfl_xor_sync(WMASK, __mySum, i, BDIM_X);
+                __myCnt += __shfl_xor_sync(WMASK, __myCnt, i, BDIM_X);
+        }
+
+        return __mySum/__myCnt;
+
+}
+
+template<
+    int BDIM_X,
+    typename LEN_T,
+    typename MSK_T,
+    typename VAL_T>
+__device__ LEN_T maskGet(const LEN_T n, 
+			 const MSK_T *__restrict__ mask,
+			 const VAL_T *__restrict__ plain,
+			       VAL_T *__restrict__ masked) {
+
+	const int tidx = threadIdx.x;
+	
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+	const int __laneMask = (1 << tidx)-1;
+
+	int woff = 0;
+	for(int j = 0; j < n; j += BDIM_X) {
+
+		const int __act = (j+tidx < n) ? !mask[j+tidx] : 0;
+		const int __msk = __ballot_sync(WMASK, __act);
+
+		const int toff = __popc(__msk & __laneMask);
+		if (__act) {
+			masked[woff+toff] = plain[j+tidx];
+		}
+		woff += __popc(__msk);
+	}
+	return woff;
+}
+
+template<
+    int BDIM_X,
+    typename LEN_T,
+    typename MSK_T,
+    typename VAL_T>
+__device__ void maskPut(const LEN_T n, 
+			const MSK_T *__restrict__ mask,
+			const VAL_T *__restrict__ masked,
+			      VAL_T *__restrict__ plain) {
+
+	const int tidx = threadIdx.x;
+	
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+	const int __laneMask = (1 << tidx)-1;
+
+	int woff = 0;
+	for(int j = 0; j < n; j += BDIM_X) {
+
+		const int __act = (j+tidx < n) ? !mask[j+tidx] : 0;
+		const int __msk = __ballot_sync(WMASK, __act);
+
+		const int toff = __popc(__msk & __laneMask);
+		if (__act) {
+			plain[j+tidx] = masked[woff+toff];
+		}
+		woff += __popc(__msk);
+	}
+	return;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int closest_peak_d(const REAL_T max_angle,
+			      const REAL3_T  direction, //dir
+                              const int npeaks,
+                              const REAL3_T *__restrict__ peaks,
+                                    REAL3_T *__restrict__ peak) {// dirs,
+
+        const int tidx = threadIdx.x;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        //const REAL_T cos_similarity = COS(MAX_ANGLE_P);
+        const REAL_T cos_similarity = COS(max_angle);
+#if 0
+        if (!threadIdx.y && !tidx) {
+                printf("direction: (%f, %f, %f)\n",
+                        direction.x, direction.y, direction.z);
+        }
+        __syncwarp(WMASK);
+#endif
+        REAL_T cpeak_dot = 0;
+        int    cpeak_idx = -1;
+        for(int j = 0; j < npeaks; j += BDIM_X) {
+                if (j+tidx < npeaks) {
+#if 0
+                        if (!threadIdx.y && !tidx) {
+                                printf("j+tidx: %d, peaks[j+tidx]: (%f, %f, %f)\n",
+                                        j+tidx, peaks[j+tidx].x, peaks[j+tidx].y, peaks[j+tidx].z);
+                        }
+#endif
+                        const REAL_T dot = direction.x*peaks[j+tidx].x+
+                                           direction.y*peaks[j+tidx].y+
+                                           direction.z*peaks[j+tidx].z;
+
+                        if (FABS(dot) > FABS(cpeak_dot)) {
+                                cpeak_dot = dot;
+                                cpeak_idx = j+tidx;
+                        }
+                }
+        }
+#if 0
+        if (!threadIdx.y && !tidx) {
+                printf("cpeak_idx: %d, cpeak_dot: %f\n", cpeak_idx, cpeak_dot);
+        }
+        __syncwarp(WMASK);
+#endif
+
+        #pragma unroll
+        for(int j = BDIM_X/2; j; j /= 2) {
+
+                const REAL_T dot = __shfl_xor_sync(WMASK, cpeak_dot, j, BDIM_X);
+                const int    idx = __shfl_xor_sync(WMASK, cpeak_idx, j, BDIM_X);
+                if (FABS(dot) > FABS(cpeak_dot)) {
+                        cpeak_dot = dot;
+                        cpeak_idx = idx;
+                }
+        }
+#if 0
+        if (!threadIdx.y && !tidx) {
+                printf("cpeak_idx: %d, cpeak_dot: %f, cos_similarity: %f\n", cpeak_idx, cpeak_dot, cos_similarity);
+        }
+        __syncwarp(WMASK);
+#endif
+        if (cpeak_idx >= 0) {
+                if (cpeak_dot >= cos_similarity) {
+                        peak[0] = peaks[cpeak_idx];
+                        return 1;
+                }
+                if (cpeak_dot <= -cos_similarity) {
+                        peak[0] = MAKE_REAL3(-peaks[cpeak_idx].x,
+                                             -peaks[cpeak_idx].y,
+                                             -peaks[cpeak_idx].z);
+                        return 1;
+                }
+        }
+        return 0;
+}
+
+template<int BDIM_X,
+         typename VAL_T>
+__device__ void ndotp_d(const int N,
+			const int M,
+			const VAL_T *__restrict__ srcV,
+                        const VAL_T *__restrict__ srcM,
+                              VAL_T *__restrict__ dstV) {
+
+        const int tidx = threadIdx.x;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        //#pragma unroll
+        for(int i = 0; i < N; i++) {
+
+                VAL_T __tmp = 0;
+
+                //#pragma unroll
+                for(int j = 0; j < M; j += BDIM_X) {
+                        if (j+tidx < M) {
+                                __tmp += srcV[j+tidx]*srcM[i*M + j+tidx];
+                        }
+                }
+                #pragma unroll
+                for(int j = BDIM_X/2; j; j /= 2) {
+#if 0
+                        __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
+#else
+                        __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
+#endif
+                }
+                // values could be held by BDIM_X threads and written
+                // together every BDIM_X iterations...
+
+                if (tidx == 0) {
+                        dstV[i] = __tmp;
+                }
+        }
+        return;
+}
+
+
+template<int BDIM_X,
+         typename VAL_T>
+__device__ void ndotp_log_opdt_d(const int N,
+			    const int M,
+			    const VAL_T *__restrict__ srcV,
+                            const VAL_T *__restrict__ srcM,
+                                  VAL_T *__restrict__ dstV) {
+
+        const int tidx = threadIdx.x;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+         const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        const VAL_T ONEP5 = static_cast<VAL_T>(1.5);
+
+        //#pragma unroll
+        for(int i = 0; i < N; i++) {
+
+                VAL_T __tmp = 0;
+
+                //#pragma unroll
+                for(int j = 0; j < M; j += BDIM_X) {
+                        if (j+tidx < M) {
+                                const VAL_T v = srcV[j+tidx];
+                                __tmp += -LOG(v)*(ONEP5+LOG(v))*v * srcM[i*M + j+tidx];
+                        }
+                }
+                #pragma unroll
+                for(int j = BDIM_X/2; j; j /= 2) {
+#if 0
+                        __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
+#else
+                        __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
+#endif
+                }
+                // values could be held by BDIM_X threads and written
+                // together every BDIM_X iterations...
+
+                if (tidx == 0) {
+                        dstV[i] = __tmp;
+                }
+        }
+        return;
+}
+
+template<int BDIM_X,
+	 typename VAL_T>
+__device__ void ndotp_log_csa_d(const int N,
+				const int M,
+				const VAL_T *__restrict__ srcV,
+				const VAL_T *__restrict__ srcM,
+				VAL_T *__restrict__ dstV) {
+
+	const int tidx = threadIdx.x;
+
+	const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+	const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+	// Clamp values
+	constexpr VAL_T min = .001;
+	constexpr VAL_T max = .999;
+
+	//#pragma unroll
+	for(int i = 0; i < N; i++) {
+
+		VAL_T __tmp = 0;
+
+		//#pragma unroll
+		for(int j = 0; j < M; j += BDIM_X) {
+			if (j+tidx < M) {
+				const VAL_T v = MIN(MAX(srcV[j+tidx], min), max);
+				__tmp += LOG(-LOG(v)) * srcM[i*M + j+tidx];
+			}
+		}
+		#pragma unroll
+		for(int j = BDIM_X/2; j; j /= 2) {
+#if 0
+			__tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
+#else
+			__tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
+#endif
+		}
+		// values could be held by BDIM_X threads and written
+		// together every BDIM_X iterations...
+
+		if (tidx == 0) {
+			dstV[i] = __tmp;
+		}
+	}
+	return;
+}
+
+
+template<int BDIM_X,
+         typename REAL_T>
+__device__ void fit_opdt(const int delta_nr,
+                         const int hr_side,
+                         const REAL_T *__restrict__ delta_q,
+                         const REAL_T *__restrict__ delta_b,
+                         const REAL_T *__restrict__ __msk_data_sh,
+                         REAL_T *__restrict__ __h_sh,
+                         REAL_T *__restrict__ __r_sh) {
+        const int tidx = threadIdx.x;
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        ndotp_log_opdt_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh);
+        ndotp_d         <BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_b, __h_sh);
+        __syncwarp(WMASK);
+        #pragma unroll
+        for(int j = tidx; j < delta_nr; j += BDIM_X) {
+                __r_sh[j] -= __h_sh[j];
+        }
+        __syncwarp(WMASK);
+}
+
+template<int BDIM_X, typename REAL_T>
+__device__ void fit_csa(const int delta_nr,
+                        const int hr_side,
+                        const REAL_T *__restrict__ fit_matrix,
+                        const REAL_T *__restrict__ __msk_data_sh,
+                        REAL_T *__restrict__ __r_sh) {
+        const int tidx = threadIdx.x;
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        constexpr REAL _n0_const = 0.28209479177387814; // .5 / sqrt(pi)
+        ndotp_log_csa_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, fit_matrix, __r_sh);
+        __syncwarp(WMASK);
+        if (tidx == 0) {
+                __r_sh[0] = _n0_const;
+        }
+        __syncwarp(WMASK);
+}
+
+template<int BDIM_X, ModelType MODEL_T, typename REAL_T>
+__device__ void fit_model_coef(const int delta_nr, // delta_nr is number of ODF directions
+                               const int hr_side, // hr_side is number of data directions
+                               const REAL_T *__restrict__ delta_q,
+                               const REAL_T *__restrict__ delta_b, // these are fit matrices the model can use, different for each model
+                               const REAL_T *__restrict__ __msk_data_sh, // __msk_data_sh is the part of the data currently being operated on by this block
+                               REAL_T *__restrict__ __h_sh, // these last two are modifications to the coefficients that will be returned
+                               REAL_T *__restrict__ __r_sh) {
+        switch(MODEL_T) {
+                case OPDT:
+                        fit_opdt<BDIM_X>(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh);
+                        break;
+                case CSA:
+                        fit_csa<BDIM_X>(delta_nr, hr_side, delta_q, __msk_data_sh, __r_sh);
+                        break;
+                default:
+                        printf("FATAL: Invalid Model Type.\n");
+                        break;
+        }
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         int NATTEMPTS,
+         ModelType MODEL_T,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int get_direction_boot_d(
+                                curandStatePhilox4_32_10_t *st,
+                                const REAL_T max_angle,
+                                const REAL_T min_signal,
+                                const REAL_T relative_peak_thres,
+                                const REAL_T min_separation_angle,
+                                REAL3_T dir,
+                                const int dimx,
+                                const int dimy,
+                                const int dimz,
+                                const int dimt,
+                                const REAL_T *__restrict__ dataf,
+                                const int *__restrict__ b0s_mask, // not using this (and its opposite, dwi_mask)
+                                                                  // but not clear if it will never be needed so
+                                                                  // we'll keep it here for now...
+                                const REAL3_T point,
+                                const REAL_T *__restrict__ H, 
+                                const REAL_T *__restrict__ R,
+                                // model unused
+                                // max_angle, pmf_threshold from global defines
+                                // b0s_mask already passed
+                                // min_signal from global defines
+                                const int delta_nr,
+                                const REAL_T *__restrict__ delta_b,
+                                const REAL_T *__restrict__ delta_q, // fit_matrix
+                                const int samplm_nr,
+                                const REAL_T *__restrict__ sampling_matrix,
+                                const REAL3_T *__restrict__ sphere_vertices,
+                                const int2 *__restrict__ sphere_edges,
+                                const int num_edges,
+                                REAL3_T *__restrict__ dirs) {
+
+        const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
+	
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+	const int n32dimt = ((dimt+31)/32)*32;
+
+	extern REAL_T __shared__ __sh[];
+
+	REAL_T *__vox_data_sh = reinterpret_cast<REAL_T *>(__sh);
+	REAL_T *__msk_data_sh = __vox_data_sh + BDIM_Y*n32dimt;
+
+	REAL_T *__r_sh = __msk_data_sh + BDIM_Y*n32dimt;
+	REAL_T *__h_sh = __r_sh + BDIM_Y*MAX(n32dimt, samplm_nr);
+
+	__vox_data_sh += tidy*n32dimt;
+	__msk_data_sh += tidy*n32dimt;
+
+	__r_sh += tidy*MAX(n32dimt, samplm_nr);
+	__h_sh += tidy*MAX(n32dimt, samplm_nr);
+	
+	// compute hr_side (may be passed from python)
+	int hr_side = 0;
+	for(int j = tidx; j < dimt; j += BDIM_X) {
+		hr_side += !b0s_mask[j] ? 1 : 0;
+	}
+        #pragma unroll
+        for(int i = BDIM_X/2; i; i /= 2) {
+                hr_side += __shfl_xor_sync(WMASK, hr_side, i, BDIM_X);
+        }
+
+        #pragma unroll
+        for(int i = 0; i < NATTEMPTS; i++) {
+
+                const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, -1, dataf, point, __vox_data_sh);
+
+		const int nmsk = maskGet<BDIM_X>(dimt, b0s_mask, __vox_data_sh, __msk_data_sh);
+
+		//if (!tidx && !threadIdx.y && !blockIdx.x) {
+		//
+		//	printf("interp of %f, %f, %f\n", point.x, point.y, point.z);
+		//	printf("hr_side: %d\n", hr_side);
+		//	printArray("vox_data", 6, dimt, __vox_data_sh[tidy]);
+		//	printArray("msk_data", 6, nmsk, __msk_data_sh[tidy]);
+		//}
+		//break;
+
+                __syncwarp(WMASK);
+
+                if (rv == 0) {
+
+                        ndotp_d<BDIM_X>(hr_side, hr_side, __msk_data_sh, R, __r_sh);
+			//__syncwarp();
+			//printArray("__r", 5, hr_side*hr_side, R);
+			//printArray("__r_sh", 6, hr_side, __r_sh[tidy]);
+
+                        ndotp_d<BDIM_X>(hr_side, hr_side, __msk_data_sh, H, __h_sh);
+			//__syncwarp();
+			//printArray("__h_sh", 6, hr_side, __h_sh[tidy]);
+
+                        __syncwarp(WMASK);
+
+                        for(int j = 0; j < hr_side; j += BDIM_X) {
+                                if (j+tidx < hr_side) {
+#ifdef USE_FIXED_PERMUTATION
+                                        const int srcPermInd = fixedPerm[j+tidx];
+#else
+                                        const int srcPermInd = curand(st) % hr_side;
+//                                        if (srcPermInd < 0 || srcPermInd >= hr_side) {
+//                                                printf("srcPermInd: %d\n", srcPermInd);
+//                                        }
+#endif
+					__h_sh[j+tidx] += __r_sh[srcPermInd];
+					//__h_sh[j+tidx] += __r_sh[j+tidx];
+                                }
+                        }
+			__syncwarp(WMASK);
+
+			//printArray("h+perm(r):", 6, hr_side, __h_sh[tidy]);
+			//__syncwarp();
+		
+			// vox_data[dwi_mask] = masked_data
+			maskPut<BDIM_X>(dimt, b0s_mask, __h_sh, __vox_data_sh);
+			__syncwarp(WMASK);
+
+			//printArray("vox_data[dwi_mask]:", 6, dimt, __vox_data_sh[tidy]);
+			//__syncwarp();
+
+			for(int j = tidx; j < dimt; j += BDIM_X) {
+				//__vox_data_sh[j] = MAX(MIN_SIGNAL_P, __vox_data_sh[j]);
+				__vox_data_sh[j] = MAX(min_signal, __vox_data_sh[j]);
+			}
+			__syncwarp(WMASK);
+
+			const REAL_T denom = avgMask<BDIM_X>(dimt, b0s_mask, __vox_data_sh);
+
+			for(int j = tidx; j < dimt; j += BDIM_X) {
+				__vox_data_sh[j] /= denom;
+			}
+			__syncwarp();
+
+			//if (!tidx && !threadIdx.y && !blockIdx.x) {
+			//	printf("denom: %f\n", denom);
+			//}
+			////break;
+			//if (!tidx && !threadIdx.y && !blockIdx.x) {
+			//
+			//	printf("__vox_data_sh:\n");
+			//	printArray("vox_data", 6, dimt, __vox_data_sh[tidy]);
+			//}
+			//break;
+
+			maskGet<BDIM_X>(dimt, b0s_mask, __vox_data_sh, __msk_data_sh);
+			__syncwarp(WMASK);
+
+                        fit_model_coef<BDIM_X, MODEL_T>(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh);
+
+                        // __r_sh[tidy] <- python 'coef'
+
+                        ndotp_d<BDIM_X>(samplm_nr, delta_nr, __r_sh, sampling_matrix, __h_sh);
+
+                        // __h_sh[tidy] <- python 'pmf'
+                } else {
+                        #pragma unroll
+                        for(int j = tidx; j < samplm_nr; j += BDIM_X) {
+				__h_sh[j] = 0;
+                        }
+                        // __h_sh[tidy] <- python 'pmf'
+                }
+                __syncwarp(WMASK);
+#if 0
+                if (!threadIdx.y && threadIdx.x == 0) {
+                        for(int j = 0; j < samplm_nr; j++) {
+                                printf("pmf[%d]: %f\n", j, __h_sh[tidy][j]);
+                        }
+                }
+                //return;
+#endif
+                const REAL_T abs_pmf_thr = PMF_THRESHOLD_P*max_d<BDIM_X>(samplm_nr, __h_sh, REAL_MIN);
+                __syncwarp(WMASK);
+
+                #pragma unroll
+                for(int j = tidx; j < samplm_nr; j += BDIM_X) {
+			const REAL_T __v = __h_sh[j];
+			if (__v < abs_pmf_thr) {
+				__h_sh[j] = 0;
+			}
+                }
+                __syncwarp(WMASK);
+#if 0
+                if (!threadIdx.y && threadIdx.x == 0) {
+                        printf("abs_pmf_thr: %f\n", abs_pmf_thr);
+                        for(int j = 0; j < samplm_nr; j++) {
+                                printf("pmfNORM[%d]: %f\n", j, __h_sh[tidy][j]);
+                        }
+                }
+                //return;
+#endif
+#if 0
+                if init:
+                        directions = peak_directions(pmf, sphere)[0]
+                        return directions
+                else:
+                        peaks = peak_directions(pmf, sphere)[0]
+                        if (len(peaks) > 0):
+                                return closest_peak(directions, peaks, cos_similarity)
+#endif
+                const int ndir = peak_directions_d<BDIM_X,
+                                                   BDIM_Y>(__h_sh, dirs,
+                                                           sphere_vertices,
+                                                           sphere_edges,
+                                                           num_edges,
+							   samplm_nr,
+							   reinterpret_cast<int *>(__r_sh), // reuse __r_sh as shInd in func which is large enough
+							   relative_peak_thres,
+							   min_separation_angle);
+                if (NATTEMPTS == 1) { // init=True...
+                        return ndir; // and dirs;
+                } else { // init=False...
+                        if (ndir > 0) {
+                                /*
+                                if (!threadIdx.y && threadIdx.x == 0 && ndir > 1) {
+                                        printf("NATTEMPTS=5 and ndir: %d!!!\n", ndir);
+                                }
+                                */
+                                REAL3_T peak;
+                                const int foundPeak = closest_peak_d<BDIM_X, BDIM_Y, REAL_T, REAL3_T>(max_angle, dir, ndir, dirs, &peak);
+                                __syncwarp(WMASK);
+                                if (foundPeak) {
+                                        if (tidx == 0) {
+                                                dirs[0] = peak;
+                                        }
+                                        return 1;
+                                }
+                        }
+                }
+        }
+        return 0;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         typename REAL_T,
+         typename REAL3_T>
+__global__ void getNumStreamlinesBoot_k(
+                                    const ModelType model_type,
+                                    const REAL_T max_angle,
+		                    const REAL_T min_signal,
+		                    const REAL_T relative_peak_thres,
+	                            const REAL_T min_separation_angle,
+		                    const long long rndSeed,
+                                    const int nseed,
+                                    const REAL3_T *__restrict__ seeds,
+                                    const int dimx,
+                                    const int dimy,
+                                    const int dimz,
+                                    const int dimt,
+                                    const REAL_T *__restrict__ dataf,
+                                    const REAL_T *__restrict__ H,
+                                    const REAL_T *__restrict__ R,
+		                    const int delta_nr,
+                                    const REAL_T *__restrict__ delta_b,
+                                    const REAL_T *__restrict__ delta_q,
+                                    const int  *__restrict__ b0s_mask, // change to int
+		                    const int samplm_nr,
+                                    const REAL_T *__restrict__ sampling_matrix,
+                                    const REAL3_T *__restrict__ sphere_vertices,
+                                    const int2 *__restrict__ sphere_edges,
+                                    const int num_edges,
+                                    REAL3_T *__restrict__ shDir0,
+                                    int *slineOutOff) {
+
+        const int tidx = threadIdx.x;
+        const int slid = blockIdx.x*blockDim.y + threadIdx.y;
+        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
+
+        if (slid >= nseed) {
+                return;
+        }
+
+        REAL3_T seed = seeds[slid]; 
+        // seed = lin_mat*seed + offset
+
+        REAL3_T *__restrict__ __shDir = shDir0+slid*samplm_nr;
+
+	// const int hr_side = dimt-1;
+
+        curandStatePhilox4_32_10_t st;
+        //curand_init(rndSeed, slid + rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
+        curand_init(rndSeed, gid, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
+                                                                                   // elements of the same sequence
+        // python:
+        //directions = get_direction(None, dataf, dwi_mask, sphere, s, H, R, model, max_angle,
+        //                pmf_threshold, b0s_mask, min_signal, fit_matrix,
+        //                sampling_matrix, init=True)
+
+	//if (!tidx && !threadIdx.y && !blockIdx.x) {
+	//	printf("seed: %f, %f, %f\n", seed.x, seed.y, seed.z);
+	//}
+
+        int ndir;
+        switch(model_type) {
+            case OPDT:
+                ndir = get_direction_boot_d<BDIM_X,
+                                            BDIM_Y,
+                                            1,
+                                            OPDT>(
+                                                &st,
+                                                max_angle,
+                                                min_signal,
+                                                relative_peak_thres,
+                                                min_separation_angle,
+                                                MAKE_REAL3(0,0,0),
+                                                dimx, dimy, dimz, dimt, dataf,
+                                                b0s_mask /* !dwi_mask */,
+                                                seed,
+                                                H, R,
+                                                // model unused
+                                                // max_angle, pmf_threshold from global defines
+                                                // b0s_mask already passed
+                                                // min_signal from global defines
+                                                delta_nr,
+                                                delta_b, delta_q, // fit_matrix
+                                                samplm_nr,
+                                                sampling_matrix,
+                                                sphere_vertices,
+                                                sphere_edges,
+                                                num_edges,
+                                                __shDir);
+                break;
+            case CSA:
+                ndir = get_direction_boot_d<BDIM_X,
+                                            BDIM_Y,
+                                            1,
+                                            CSA>(
+                                                &st,
+                                                max_angle,
+                                                min_signal,
+                                                relative_peak_thres,
+                                                min_separation_angle,
+                                                MAKE_REAL3(0,0,0),
+                                                dimx, dimy, dimz, dimt, dataf,
+                                                b0s_mask /* !dwi_mask */,
+                                                seed,
+                                                H, R,
+                                                // model unused
+                                                // max_angle, pmf_threshold from global defines
+                                                // b0s_mask already passed
+                                                // min_signal from global defines
+                                                delta_nr,
+                                                delta_b, delta_q, // fit_matrix
+                                                samplm_nr,
+                                                sampling_matrix,
+                                                sphere_vertices,
+                                                sphere_edges,
+                                                num_edges,
+                                                __shDir);
+                break;
+            default:
+                printf("FATAL: Invalid Model Type.\n");
+                break;
+        }
+
+        if (tidx == 0) {
+                slineOutOff[slid] = ndir;
+        }
+
+        return;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         ModelType MODEL_T,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int tracker_boot_d(
+                        curandStatePhilox4_32_10_t *st,
+			            const REAL_T max_angle,
+			            const REAL_T tc_threshold,
+			            const REAL_T step_size,
+			            const REAL_T relative_peak_thres,
+			            const REAL_T min_separation_angle,
+                         REAL3_T seed,
+                         REAL3_T first_step,
+                         REAL3_T voxel_size,
+                         const int dimx,
+                         const int dimy,
+                         const int dimz,
+                         const int dimt,
+                         const REAL_T *__restrict__ dataf,
+                         const REAL_T *__restrict__ metric_map,
+		                 const int samplm_nr,
+                         const REAL3_T *__restrict__ sphere_vertices,
+                         const int2 *__restrict__ sphere_edges,
+                         const int num_edges,
+                        /*BOOT specific params*/
+                        const REAL_T min_signal,
+                        const int delta_nr,
+                        const REAL_T *__restrict__ H,
+                        const REAL_T *__restrict__ R,
+                        const REAL_T *__restrict__ delta_b,
+                        const REAL_T *__restrict__ delta_q,
+                        const REAL_T *__restrict__ sampling_matrix,
+                        const int    *__restrict__ b0s_mask,
+                        /*BOOT specific params*/
+                         int *__restrict__ nsteps,
+                         REAL3_T *__restrict__ streamline) {
+
+        const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        int tissue_class = TRACKPOINT;
+
+        REAL3_T point = seed;
+        REAL3_T direction = first_step;
+        __shared__ REAL3_T __sh_new_dir[BDIM_Y];
+
+        if (tidx == 0) {
+                streamline[0] = point;
+#if 0
+                if (threadIdx.y == 1) {
+                        printf("streamline[0]: %f, %f, %f\n", point.x, point.y, point.z);
+                }
+#endif
+        }
+        __syncwarp(WMASK);
+
+        int step_frac = 1;
+
+        int i;
+        for(i = 1; i < MAX_SLINE_LEN*step_frac; i++) {
+                int ndir = get_direction_boot_d<BDIM_X,
+                                                BDIM_Y,
+                                                5,
+                                                MODEL_T>(
+                                                        st,
+                                                        max_angle,
+                                                        min_signal,
+                                                        relative_peak_thres,
+                                                        min_separation_angle,
+                                                        direction,
+                                                        dimx, dimy, dimz, dimt, dataf,
+                                                        b0s_mask /* !dwi_mask */,
+                                                        point,
+                                                        H, R,
+                                                        delta_nr,
+                                                        delta_b, delta_q, // fit_matrix
+                                                        samplm_nr,
+                                                        sampling_matrix,
+                                                        sphere_vertices,
+                                                        sphere_edges,
+                                                        num_edges,
+                                                        __sh_new_dir + tidy);
+                __syncwarp(WMASK);
+                direction = __sh_new_dir[tidy];
+                __syncwarp(WMASK);
+
+                if (ndir == 0) {
+                        break;
+                }
+
+                point.x += (direction.x / voxel_size.x) * (step_size / step_frac);
+                point.y += (direction.y / voxel_size.y) * (step_size / step_frac);
+                point.z += (direction.z / voxel_size.z) * (step_size / step_frac);
+
+                if ((tidx == 0) && ((i % step_frac) == 0)){
+                        streamline[i/step_frac] = point;
+                }
+                __syncwarp(WMASK);
+
+                tissue_class = check_point_d<BDIM_X, BDIM_Y>(tc_threshold, point, dimx, dimy, dimz, metric_map);
+
+                if (tissue_class == ENDPOINT ||
+                    tissue_class == INVALIDPOINT ||
+                    tissue_class == OUTSIDEIMAGE) {
+                        break;
+                }
+        }
+        nsteps[0] = i/step_frac;
+        if (((i % step_frac) != 0) && i < step_frac*(MAX_SLINE_LEN - 1)){
+                nsteps[0]++;
+                if (tidx == 0) {
+                        streamline[nsteps[0]] = point;
+                }
+        }
+
+        return tissue_class;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         ModelType MODEL_T,
+         typename REAL_T,
+         typename REAL3_T>
+__global__ void genStreamlinesMergeBoot_k(
+				      const REAL_T max_angle,
+				      const REAL_T tc_threshold,
+				      const REAL_T step_size,
+				      const REAL_T relative_peak_thres,
+				      const REAL_T min_separation_angle,
+				      const long long rndSeed,
+                      const int rndOffset,
+                      const int nseed,
+                      const REAL3_T *__restrict__ seeds,
+                      const int dimx,
+                      const int dimy,
+                      const int dimz,
+                      const int dimt,
+                      const REAL_T *__restrict__ dataf,
+                      const REAL_T *__restrict__ metric_map,
+				      const int samplm_nr,
+                      const REAL3_T *__restrict__ sphere_vertices,
+                      const int2 *__restrict__ sphere_edges,
+                      const int num_edges,
+                      /*BOOT specific params*/
+				      const REAL_T min_signal,
+				      const int delta_nr,
+                      const REAL_T *__restrict__ H,
+                      const REAL_T *__restrict__ R,
+                      const REAL_T *__restrict__ delta_b,
+                      const REAL_T *__restrict__ delta_q,
+                      const REAL_T *__restrict__ sampling_matrix,
+                      const int    *__restrict__ b0s_mask,
+                      /*BOOT specific params*/
+                      const int    *__restrict__ slineOutOff,
+                      REAL3_T *__restrict__ shDir0,
+                      int     *__restrict__ slineSeed,
+                      int     *__restrict__ slineLen,
+                      REAL3_T *__restrict__ sline) {
+
+        const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
+
+        const int slid = blockIdx.x*blockDim.y + threadIdx.y;
+
+        const int lid = (tidy*BDIM_X + tidx) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        curandStatePhilox4_32_10_t st;
+        // const int gbid = blockIdx.y*gridDim.x + blockIdx.x;
+        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
+        //curand_init(rndSeed, slid+rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(HR_SIDE/BDIM_X)
+        curand_init(rndSeed, gid+1, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
+                                                                                 // elements of the same sequence
+        if (slid >= nseed) {
+                return;
+        }
+
+        REAL3_T seed = seeds[slid]; 
+
+        int ndir = slineOutOff[slid+1]-slineOutOff[slid];
+#if 0
+        if (threadIdx.y == 0 && threadIdx.x == 0) {
+                printf("%s: ndir: %d\n", __func__, ndir);
+                for(int i = 0; i < ndir; i++) {
+                        printf("__shDir[%d][%d]: (%f, %f, %f)\n",
+                                tidy, i, __shDir[tidy][i].x, __shDir[tidy][i].y, __shDir[tidy][i].z);
+                }
+        }
+#endif
+        __syncwarp(WMASK);
+
+        int slineOff = slineOutOff[slid];
+
+        for(int i = 0; i < ndir; i++) {
+                REAL3_T first_step = shDir0[slid*samplm_nr + i];
+
+		REAL3_T *__restrict__ currSline = sline + slineOff*MAX_SLINE_LEN*2;
+
+                if (tidx == 0) {
+                        slineSeed[slineOff] = slid;
+                }
+#if 0
+                if (threadIdx.y == 0 && threadIdx.x == 0) {
+                        printf("calling trackerF from: (%f, %f, %f)\n", first_step.x, first_step.y, first_step.z);
+                }
+#endif
+
+                int stepsB;
+                const int tissue_classB = tracker_boot_d<BDIM_X,
+                                                         BDIM_Y,
+                                                         MODEL_T>(
+                                                        &st,
+		                		                        max_angle,
+			        			                        tc_threshold,
+                                                        step_size,
+                                                        relative_peak_thres,
+                                                        min_separation_angle,
+                                                        seed,
+                                                        MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
+                                                        MAKE_REAL3(1, 1, 1),
+                                                        dimx, dimy, dimz, dimt, dataf,
+                                                        metric_map,
+                                                        samplm_nr,
+                                                        sphere_vertices,
+                                                        sphere_edges,
+                                                        num_edges,
+                                                        min_signal,
+                                                        delta_nr,
+                                                        H,
+                                                        R,
+                                                        delta_b,
+                                                        delta_q,
+                                                        sampling_matrix,
+                                                        b0s_mask,
+                                                        &stepsB,
+                                                        currSline);
+
+                // reverse backward sline
+                for(int j = 0; j < stepsB/2; j += BDIM_X) {
+                        if (j+tidx < stepsB/2) {
+                                const REAL3_T __p = currSline[j+tidx];
+                                currSline[j+tidx] = currSline[stepsB-1 - (j+tidx)];
+                                currSline[stepsB-1 - (j+tidx)] = __p;
+                        }
+                }
+
+                int stepsF;
+                const int tissue_classF = tracker_boot_d<BDIM_X,
+                                                         BDIM_Y,
+                                                         MODEL_T>(
+                                                        &st,
+     	                    			                max_angle,
+		        				                        tc_threshold,
+	                				                    step_size,
+			    				                        relative_peak_thres,
+			            			                    min_separation_angle,
+                                                        seed,
+                                                        first_step,
+                                                        MAKE_REAL3(1, 1, 1),
+                                                        dimx, dimy, dimz, dimt, dataf,
+                                                        metric_map,
+			        			                        samplm_nr,
+                                                        sphere_vertices,
+                                                        sphere_edges,
+                                                        num_edges,
+                                                        min_signal,
+                                                        delta_nr,
+                                                        H,
+                                                        R,
+                                                        delta_b,
+                                                        delta_q,
+                                                        sampling_matrix,
+                                                        b0s_mask,
+                                                        &stepsF,
+                                                        currSline + stepsB-1);
+                if (tidx == 0) {
+                        slineLen[slineOff] = stepsB-1+stepsF;
+                }
+                
+                slineOff += 1;
+#if 0
+                if (threadIdx.y == 0 && threadIdx.x == 0) {
+                        printf("%s: stepsF: %d, tissue_classF: %d\n", __func__, stepsF, tissue_classF);
+                }
+                __syncwarp(WMASK);
+#endif
+                //if (/* !return_all || */0 &&
+                //    tissue_classF != ENDPOINT &&
+                //    tissue_classF != OUTSIDEIMAGE) {
+                //        continue;
+                //}
+                //if (/* !return_all || */ 0 &&
+                //    tissue_classB != ENDPOINT &&
+                //    tissue_classB != OUTSIDEIMAGE) {
+                //        continue;
+                //}
+        }
+        return;
+}
diff --git a/cuslines/cuda_c/generate_streamlines_cuda.cu b/cuslines/cuda_c/generate_streamlines_cuda.cu
index b9a84c2..f5629e0 100644
--- a/cuslines/cuda_c/generate_streamlines_cuda.cu
+++ b/cuslines/cuda_c/generate_streamlines_cuda.cu
@@ -29,13 +29,13 @@
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
 
-#include "cudamacro.h" /* for time() */
 #include "globals.h"
-
 #include "cuwsort.cuh"
 #include "ptt.cuh"
 
 #include "utils.cu"
+#include "tracking_helpers.cu"
+#include "boot.cu"
 #include "ptt.cu"
 
 #define MAX_NUM_DIR (128)
@@ -45,630 +45,6 @@
 #define MAX_DIMS        (8)
 #define MAX_STR_LEN     (256)
 
-using namespace cuwsort;
-
-//#define USE_FIXED_PERMUTATION
-#ifdef USE_FIXED_PERMUTATION
-//__device__ const int fixedPerm[] = {44, 47, 53,  0,  3,  3, 39,  9, 19, 21, 50, 36, 23,
-//                                     6, 24, 24, 12,  1, 38, 39, 23, 46, 24, 17, 37, 25, 
-//                                    13,  8,  9, 20, 51, 16, 51,  5, 15, 47,  0, 18, 35, 
-//                                    24, 49, 51, 29, 19, 19, 14, 39, 32,  1,  9, 32, 31,
-//                                    10, 52, 23};
-__device__ const int fixedPerm[] = {
-  47, 117,  67, 103,   9,  21,  36,  87,  70,  88, 140,  58,  39,  87,  88,  81,  25,  77,
-  72,   9, 148, 115,  79,  82,  99,  29, 147, 147, 142,  32,   9, 127,  32,  31, 114,  28,
-  34, 128, 128,  53, 133,  38,  17,  79, 132, 105,  42,  31, 120,   1,  65,  57,  35, 102,
- 119,  11,  82,  91, 128, 142,  99,  53, 140, 121,  84,  68,   6,  47, 127, 131, 100,  78,
- 143, 148,  23, 141, 117,  85,  48,  49,  69,  95,  94,   0, 113,  36,  48,  93, 131,  98,
-  42, 112, 149, 127,   0, 138, 114,  43, 127,  23, 130, 121,  98,  62, 123,  82, 148,  50,
-  14,  41,  58,  36,  10,  86,  43, 104,  11,   2,  51,  80,  32, 128,  38,  19,  42, 115,
-  77,  30,  24, 125,   2,   3,  94, 107,  13, 112,  40,  72,  19,  95,  72,  67,  61,  14,
-  96,   4, 139,  86, 121, 109};
-#endif
-
-template<int BDIM_X,
-         typename VAL_T>
-__device__ void ndotp_d(const int N,
-			const int M,
-			const VAL_T *__restrict__ srcV,
-                        const VAL_T *__restrict__ srcM,
-                              VAL_T *__restrict__ dstV) {
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        //#pragma unroll
-        for(int i = 0; i < N; i++) {
-
-                VAL_T __tmp = 0;
-
-                //#pragma unroll
-                for(int j = 0; j < M; j += BDIM_X) {
-                        if (j+tidx < M) {
-                                __tmp += srcV[j+tidx]*srcM[i*M + j+tidx];
-                        }
-                }
-                #pragma unroll
-                for(int j = BDIM_X/2; j; j /= 2) {
-#if 0
-                        __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
-#else
-                        __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
-#endif
-                }
-                // values could be held by BDIM_X threads and written
-                // together every BDIM_X iterations...
-
-                if (tidx == 0) {
-                        dstV[i] = __tmp;
-                }
-        }
-        return;
-}
-
-
-template<int BDIM_X,
-         typename VAL_T>
-__device__ void ndotp_log_opdt_d(const int N,
-			    const int M,
-			    const VAL_T *__restrict__ srcV,
-                            const VAL_T *__restrict__ srcM,
-                                  VAL_T *__restrict__ dstV) {
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-         const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        const VAL_T ONEP5 = static_cast<VAL_T>(1.5);
-
-        //#pragma unroll
-        for(int i = 0; i < N; i++) {
-
-                VAL_T __tmp = 0;
-
-                //#pragma unroll
-                for(int j = 0; j < M; j += BDIM_X) {
-                        if (j+tidx < M) {
-                                const VAL_T v = srcV[j+tidx];
-                                __tmp += -LOG(v)*(ONEP5+LOG(v))*v * srcM[i*M + j+tidx];
-                        }
-                }
-                #pragma unroll
-                for(int j = BDIM_X/2; j; j /= 2) {
-#if 0
-                        __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
-#else
-                        __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
-#endif
-                }
-                // values could be held by BDIM_X threads and written
-                // together every BDIM_X iterations...
-
-                if (tidx == 0) {
-                        dstV[i] = __tmp;
-                }
-        }
-        return;
-}
-
-template<int BDIM_X,
-	 typename VAL_T>
-__device__ void ndotp_log_csa_d(const int N,
-				const int M,
-				const VAL_T *__restrict__ srcV,
-				const VAL_T *__restrict__ srcM,
-				VAL_T *__restrict__ dstV) {
-
-	const int tidx = threadIdx.x;
-
-	const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-	const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-	// Clamp values
-	constexpr VAL_T min = .001;
-	constexpr VAL_T max = .999;
-
-	//#pragma unroll
-	for(int i = 0; i < N; i++) {
-
-		VAL_T __tmp = 0;
-
-		//#pragma unroll
-		for(int j = 0; j < M; j += BDIM_X) {
-			if (j+tidx < M) {
-				const VAL_T v = MIN(MAX(srcV[j+tidx], min), max);
-				__tmp += LOG(-LOG(v)) * srcM[i*M + j+tidx];
-			}
-		}
-		#pragma unroll
-		for(int j = BDIM_X/2; j; j /= 2) {
-#if 0
-			__tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
-#else
-			__tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
-#endif
-		}
-		// values could be held by BDIM_X threads and written
-		// together every BDIM_X iterations...
-
-		if (tidx == 0) {
-			dstV[i] = __tmp;
-		}
-	}
-	return;
-}
-
-
-template<int BDIM_X,
-         typename REAL_T>
-__device__ void fit_opdt(const int delta_nr,
-                         const int hr_side,
-                         const REAL_T *__restrict__ delta_q,
-                         const REAL_T *__restrict__ delta_b,
-                         const REAL_T *__restrict__ __msk_data_sh,
-                         REAL_T *__restrict__ __h_sh,
-                         REAL_T *__restrict__ __r_sh) {
-        const int tidx = threadIdx.x;
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        ndotp_log_opdt_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh);
-        ndotp_d         <BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_b, __h_sh);
-        __syncwarp(WMASK);
-        #pragma unroll
-        for(int j = tidx; j < delta_nr; j += BDIM_X) {
-                __r_sh[j] -= __h_sh[j];
-        }
-        __syncwarp(WMASK);
-}
-
-template<int BDIM_X, typename REAL_T>
-__device__ void fit_csa(const int delta_nr,
-                        const int hr_side,
-                        const REAL_T *__restrict__ fit_matrix,
-                        const REAL_T *__restrict__ __msk_data_sh,
-                        REAL_T *__restrict__ __r_sh) {
-        const int tidx = threadIdx.x;
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        constexpr REAL _n0_const = 0.28209479177387814; // .5 / sqrt(pi)
-        ndotp_log_csa_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, fit_matrix, __r_sh);
-        __syncwarp(WMASK);
-        if (tidx == 0) {
-                __r_sh[0] = _n0_const;
-        }
-        __syncwarp(WMASK);
-}
-
-template<int BDIM_X, ModelType MODEL_T, typename REAL_T>
-__device__ void fit_model_coef(const int delta_nr, // delta_nr is number of ODF directions
-                               const int hr_side, // hr_side is number of data directions
-                               const REAL_T *__restrict__ delta_q,
-                               const REAL_T *__restrict__ delta_b, // these are fit matrices the model can use, different for each model
-                               const REAL_T *__restrict__ __msk_data_sh, // __msk_data_sh is the part of the data currently being operated on by this block
-                               REAL_T *__restrict__ __h_sh, // these last two are modifications to the coefficients that will be returned
-                               REAL_T *__restrict__ __r_sh) {
-        switch(MODEL_T) {
-                case OPDT:
-                        fit_opdt<BDIM_X>(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh);
-                        break;
-                case CSA:
-                        fit_csa<BDIM_X>(delta_nr, hr_side, delta_q, __msk_data_sh, __r_sh);
-                        break;
-                default:
-                        printf("FATAL: Invalid Model Type.\n");
-                        break;
-        }
-}
-
-template<int BDIM_X,
-         typename LEN_T,
-         typename VAL_T>
-__device__ VAL_T max_mask_transl_d(const int n,
-				   const LEN_T *__restrict__ srcMsk,
-                                   const VAL_T *__restrict__ srcVal,
-                                   const VAL_T offset,
-                                   const VAL_T minVal) {
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        VAL_T __m = minVal;
-
-        for(int i = tidx; i < n; i += BDIM_X) {
-		const LEN_T sel = srcMsk[i];
-		if (sel > 0) {
-			__m = MAX(__m, srcVal[i]+offset);
-		}
-        }
-
-        #pragma unroll
-        for(int i = BDIM_X/2; i; i /= 2) {
-                const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X);
-                __m = MAX(__m, __tmp);
-        }
-
-        return __m;
-}
-
-template<int BDIM_X,
-         typename VAL_T>
-__device__ VAL_T min_d(const int n, const VAL_T *__restrict__ src, const VAL_T maxVal) {
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        VAL_T __m = maxVal;
-
-        for(int i = tidx; i < n; i += BDIM_X) {
-		__m = MIN(__m, src[i]);
-        }
-
-        #pragma unroll
-        for(int i = BDIM_X/2; i; i /= 2) {
-                const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X);
-                __m = MIN(__m, __tmp);
-        }
-
-        return __m;
-}
-			
-template<int BDIM_X,
-         typename VAL_T>
-__device__ VAL_T avgMask(const int mskLen,
-			 const int *__restrict__ mask,
-			 const VAL_T *__restrict__ data) {
-        
-	const int tidx = threadIdx.x;
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        int   __myCnt = 0;
-        VAL_T __mySum = 0;
-
-        for(int i = tidx; i < mskLen; i += BDIM_X) {
-		if(mask[i]) {
-			__myCnt++;
-			__mySum += data[i];
-		}
-        }
-
-        #pragma unroll
-        for(int i = BDIM_X/2; i; i /= 2) {
-                __mySum += __shfl_xor_sync(WMASK, __mySum, i, BDIM_X);
-                __myCnt += __shfl_xor_sync(WMASK, __myCnt, i, BDIM_X);
-        }
-
-        return __mySum/__myCnt;
-
-}
-
-template<int BDIM_X,
-         int BDIM_Y,
-         typename REAL_T,
-         typename REAL3_T>
-__device__ int peak_directions_d(const REAL_T  *__restrict__ odf,
-                                       REAL3_T *__restrict__ dirs,
-                                 const REAL3_T *__restrict__ sphere_vertices,
-                                 const int2 *__restrict__ sphere_edges,
-                                 const int num_edges,
-				 int samplm_nr,
-				 int *__restrict__ __shInd,
-				 const REAL_T relative_peak_thres,
-				 const REAL_T min_separation_angle) {
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        const unsigned int lmask = (1 << lid)-1;
-
-//        __shared__ int __shInd[BDIM_Y][SAMPLM_NR];
-
-        #pragma unroll
-        for(int j = tidx; j < samplm_nr; j += BDIM_X) {
-		__shInd[j] = 0;
-        }
-
-        REAL_T odf_min = min_d<BDIM_X>(samplm_nr, odf, REAL_MAX);
-        odf_min = MAX(0, odf_min);
-
-        __syncwarp(WMASK);
-
-        // local_maxima() + _compare_neighbors()
-        // selecting only the indices corrisponding to maxima Ms
-        // such that M-odf_min >= relative_peak_thres
-        //#pragma unroll
-        for(int j = 0; j < num_edges; j += BDIM_X) {
-                if (j+tidx < num_edges) {
-                        const int u_ind = sphere_edges[j+tidx].x;
-                        const int v_ind = sphere_edges[j+tidx].y;
-
-                        //if (u_ind >= NUM_EDGES || v_ind >= NUM_EDGES) { ERROR; }
-
-                        const REAL_T u_val = odf[u_ind];
-                        const REAL_T v_val = odf[v_ind];
-
-                        //if (u_val != u_val || v_val != v_val) { ERROR_NANs; }
-
-                        // only check that they are not equal
-                        //if (u_val != v_val) {
-                        //        __shInd[tidy][u_val < v_val ? u_ind : v_ind] = -1; // benign race conditions...
-                        //}
-                        if (u_val < v_val) {
-                                atomicExch(__shInd+u_ind, -1);
-                                atomicOr(  __shInd+v_ind,  1);
-                        } else if (v_val < u_val) {
-                                atomicExch(__shInd+v_ind, -1);
-                                atomicOr(  __shInd+u_ind,  1);
-                        }
-                }
-        }
-        __syncwarp(WMASK);
-
-        const REAL_T compThres = relative_peak_thres*max_mask_transl_d<BDIM_X>(samplm_nr, __shInd, odf, -odf_min, REAL_MIN);
-#if 1
-/*
-        if (!tidy && !tidx) {
-                for(int j = 0; j < SAMPLM_NR; j++) {
-                        printf("local_max[%d]: %d (%f)\n", j, __shInd[tidy][j], odf[j]);
-                }
-                printf("maxMax with offset %f: %f\n", -odf_min, compThres);
-        }
-        __syncwarp(WMASK);
-*/
-        // compact indices of positive values to the right
-        int n = 0;
-
-        for(int j = 0; j < samplm_nr; j += BDIM_X) {
-
-                const int __v = (j+tidx < samplm_nr) ? __shInd[j+tidx] : -1;
-                const int __keep = (__v > 0) && ((odf[j+tidx]-odf_min) >= compThres);
-                const int __msk = __ballot_sync(WMASK, __keep);
-
-//__syncwarp(WMASK); // unnecessary
-                if (__keep) {
-                        const int myoff = __popc(__msk & lmask);
-                        __shInd[n + myoff] = j+tidx;
-                }
-                n += __popc(__msk);
-//__syncwarp(WMASK); // should be unnecessary
-        }
-        __syncwarp(WMASK);
-/*
-        if (!tidy && !tidx) {
-                for(int j = 0; j < n; j++) {
-                        printf("local_max_compact[%d]: %d\n", j, __shInd[tidy][j]);
-                }
-        }
-        __syncwarp(WMASK);
-*/
-
-        // sort local maxima indices
-        if (n < BDIM_X) {
-                REAL_T k = REAL_MIN;
-                int    v = 0;
-                if (tidx < n) {
-                        v = __shInd[tidx];
-                        k = odf[v];
-                }
-                warp_sort<32, BDIM_X, WSORT_DIR_DEC>(&k, &v);
-                __syncwarp(WMASK);
-
-                if (tidx < n) {
-                        __shInd[tidx] = v;
-                }
-        } else {
-                // ERROR !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-        }
-        __syncwarp(WMASK);
-
-        // __shInd[tidy][] contains the indices in odf correspoding to
-        // normalized maxima NOT sorted!
-        if (n != 0) {
-                // remove_similar_vertices()
-                // PRELIMINARY INEFFICIENT, SINGLE TH, IMPLEMENTATION
-                if (tidx == 0) {
-                        const REAL_T cos_similarity = COS(min_separation_angle);
-
-                        dirs[0] = sphere_vertices[__shInd[0]];
-
-                        int k = 1;
-                        for(int i = 1; i < n; i++) {
-
-                                const REAL3_T abc = sphere_vertices[__shInd[i]];
-
-                                int j = 0;
-                                for(; j < k; j++) {
-                                        const REAL_T cos = FABS(abc.x*dirs[j].x+
-                                                                abc.y*dirs[j].y+
-                                                                abc.z*dirs[j].z);
-                                        if (cos > cos_similarity) {
-                                                break;
-                                        }
-                                }
-                                if (j == k) {
-                                        dirs[k++] = abc;
-                                }
-                        }
-                        n = k;
-                }
-                n = __shfl_sync(WMASK, n, 0, BDIM_X);
-                __syncwarp(WMASK);
-
-        }
-/*
-        if (!tidy && !tidx) {
-                for(int j = 0; j < n; j++) {
-                        printf("local_max_compact_uniq[%d]: %d\n", j, __shInd[tidy][j]);
-                }
-        }
-        __syncwarp(WMASK);
-*/
-#else
-        const int indMax = max_d<BDIM_X, SAMPLM_NR>(__shInd[tidy], -1);
-        if (indMax != -1) {
-                __ret = MAKE_REAL3(sphere_vertices[indMax][0],
-                                   sphere_vertices[indMax][1],
-                                   sphere_vertices[indMax][2]);
-        }
-#endif
-        return n;
-}
-
-template<int BDIM_X,
-         int BDIM_Y,
-         typename REAL_T,
-         typename REAL3_T>
-__device__ int closest_peak_d(const REAL_T max_angle,
-			      const REAL3_T  direction, //dir
-                              const int npeaks,
-                              const REAL3_T *__restrict__ peaks,
-                                    REAL3_T *__restrict__ peak) {// dirs,
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        //const REAL_T cos_similarity = COS(MAX_ANGLE_P);
-        const REAL_T cos_similarity = COS(max_angle);
-#if 0
-        if (!threadIdx.y && !tidx) {
-                printf("direction: (%f, %f, %f)\n",
-                        direction.x, direction.y, direction.z);
-        }
-        __syncwarp(WMASK);
-#endif
-        REAL_T cpeak_dot = 0;
-        int    cpeak_idx = -1;
-        for(int j = 0; j < npeaks; j += BDIM_X) {
-                if (j+tidx < npeaks) {
-#if 0
-                        if (!threadIdx.y && !tidx) {
-                                printf("j+tidx: %d, peaks[j+tidx]: (%f, %f, %f)\n",
-                                        j+tidx, peaks[j+tidx].x, peaks[j+tidx].y, peaks[j+tidx].z);
-                        }
-#endif
-                        const REAL_T dot = direction.x*peaks[j+tidx].x+
-                                           direction.y*peaks[j+tidx].y+
-                                           direction.z*peaks[j+tidx].z;
-
-                        if (FABS(dot) > FABS(cpeak_dot)) {
-                                cpeak_dot = dot;
-                                cpeak_idx = j+tidx;
-                        }
-                }
-        }
-#if 0
-        if (!threadIdx.y && !tidx) {
-                printf("cpeak_idx: %d, cpeak_dot: %f\n", cpeak_idx, cpeak_dot);
-        }
-        __syncwarp(WMASK);
-#endif
-
-        #pragma unroll
-        for(int j = BDIM_X/2; j; j /= 2) {
-
-                const REAL_T dot = __shfl_xor_sync(WMASK, cpeak_dot, j, BDIM_X);
-                const int    idx = __shfl_xor_sync(WMASK, cpeak_idx, j, BDIM_X);
-                if (FABS(dot) > FABS(cpeak_dot)) {
-                        cpeak_dot = dot;
-                        cpeak_idx = idx;
-                }
-        }
-#if 0
-        if (!threadIdx.y && !tidx) {
-                printf("cpeak_idx: %d, cpeak_dot: %f, cos_similarity: %f\n", cpeak_idx, cpeak_dot, cos_similarity);
-        }
-        __syncwarp(WMASK);
-#endif
-        if (cpeak_idx >= 0) {
-                if (cpeak_dot >= cos_similarity) {
-                        peak[0] = peaks[cpeak_idx];
-                        return 1;
-                }
-                if (cpeak_dot <= -cos_similarity) {
-                        peak[0] = MAKE_REAL3(-peaks[cpeak_idx].x,
-                                             -peaks[cpeak_idx].y,
-                                             -peaks[cpeak_idx].z);
-                        return 1;
-                }
-        }
-        return 0;
-}
-
-template<int BDIM_X,
-	 typename LEN_T,
-	 typename MSK_T,
-	 typename VAL_T>
-__device__ LEN_T maskGet(const LEN_T n, 
-			 const MSK_T *__restrict__ mask,
-			 const VAL_T *__restrict__ plain,
-			       VAL_T *__restrict__ masked) {
-
-	const int tidx = threadIdx.x;
-	
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-	const int __laneMask = (1 << tidx)-1;
-
-	int woff = 0;
-	for(int j = 0; j < n; j += BDIM_X) {
-
-		const int __act = (j+tidx < n) ? !mask[j+tidx] : 0;
-		const int __msk = __ballot_sync(WMASK, __act);
-
-		const int toff = __popc(__msk & __laneMask);
-		if (__act) {
-			masked[woff+toff] = plain[j+tidx];
-		}
-		woff += __popc(__msk);
-	}
-	return woff;
-}
-
-template<int BDIM_X,
-	 typename LEN_T,
-	 typename MSK_T,
-	 typename VAL_T>
-__device__ void maskPut(const LEN_T n, 
-			const MSK_T *__restrict__ mask,
-			const VAL_T *__restrict__ masked,
-			      VAL_T *__restrict__ plain) {
-
-	const int tidx = threadIdx.x;
-	
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-	const int __laneMask = (1 << tidx)-1;
-
-	int woff = 0;
-	for(int j = 0; j < n; j += BDIM_X) {
-
-		const int __act = (j+tidx < n) ? !mask[j+tidx] : 0;
-		const int __msk = __ballot_sync(WMASK, __act);
-
-		const int toff = __popc(__msk & __laneMask);
-		if (__act) {
-			plain[j+tidx] = masked[woff+toff];
-		}
-		woff += __popc(__msk);
-	}
-	return;
-}
-
 template<int BDIM_X,
          int BDIM_Y,
          bool IS_START,
@@ -913,279 +289,6 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
         }
 }
 
-template<int BDIM_X,
-         int BDIM_Y,
-         int NATTEMPTS,
-         ModelType MODEL_T,
-         typename REAL_T,
-         typename REAL3_T>
-__device__ int get_direction_boot_d(
-                                curandStatePhilox4_32_10_t *st,
-                                const REAL_T max_angle,
-                                const REAL_T min_signal,
-                                const REAL_T relative_peak_thres,
-                                const REAL_T min_separation_angle,
-                                REAL3_T dir,
-                                const int dimx,
-                                const int dimy,
-                                const int dimz,
-                                const int dimt,
-                                const REAL_T *__restrict__ dataf,
-                                const int *__restrict__ b0s_mask, // not using this (and its opposite, dwi_mask)
-                                                                  // but not clear if it will never be needed so
-                                                                  // we'll keep it here for now...
-                                const REAL3_T point,
-                                const REAL_T *__restrict__ H, 
-                                const REAL_T *__restrict__ R,
-                                // model unused
-                                // max_angle, pmf_threshold from global defines
-                                // b0s_mask already passed
-                                // min_signal from global defines
-                                const int delta_nr,
-                                const REAL_T *__restrict__ delta_b,
-                                const REAL_T *__restrict__ delta_q, // fit_matrix
-                                const int samplm_nr,
-                                const REAL_T *__restrict__ sampling_matrix,
-                                const REAL3_T *__restrict__ sphere_vertices,
-                                const int2 *__restrict__ sphere_edges,
-                                const int num_edges,
-                                REAL3_T *__restrict__ dirs) {
-
-        const int tidx = threadIdx.x;
-        const int tidy = threadIdx.y;
-	
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-	const int n32dimt = ((dimt+31)/32)*32;
-
-	extern REAL_T __shared__ __sh[];
-
-	REAL_T *__vox_data_sh = reinterpret_cast<REAL_T *>(__sh);
-	REAL_T *__msk_data_sh = __vox_data_sh + BDIM_Y*n32dimt;
-
-	REAL_T *__r_sh = __msk_data_sh + BDIM_Y*n32dimt;
-	REAL_T *__h_sh = __r_sh + BDIM_Y*MAX(n32dimt, samplm_nr);
-
-	__vox_data_sh += tidy*n32dimt;
-	__msk_data_sh += tidy*n32dimt;
-
-	__r_sh += tidy*MAX(n32dimt, samplm_nr);
-	__h_sh += tidy*MAX(n32dimt, samplm_nr);
-	
-	// compute hr_side (may be passed from python)
-	int hr_side = 0;
-	for(int j = tidx; j < dimt; j += BDIM_X) {
-		hr_side += !b0s_mask[j] ? 1 : 0;
-	}
-        #pragma unroll
-        for(int i = BDIM_X/2; i; i /= 2) {
-                hr_side += __shfl_xor_sync(WMASK, hr_side, i, BDIM_X);
-        }
-
-        #pragma unroll
-        for(int i = 0; i < NATTEMPTS; i++) {
-
-                const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, -1, dataf, point, __vox_data_sh);
-
-		const int nmsk = maskGet<BDIM_X>(dimt, b0s_mask, __vox_data_sh, __msk_data_sh);
-
-		//if (!tidx && !threadIdx.y && !blockIdx.x) {
-		//
-		//	printf("interp of %f, %f, %f\n", point.x, point.y, point.z);
-		//	printf("hr_side: %d\n", hr_side);
-		//	printArray("vox_data", 6, dimt, __vox_data_sh[tidy]);
-		//	printArray("msk_data", 6, nmsk, __msk_data_sh[tidy]);
-		//}
-		//break;
-
-                __syncwarp(WMASK);
-
-                if (rv == 0) {
-
-                        ndotp_d<BDIM_X>(hr_side, hr_side, __msk_data_sh, R, __r_sh);
-			//__syncwarp();
-			//printArray("__r", 5, hr_side*hr_side, R);
-			//printArray("__r_sh", 6, hr_side, __r_sh[tidy]);
-
-                        ndotp_d<BDIM_X>(hr_side, hr_side, __msk_data_sh, H, __h_sh);
-			//__syncwarp();
-			//printArray("__h_sh", 6, hr_side, __h_sh[tidy]);
-
-                        __syncwarp(WMASK);
-
-                        for(int j = 0; j < hr_side; j += BDIM_X) {
-                                if (j+tidx < hr_side) {
-#ifdef USE_FIXED_PERMUTATION
-                                        const int srcPermInd = fixedPerm[j+tidx];
-#else
-                                        const int srcPermInd = curand(st) % hr_side;
-//                                        if (srcPermInd < 0 || srcPermInd >= hr_side) {
-//                                                printf("srcPermInd: %d\n", srcPermInd);
-//                                        }
-#endif
-					__h_sh[j+tidx] += __r_sh[srcPermInd];
-					//__h_sh[j+tidx] += __r_sh[j+tidx];
-                                }
-                        }
-			__syncwarp(WMASK);
-
-			//printArray("h+perm(r):", 6, hr_side, __h_sh[tidy]);
-			//__syncwarp();
-		
-			// vox_data[dwi_mask] = masked_data
-			maskPut<BDIM_X>(dimt, b0s_mask, __h_sh, __vox_data_sh);
-			__syncwarp(WMASK);
-
-			//printArray("vox_data[dwi_mask]:", 6, dimt, __vox_data_sh[tidy]);
-			//__syncwarp();
-
-			for(int j = tidx; j < dimt; j += BDIM_X) {
-				//__vox_data_sh[j] = MAX(MIN_SIGNAL_P, __vox_data_sh[j]);
-				__vox_data_sh[j] = MAX(min_signal, __vox_data_sh[j]);
-			}
-			__syncwarp(WMASK);
-
-			const REAL_T denom = avgMask<BDIM_X>(dimt, b0s_mask, __vox_data_sh);
-
-			for(int j = tidx; j < dimt; j += BDIM_X) {
-				__vox_data_sh[j] /= denom;
-			}
-			__syncwarp();
-
-			//if (!tidx && !threadIdx.y && !blockIdx.x) {
-			//	printf("denom: %f\n", denom);
-			//}
-			////break;
-			//if (!tidx && !threadIdx.y && !blockIdx.x) {
-			//
-			//	printf("__vox_data_sh:\n");
-			//	printArray("vox_data", 6, dimt, __vox_data_sh[tidy]);
-			//}
-			//break;
-
-			maskGet<BDIM_X>(dimt, b0s_mask, __vox_data_sh, __msk_data_sh);
-			__syncwarp(WMASK);
-
-                        fit_model_coef<BDIM_X, MODEL_T>(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh);
-
-                        // __r_sh[tidy] <- python 'coef'
-
-                        ndotp_d<BDIM_X>(samplm_nr, delta_nr, __r_sh, sampling_matrix, __h_sh);
-
-                        // __h_sh[tidy] <- python 'pmf'
-                } else {
-                        #pragma unroll
-                        for(int j = tidx; j < samplm_nr; j += BDIM_X) {
-				__h_sh[j] = 0;
-                        }
-                        // __h_sh[tidy] <- python 'pmf'
-                }
-                __syncwarp(WMASK);
-#if 0
-                if (!threadIdx.y && threadIdx.x == 0) {
-                        for(int j = 0; j < samplm_nr; j++) {
-                                printf("pmf[%d]: %f\n", j, __h_sh[tidy][j]);
-                        }
-                }
-                //return;
-#endif
-                const REAL_T abs_pmf_thr = PMF_THRESHOLD_P*max_d<BDIM_X>(samplm_nr, __h_sh, REAL_MIN);
-                __syncwarp(WMASK);
-
-                #pragma unroll
-                for(int j = tidx; j < samplm_nr; j += BDIM_X) {
-			const REAL_T __v = __h_sh[j];
-			if (__v < abs_pmf_thr) {
-				__h_sh[j] = 0;
-			}
-                }
-                __syncwarp(WMASK);
-#if 0
-                if (!threadIdx.y && threadIdx.x == 0) {
-                        printf("abs_pmf_thr: %f\n", abs_pmf_thr);
-                        for(int j = 0; j < samplm_nr; j++) {
-                                printf("pmfNORM[%d]: %f\n", j, __h_sh[tidy][j]);
-                        }
-                }
-                //return;
-#endif
-#if 0
-                if init:
-                        directions = peak_directions(pmf, sphere)[0]
-                        return directions
-                else:
-                        peaks = peak_directions(pmf, sphere)[0]
-                        if (len(peaks) > 0):
-                                return closest_peak(directions, peaks, cos_similarity)
-#endif
-                const int ndir = peak_directions_d<BDIM_X,
-                                                   BDIM_Y>(__h_sh, dirs,
-                                                           sphere_vertices,
-                                                           sphere_edges,
-                                                           num_edges,
-							   samplm_nr,
-							   reinterpret_cast<int *>(__r_sh), // reuse __r_sh as shInd in func which is large enough
-							   relative_peak_thres,
-							   min_separation_angle);
-                if (NATTEMPTS == 1) { // init=True...
-                        return ndir; // and dirs;
-                } else { // init=False...
-                        if (ndir > 0) {
-                                /*
-                                if (!threadIdx.y && threadIdx.x == 0 && ndir > 1) {
-                                        printf("NATTEMPTS=5 and ndir: %d!!!\n", ndir);
-                                }
-                                */
-                                REAL3_T peak;
-                                const int foundPeak = closest_peak_d<BDIM_X, BDIM_Y, REAL_T, REAL3_T>(max_angle, dir, ndir, dirs, &peak);
-                                __syncwarp(WMASK);
-                                if (foundPeak) {
-                                        if (tidx == 0) {
-                                                dirs[0] = peak;
-                                        }
-                                        return 1;
-                                }
-                        }
-                }
-        }
-        return 0;
-}
-
-enum {OUTSIDEIMAGE, INVALIDPOINT, TRACKPOINT, ENDPOINT};
-
-template<int BDIM_X,
-         int BDIM_Y,
-         typename REAL_T,
-         typename REAL3_T>
-__device__ int check_point_d(const REAL_T tc_threshold,
-			     const REAL3_T point,
-                             const int dimx,
-                             const int dimy,
-                             const int dimz,
-                             const REAL_T *__restrict__ metric_map) {
-
-        const int tidy = threadIdx.y;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        __shared__ REAL_T __shInterpOut[BDIM_Y];
-
-        const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, 1, 0, metric_map, point, __shInterpOut+tidy);
-        __syncwarp(WMASK);
-#if 0
-        if (threadIdx.y == 1 && threadIdx.x == 0) {
-                printf("__shInterpOut[tidy]: %f, TC_THRESHOLD_P: %f\n", __shInterpOut[tidy], TC_THRESHOLD_P);
-        }
-#endif
-        if (rv != 0) {
-                return OUTSIDEIMAGE;
-        }
-        //return (__shInterpOut[tidy] > TC_THRESHOLD_P) ? TRACKPOINT : ENDPOINT;
-        return (__shInterpOut[tidy] > tc_threshold) ? TRACKPOINT : ENDPOINT;
-}
-
 template<int BDIM_X,
          int BDIM_Y,
          ModelType MODEL_T,
@@ -1207,7 +310,6 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                          const int dimt,
                          const REAL_T *__restrict__ dataf,
                          const REAL_T *__restrict__ metric_map,
-                         const typename ModelCtx<MODEL_T, REAL_T>::type* __restrict__ ctx,
 		         const int samplm_nr,
                          const REAL3_T *__restrict__ sphere_vertices,
                          const int2 *__restrict__ sphere_edges,
@@ -1229,11 +331,6 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
 
         if (tidx == 0) {
                 streamline[0] = point;
-#if 0
-                if (threadIdx.y == 1) {
-                        printf("streamline[0]: %f, %f, %f\n", point.x, point.y, point.z);
-                }
-#endif
         }
         __syncwarp(WMASK);
 
@@ -1277,30 +374,6 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                                                         point,
                                                         sphere_vertices,
                                                         __sh_new_dir + tidy);
-                } else {
-                        // call get_direction_boot_d() with NATTEMPTS=5
-                        ndir = get_direction_boot_d<BDIM_X,
-                                                    BDIM_Y,
-                                                    5,
-                                                    MODEL_T>(
-                                                        st,
-                                                        max_angle,
-                                                        ctx->min_signal,
-                                                        relative_peak_thres,
-                                                        min_separation_angle,
-                                                        direction,
-                                                        dimx, dimy, dimz, dimt, dataf,
-                                                        ctx->b0s_mask /* !dwi_mask */,
-                                                        point,
-                                                        ctx->H, ctx->R,
-                                                        ctx->delta_nr,
-                                                        ctx->delta_b, ctx->delta_q, // fit_matrix
-                                                        samplm_nr,
-                                                        ctx->sampling_matrix,
-                                                        sphere_vertices,
-                                                        sphere_edges,
-                                                        num_edges,
-                                                        __sh_new_dir + tidy);
                 }
                 __syncwarp(WMASK);
                 direction = __sh_new_dir[tidy];
@@ -1315,9 +388,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                 }
                 //return;
 #endif
-                //point.x += (direction.x / voxel_size.x) * STEP_SIZE_P;
-                //point.y += (direction.y / voxel_size.y) * STEP_SIZE_P;
-                //point.z += (direction.z / voxel_size.z) * STEP_SIZE_P;
+
                 point.x += (direction.x / voxel_size.x) * (step_size / step_frac);
                 point.y += (direction.y / voxel_size.y) * (step_size / step_frac);
                 point.z += (direction.z / voxel_size.z) * (step_size / step_frac);
@@ -1382,136 +453,6 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
         return tissue_class;
 }
 
-template<int BDIM_X,
-         int BDIM_Y,
-         typename REAL_T,
-         typename REAL3_T>
-__global__ void getNumStreamlinesBoot_k(
-                                    const ModelType model_type,
-                                    const REAL_T max_angle,
-		                    const REAL_T min_signal,
-		                    const REAL_T relative_peak_thres,
-	                            const REAL_T min_separation_angle,
-		                    const long long rndSeed,
-                                    const int nseed,
-                                    const REAL3_T *__restrict__ seeds,
-                                    const int dimx,
-                                    const int dimy,
-                                    const int dimz,
-                                    const int dimt,
-                                    const REAL_T *__restrict__ dataf,
-                                    const REAL_T *__restrict__ H,
-                                    const REAL_T *__restrict__ R,
-		                    const int delta_nr,
-                                    const REAL_T *__restrict__ delta_b,
-                                    const REAL_T *__restrict__ delta_q,
-                                    const int  *__restrict__ b0s_mask, // change to int
-		                    const int samplm_nr,
-                                    const REAL_T *__restrict__ sampling_matrix,
-                                    const REAL3_T *__restrict__ sphere_vertices,
-                                    const int2 *__restrict__ sphere_edges,
-                                    const int num_edges,
-                                    REAL3_T *__restrict__ shDir0,
-                                    int *slineOutOff) {
-
-        const int tidx = threadIdx.x;
-        const int slid = blockIdx.x*blockDim.y + threadIdx.y;
-        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
-
-        if (slid >= nseed) {
-                return;
-        }
-
-        REAL3_T seed = seeds[slid]; 
-        // seed = lin_mat*seed + offset
-
-        REAL3_T *__restrict__ __shDir = shDir0+slid*samplm_nr;
-
-	// const int hr_side = dimt-1;
-
-        curandStatePhilox4_32_10_t st;
-        //curand_init(rndSeed, slid + rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
-        curand_init(rndSeed, gid, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
-                                                                                   // elements of the same sequence
-        // python:
-        //directions = get_direction(None, dataf, dwi_mask, sphere, s, H, R, model, max_angle,
-        //                pmf_threshold, b0s_mask, min_signal, fit_matrix,
-        //                sampling_matrix, init=True)
-
-	//if (!tidx && !threadIdx.y && !blockIdx.x) {
-	//	printf("seed: %f, %f, %f\n", seed.x, seed.y, seed.z);
-	//}
-
-        int ndir;
-        switch(model_type) {
-            case OPDT:
-                ndir = get_direction_boot_d<BDIM_X,
-                                            BDIM_Y,
-                                            1,
-                                            OPDT>(
-                                                &st,
-                                                max_angle,
-                                                min_signal,
-                                                relative_peak_thres,
-                                                min_separation_angle,
-                                                MAKE_REAL3(0,0,0),
-                                                dimx, dimy, dimz, dimt, dataf,
-                                                b0s_mask /* !dwi_mask */,
-                                                seed,
-                                                H, R,
-                                                // model unused
-                                                // max_angle, pmf_threshold from global defines
-                                                // b0s_mask already passed
-                                                // min_signal from global defines
-                                                delta_nr,
-                                                delta_b, delta_q, // fit_matrix
-                                                samplm_nr,
-                                                sampling_matrix,
-                                                sphere_vertices,
-                                                sphere_edges,
-                                                num_edges,
-                                                __shDir);
-                break;
-            case CSA:
-                ndir = get_direction_boot_d<BDIM_X,
-                                            BDIM_Y,
-                                            1,
-                                            CSA>(
-                                                &st,
-                                                max_angle,
-                                                min_signal,
-                                                relative_peak_thres,
-                                                min_separation_angle,
-                                                MAKE_REAL3(0,0,0),
-                                                dimx, dimy, dimz, dimt, dataf,
-                                                b0s_mask /* !dwi_mask */,
-                                                seed,
-                                                H, R,
-                                                // model unused
-                                                // max_angle, pmf_threshold from global defines
-                                                // b0s_mask already passed
-                                                // min_signal from global defines
-                                                delta_nr,
-                                                delta_b, delta_q, // fit_matrix
-                                                samplm_nr,
-                                                sampling_matrix,
-                                                sphere_vertices,
-                                                sphere_edges,
-                                                num_edges,
-                                                __shDir);
-                break;
-            default:
-                printf("FATAL: Invalid Model Type.\n");
-                break;
-        }
-
-        if (tidx == 0) {
-                slineOutOff[slid] = ndir;
-        }
-
-        return;
-}
-
 template<int BDIM_X,
          int BDIM_Y,
          typename REAL_T,
@@ -1572,7 +513,7 @@ template<int BDIM_X,
          ModelType MODEL_T,
          typename REAL_T,
          typename REAL3_T>
-__global__ void genStreamlinesMerge_k(
+__global__ void genStreamlinesMergeProb_k(
 				      const REAL_T max_angle,
 				      const REAL_T tc_threshold,
 				      const REAL_T step_size,
@@ -1588,7 +529,6 @@ __global__ void genStreamlinesMerge_k(
                                       const int dimt,
                                       const REAL_T *__restrict__ dataf,
                                       const REAL_T *__restrict__ metric_map,
-                                      const typename ModelCtx<MODEL_T, REAL_T>::type* __restrict__ ctx,
 				      const int samplm_nr,
                                       const REAL3_T *__restrict__ sphere_vertices,
                                       const int2 *__restrict__ sphere_edges,
@@ -1689,8 +629,7 @@ __global__ void genStreamlinesMerge_k(
                                                              MAKE_REAL3(1, 1, 1),
                                                              dimx, dimy, dimz, dimt, dataf,
                                                              metric_map,
-                                                             ctx,
-		                			                         samplm_nr,
+		                			     samplm_nr,
                                                              sphere_vertices,
                                                              sphere_edges,
                                                              num_edges,
@@ -1724,8 +663,7 @@ __global__ void genStreamlinesMerge_k(
                                                              MAKE_REAL3(1, 1, 1),
                                                              dimx, dimy, dimz, dimt, dataf,
                                                              metric_map,
-                                                             ctx,
-			        			                             samplm_nr,
+			        			     samplm_nr,
                                                              sphere_vertices,
                                                              sphere_edges,
                                                              num_edges,
diff --git a/cuslines/cuda_c/globals.h b/cuslines/cuda_c/globals.h
index b9f8211..71bcd73 100644
--- a/cuslines/cuda_c/globals.h
+++ b/cuslines/cuda_c/globals.h
@@ -98,33 +98,6 @@ enum ModelType {
   PTT = 3,
 };
 
-struct NoCtx {};
-
-template<typename REAL_T>
-struct BootCtx {
-    REAL_T min_signal;
-    int delta_nr;
-    const REAL_T* H;
-    const REAL_T* R;
-    const REAL_T* delta_b;
-    const REAL_T* delta_q;
-    const REAL_T* sampling_matrix;
-    const int* b0s_mask;
-};
-
-template<ModelType M, typename REAL_T>
-struct ModelCtx {
-    using type = NoCtx;
-};
-
-template<typename REAL_T>
-struct ModelCtx<CSA, REAL_T> {
-    using type = BootCtx<REAL_T>;
-};
-
-template<typename REAL_T>
-struct ModelCtx<OPDT, REAL_T> {
-    using type = BootCtx<REAL_T>;
-};
+enum {OUTSIDEIMAGE, INVALIDPOINT, TRACKPOINT, ENDPOINT};
 
 #endif
diff --git a/cuslines/cuda_c/tracking_helpers.cu b/cuslines/cuda_c/tracking_helpers.cu
new file mode 100644
index 0000000..21d5f67
--- /dev/null
+++ b/cuslines/cuda_c/tracking_helpers.cu
@@ -0,0 +1,290 @@
+
+using namespace cuwsort;
+
+template<typename REAL_T>
+__device__ REAL_T interpolation_helper_d(const REAL_T*__restrict__ dataf, const REAL_T wgh[3][2], const long long coo[3][2], int dimy, int dimz, int dimt, int t) {
+    REAL_T __tmp = 0;
+    #pragma unroll
+    for (int i = 0; i < 2; i++) {
+        #pragma unroll
+        for (int j = 0; j < 2; j++) {
+            #pragma unroll
+            for (int k = 0; k < 2; k++) {
+                __tmp += wgh[0][i] * wgh[1][j] * wgh[2][k] *
+                         dataf[coo[0][i] * dimy * dimz * dimt +
+                               coo[1][j] * dimz * dimt +
+                               coo[2][k] * dimt +
+                               t];
+            }
+        }
+    }
+    return __tmp;
+}
+
+template<int BDIM_X,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int trilinear_interp_d(const int dimx,
+                                  const int dimy,
+                                  const int dimz,
+                                  const int dimt,
+                                  int dimt_idx, // If -1, get all
+                                  const REAL_T *__restrict__ dataf,
+                                  const REAL3_T point,
+                                  REAL_T *__restrict__ __vox_data) {
+        const REAL_T HALF = static_cast<REAL_T>(0.5);
+
+        // all thr compute the same here
+        if (point.x < -HALF || point.x+HALF >= dimx ||
+            point.y < -HALF || point.y+HALF >= dimy ||
+               point.z < -HALF || point.z+HALF >= dimz) {
+                return -1;
+        }
+
+        long long  coo[3][2];
+        REAL_T wgh[3][2]; // could use just one...
+
+        const REAL_T ONE  = static_cast<REAL_T>(1.0);
+
+        const REAL3_T fl = MAKE_REAL3(FLOOR(point.x),
+                                      FLOOR(point.y),
+                                      FLOOR(point.z));
+
+        wgh[0][1] = point.x - fl.x; 
+        wgh[0][0] = ONE-wgh[0][1]; 
+        coo[0][0] = MAX(0, fl.x);
+        coo[0][1] = MIN(dimx-1, coo[0][0]+1);
+
+        wgh[1][1] = point.y - fl.y; 
+        wgh[1][0] = ONE-wgh[1][1]; 
+        coo[1][0] = MAX(0, fl.y);
+        coo[1][1] = MIN(dimy-1, coo[1][0]+1);
+
+        wgh[2][1] = point.z - fl.z; 
+        wgh[2][0] = ONE-wgh[2][1]; 
+        coo[2][0] = MAX(0, fl.z);
+        coo[2][1] = MIN(dimz-1, coo[2][0]+1);
+
+        if (dimt_idx == -1) {
+                for (int t = threadIdx.x; t < dimt; t += BDIM_X) {
+                        __vox_data[t] = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, t);
+                }
+        } else {
+                *__vox_data = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, dimt_idx);
+        }
+
+        // if (threadIdx.x == 0) {
+        //         printf("point: %f, %f, %f\n", point.x, point.y, point.z);
+        //         printf("dimt_idx: %d\n", dimt_idx);
+        //         // for(int i = 0; i < dimt; i++) {
+        //         //         printf("__vox_data[%d]: %f\n", i, __vox_data[i]);
+        //         // }
+        // }
+        return 0;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int check_point_d(const REAL_T tc_threshold,
+			     const REAL3_T point,
+                             const int dimx,
+                             const int dimy,
+                             const int dimz,
+                             const REAL_T *__restrict__ metric_map) {
+
+        const int tidy = threadIdx.y;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        __shared__ REAL_T __shInterpOut[BDIM_Y];
+
+        const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, 1, 0, metric_map, point, __shInterpOut+tidy);
+        __syncwarp(WMASK);
+#if 0
+        if (threadIdx.y == 1 && threadIdx.x == 0) {
+                printf("__shInterpOut[tidy]: %f, TC_THRESHOLD_P: %f\n", __shInterpOut[tidy], TC_THRESHOLD_P);
+        }
+#endif
+        if (rv != 0) {
+                return OUTSIDEIMAGE;
+        }
+        //return (__shInterpOut[tidy] > TC_THRESHOLD_P) ? TRACKPOINT : ENDPOINT;
+        return (__shInterpOut[tidy] > tc_threshold) ? TRACKPOINT : ENDPOINT;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int peak_directions_d(const REAL_T  *__restrict__ odf,
+                                       REAL3_T *__restrict__ dirs,
+                                 const REAL3_T *__restrict__ sphere_vertices,
+                                 const int2 *__restrict__ sphere_edges,
+                                 const int num_edges,
+				 int samplm_nr,
+				 int *__restrict__ __shInd,
+				 const REAL_T relative_peak_thres,
+				 const REAL_T min_separation_angle) {
+
+        const int tidx = threadIdx.x;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        const unsigned int lmask = (1 << lid)-1;
+
+//        __shared__ int __shInd[BDIM_Y][SAMPLM_NR];
+
+        #pragma unroll
+        for(int j = tidx; j < samplm_nr; j += BDIM_X) {
+		__shInd[j] = 0;
+        }
+
+        REAL_T odf_min = min_d<BDIM_X>(samplm_nr, odf, REAL_MAX);
+        odf_min = MAX(0, odf_min);
+
+        __syncwarp(WMASK);
+
+        // local_maxima() + _compare_neighbors()
+        // selecting only the indices corrisponding to maxima Ms
+        // such that M-odf_min >= relative_peak_thres
+        //#pragma unroll
+        for(int j = 0; j < num_edges; j += BDIM_X) {
+                if (j+tidx < num_edges) {
+                        const int u_ind = sphere_edges[j+tidx].x;
+                        const int v_ind = sphere_edges[j+tidx].y;
+
+                        //if (u_ind >= NUM_EDGES || v_ind >= NUM_EDGES) { ERROR; }
+
+                        const REAL_T u_val = odf[u_ind];
+                        const REAL_T v_val = odf[v_ind];
+
+                        //if (u_val != u_val || v_val != v_val) { ERROR_NANs; }
+
+                        // only check that they are not equal
+                        //if (u_val != v_val) {
+                        //        __shInd[tidy][u_val < v_val ? u_ind : v_ind] = -1; // benign race conditions...
+                        //}
+                        if (u_val < v_val) {
+                                atomicExch(__shInd+u_ind, -1);
+                                atomicOr(  __shInd+v_ind,  1);
+                        } else if (v_val < u_val) {
+                                atomicExch(__shInd+v_ind, -1);
+                                atomicOr(  __shInd+u_ind,  1);
+                        }
+                }
+        }
+        __syncwarp(WMASK);
+
+        const REAL_T compThres = relative_peak_thres*max_mask_transl_d<BDIM_X>(samplm_nr, __shInd, odf, -odf_min, REAL_MIN);
+#if 1
+/*
+        if (!tidy && !tidx) {
+                for(int j = 0; j < SAMPLM_NR; j++) {
+                        printf("local_max[%d]: %d (%f)\n", j, __shInd[tidy][j], odf[j]);
+                }
+                printf("maxMax with offset %f: %f\n", -odf_min, compThres);
+        }
+        __syncwarp(WMASK);
+*/
+        // compact indices of positive values to the right
+        int n = 0;
+
+        for(int j = 0; j < samplm_nr; j += BDIM_X) {
+
+                const int __v = (j+tidx < samplm_nr) ? __shInd[j+tidx] : -1;
+                const int __keep = (__v > 0) && ((odf[j+tidx]-odf_min) >= compThres);
+                const int __msk = __ballot_sync(WMASK, __keep);
+
+//__syncwarp(WMASK); // unnecessary
+                if (__keep) {
+                        const int myoff = __popc(__msk & lmask);
+                        __shInd[n + myoff] = j+tidx;
+                }
+                n += __popc(__msk);
+//__syncwarp(WMASK); // should be unnecessary
+        }
+        __syncwarp(WMASK);
+/*
+        if (!tidy && !tidx) {
+                for(int j = 0; j < n; j++) {
+                        printf("local_max_compact[%d]: %d\n", j, __shInd[tidy][j]);
+                }
+        }
+        __syncwarp(WMASK);
+*/
+
+        // sort local maxima indices
+        if (n < BDIM_X) {
+                REAL_T k = REAL_MIN;
+                int    v = 0;
+                if (tidx < n) {
+                        v = __shInd[tidx];
+                        k = odf[v];
+                }
+                warp_sort<32, BDIM_X, WSORT_DIR_DEC>(&k, &v);
+                __syncwarp(WMASK);
+
+                if (tidx < n) {
+                        __shInd[tidx] = v;
+                }
+        } else {
+                // ERROR !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+        }
+        __syncwarp(WMASK);
+
+        // __shInd[tidy][] contains the indices in odf correspoding to
+        // normalized maxima NOT sorted!
+        if (n != 0) {
+                // remove_similar_vertices()
+                // PRELIMINARY INEFFICIENT, SINGLE TH, IMPLEMENTATION
+                if (tidx == 0) {
+                        const REAL_T cos_similarity = COS(min_separation_angle);
+
+                        dirs[0] = sphere_vertices[__shInd[0]];
+
+                        int k = 1;
+                        for(int i = 1; i < n; i++) {
+
+                                const REAL3_T abc = sphere_vertices[__shInd[i]];
+
+                                int j = 0;
+                                for(; j < k; j++) {
+                                        const REAL_T cos = FABS(abc.x*dirs[j].x+
+                                                                abc.y*dirs[j].y+
+                                                                abc.z*dirs[j].z);
+                                        if (cos > cos_similarity) {
+                                                break;
+                                        }
+                                }
+                                if (j == k) {
+                                        dirs[k++] = abc;
+                                }
+                        }
+                        n = k;
+                }
+                n = __shfl_sync(WMASK, n, 0, BDIM_X);
+                __syncwarp(WMASK);
+
+        }
+/*
+        if (!tidy && !tidx) {
+                for(int j = 0; j < n; j++) {
+                        printf("local_max_compact_uniq[%d]: %d\n", j, __shInd[tidy][j]);
+                }
+        }
+        __syncwarp(WMASK);
+*/
+#else
+        const int indMax = max_d<BDIM_X, SAMPLM_NR>(__shInd[tidy], -1);
+        if (indMax != -1) {
+                __ret = MAKE_REAL3(sphere_vertices[indMax][0],
+                                   sphere_vertices[indMax][1],
+                                   sphere_vertices[indMax][2]);
+        }
+#endif
+        return n;
+}
diff --git a/cuslines/cuda_c/utils.cu b/cuslines/cuda_c/utils.cu
index 93b1190..8c5afe1 100644
--- a/cuslines/cuda_c/utils.cu
+++ b/cuslines/cuda_c/utils.cu
@@ -22,6 +22,62 @@ __device__ VAL_T max_d(const int n, const VAL_T *__restrict__ src, const VAL_T m
         return __m;
 }
 
+template<int BDIM_X,
+         typename LEN_T,
+         typename VAL_T>
+__device__ VAL_T max_mask_transl_d(const int n,
+				   const LEN_T *__restrict__ srcMsk,
+                                   const VAL_T *__restrict__ srcVal,
+                                   const VAL_T offset,
+                                   const VAL_T minVal) {
+
+        const int tidx = threadIdx.x;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        VAL_T __m = minVal;
+
+        for(int i = tidx; i < n; i += BDIM_X) {
+		const LEN_T sel = srcMsk[i];
+		if (sel > 0) {
+			__m = MAX(__m, srcVal[i]+offset);
+		}
+        }
+
+        #pragma unroll
+        for(int i = BDIM_X/2; i; i /= 2) {
+                const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X);
+                __m = MAX(__m, __tmp);
+        }
+
+        return __m;
+}
+
+template<int BDIM_X,
+         typename VAL_T>
+__device__ VAL_T min_d(const int n, const VAL_T *__restrict__ src, const VAL_T maxVal) {
+
+        const int tidx = threadIdx.x;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        VAL_T __m = maxVal;
+
+        for(int i = tidx; i < n; i += BDIM_X) {
+		__m = MIN(__m, src[i]);
+        }
+
+        #pragma unroll
+        for(int i = BDIM_X/2; i; i /= 2) {
+                const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X);
+                __m = MIN(__m, __tmp);
+        }
+
+        return __m;
+}
+
 template<int BDIM_X, typename REAL_T>
 __device__ void prefix_sum_sh_d(REAL_T *num_sh, int __len) {
     const int tidx = threadIdx.x;
@@ -80,85 +136,3 @@ __device__ void printArray(const char *name, int ncol, int n, REAL_T *arr) {
 		printArrayAlways(name, ncol, n, arr);
 	}
 }
-
-template<typename REAL_T>
-__device__ REAL_T interpolation_helper_d(const REAL_T*__restrict__ dataf, const REAL_T wgh[3][2], const long long coo[3][2], int dimy, int dimz, int dimt, int t) {
-    REAL_T __tmp = 0;
-    #pragma unroll
-    for (int i = 0; i < 2; i++) {
-        #pragma unroll
-        for (int j = 0; j < 2; j++) {
-            #pragma unroll
-            for (int k = 0; k < 2; k++) {
-                __tmp += wgh[0][i] * wgh[1][j] * wgh[2][k] *
-                         dataf[coo[0][i] * dimy * dimz * dimt +
-                               coo[1][j] * dimz * dimt +
-                               coo[2][k] * dimt +
-                               t];
-            }
-        }
-    }
-    return __tmp;
-}
-
-template<int BDIM_X,
-         typename REAL_T,
-         typename REAL3_T>
-__device__ int trilinear_interp_d(const int dimx,
-                                  const int dimy,
-                                  const int dimz,
-                                  const int dimt,
-                                  int dimt_idx, // If -1, get all
-                                  const REAL_T *__restrict__ dataf,
-                                  const REAL3_T point,
-                                  REAL_T *__restrict__ __vox_data) {
-        const REAL_T HALF = static_cast<REAL_T>(0.5);
-
-        // all thr compute the same here
-        if (point.x < -HALF || point.x+HALF >= dimx ||
-            point.y < -HALF || point.y+HALF >= dimy ||
-               point.z < -HALF || point.z+HALF >= dimz) {
-                return -1;
-        }
-
-        long long  coo[3][2];
-        REAL_T wgh[3][2]; // could use just one...
-
-        const REAL_T ONE  = static_cast<REAL_T>(1.0);
-
-        const REAL3_T fl = MAKE_REAL3(FLOOR(point.x),
-                                      FLOOR(point.y),
-                                      FLOOR(point.z));
-
-        wgh[0][1] = point.x - fl.x; 
-        wgh[0][0] = ONE-wgh[0][1]; 
-        coo[0][0] = MAX(0, fl.x);
-        coo[0][1] = MIN(dimx-1, coo[0][0]+1);
-
-        wgh[1][1] = point.y - fl.y; 
-        wgh[1][0] = ONE-wgh[1][1]; 
-        coo[1][0] = MAX(0, fl.y);
-        coo[1][1] = MIN(dimy-1, coo[1][0]+1);
-
-        wgh[2][1] = point.z - fl.z; 
-        wgh[2][0] = ONE-wgh[2][1]; 
-        coo[2][0] = MAX(0, fl.z);
-        coo[2][1] = MIN(dimz-1, coo[2][0]+1);
-
-        if (dimt_idx == -1) {
-                for (int t = threadIdx.x; t < dimt; t += BDIM_X) {
-                        __vox_data[t] = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, t);
-                }
-        } else {
-                *__vox_data = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, dimt_idx);
-        }
-
-        // if (threadIdx.x == 0) {
-        //         printf("point: %f, %f, %f\n", point.x, point.y, point.z);
-        //         printf("dimt_idx: %d\n", dimt_idx);
-        //         // for(int i = 0; i < dimt; i++) {
-        //         //         printf("__vox_data[%d]: %f\n", i, __vox_data[i]);
-        //         // }
-        // }
-        return 0;
-}
diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py
index d1b9e28..3383d1a 100644
--- a/cuslines/cuda_python/cu_direction_getters.py
+++ b/cuslines/cuda_python/cu_direction_getters.py
@@ -1,6 +1,5 @@
 import numpy as np
 from abc import ABC, abstractmethod
-import ctypes
 import logging
 from importlib.resources import files
 from time import time
@@ -10,7 +9,7 @@
 from cuda.core import Device, LaunchConfig, Program, launch, ProgramOptions
 from cuda.pathfinder import find_nvidia_header_directory
 from cuda.cccl import get_include_paths
-from cuda.bindings import runtime
+from cuda.bindings import runtime, driver
 from cuda.bindings.runtime import cudaMemcpyKind
 
 from cuslines.cuda_python.cutils import (
@@ -22,7 +21,6 @@
     ModelType,
     THR_X_SL,
     BLOCK_Y,
-    REAL_DTYPE_AS_CTYPE,
 )
 
 logger = logging.getLogger("GPUStreamlines")
@@ -47,7 +45,7 @@ def compile_program(self, debug: bool = False):
         start_time = time()
         logger.info("Compiling GPUStreamlines")
 
-        cuslines_cuda = files("cuslines")
+        cuslines_cuda = files("cuslines").joinpath("cuda_c")
 
         if debug:
             program_opts = {
@@ -78,7 +76,7 @@ def compile_program(self, debug: bool = False):
         # I think this is reasonable
         dev = Device()
         dev.set_current()
-        cuda_path = cuslines_cuda.joinpath("cuda_c/generate_streamlines_cuda.cu")
+        cuda_path = cuslines_cuda.joinpath("generate_streamlines_cuda.cu")
         with open(cuda_path, "r") as f:
             prog = Program(f.read(), code_type="c++", options=program_options)
         self.module = prog.compile(
@@ -90,18 +88,6 @@ def compile_program(self, debug: bool = False):
         logger.info("GPUStreamlines compiled successfully in %.2f seconds", time() - start_time)
 
 
-class _BootCtx(ctypes.Structure):
-    _fields_ = [
-        ("min_signal", REAL_DTYPE_AS_CTYPE),
-        ("delta_nr", ctypes.c_int32),
-        ("H", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
-        ("R", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
-        ("delta_b", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
-        ("delta_q", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
-        ("sampling_matrix", ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
-        ("b0s_mask", ctypes.POINTER(ctypes.c_int32))]
-
-
 class BootDirectionGetter(GPUDirectionGetter):
     def __init__(
             self,
@@ -120,6 +106,8 @@ def __init__(
         else:
             raise ValueError(f"Invalid model_type {model_type}, must be one of 'OPDT', 'CSA'")
 
+        checkCudaErrors(driver.cuInit(0))
+
         self.H = np.ascontiguousarray(H, dtype=REAL_DTYPE)
         self.R = np.ascontiguousarray(R, dtype=REAL_DTYPE)
         self.delta_b = np.ascontiguousarray(delta_b, dtype=REAL_DTYPE)
@@ -128,7 +116,6 @@ def __init__(
         self.min_signal = REAL_DTYPE(min_signal)
         self.sampling_matrix = np.ascontiguousarray(sampling_matrix, dtype=REAL_DTYPE)
         self.b0s_mask = np.ascontiguousarray(b0s_mask, dtype=np.int32)
-        self.ctx_h = []
 
         self.H_d = []
         self.R_d = []
@@ -136,10 +123,9 @@ def __init__(
         self.delta_q_d = []
         self.b0s_mask_d = []
         self.sampling_matrix_d = []
-        self.ctx_d = []
 
         self.getnum_kernel_name = f"getNumStreamlinesBoot_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
-        self.genstreamlines_kernel_name = f"genStreamlinesMerge_k<{THR_X_SL},{BLOCK_Y},{model_type.upper()},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.genstreamlines_kernel_name = f"genStreamlinesMergeBoot_k<{THR_X_SL},{BLOCK_Y},{model_type.upper()},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
         self.compile_program()
 
     @classmethod
@@ -232,19 +218,6 @@ def allocate_on_gpu(self, n):
         self.sampling_matrix_d.append(
             checkCudaErrors(runtime.cudaMalloc(
                 REAL_SIZE*self.sampling_matrix.size)))
-        self.ctx_d.append(
-            checkCudaErrors(runtime.cudaMalloc(
-                ctypes.sizeof(_BootCtx))))
-        self.ctx_h.append(_BootCtx(
-            min_signal=self.min_signal,
-            delta_nr=self.delta_nr,
-            H=ctypes.cast(self.H_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
-            R=ctypes.cast(self.R_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
-            delta_b=ctypes.cast(self.delta_b_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
-            delta_q=ctypes.cast(self.delta_q_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
-            sampling_matrix=ctypes.cast(self.sampling_matrix_d[n], ctypes.POINTER(REAL_DTYPE_AS_CTYPE)),
-            b0s_mask=ctypes.cast(self.b0s_mask_d[n], ctypes.POINTER(ctypes.c_int32))
-        ))
 
         checkCudaErrors(runtime.cudaMemcpy(
             self.H_d[n],
@@ -276,11 +249,6 @@ def allocate_on_gpu(self, n):
             self.sampling_matrix.ctypes.data,
             REAL_SIZE*self.sampling_matrix.size,
             cudaMemcpyKind.cudaMemcpyHostToDevice))
-        checkCudaErrors(runtime.cudaMemcpy(
-            self.ctx_d[n],
-            ctypes.addressof(self.ctx_h[n]),
-            ctypes.sizeof(_BootCtx),
-            cudaMemcpyKind.cudaMemcpyHostToDevice))
 
     def deallocate_on_gpu(self, n):
         if self.H_d[n]:
@@ -295,8 +263,6 @@ def deallocate_on_gpu(self, n):
             checkCudaErrors(runtime.cudaFree(self.b0s_mask_d[n]))
         if self.sampling_matrix_d[n]:
             checkCudaErrors(runtime.cudaFree(self.sampling_matrix_d[n]))
-        if self.ctx_d[n]:
-            checkCudaErrors(runtime.cudaFree(self.ctx_d[n]))
 
     def _shared_mem_bytes(self, sp):
         return REAL_SIZE*BLOCK_Y*2*(
@@ -312,8 +278,9 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
             sp.gpu_tracker.streams[n], config, ker,
             self.model_type,
             sp.gpu_tracker.max_angle,
-            sp.gpu_tracker.min_separation_angle,
+            self.min_signal,
             sp.gpu_tracker.relative_peak_thresh,
+            sp.gpu_tracker.min_separation_angle,
             sp.gpu_tracker.rng_seed,
             nseeds_gpu,
             sp.seeds_d[n],
@@ -358,11 +325,18 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
             sp.gpu_tracker.dimt,
             sp.gpu_tracker.dataf_d[n],
             sp.gpu_tracker.metric_map_d[n],
-            self.ctx_d[n],
             sp.gpu_tracker.samplm_nr,
             sp.gpu_tracker.sphere_vertices_d[n],
             sp.gpu_tracker.sphere_edges_d[n],
             sp.gpu_tracker.nedges,
+            self.min_signal,
+            self.delta_nr,
+            self.H_d[n],
+            self.R_d[n],
+            self.delta_b_d[n],
+            self.delta_q_d[n],
+            self.sampling_matrix_d[n],
+            self.b0s_mask_d[n],
             sp.slinesOffs_d[n],
             sp.shDirTemp0_d[n],
             sp.slineSeed_d[n],
@@ -373,8 +347,9 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
 
 class ProbDirectionGetter(GPUDirectionGetter):
     def __init__(self):
+        checkCudaErrors(driver.cuInit(0))
         self.getnum_kernel_name = f"getNumStreamlinesProb_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
-        self.genstreamlines_kernel_name = f"genStreamlinesMerge_k<{THR_X_SL},{BLOCK_Y},PROB,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.genstreamlines_kernel_name = f"genStreamlinesMergeProb_k<{THR_X_SL},{BLOCK_Y},PROB,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
         self.compile_program()
 
     def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
@@ -427,7 +402,6 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
             sp.gpu_tracker.dimt,
             sp.gpu_tracker.dataf_d[n],
             sp.gpu_tracker.metric_map_d[n],
-            int(0),
             sp.gpu_tracker.samplm_nr,
             sp.gpu_tracker.sphere_vertices_d[n],
             sp.gpu_tracker.sphere_edges_d[n],
@@ -440,11 +414,11 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
         )
 
 
-
 class PttDirectionGetter(ProbDirectionGetter):
     def __init__(self):
+        checkCudaErrors(driver.cuInit(0))
         self.getnum_kernel_name = f"getNumStreamlinesProb_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
-        self.genstreamlines_kernel_name = f"genStreamlinesMerge_k<{THR_X_SL},{BLOCK_Y},PTT,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.genstreamlines_kernel_name = f"genStreamlinesMergeProb_k<{THR_X_SL},{BLOCK_Y},PTT,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
         self.compile_program()
 
     def _shared_mem_bytes(self, sp):
diff --git a/cuslines/cuda_python/cu_tractography.py b/cuslines/cuda_python/cu_tractography.py
index 1d34adc..d9f94f9 100644
--- a/cuslines/cuda_python/cu_tractography.py
+++ b/cuslines/cuda_python/cu_tractography.py
@@ -1,4 +1,4 @@
-from cuda.bindings import driver, runtime
+from cuda.bindings import runtime
 from cuda.bindings.runtime import cudaMemcpyKind
 # TODO: consider cuda core over cuda bindings
 
@@ -116,7 +116,6 @@ def __init__(
         self.rng_offset = int(rng_offset)
         self.chunk_size = int(chunk_size)
 
-        checkCudaErrors(driver.cuInit(0))
         avail = checkCudaErrors(runtime.cudaGetDeviceCount())
         if self.ngpus > avail:
             raise RuntimeError(f"Requested {self.ngpus} GPUs but only {avail} available")
diff --git a/cuslines/cuda_python/cutils.py b/cuslines/cuda_python/cutils.py
index 2fd688e..4d0e313 100644
--- a/cuslines/cuda_python/cutils.py
+++ b/cuslines/cuda_python/cutils.py
@@ -1,7 +1,6 @@
 from cuda.bindings import driver, nvrtc
 
 import numpy as np
-import ctypes
 
 from enum import IntEnum
 
@@ -22,7 +21,6 @@ class ModelType(IntEnum):
                             ('z', np.float32)], align=True)
     REAL_DTYPE_AS_STR = "float"
     REAL3_DTYPE_AS_STR = "float3"
-    REAL_DTYPE_AS_CTYPE = ctypes.c_float
 elif REAL_SIZE == 8:
     REAL_DTYPE = np.float64
     REAL3_DTYPE = np.dtype([('x', np.float64),
@@ -30,7 +28,6 @@ class ModelType(IntEnum):
                             ('z', np.float64)], align=True)
     REAL_DTYPE_AS_STR = "double"
     REAL3_DTYPE_AS_STR = "double3"
-    REAL_DTYPE_AS_CTYPE = ctypes.c_double
 else:
     raise NotImplementedError(f"Unsupported REAL_SIZE={REAL_SIZE} in globals.h")
 BLOCK_Y = THR_X_BL//THR_X_SL
diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py
index 06c61c6..57053fe 100644
--- a/run_gpu_streamlines.py
+++ b/run_gpu_streamlines.py
@@ -72,7 +72,7 @@
 #Get Gradient values
 def get_gtab(fbval, fbvec):
     bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
-    gtab = gradient_table(bvals, bvecs)
+    gtab = gradient_table(bvals=bvals, bvecs=bvecs)
     return gtab
 
 def get_img(ep2_seq):
@@ -115,7 +115,8 @@ def get_img(ep2_seq):
   if not all(arg == 'hardi' for arg in [args.nifti_file, args.bvals, args.bvecs, args.mask_nifti, args.roi_nifti]):
     raise ValueError("If any of the arguments is 'hardi', all must be 'hardi'")
   # Get Stanford HARDI data
-  hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames('stanford_hardi')
+  hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames(
+     name='stanford_hardi')
   csf, gm, wm = read_stanford_pve_maps()
   wm_data = wm.get_fdata()
 
@@ -139,7 +140,7 @@ def get_img(ep2_seq):
 
 tenmodel = dti.TensorModel(gtab, fit_method='WLS')
 print('Fitting Tensor')
-tenfit = tenmodel.fit(data, mask)
+tenfit = tenmodel.fit(data, mask=mask)
 print('Computing anisotropy measures (FA,MD,RGB)')
 FA = tenfit.fa
 
@@ -220,6 +221,7 @@ def get_img(ep2_seq):
     ts = time.time()
     streamline_generator = LocalTracking(dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=args.step_size)
     sft = StatefulTractogram(streamline_generator, img, Space.VOX)
+    n_sls = len(sft.streamlines)
     te = time.time()
 else:
     with GPUTracker(
@@ -240,10 +242,12 @@ def get_img(ep2_seq):
         ts = time.time()
         if args.output_prefix and write_method == "trx":
             trx_file = gpu_tracker.generate_trx(seed_mask, img)
+            n_sls = len(trx_file.streamlines)
         else:
             sft = gpu_tracker.generate_sft(seed_mask, img)
+            n_sls = len(sft.streamlines)
         te = time.time()
-print("Generated {} streamlines from {} seeds, time: {} s".format(len(sft.streamlines),
+print("Generated {} streamlines from {} seeds, time: {} s".format(n_sls,
                                                                   seed_mask.shape[0],
                                                                   te-ts))
 

From d008d039e061a8c67b24750c301fcd9d6cfcc068 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Wed, 7 Jan 2026 12:54:34 -0800
Subject: [PATCH 27/31] ruff

---
 cuslines/cuda_python/__init__.py             |   4 +-
 cuslines/cuda_python/cu_direction_getters.py | 224 +++++++++++--------
 cuslines/cuda_python/cu_propagate_seeds.py   | 177 ++++++++-------
 cuslines/cuda_python/cu_tractography.py      |  98 ++++----
 cuslines/cuda_python/cutils.py               |  26 ++-
 5 files changed, 315 insertions(+), 214 deletions(-)

diff --git a/cuslines/cuda_python/__init__.py b/cuslines/cuda_python/__init__.py
index d0b42d4..fd05c1e 100644
--- a/cuslines/cuda_python/__init__.py
+++ b/cuslines/cuda_python/__init__.py
@@ -2,12 +2,12 @@
 from .cu_direction_getters import (
     ProbDirectionGetter,
     PttDirectionGetter,
-    BootDirectionGetter
+    BootDirectionGetter,
 )
 
 __all__ = [
     "GPUTracker",
     "ProbDirectionGetter",
     "PttDirectionGetter",
-    "BootDirectionGetter"
+    "BootDirectionGetter",
 ]
diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py
index 3383d1a..b659445 100644
--- a/cuslines/cuda_python/cu_direction_getters.py
+++ b/cuslines/cuda_python/cu_direction_getters.py
@@ -55,9 +55,7 @@ def compile_program(self, debug: bool = False):
                 "lineinfo": True,
             }
         else:
-            program_opts = {
-                "ptxas_options": ["-O3"]
-            }
+            program_opts = {"ptxas_options": ["-O3"]}
 
         program_options = ProgramOptions(
             name="cuslines",
@@ -68,8 +66,10 @@ def compile_program(self, debug: bool = False):
                 str(cuslines_cuda),
                 find_nvidia_header_directory("cudart"),
                 find_nvidia_header_directory("curand"),
-                get_include_paths().libcudacxx],
-            **program_opts)
+                get_include_paths().libcudacxx,
+            ],
+            **program_opts,
+        )
 
         # Here we assume all devices are the same,
         # so we compile once for any current device.
@@ -84,27 +84,33 @@ def compile_program(self, debug: bool = False):
             name_expressions=(
                 self.getnum_kernel_name,
                 self.genstreamlines_kernel_name,
-            ))
-        logger.info("GPUStreamlines compiled successfully in %.2f seconds", time() - start_time)
+            ),
+        )
+        logger.info(
+            "GPUStreamlines compiled successfully in %.2f seconds", time() - start_time
+        )
 
 
 class BootDirectionGetter(GPUDirectionGetter):
     def __init__(
-            self,
-            model_type: str,
-            min_signal: float,
-            H: np.ndarray,
-            R: np.ndarray,
-            delta_b: np.ndarray,
-            delta_q: np.ndarray,
-            sampling_matrix: np.ndarray,
-            b0s_mask: np.ndarray):
+        self,
+        model_type: str,
+        min_signal: float,
+        H: np.ndarray,
+        R: np.ndarray,
+        delta_b: np.ndarray,
+        delta_q: np.ndarray,
+        sampling_matrix: np.ndarray,
+        b0s_mask: np.ndarray,
+    ):
         if model_type.upper() == "OPDT":
             self.model_type = int(ModelType.OPDT)
         elif model_type.upper() == "CSA":
             self.model_type = int(ModelType.CSA)
         else:
-            raise ValueError(f"Invalid model_type {model_type}, must be one of 'OPDT', 'CSA'")
+            raise ValueError(
+                f"Invalid model_type {model_type}, must be one of 'OPDT', 'CSA'"
+            )
 
         checkCudaErrors(driver.cuInit(0))
 
@@ -129,11 +135,15 @@ def __init__(
         self.compile_program()
 
     @classmethod
-    def from_dipy_opdt(cls, gtab, sphere,
-                       sh_order_max=6,
-                       full_basis=False,
-                       sh_lambda=0.006,
-                       min_signal=1):
+    def from_dipy_opdt(
+        cls,
+        gtab,
+        sphere,
+        sh_order_max=6,
+        full_basis=False,
+        sh_lambda=0.006,
+        min_signal=1,
+    ):
         sampling_matrix, _, _ = shm.real_sh_descoteaux(
             sh_order_max, sphere.theta, sphere.phi, full_basis=full_basis, legacy=False
         )
@@ -160,15 +170,19 @@ def from_dipy_opdt(cls, gtab, sphere,
             delta_b=delta_b,
             delta_q=delta_q,
             sampling_matrix=sampling_matrix,
-            b0s_mask=gtab.b0s_mask
+            b0s_mask=gtab.b0s_mask,
         )
 
     @classmethod
-    def from_dipy_csa(cls, gtab, sphere,
-                      sh_order_max=6,
-                      full_basis=False,
-                      sh_lambda=0.006,
-                      min_signal=1):
+    def from_dipy_csa(
+        cls,
+        gtab,
+        sphere,
+        sh_order_max=6,
+        full_basis=False,
+        sh_lambda=0.006,
+        min_signal=1,
+    ):
         sampling_matrix, _, _ = shm.real_sh_descoteaux(
             sh_order_max, sphere.theta, sphere.phi, full_basis=full_basis, legacy=False
         )
@@ -196,59 +210,73 @@ def from_dipy_csa(cls, gtab, sphere,
             delta_b=delta_b,
             delta_q=delta_q,
             sampling_matrix=sampling_matrix,
-            b0s_mask=gtab.b0s_mask
+            b0s_mask=gtab.b0s_mask,
         )
 
     def allocate_on_gpu(self, n):
-        self.H_d.append(
-            checkCudaErrors(runtime.cudaMalloc(
-                REAL_SIZE*self.H.size)))
-        self.R_d.append(
-            checkCudaErrors(runtime.cudaMalloc(
-                REAL_SIZE*self.R.size)))
+        self.H_d.append(checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.H.size)))
+        self.R_d.append(checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.R.size)))
         self.delta_b_d.append(
-            checkCudaErrors(runtime.cudaMalloc(
-                REAL_SIZE*self.delta_b.size)))
+            checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.delta_b.size))
+        )
         self.delta_q_d.append(
-            checkCudaErrors(runtime.cudaMalloc(
-                REAL_SIZE*self.delta_q.size)))
+            checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.delta_q.size))
+        )
         self.b0s_mask_d.append(
-            checkCudaErrors(runtime.cudaMalloc(
-                np.int32().nbytes*self.b0s_mask.size)))
+            checkCudaErrors(runtime.cudaMalloc(np.int32().nbytes * self.b0s_mask.size))
+        )
         self.sampling_matrix_d.append(
-            checkCudaErrors(runtime.cudaMalloc(
-                REAL_SIZE*self.sampling_matrix.size)))
+            checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.sampling_matrix.size))
+        )
 
-        checkCudaErrors(runtime.cudaMemcpy(
-            self.H_d[n],
-            self.H.ctypes.data,
-            REAL_SIZE*self.H.size,
-            cudaMemcpyKind.cudaMemcpyHostToDevice))
-        checkCudaErrors(runtime.cudaMemcpy(
-            self.R_d[n],
-            self.R.ctypes.data,
-            REAL_SIZE*self.R.size,
-            cudaMemcpyKind.cudaMemcpyHostToDevice))
-        checkCudaErrors(runtime.cudaMemcpy(
-            self.delta_b_d[n],
-            self.delta_b.ctypes.data,
-            REAL_SIZE*self.delta_b.size,
-            cudaMemcpyKind.cudaMemcpyHostToDevice))
-        checkCudaErrors(runtime.cudaMemcpy(
-            self.delta_q_d[n],
-            self.delta_q.ctypes.data,
-            REAL_SIZE*self.delta_q.size,
-            cudaMemcpyKind.cudaMemcpyHostToDevice))
-        checkCudaErrors(runtime.cudaMemcpy(
-            self.b0s_mask_d[n],
-            self.b0s_mask.ctypes.data,
-            np.int32().nbytes*self.b0s_mask.size,
-            cudaMemcpyKind.cudaMemcpyHostToDevice))
-        checkCudaErrors(runtime.cudaMemcpy(
-            self.sampling_matrix_d[n],
-            self.sampling_matrix.ctypes.data,
-            REAL_SIZE*self.sampling_matrix.size,
-            cudaMemcpyKind.cudaMemcpyHostToDevice))
+        checkCudaErrors(
+            runtime.cudaMemcpy(
+                self.H_d[n],
+                self.H.ctypes.data,
+                REAL_SIZE * self.H.size,
+                cudaMemcpyKind.cudaMemcpyHostToDevice,
+            )
+        )
+        checkCudaErrors(
+            runtime.cudaMemcpy(
+                self.R_d[n],
+                self.R.ctypes.data,
+                REAL_SIZE * self.R.size,
+                cudaMemcpyKind.cudaMemcpyHostToDevice,
+            )
+        )
+        checkCudaErrors(
+            runtime.cudaMemcpy(
+                self.delta_b_d[n],
+                self.delta_b.ctypes.data,
+                REAL_SIZE * self.delta_b.size,
+                cudaMemcpyKind.cudaMemcpyHostToDevice,
+            )
+        )
+        checkCudaErrors(
+            runtime.cudaMemcpy(
+                self.delta_q_d[n],
+                self.delta_q.ctypes.data,
+                REAL_SIZE * self.delta_q.size,
+                cudaMemcpyKind.cudaMemcpyHostToDevice,
+            )
+        )
+        checkCudaErrors(
+            runtime.cudaMemcpy(
+                self.b0s_mask_d[n],
+                self.b0s_mask.ctypes.data,
+                np.int32().nbytes * self.b0s_mask.size,
+                cudaMemcpyKind.cudaMemcpyHostToDevice,
+            )
+        )
+        checkCudaErrors(
+            runtime.cudaMemcpy(
+                self.sampling_matrix_d[n],
+                self.sampling_matrix.ctypes.data,
+                REAL_SIZE * self.sampling_matrix.size,
+                cudaMemcpyKind.cudaMemcpyHostToDevice,
+            )
+        )
 
     def deallocate_on_gpu(self, n):
         if self.H_d[n]:
@@ -265,9 +293,16 @@ def deallocate_on_gpu(self, n):
             checkCudaErrors(runtime.cudaFree(self.sampling_matrix_d[n]))
 
     def _shared_mem_bytes(self, sp):
-        return REAL_SIZE*BLOCK_Y*2*(
-            sp.gpu_tracker.n32dimt + max(sp.gpu_tracker.n32dimt, sp.gpu_tracker.samplm_nr)) + \
-                np.int32().nbytes*BLOCK_Y*sp.gpu_tracker.samplm_nr
+        return (
+            REAL_SIZE
+            * BLOCK_Y
+            * 2
+            * (
+                sp.gpu_tracker.n32dimt
+                + max(sp.gpu_tracker.n32dimt, sp.gpu_tracker.samplm_nr)
+            )
+            + np.int32().nbytes * BLOCK_Y * sp.gpu_tracker.samplm_nr
+        )
 
     def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
         ker = self.module.get_kernel(self.getnum_kernel_name)
@@ -275,7 +310,9 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
         config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory)
 
         launch(
-            sp.gpu_tracker.streams[n], config, ker,
+            sp.gpu_tracker.streams[n],
+            config,
+            ker,
             self.model_type,
             sp.gpu_tracker.max_angle,
             self.min_signal,
@@ -301,7 +338,8 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
             sp.gpu_tracker.sphere_edges_d[n],
             sp.gpu_tracker.nedges,
             sp.shDirTemp0_d[n],
-            sp.slinesOffs_d[n])
+            sp.slinesOffs_d[n],
+        )
 
     def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
         ker = self.module.get_kernel(self.genstreamlines_kernel_name)
@@ -309,14 +347,16 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
         config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory)
 
         launch(
-            sp.gpu_tracker.streams[n], config, ker,
+            sp.gpu_tracker.streams[n],
+            config,
+            ker,
             sp.gpu_tracker.max_angle,
             sp.gpu_tracker.tc_threshold,
             sp.gpu_tracker.step_size,
             sp.gpu_tracker.relative_peak_thresh,
             sp.gpu_tracker.min_separation_angle,
             sp.gpu_tracker.rng_seed,
-            sp.gpu_tracker.rng_offset + n*nseeds_gpu,
+            sp.gpu_tracker.rng_offset + n * nseeds_gpu,
             nseeds_gpu,
             sp.seeds_d[n],
             sp.gpu_tracker.dimx,
@@ -341,7 +381,7 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
             sp.shDirTemp0_d[n],
             sp.slineSeed_d[n],
             sp.slineLen_d[n],
-            sp.sline_d[n]
+            sp.sline_d[n],
         )
 
 
@@ -354,12 +394,16 @@ def __init__(self):
 
     def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
         ker = self.module.get_kernel(self.getnum_kernel_name)
-        shared_memory = REAL_SIZE*BLOCK_Y*sp.gpu_tracker.n32dimt + \
-            np.int32().nbytes*BLOCK_Y*sp.gpu_tracker.n32dimt
+        shared_memory = (
+            REAL_SIZE * BLOCK_Y * sp.gpu_tracker.n32dimt
+            + np.int32().nbytes * BLOCK_Y * sp.gpu_tracker.n32dimt
+        )
         config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory)
 
         launch(
-            sp.gpu_tracker.streams[n], config, ker,
+            sp.gpu_tracker.streams[n],
+            config,
+            ker,
             sp.gpu_tracker.max_angle,
             sp.gpu_tracker.relative_peak_thresh,
             sp.gpu_tracker.min_separation_angle,
@@ -375,7 +419,8 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
             sp.gpu_tracker.sphere_edges_d[n],
             sp.gpu_tracker.nedges,
             sp.shDirTemp0_d[n],
-            sp.slinesOffs_d[n])
+            sp.slinesOffs_d[n],
+        )
 
     def _shared_mem_bytes(self, sp):
         return REAL_SIZE * BLOCK_Y * sp.gpu_tracker.n32dimt
@@ -386,14 +431,16 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
         config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory)
 
         launch(
-            sp.gpu_tracker.streams[n], config, ker,
+            sp.gpu_tracker.streams[n],
+            config,
+            ker,
             sp.gpu_tracker.max_angle,
             sp.gpu_tracker.tc_threshold,
             sp.gpu_tracker.step_size,
             sp.gpu_tracker.relative_peak_thresh,
             sp.gpu_tracker.min_separation_angle,
             sp.gpu_tracker.rng_seed,
-            sp.gpu_tracker.rng_offset + n*nseeds_gpu,
+            sp.gpu_tracker.rng_offset + n * nseeds_gpu,
             nseeds_gpu,
             sp.seeds_d[n],
             sp.gpu_tracker.dimx,
@@ -410,7 +457,7 @@ def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
             sp.shDirTemp0_d[n],
             sp.slineSeed_d[n],
             sp.slineLen_d[n],
-            sp.sline_d[n]
+            sp.sline_d[n],
         )
 
 
@@ -423,4 +470,3 @@ def __init__(self):
 
     def _shared_mem_bytes(self, sp):
         return 0
-
diff --git a/cuslines/cuda_python/cu_propagate_seeds.py b/cuslines/cuda_python/cu_propagate_seeds.py
index 72037c6..b8991e5 100644
--- a/cuslines/cuda_python/cu_propagate_seeds.py
+++ b/cuslines/cuda_python/cu_propagate_seeds.py
@@ -16,16 +16,15 @@
     THR_X_BL,
     DEV_PTR,
     div_up,
-    checkCudaErrors)
+    checkCudaErrors,
+)
 
 
 logger = logging.getLogger("GPUStreamlines")
 
 
 class SeedBatchPropagator:
-    def __init__(
-            self,
-            gpu_tracker):
+    def __init__(self, gpu_tracker):
         self.gpu_tracker = gpu_tracker
         self.ngpus = gpu_tracker.ngpus
 
@@ -44,53 +43,71 @@ def __init__(
     def _switch_device(self, n):
         checkCudaErrors(runtime.cudaSetDevice(n))
 
-        nseeds_gpu =  min(
-            self.nseeds_per_gpu, max(0, self.nseeds - n * self.nseeds_per_gpu))
-        block = (THR_X_SL, THR_X_BL//THR_X_SL, 1)
-        grid = (div_up(nseeds_gpu, THR_X_BL//THR_X_SL), 1, 1)
+        nseeds_gpu = min(
+            self.nseeds_per_gpu, max(0, self.nseeds - n * self.nseeds_per_gpu)
+        )
+        block = (THR_X_SL, THR_X_BL // THR_X_SL, 1)
+        grid = (div_up(nseeds_gpu, THR_X_BL // THR_X_SL), 1, 1)
 
         return nseeds_gpu, block, grid
 
     def _get_sl_buffer_size(self, n):
-        return REAL_SIZE*2*3*MAX_SLINE_LEN*self.nSlines[n].astype(np.int64)
+        return REAL_SIZE * 2 * 3 * MAX_SLINE_LEN * self.nSlines[n].astype(np.int64)
 
     def _allocate_seed_memory(self, seeds):
         # Move seeds to GPU
         for ii in range(self.ngpus):
             nseeds_gpu, _, _ = self._switch_device(ii)
-            self.seeds_d[ii] = checkCudaErrors(runtime.cudaMalloc(
-                REAL_SIZE*3*nseeds_gpu))
-            seeds_host = np.ascontiguousarray(seeds[
-                ii*self.nseeds_per_gpu:ii*self.nseeds_per_gpu+nseeds_gpu],
-                dtype=REAL_DTYPE)
-            checkCudaErrors(runtime.cudaMemcpy(
-                self.seeds_d[ii],
-                seeds_host.ctypes.data,
-                REAL_SIZE*3*nseeds_gpu,
-                cudaMemcpyKind.cudaMemcpyHostToDevice))
+            self.seeds_d[ii] = checkCudaErrors(
+                runtime.cudaMalloc(REAL_SIZE * 3 * nseeds_gpu)
+            )
+            seeds_host = np.ascontiguousarray(
+                seeds[ii * self.nseeds_per_gpu : ii * self.nseeds_per_gpu + nseeds_gpu],
+                dtype=REAL_DTYPE,
+            )
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.seeds_d[ii],
+                    seeds_host.ctypes.data,
+                    REAL_SIZE * 3 * nseeds_gpu,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
 
         for ii in range(self.ngpus):
             nseeds_gpu, block, grid = self._switch_device(ii)
             # Streamline offsets
-            self.slinesOffs_d[ii] = checkCudaErrors(runtime.cudaMalloc(
-                np.int32().nbytes * (nseeds_gpu + 1)))
+            self.slinesOffs_d[ii] = checkCudaErrors(
+                runtime.cudaMalloc(np.int32().nbytes * (nseeds_gpu + 1))
+            )
             # Initial directions from each seed
-            self.shDirTemp0_d[ii] = checkCudaErrors(runtime.cudaMalloc(
-                REAL3_DTYPE.itemsize * self.gpu_tracker.samplm_nr * grid[0] * block[1]))
-
-    def _cumsum_offsets(self):  # TODO: performance: do this on device? not crucial for performance now
+            self.shDirTemp0_d[ii] = checkCudaErrors(
+                runtime.cudaMalloc(
+                    REAL3_DTYPE.itemsize
+                    * self.gpu_tracker.samplm_nr
+                    * grid[0]
+                    * block[1]
+                )
+            )
+
+    def _cumsum_offsets(
+        self,
+    ):  # TODO: performance: do this on device? not crucial for performance now
         for ii in range(self.ngpus):
             nseeds_gpu, _, _ = self._switch_device(ii)
-            if (nseeds_gpu == 0):
+            if nseeds_gpu == 0:
                 self.nSlines[ii] = 0
                 continue
 
             slinesOffs_h = np.empty(nseeds_gpu + 1, dtype=np.int32)
-            checkCudaErrors(runtime.cudaMemcpy(
-                slinesOffs_h.ctypes.data,
-                self.slinesOffs_d[ii],
-                slinesOffs_h.nbytes,
-                cudaMemcpyKind.cudaMemcpyDeviceToHost))
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    slinesOffs_h.ctypes.data,
+                    self.slinesOffs_d[ii],
+                    slinesOffs_h.nbytes,
+                    cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                )
+            )
 
             __pval = slinesOffs_h[0]
             slinesOffs_h[0] = 0
@@ -100,24 +117,29 @@ def _cumsum_offsets(self):  # TODO: performance: do this on device? not crucial
                 __pval = __cval
             self.nSlines[ii] = int(slinesOffs_h[nseeds_gpu])
 
-            checkCudaErrors(runtime.cudaMemcpy(
-                self.slinesOffs_d[ii],
-                slinesOffs_h.ctypes.data,
-                slinesOffs_h.nbytes,
-                cudaMemcpyKind.cudaMemcpyHostToDevice))
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.slinesOffs_d[ii],
+                    slinesOffs_h.ctypes.data,
+                    slinesOffs_h.nbytes,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
 
     def _allocate_tracking_memory(self):
         for ii in range(self.ngpus):
             self._switch_device(ii)
 
-            self.slineSeed_d[ii] = checkCudaErrors(runtime.cudaMalloc(
-                self.nSlines[ii] * np.int32().nbytes))
-            checkCudaErrors(runtime.cudaMemset(
-                self.slineSeed_d[ii],
-                -1,
-                self.nSlines[ii] * np.int32().nbytes))
+            self.slineSeed_d[ii] = checkCudaErrors(
+                runtime.cudaMalloc(self.nSlines[ii] * np.int32().nbytes)
+            )
+            checkCudaErrors(
+                runtime.cudaMemset(
+                    self.slineSeed_d[ii], -1, self.nSlines[ii] * np.int32().nbytes
+                )
+            )
 
-            if self.nSlines[ii] > EXCESS_ALLOC_FACT*self.nSlines_old[ii]:
+            if self.nSlines[ii] > EXCESS_ALLOC_FACT * self.nSlines_old[ii]:
                 self.slines[ii] = 0
                 self.sline_lens[ii] = 0
                 gc.collect()
@@ -127,42 +149,48 @@ def _allocate_tracking_memory(self):
 
             if not self.slines[ii]:
                 self.slines[ii] = np.empty(
-                    (EXCESS_ALLOC_FACT*self.nSlines[ii], MAX_SLINE_LEN*2, 3),
-                    dtype=REAL_DTYPE)
+                    (EXCESS_ALLOC_FACT * self.nSlines[ii], MAX_SLINE_LEN * 2, 3),
+                    dtype=REAL_DTYPE,
+                )
             if not self.sline_lens[ii]:
                 self.sline_lens[ii] = np.empty(
-                    EXCESS_ALLOC_FACT*self.nSlines[ii],
-                    dtype=np.int32)
+                    EXCESS_ALLOC_FACT * self.nSlines[ii], dtype=np.int32
+                )
 
         for ii in range(self.ngpus):
             self._switch_device(ii)
             buffer_size = self._get_sl_buffer_size(ii)
 
-            self.slineLen_d[ii] = checkCudaErrors(runtime.cudaMalloc(
-                np.int32().nbytes * self.nSlines[ii]))
-            self.sline_d[ii] = checkCudaErrors(runtime.cudaMalloc(
-                buffer_size))
+            self.slineLen_d[ii] = checkCudaErrors(
+                runtime.cudaMalloc(np.int32().nbytes * self.nSlines[ii])
+            )
+            self.sline_d[ii] = checkCudaErrors(runtime.cudaMalloc(buffer_size))
 
     def _cleanup(self):
         for ii in range(self.ngpus):
             self._switch_device(ii)
-            checkCudaErrors(runtime.cudaMemcpyAsync(
-                self.slines[ii],
-                self.sline_d[ii],
-                self._get_sl_buffer_size(ii),
-                cudaMemcpyKind.cudaMemcpyDeviceToHost,
-                self.gpu_tracker.streams[ii]))
-            checkCudaErrors(runtime.cudaMemcpyAsync(
-                self.sline_lens[ii],
-                self.slineLen_d[ii],
-                np.int32().nbytes*self.nSlines[ii],
-                cudaMemcpyKind.cudaMemcpyDeviceToHost,
-                self.gpu_tracker.streams[ii]))
+            checkCudaErrors(
+                runtime.cudaMemcpyAsync(
+                    self.slines[ii],
+                    self.sline_d[ii],
+                    self._get_sl_buffer_size(ii),
+                    cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                    self.gpu_tracker.streams[ii],
+                )
+            )
+            checkCudaErrors(
+                runtime.cudaMemcpyAsync(
+                    self.sline_lens[ii],
+                    self.slineLen_d[ii],
+                    np.int32().nbytes * self.nSlines[ii],
+                    cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                    self.gpu_tracker.streams[ii],
+                )
+            )
 
         for ii in range(self.ngpus):
             self._switch_device(ii)
-            checkCudaErrors(runtime.cudaStreamSynchronize(
-                self.gpu_tracker.streams[ii]))
+            checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii]))
             checkCudaErrors(runtime.cudaFree(self.seeds_d[ii]))
             checkCudaErrors(runtime.cudaFree(self.slineSeed_d[ii]))
             checkCudaErrors(runtime.cudaFree(self.slinesOffs_d[ii]))
@@ -179,30 +207,30 @@ def _cleanup(self):
     # May be better to do in cuda code directly
     def propagate(self, seeds):
         self.nseeds = len(seeds)
-        self.nseeds_per_gpu = (self.nseeds + self.gpu_tracker.ngpus - 1) // self.gpu_tracker.ngpus
+        self.nseeds_per_gpu = (
+            self.nseeds + self.gpu_tracker.ngpus - 1
+        ) // self.gpu_tracker.ngpus
 
         self._allocate_seed_memory(seeds)
 
         for ii in range(self.ngpus):
             nseeds_gpu, block, grid = self._switch_device(ii)
-            if (nseeds_gpu == 0):
+            if nseeds_gpu == 0:
                 continue
             self.gpu_tracker.dg.getNumStreamlines(ii, nseeds_gpu, block, grid, self)
         for ii in range(self.ngpus):
-            checkCudaErrors(runtime.cudaStreamSynchronize(
-                self.gpu_tracker.streams[ii]))
+            checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii]))
 
         self._cumsum_offsets()
         self._allocate_tracking_memory()
 
         for ii in range(self.ngpus):
             nseeds_gpu, block, grid = self._switch_device(ii)
-            if (nseeds_gpu == 0):
+            if nseeds_gpu == 0:
                 continue
             self.gpu_tracker.dg.generateStreamlines(ii, nseeds_gpu, block, grid, self)
         for ii in range(self.ngpus):
-            checkCudaErrors(runtime.cudaStreamSynchronize(
-                self.gpu_tracker.streams[ii]))
+            checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii]))
 
         self._cleanup()
 
@@ -223,9 +251,8 @@ def _yield_slines():
                 for jj in range(self.nSlines[ii]):
                     npts = this_len[jj]
 
-                    yield np.asarray(
-                        this_sls[jj],
-                        dtype=REAL_DTYPE)[:npts]
+                    yield np.asarray(this_sls[jj], dtype=REAL_DTYPE)[:npts]
+
         return _yield_slines()
 
     def as_array_sequence(self):
diff --git a/cuslines/cuda_python/cu_tractography.py b/cuslines/cuda_python/cu_tractography.py
index d9f94f9..92f34c0 100644
--- a/cuslines/cuda_python/cu_tractography.py
+++ b/cuslines/cuda_python/cu_tractography.py
@@ -14,7 +14,7 @@
 )
 from cuslines.cuda_python.cu_direction_getters import (
     GPUDirectionGetter,
-    BootDirectionGetter
+    BootDirectionGetter,
 )
 from cuslines.cuda_python.cu_propagate_seeds import SeedBatchPropagator
 
@@ -32,6 +32,7 @@
 # SCIL streamline reduction onboard GPU
 # Remove small/long streamlines on gpu
 
+
 class GPUTracker:
     def __init__(
         self,
@@ -118,7 +119,9 @@ def __init__(
 
         avail = checkCudaErrors(runtime.cudaGetDeviceCount())
         if self.ngpus > avail:
-            raise RuntimeError(f"Requested {self.ngpus} GPUs but only {avail} available")
+            raise RuntimeError(
+                f"Requested {self.ngpus} GPUs but only {avail} available"
+            )
 
         logger.info("Creating GPUTracker with %d GPUs...", self.ngpus)
 
@@ -130,8 +133,7 @@ def __init__(
         self.streams = []
         self.managed_data = []
 
-        self.seed_propagator = SeedBatchPropagator(
-            gpu_tracker=self)
+        self.seed_propagator = SeedBatchPropagator(gpu_tracker=self)
         self._allocated = False
 
     def __enter__(self):
@@ -145,46 +147,64 @@ def _allocate(self):
         for ii in range(self.ngpus):
             checkCudaErrors(runtime.cudaSetDevice(ii))
             self.streams.append(
-                checkCudaErrors(runtime.cudaStreamCreateWithFlags(
-                    runtime.cudaStreamNonBlocking)))
+                checkCudaErrors(
+                    runtime.cudaStreamCreateWithFlags(runtime.cudaStreamNonBlocking)
+                )
+            )
 
         for ii in range(self.ngpus):
             checkCudaErrors(runtime.cudaSetDevice(ii))
 
             # TODO: performance: dataf could be managed or texture memory instead?
             self.dataf_d.append(
-                checkCudaErrors(runtime.cudaMalloc(
-                    REAL_SIZE*self.dataf.size)))
+                checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.dataf.size))
+            )
             self.metric_map_d.append(
-                checkCudaErrors(runtime.cudaMalloc(
-                    REAL_SIZE*self.metric_map.size)))
+                checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.metric_map.size))
+            )
             self.sphere_vertices_d.append(
-                checkCudaErrors(runtime.cudaMalloc(
-                    REAL_SIZE*self.sphere_vertices.size)))
+                checkCudaErrors(
+                    runtime.cudaMalloc(REAL_SIZE * self.sphere_vertices.size)
+                )
+            )
             self.sphere_edges_d.append(
-                checkCudaErrors(runtime.cudaMalloc(
-                    np.int32().nbytes*self.sphere_edges.size)))
-
-            checkCudaErrors(runtime.cudaMemcpy(
-                self.dataf_d[ii],
-                self.dataf.ctypes.data,
-                REAL_SIZE*self.dataf.size,
-                cudaMemcpyKind.cudaMemcpyHostToDevice))
-            checkCudaErrors(runtime.cudaMemcpy(
-                self.metric_map_d[ii],
-                self.metric_map.ctypes.data,
-                REAL_SIZE*self.metric_map.size,
-                cudaMemcpyKind.cudaMemcpyHostToDevice))
-            checkCudaErrors(runtime.cudaMemcpy(
-                self.sphere_vertices_d[ii],
-                self.sphere_vertices.ctypes.data,
-                REAL_SIZE*self.sphere_vertices.size,
-                cudaMemcpyKind.cudaMemcpyHostToDevice))
-            checkCudaErrors(runtime.cudaMemcpy(
-                self.sphere_edges_d[ii],
-                self.sphere_edges.ctypes.data,
-                np.int32().nbytes*self.sphere_edges.size,
-                cudaMemcpyKind.cudaMemcpyHostToDevice))
+                checkCudaErrors(
+                    runtime.cudaMalloc(np.int32().nbytes * self.sphere_edges.size)
+                )
+            )
+
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.dataf_d[ii],
+                    self.dataf.ctypes.data,
+                    REAL_SIZE * self.dataf.size,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.metric_map_d[ii],
+                    self.metric_map.ctypes.data,
+                    REAL_SIZE * self.metric_map.size,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.sphere_vertices_d[ii],
+                    self.sphere_vertices.ctypes.data,
+                    REAL_SIZE * self.sphere_vertices.size,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.sphere_edges_d[ii],
+                    self.sphere_edges.ctypes.data,
+                    np.int32().nbytes * self.sphere_edges.size,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
             self.dg.allocate_on_gpu(ii)
 
         self._allocated = True
@@ -211,7 +231,7 @@ def _divide_chunks(self, seeds):
         global_chunk_sz = self.chunk_size * self.ngpus
         nchunks = (seeds.shape[0] + global_chunk_sz - 1) // global_chunk_sz
         return global_chunk_sz, nchunks
-    
+
     def generate_sft(self, seeds, ref_img):
         global_chunk_sz, nchunks = self._divide_chunks(seeds)
         buffer_size = 0
@@ -228,8 +248,7 @@ def generate_sft(self, seeds, ref_img):
                     seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz].shape[0]
                 )
         array_sequence = ArraySequence(
-            (item for gen in generators for item in gen),
-            buffer_size // MEGABYTE
+            (item for gen in generators for item in gen), buffer_size // MEGABYTE
         )
         return StatefulTractogram(array_sequence, ref_img, Space.VOX)
 
@@ -259,7 +278,8 @@ def generate_trx(self, seeds, ref_img):
                 )
                 tractogram = Tractogram(
                     self.seed_propagator.as_array_sequence(),
-                    affine_to_rasmm=ref_img.affine)
+                    affine_to_rasmm=ref_img.affine,
+                )
                 tractogram.to_world()
                 sls = tractogram.streamlines
 
diff --git a/cuslines/cuda_python/cutils.py b/cuslines/cuda_python/cutils.py
index 4d0e313..db4115a 100644
--- a/cuslines/cuda_python/cutils.py
+++ b/cuslines/cuda_python/cutils.py
@@ -13,26 +13,28 @@ class ModelType(IntEnum):
     PROB = 2
     PTT = 3
 
+
 REAL3_SIZE = 3 * REAL_SIZE
 if REAL_SIZE == 4:
     REAL_DTYPE = np.float32
-    REAL3_DTYPE = np.dtype([('x', np.float32),
-                            ('y', np.float32),
-                            ('z', np.float32)], align=True)
+    REAL3_DTYPE = np.dtype(
+        [("x", np.float32), ("y", np.float32), ("z", np.float32)], align=True
+    )
     REAL_DTYPE_AS_STR = "float"
     REAL3_DTYPE_AS_STR = "float3"
 elif REAL_SIZE == 8:
     REAL_DTYPE = np.float64
-    REAL3_DTYPE = np.dtype([('x', np.float64),
-                            ('y', np.float64),
-                            ('z', np.float64)], align=True)
+    REAL3_DTYPE = np.dtype(
+        [("x", np.float64), ("y", np.float64), ("z", np.float64)], align=True
+    )
     REAL_DTYPE_AS_STR = "double"
     REAL3_DTYPE_AS_STR = "double3"
 else:
     raise NotImplementedError(f"Unsupported REAL_SIZE={REAL_SIZE} in globals.h")
-BLOCK_Y = THR_X_BL//THR_X_SL
+BLOCK_Y = THR_X_BL // THR_X_SL
 DEV_PTR = object
 
+
 def _cudaGetErrorEnum(error):
     if isinstance(error, driver.CUresult):
         err, name = driver.cuGetErrorName(error)
@@ -40,11 +42,16 @@ def _cudaGetErrorEnum(error):
     elif isinstance(error, nvrtc.nvrtcResult):
         return nvrtc.nvrtcGetErrorString(error)[1]
     else:
-        raise RuntimeError('Unknown error type: {}'.format(error))
+        raise RuntimeError("Unknown error type: {}".format(error))
+
 
 def checkCudaErrors(result):
     if result[0].value:
-        raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0])))
+        raise RuntimeError(
+            "CUDA error code={}({})".format(
+                result[0].value, _cudaGetErrorEnum(result[0])
+            )
+        )
     if len(result) == 1:
         return None
     elif len(result) == 2:
@@ -52,5 +59,6 @@ def checkCudaErrors(result):
     else:
         return result[1:]
 
+
 def div_up(a, b):
     return (a + b - 1) // b

From aa8e45fb2b9b883d053f9b46e6de008fbc660b0b Mon Sep 17 00:00:00 2001
From: John Kruper <36000@users.noreply.github.com>
Date: Wed, 7 Jan 2026 13:04:30 -0800
Subject: [PATCH 28/31] Fix run_gpu_streamlines.py from copilot

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 run_gpu_streamlines.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py
index 57053fe..0d6c447 100644
--- a/run_gpu_streamlines.py
+++ b/run_gpu_streamlines.py
@@ -196,20 +196,20 @@ def get_img(ep2_seq):
   model = ConstrainedSphericalDeconvModel(gtab, response, sh_order=args.sh_order)
   fit = model.fit(data, mask=(FA >= args.fa_threshold))
   data = fit.odf(sphere).clip(min=0)
-  if args.model == "ptt":
+  if args.dg == "ptt":
       if args.device == "cpu":
           dg = cpu_PTTDirectionGetter()
       else:
         # Set FOD to 0 outside mask for probing
         data[FA < args.fa_threshold, :] = 0
         dg = PttDirectionGetter()
-  elif args.model == "prob":
+  elif args.dg == "prob":
       if args.device == "cpu":
         dg = cpu_ProbDirectionGetter()
       else:
         dg = ProbDirectionGetter()
   else:
-      raise ValueError("Unknown model type: {}".format(args.model))
+      raise ValueError("Unknown direction getter type: {}".format(args.dg))
 
 # Setup direction getter args
 if args.device == "cpu":

From cddceb00ba417f622efd33f87e9a26f22141a057 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Wed, 7 Jan 2026 13:05:14 -0800
Subject: [PATCH 29/31] cleanup

---
 setup.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/setup.py b/setup.py
index a392cc6..ae15d6a 100644
--- a/setup.py
+++ b/setup.py
@@ -8,8 +8,6 @@ def defines_to_python(src, dst):
     src = Path(src)
     dst = Path(dst)
 
-    defines = {}
-
     INT_DEFINE = re.compile(
         r"#define\s+(\w+)\s+\(?\s*([0-9]+)\s*\)?"
     )

From cfaa93206205ba86817918621545a4f82fa09593 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Wed, 7 Jan 2026 14:23:51 -0800
Subject: [PATCH 30/31] use logging here

---
 cuslines/cuda_python/cu_tractography.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuslines/cuda_python/cu_tractography.py b/cuslines/cuda_python/cu_tractography.py
index 92f34c0..9c24cd7 100644
--- a/cuslines/cuda_python/cu_tractography.py
+++ b/cuslines/cuda_python/cu_tractography.py
@@ -290,7 +290,7 @@ def generate_trx(self, seeds, ref_img):
                     new_offsets_idx > trx_file.header["NB_STREAMLINES"]
                     or new_sls_data_idx > trx_file.header["NB_VERTICES"]
                 ):
-                    print("TRX resizing...")
+                    logger.info("TRX resizing...")
                     trx_file.resize(
                         nb_streamlines=new_offsets_idx * 2,
                         nb_vertices=new_sls_data_idx * 2,

From e6f01baebf4095212c016adf57d057dcebdd269c Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Wed, 7 Jan 2026 14:52:09 -0800
Subject: [PATCH 31/31] abstract class correction

---
 cuslines/cuda_python/cu_direction_getters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py
index b659445..617f893 100644
--- a/cuslines/cuda_python/cu_direction_getters.py
+++ b/cuslines/cuda_python/cu_direction_getters.py
@@ -32,7 +32,7 @@ def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
         pass
 
     @abstractmethod
-    def generateStreamlines(self):
+    def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
         pass
 
     def allocate_on_gpu(self, n):