From 45279bbacb30f2335089877a018725761e73c25d Mon Sep 17 00:00:00 2001 From: Habib Rehman Date: Mon, 26 Jan 2026 02:18:11 -0500 Subject: [PATCH 1/4] Added SPLIT, tentatively works --- .../split_correction/config.vsh.yaml | 54 +++++++++++ .../split_correction/script.R | 95 +++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 src/methods_expression_correction/split_correction/config.vsh.yaml create mode 100644 src/methods_expression_correction/split_correction/script.R diff --git a/src/methods_expression_correction/split_correction/config.vsh.yaml b/src/methods_expression_correction/split_correction/config.vsh.yaml new file mode 100644 index 00000000..7137954b --- /dev/null +++ b/src/methods_expression_correction/split_correction/config.vsh.yaml @@ -0,0 +1,54 @@ +__merge__: /src/api/comp_method_expression_correction.yaml + +name: split_correction +label: "SPLIT" +summary: "Correct doublet/misegmented cells using SPLIT" +description: "SPLIT (Spatial Purification of Layered Intracellular Transcripts) is a novel method that integrates snRNA-seq with RCTD deconvolution to enhance signal purity. SPLIT effectively resolves mixed transcriptomic signals, improving background correction and cell-type resolution." +links: + documentation: "https://github.com/bdsc-tds/SPLIT" + repository: "https://github.com/bdsc-tds/SPLIT" +references: + doi: "10.1101/2025.04.23.649965" + +# arguments: +# - name: --celltype_key +# required: false +# direction: input +# type: string +# default: cell_type + +resources: + - type: r_script + path: script.R + +engines: + - type: docker + image: openproblems/base_r:1 + setup: + #- type: docker + # run: | + # apt-get update && apt-get install -y wget + - type: r + bioc: [anndataR, rhdf5, devtools, scater] + #- type: r + # bioc: [SummarizedExperiment,SingleCellExperiment,SpatialExperiment] + # bioc_force_install: true + - type: docker + run: | + Rscript -e "BiocManager::install('SingleCellExperiment', type = 'source', force = TRUE, ask = FALSE); options(timeout = 600000000); devtools::install_github('dmcable/spacexr', build_vignettes = FALSE); devtools::install_github('bdsc-tds/SPLIT')" + + # SingleCellExperiment part can probably be left out again in the future. It currently fixes a bug described in these issues: + # https://github.com/drighelli/SpatialExperiment/issues/171 + # https://github.com/satijalab/seurat/issues/9889 + # The reinstall of SingleCellExperiment triggers the correct re-install of SpatialExperiment. + + # Is there a better way to install an r package from github? + # The 6 million timeout thing stops it from breaking + + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [ hightime, highcpu, highmem ] \ No newline at end of file diff --git a/src/methods_expression_correction/split_correction/script.R b/src/methods_expression_correction/split_correction/script.R new file mode 100644 index 00000000..8a92c119 --- /dev/null +++ b/src/methods_expression_correction/split_correction/script.R @@ -0,0 +1,95 @@ +library(spacexr) +library(Matrix) +library(SingleCellExperiment) +library(anndataR) +library(SPLIT) +library(Seurat) +library(scuttle) + +## VIASH START +par <- list( + "input_spatial_with_cell_types" = "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/spatial_aggregated_counts.h5ad", + "input_scrnaseq_reference"= "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad", + "output" = "task_ist_preprocessing/tmp/split_corrected.h5ad" +) + +meta <- list( + 'cpus': 4, +) + +## VIASH END + +# Read the input h5ad file and convert to SingleCellExperiment and Seurat +sce <- read_h5ad(par$input_spatial_with_cell_types, as = "SingleCellExperiment") +xe <- read_h5ad(par$input_spatial_with_cell_types, as = "Seurat") + +# Extract spatial coordinates and counts matrix +centroid_x <- colData(sce)$centroid_x +centroid_y <- colData(sce)$centroid_y +coords <- data.frame(centroid_x, centroid_y) +counts <- assay(sce,"counts") +rownames(coords) <- colData(sce)$cell_id +puck <- SpatialRNA(coords, counts) + +# Read reference scrnaseq +ref <- read_h5ad(par$input_scrnaseq_reference, as = "SingleCellExperiment") + +#filter reference cell types to those with >25 cells +valid_celltypes <- names(table(colData(ref)$cell_type))[table(colData(ref)$cell_type) >= 25] +filtered_ref <- ref[,colData(ref)$cell_type %in% valid_celltypes] + +ref_counts <- assay(filtered_ref, "counts") +# factor to drop filtered cell types +colData(filtered_ref)$cell_type <- factor(colData(filtered_ref)$cell_type) +cell_types <- colData(filtered_ref)$cell_type +names(cell_types) <- colnames(ref_counts) +reference <- Reference(ref_counts, cell_types, min_UMI = 0) + +# check cores +cores <- 1 +if ("cpus" %in% names(meta) && !is.null(meta$cpus)) cores <- meta$cpus +cat(sprintf("Number of cores: %s\n", cores)) + +# Run the algorithm +myRCTD <- create.RCTD(puck, reference, max_cores = cores) +myRCTD <- run.RCTD(myRCTD, doublet_mode = "doublet") + +# Extract results +# results <- myRCTD@results +# spatial_cell_types <- results$results_df$first_type +# # Include None Spatial cell type for the "reject" cells +# levels(spatial_cell_types) <- c(levels(spatial_cell_types), "None_sp") +# spatial_cell_types[results$results_df$spot_class == "reject"] <- "None_sp" +# names(spatial_cell_types) <- rownames(results$results_df) + +# # +# colData(sce)$cell_type <- "None_sp" +# colData(sce)[names(spatial_cell_types),"cell_type"] <- as.character(spatial_cell_types) + + +# Post-process RCTD output +RCTD <- SPLIT::run_post_process_RCTD(myRCTD) + +# Run SPLIT purification +res_split <- SPLIT::purify( + counts = GetAssayData(xe, assay = 'RNA', layer = 'counts'), # or any gene x cells counts matrix + rctd = RCTD, + DO_purify_singlets = TRUE # optional +) + +# create corrected counts layer in original SingleCell object +assay(sce, "corrected_counts") <- assay(sce, "counts") +assay(sce, "corrected_counts")[rownames(res_split$purified_counts), colnames(res_split$purified_counts)] <- res_split$purified_counts + +#log normalized +size_factors <- librarySizeFactors(assay(sce, "corrected_counts")) + +size_factors[size_factors == 0] <- 1 #why + +#print(min(size_factors)) +assay(sce, "normalized") <- assay(logNormCounts(sce, size_factors=size_factors, assay.type = "corrected_counts"),"logcounts") + +# Write the final object to h5ad format +# set to 'w', is this ok? +dir.create(dirname(par$output), showWarnings = FALSE, recursive = TRUE) +write_h5ad(sce, par$output, mode = "w") From 6080a68c75c6816e1771338ea86368d4c9b6b70a Mon Sep 17 00:00:00 2001 From: Habib Rehman Date: Mon, 2 Feb 2026 00:23:56 -0500 Subject: [PATCH 2/4] Fixed filtering and container for SPLIT --- .../split_correction/config.vsh.yaml | 13 ++--- .../split_correction/script.R | 51 +++++++++++-------- 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/src/methods_expression_correction/split_correction/config.vsh.yaml b/src/methods_expression_correction/split_correction/config.vsh.yaml index 7137954b..3c865d11 100644 --- a/src/methods_expression_correction/split_correction/config.vsh.yaml +++ b/src/methods_expression_correction/split_correction/config.vsh.yaml @@ -10,12 +10,13 @@ links: references: doi: "10.1101/2025.04.23.649965" -# arguments: -# - name: --celltype_key -# required: false -# direction: input -# type: string -# default: cell_type +arguments: + - name: --keep_all_cells + required: false + direction: input + type: boolean + default: false + description: Whether to keep cells with 0 counts (may cause errors if set to TRUE) resources: - type: r_script diff --git a/src/methods_expression_correction/split_correction/script.R b/src/methods_expression_correction/split_correction/script.R index 8a92c119..1678d3ec 100644 --- a/src/methods_expression_correction/split_correction/script.R +++ b/src/methods_expression_correction/split_correction/script.R @@ -10,7 +10,8 @@ library(scuttle) par <- list( "input_spatial_with_cell_types" = "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/spatial_aggregated_counts.h5ad", "input_scrnaseq_reference"= "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad", - "output" = "task_ist_preprocessing/tmp/split_corrected.h5ad" + "output" = "task_ist_preprocessing/tmp/split_corrected.h5ad", + "keep_all_cells" = FALSE, ) meta <- list( @@ -23,25 +24,32 @@ meta <- list( sce <- read_h5ad(par$input_spatial_with_cell_types, as = "SingleCellExperiment") xe <- read_h5ad(par$input_spatial_with_cell_types, as = "Seurat") +# filter out 0 cells +if (!par$keep_all_cells) { + cat("Filtering cells with 0 counts\n") + sce <- sce[, colSums(counts(sce)) > 0] + xe <- subset(xe, subset = nCount_RNA > 0) +} + # Extract spatial coordinates and counts matrix centroid_x <- colData(sce)$centroid_x centroid_y <- colData(sce)$centroid_y coords <- data.frame(centroid_x, centroid_y) -counts <- assay(sce,"counts") +counts <- assay(sce, "counts") rownames(coords) <- colData(sce)$cell_id puck <- SpatialRNA(coords, counts) # Read reference scrnaseq ref <- read_h5ad(par$input_scrnaseq_reference, as = "SingleCellExperiment") -#filter reference cell types to those with >25 cells +#filter reference cell types to those with >25 cells (minimum for RCTD) valid_celltypes <- names(table(colData(ref)$cell_type))[table(colData(ref)$cell_type) >= 25] filtered_ref <- ref[,colData(ref)$cell_type %in% valid_celltypes] ref_counts <- assay(filtered_ref, "counts") # factor to drop filtered cell types -colData(filtered_ref)$cell_type <- factor(colData(filtered_ref)$cell_type) -cell_types <- colData(filtered_ref)$cell_type +colData(filtered_ref)$cell_type <- factor(colData(filtered_ref)$cell_type) +cell_types <- colData(filtered_ref)$cell_type names(cell_types) <- colnames(ref_counts) reference <- Reference(ref_counts, cell_types, min_UMI = 0) @@ -51,45 +59,44 @@ if ("cpus" %in% names(meta) && !is.null(meta$cpus)) cores <- meta$cpus cat(sprintf("Number of cores: %s\n", cores)) # Run the algorithm +cat("Running RCTD\n") myRCTD <- create.RCTD(puck, reference, max_cores = cores) myRCTD <- run.RCTD(myRCTD, doublet_mode = "doublet") -# Extract results +# Get the "spot_class" annotation from RCTD +# cat("Saving RCTD spot_class\n") # results <- myRCTD@results -# spatial_cell_types <- results$results_df$first_type -# # Include None Spatial cell type for the "reject" cells -# levels(spatial_cell_types) <- c(levels(spatial_cell_types), "None_sp") -# spatial_cell_types[results$results_df$spot_class == "reject"] <- "None_sp" -# names(spatial_cell_types) <- rownames(results$results_df) - -# # -# colData(sce)$cell_type <- "None_sp" -# colData(sce)[names(spatial_cell_types),"cell_type"] <- as.character(spatial_cell_types) - +# rctd_spot_class <- results$results_df$spot_class +# names(rctd_spot_class) <- rownames(results$results_df) +# colData(sce)$RCTD_class <- "not_included" +# colData(sce)[names(rctd_spot_class),"RCTD_class"] <- as.character(rctd_spot_class) # Post-process RCTD output RCTD <- SPLIT::run_post_process_RCTD(myRCTD) # Run SPLIT purification +cat("Running SPLIT\n") res_split <- SPLIT::purify( counts = GetAssayData(xe, assay = 'RNA', layer = 'counts'), # or any gene x cells counts matrix rctd = RCTD, DO_purify_singlets = TRUE # optional ) + # create corrected counts layer in original SingleCell object +cat("Normalizing counts\n") + +# First copy in counts assay(sce, "corrected_counts") <- assay(sce, "counts") + +# Then, replace only the updated cells assay(sce, "corrected_counts")[rownames(res_split$purified_counts), colnames(res_split$purified_counts)] <- res_split$purified_counts -#log normalized +# Library size normalization - see note in resolVI size_factors <- librarySizeFactors(assay(sce, "corrected_counts")) - -size_factors[size_factors == 0] <- 1 #why - -#print(min(size_factors)) assay(sce, "normalized") <- assay(logNormCounts(sce, size_factors=size_factors, assay.type = "corrected_counts"),"logcounts") # Write the final object to h5ad format -# set to 'w', is this ok? +cat("Writing to h5ad\n") dir.create(dirname(par$output), showWarnings = FALSE, recursive = TRUE) write_h5ad(sce, par$output, mode = "w") From 9eb1755abe95a49d06cd5cd044ac0042aed6ab01 Mon Sep 17 00:00:00 2001 From: LouisK92 Date: Mon, 23 Feb 2026 17:41:08 +0100 Subject: [PATCH 3/4] Save uncorrected counts in split script --- .../split_correction/config.vsh.yaml | 14 ++++------- .../split_correction/script.R | 25 +++++++++++-------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/src/methods_expression_correction/split_correction/config.vsh.yaml b/src/methods_expression_correction/split_correction/config.vsh.yaml index 3c865d11..882841fb 100644 --- a/src/methods_expression_correction/split_correction/config.vsh.yaml +++ b/src/methods_expression_correction/split_correction/config.vsh.yaml @@ -26,14 +26,11 @@ engines: - type: docker image: openproblems/base_r:1 setup: - #- type: docker - # run: | - # apt-get update && apt-get install -y wget + - type: docker + run: | + apt-get update - type: r bioc: [anndataR, rhdf5, devtools, scater] - #- type: r - # bioc: [SummarizedExperiment,SingleCellExperiment,SpatialExperiment] - # bioc_force_install: true - type: docker run: | Rscript -e "BiocManager::install('SingleCellExperiment', type = 'source', force = TRUE, ask = FALSE); options(timeout = 600000000); devtools::install_github('dmcable/spacexr', build_vignettes = FALSE); devtools::install_github('bdsc-tds/SPLIT')" @@ -43,8 +40,7 @@ engines: # https://github.com/satijalab/seurat/issues/9889 # The reinstall of SingleCellExperiment triggers the correct re-install of SpatialExperiment. - # Is there a better way to install an r package from github? - # The 6 million timeout thing stops it from breaking + # Using a large timeout here to reduce failures during GitHub package installation. - type: native @@ -52,4 +48,4 @@ runners: - type: executable - type: nextflow directives: - label: [ hightime, highcpu, highmem ] \ No newline at end of file + label: [ hightime, highcpu, highmem ] diff --git a/src/methods_expression_correction/split_correction/script.R b/src/methods_expression_correction/split_correction/script.R index 1678d3ec..5210aab1 100644 --- a/src/methods_expression_correction/split_correction/script.R +++ b/src/methods_expression_correction/split_correction/script.R @@ -7,12 +7,12 @@ library(Seurat) library(scuttle) ## VIASH START -par <- list( - "input_spatial_with_cell_types" = "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/spatial_aggregated_counts.h5ad", - "input_scrnaseq_reference"= "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad", - "output" = "task_ist_preprocessing/tmp/split_corrected.h5ad", - "keep_all_cells" = FALSE, -) +par <- list( + "input_spatial_with_cell_types" = "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/spatial_with_celltypes.h5ad", + "input_scrnaseq_reference"= "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad", + "output" = "task_ist_preprocessing/tmp/split_corrected.h5ad", + "keep_all_cells" = FALSE, +) meta <- list( 'cpus': 4, @@ -83,11 +83,14 @@ res_split <- SPLIT::purify( ) -# create corrected counts layer in original SingleCell object -cat("Normalizing counts\n") - -# First copy in counts -assay(sce, "corrected_counts") <- assay(sce, "counts") +# create corrected counts layer in original SingleCell object +cat("Normalizing counts\n") + +# Preserve original normalized values before overwriting with corrected normalization +assay(sce, "normalized_uncorrected") <- assay(sce, "normalized") + +# First copy in counts +assay(sce, "corrected_counts") <- assay(sce, "counts") # Then, replace only the updated cells assay(sce, "corrected_counts")[rownames(res_split$purified_counts), colnames(res_split$purified_counts)] <- res_split$purified_counts From 6f76c6c0588a65fa085bd039170edbd7c66559c1 Mon Sep 17 00:00:00 2001 From: LouisK92 Date: Mon, 23 Feb 2026 17:50:32 +0100 Subject: [PATCH 4/4] Rename split_correction to split and add method to workflow and scripts --- scripts/run_benchmark/config.yaml | 0 scripts/run_benchmark/run_full_local.sh | 1 + scripts/run_benchmark/run_full_seqeracloud.sh | 1 + scripts/run_benchmark/run_test_local.sh | 1 + scripts/run_benchmark/run_test_seqeracloud.sh | 1 + .../{split_correction => split}/config.vsh.yaml | 2 +- .../{split_correction => split}/script.R | 0 src/workflows/run_benchmark/config.vsh.yaml | 3 ++- src/workflows/run_benchmark/main.nf | 3 ++- 9 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 scripts/run_benchmark/config.yaml rename src/methods_expression_correction/{split_correction => split}/config.vsh.yaml (96%) rename src/methods_expression_correction/{split_correction => split}/script.R (100%) diff --git a/scripts/run_benchmark/config.yaml b/scripts/run_benchmark/config.yaml new file mode 100644 index 00000000..e69de29b diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh index 1464a6df..2d828361 100755 --- a/scripts/run_benchmark/run_full_local.sh +++ b/scripts/run_benchmark/run_full_local.sh @@ -65,6 +65,7 @@ expression_correction_methods: - no_correction # - gene_efficiency_correction # - resolvi_correction + # - split method_parameters_yaml: /tmp/method_params.yaml HERE diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh index d006f7e0..9d930046 100755 --- a/scripts/run_benchmark/run_full_seqeracloud.sh +++ b/scripts/run_benchmark/run_full_seqeracloud.sh @@ -57,6 +57,7 @@ expression_correction_methods: - no_correction - gene_efficiency_correction - resolvi_correction + - split method_parameters_yaml: /tmp/method_params.yaml HERE diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh index 686326b9..2957d152 100755 --- a/scripts/run_benchmark/run_test_local.sh +++ b/scripts/run_benchmark/run_test_local.sh @@ -60,6 +60,7 @@ expression_correction_methods: - no_correction # - gene_efficiency_correction # - resolvi_correction + # - split method_parameters_yaml: /tmp/method_params.yaml HERE diff --git a/scripts/run_benchmark/run_test_seqeracloud.sh b/scripts/run_benchmark/run_test_seqeracloud.sh index 39ab8c5e..de9a52e5 100755 --- a/scripts/run_benchmark/run_test_seqeracloud.sh +++ b/scripts/run_benchmark/run_test_seqeracloud.sh @@ -56,6 +56,7 @@ expression_correction_methods: - no_correction - gene_efficiency_correction - resolvi_correction + - split #method_parameters_yaml: /tmp/method_params.yaml HERE diff --git a/src/methods_expression_correction/split_correction/config.vsh.yaml b/src/methods_expression_correction/split/config.vsh.yaml similarity index 96% rename from src/methods_expression_correction/split_correction/config.vsh.yaml rename to src/methods_expression_correction/split/config.vsh.yaml index 882841fb..18a3fabb 100644 --- a/src/methods_expression_correction/split_correction/config.vsh.yaml +++ b/src/methods_expression_correction/split/config.vsh.yaml @@ -1,6 +1,6 @@ __merge__: /src/api/comp_method_expression_correction.yaml -name: split_correction +name: split label: "SPLIT" summary: "Correct doublet/misegmented cells using SPLIT" description: "SPLIT (Spatial Purification of Layered Intracellular Transcripts) is a novel method that integrates snRNA-seq with RCTD deconvolution to enhance signal purity. SPLIT effectively resolves mixed transcriptomic signals, improving background correction and cell-type resolution." diff --git a/src/methods_expression_correction/split_correction/script.R b/src/methods_expression_correction/split/script.R similarity index 100% rename from src/methods_expression_correction/split_correction/script.R rename to src/methods_expression_correction/split/script.R diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index c25e39ce..665d540f 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -104,7 +104,7 @@ argument_groups: A list of expression correction methods to run. type: string multiple: true - default: "no_correction:gene_efficiency_correction:resolvi_correction" + default: "no_correction:gene_efficiency_correction:resolvi_correction:split" - name: Method parameters description: | Use these arguments to control the parameter sets that are run for each @@ -175,6 +175,7 @@ dependencies: - name: methods_expression_correction/no_correction - name: methods_expression_correction/gene_efficiency_correction - name: methods_expression_correction/resolvi_correction + - name: methods_expression_correction/split - name: methods_data_aggregation/aggregate_spatial_data - name: metrics/similarity - name: metrics/quality diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 6783873d..6ab1f6d3 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -414,7 +414,8 @@ workflow run_wf { expr_corr_methods = [ no_correction, gene_efficiency_correction, - resolvi_correction + resolvi_correction, + split ] expr_corr_ch = cta_ch