diff --git a/.Rbuildignore b/.Rbuildignore index 20d145b..ac35c14 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -11,3 +11,4 @@ ^CRAN-SUBMISSION$ ^[\.]?air\.toml$ ^\.vscode$ +^\.clang-format$ diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..d93a09b --- /dev/null +++ b/.clang-format @@ -0,0 +1,6 @@ +BasedOnStyle: Google +IndentWidth: 2 +ColumnLimit: 100 +AllowShortFunctionsOnASingleLine: Empty +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false diff --git a/R/read.r b/R/read.r index a4cf657..777a453 100644 --- a/R/read.r +++ b/R/read.r @@ -64,11 +64,16 @@ as_granges <- function(x) { #' Read data from bigBed files. #' +#' Columns are automatically typed based on the autoSql schema embedded +#' in the bigBed file. Integer types (`uint`, `int`) become R integers, +#' floating point types (`float`, `double`) become R doubles, and all +#' other types (including array types like `int[blockCount]`) remain +#' as character strings. +#' #' @param bbfile filename for bigBed file #' @param chrom read data for specific chromosome #' @param start start position for data #' @param end end position for data -#' @param convert convert bigBed values to individual columns #' #' @return \code{tibble} #' @@ -87,8 +92,7 @@ read_bigbed <- function( bbfile, chrom = NULL, start = NULL, - end = NULL, - convert = TRUE + end = NULL ) { if (!file.exists(bbfile)) { stop("File does not exist: ", bbfile) @@ -99,42 +103,5 @@ read_bigbed <- function( } res <- read_bigbed_cpp(bbfile, chrom, start, end) - - if (!convert) { - return(as_tibble(res)) - } - - vals <- do.call(rbind, strsplit(res[["value"]], "\t")) - # merge chrom, start, end with new values - res_new <- cbind(res[, 1:3], vals) - - fnames <- bigbed_sql_fields(bbfile) - # drop chrom, start, end - fnames <- fnames[-(1:3)] - - colnames(res_new) <- c("chrom", "start", "end", fnames) - return(as_tibble(res_new)) -} - -#' @examples -#' bb <- system.file("extdata", "test.bb", package = "cpp11bigwig") -#' bigbed_sql_fields(bb) -#' -#' @noRd -bigbed_sql_fields <- function(bbfile) { - res <- bigbed_sql_cpp(bbfile) - - # parse the autoSql - lines <- unlist(strsplit(res, "\n")) - fields <- lines[grep(";", lines)] - - unlist( - lapply( - fields, - function(line) { - field <- sub("^\\s*\\S+\\s+(\\S+);.*", "\\1", line) - field - } - ) - ) + as_tibble(res) } diff --git a/README.md b/README.md index 1b2d65b..5fdbce6 100644 --- a/README.md +++ b/README.md @@ -53,9 +53,9 @@ bb <- system.file("extdata", "test.bb", package = "cpp11bigwig") read_bigbed(bb) #> # A tibble: 3 × 12 #> chrom start end name score strand thickStart thickEnd reserved blockCount -#> -#> 1 chr1 4.80e6 4.84e6 test… 1 + 4797973 4836816 1 9 -#> 2 chr10 4.85e6 4.88e6 diff… 1 + 4848118 4880877 1 6 -#> 3 chr20 5.07e6 5.15e6 negs… 1 - 5073253 5152630 1 14 +#> +#> 1 chr1 4.80e6 4.84e6 test… 1 + 4797973 4836816 1 9 +#> 2 chr10 4.85e6 4.88e6 diff… 1 + 4848118 4880877 1 6 +#> 3 chr20 5.07e6 5.15e6 negs… 1 - 5073253 5152630 1 14 #> # ℹ 2 more variables: blockSizes , chromStarts ``` diff --git a/cpp11bigwig.Rproj b/cpp11bigwig.Rproj deleted file mode 100644 index b7ef353..0000000 --- a/cpp11bigwig.Rproj +++ /dev/null @@ -1,21 +0,0 @@ -Version: 1.0 -ProjectId: be6d8c3c-3585-47f8-a49e-d97805615fb2 - -RestoreWorkspace: Default -SaveWorkspace: Default -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: knitr -LaTeX: pdfLaTeX - -StripTrailingWhitespace: Yes - -BuildType: Package -PackageUseDevtools: Yes -PackageInstallArgs: --no-multiarch --with-keep.source -PackageRoxygenize: rd,collate,namespace diff --git a/man/read_bigbed.Rd b/man/read_bigbed.Rd index 04bc984..cd07980 100644 --- a/man/read_bigbed.Rd +++ b/man/read_bigbed.Rd @@ -4,7 +4,7 @@ \alias{read_bigbed} \title{Read data from bigBed files.} \usage{ -read_bigbed(bbfile, chrom = NULL, start = NULL, end = NULL, convert = TRUE) +read_bigbed(bbfile, chrom = NULL, start = NULL, end = NULL) } \arguments{ \item{bbfile}{filename for bigBed file} @@ -14,14 +14,16 @@ read_bigbed(bbfile, chrom = NULL, start = NULL, end = NULL, convert = TRUE) \item{start}{start position for data} \item{end}{end position for data} - -\item{convert}{convert bigBed values to individual columns} } \value{ \code{tibble} } \description{ -Read data from bigBed files. +Columns are automatically typed based on the autoSql schema embedded +in the bigBed file. Integer types (\code{uint}, \code{int}) become R integers, +floating point types (\code{float}, \code{double}) become R doubles, and all +other types (including array types like \code{int[blockCount]}) remain +as character strings. } \examples{ bb <- system.file("extdata", "test.bb", package = "cpp11bigwig") diff --git a/src/cpp11bigwig.cpp b/src/cpp11bigwig.cpp index d5f1b08..8287867 100644 --- a/src/cpp11bigwig.cpp +++ b/src/cpp11bigwig.cpp @@ -3,186 +3,309 @@ using namespace cpp11; +#include +#include #include #include "libBigWig/bigWig.h" +// R type categories for autoSql types +enum class RType { Integer, Double, String }; + +// Determine R type from autoSql type string +RType autosql_to_rtype(const std::string& type) { + // Strip array notation like int[blockCount] + std::string base_type = type; + size_t bracket_pos = type.find('['); + if (bracket_pos != std::string::npos) { + // Array types stay as strings (comma-separated values) + return RType::String; + } + + if (type == "uint" || type == "int") { + return RType::Integer; + } else if (type == "float" || type == "double") { + return RType::Double; + } + return RType::String; +} + +// Parse autoSql schema to extract field names and types +// Returns vector of pairs: (name, type) +std::vector> parse_autosql(const std::string& sql) { + std::vector> fields; + + std::istringstream stream(sql); + std::string line; + + // Match lines like: " uint chromStart; ..." or " string name; ..." + std::regex field_regex(R"(^\s*(\S+)\s+(\S+);)"); + + while (std::getline(stream, line)) { + std::smatch match; + if (std::regex_search(line, match, field_regex)) { + std::string type = match[1].str(); + std::string name = match[2].str(); + fields.push_back({name, type}); + } + } + + return fields; +} + +// Split a string by delimiter +std::vector split_string(const std::string& str, char delim) { + std::vector tokens; + std::istringstream stream(str); + std::string token; + while (std::getline(stream, token, delim)) { + tokens.push_back(token); + } + return tokens; +} + [[cpp11::register]] writable::data_frame read_bigwig_cpp(std::string bwfname, sexp chrom, sexp start, sexp end) { + const char* bwfile = bwfname.c_str(); - const char* bwfile = bwfname.c_str() ; - - bigWigFile_t *bwf = NULL; + bigWigFile_t* bwf = NULL; // NULL can be a CURL callback. see libBigWig demos - bwf = bwOpen(bwfile, NULL, "r") ; + bwf = bwOpen(bwfile, NULL, "r"); if (!bwf) - stop("Failed to open file: '%s'\n", bwfname.c_str()) ; + stop("Failed to open file: '%s'\n", bwfname.c_str()); - std::vector chroms ; - std::vector starts ; - std::vector ends ; + std::vector chroms; + std::vector starts; + std::vector ends; std::vector vals; - bwOverlappingIntervals_t *intervals = NULL ; + bwOverlappingIntervals_t* intervals = NULL; - int nchrom = bwf->cl->nKeys ; - for (int nc = 0; nccl->chrom[nc] ; - std::string bw_chrom_c(bw_chrom) ; + int nchrom = bwf->cl->nKeys; + for (int nc = 0; nc < nchrom; ++nc) { + char* bw_chrom = bwf->cl->chrom[nc]; + std::string bw_chrom_c(bw_chrom); if (!Rf_isNull(chrom)) { - std::string r_chrom = as_cpp(chrom) ; - if (r_chrom != bw_chrom_c) continue ; + std::string r_chrom = as_cpp(chrom); + if (r_chrom != bw_chrom_c) + continue; } // set maximum boundaries if start / end are not specified - int bw_start = Rf_isNull(start) ? 0 : as_cpp(start) ; - int bw_end = Rf_isNull(end) ? bwf->cl->len[nc] : as_cpp(end) ; + int bw_start = Rf_isNull(start) ? 0 : as_cpp(start); + int bw_end = Rf_isNull(end) ? bwf->cl->len[nc] : as_cpp(end); - intervals = bwGetValues(bwf, bw_chrom, bw_start, bw_end, 0) ; + intervals = bwGetValues(bwf, bw_chrom, bw_start, bw_end, 0); if (!intervals) - stop("Failed to retreived intervals for chrom `%s`\n", bw_chrom) ; - - int nint = intervals->l ; + stop("Failed to retreived intervals for chrom `%s`\n", bw_chrom); - for(int i=0; il; - int start = intervals->start[i] ; - int end = start + 1 ; - float val = intervals->value[i] ; + for (int i = 0; i < nint; ++i) { + int start = intervals->start[i]; + int end = start + 1; + float val = intervals->value[i]; if (i == 0) { - chroms.push_back(bw_chrom_c) ; - starts.push_back(start) ; - ends.push_back(end) ; - vals.push_back(val) ; + chroms.push_back(bw_chrom_c); + starts.push_back(start); + ends.push_back(end); + vals.push_back(val); } else { if (start == ends.back() && val == vals.back()) { - ends.back() = end ; + ends.back() = end; } else { - chroms.push_back(bw_chrom_c) ; - starts.push_back(start) ; - ends.push_back(end) ; - vals.push_back(val) ; + chroms.push_back(bw_chrom_c); + starts.push_back(start); + ends.push_back(end); + vals.push_back(val); } } } - bwDestroyOverlappingIntervals(intervals) ; + bwDestroyOverlappingIntervals(intervals); } - bwClose(bwf) ; - bwCleanup() ; + bwClose(bwf); + bwCleanup(); - return writable::data_frame({ - "chrom"_nm = chroms, - "start"_nm = starts, - "end"_nm = ends, - "value"_nm = vals - }) ; + return writable::data_frame( + {"chrom"_nm = chroms, "start"_nm = starts, "end"_nm = ends, "value"_nm = vals}); } [[cpp11::register]] writable::data_frame read_bigbed_cpp(std::string bbfname, sexp chrom, sexp start, sexp end) { - - const char* bbfile = bbfname.c_str() ; - bigWigFile_t *bbf = NULL; + const char* bbfile = bbfname.c_str(); + bigWigFile_t* bbf = NULL; // NULL can be a CURL callback. see libBigWig demos - bbf = bbOpen(bbfile, NULL) ; + bbf = bbOpen(bbfile, NULL); if (!bbf) - stop("Failed to open file: '%s'\n", bbfname.c_str()) ; + stop("Failed to open file: '%s'\n", bbfname.c_str()); - std::vector chroms ; - std::vector starts ; - std::vector ends ; - std::vector vals; + // Get autoSql schema and parse field info + char* sql = bbGetSQL(bbf); + std::string sql_str(sql); + free(sql); - bbOverlappingEntries_t *intervals ; + auto fields = parse_autosql(sql_str); - int nchrom = bbf->cl->nKeys ; - for (int nc = 0; nc 3 ? fields.size() - 3 : 0; - char* bb_chrom = bbf->cl->chrom[nc] ; - std::string bb_chrom_c(bb_chrom) ; + // Determine R types for extra fields + std::vector field_rtypes; + std::vector field_names; + for (size_t i = 3; i < fields.size(); ++i) { + field_names.push_back(fields[i].first); + field_rtypes.push_back(autosql_to_rtype(fields[i].second)); + } - if (!Rf_isNull(chrom)) { - std::string r_chrom = as_cpp(chrom) ; - if (r_chrom != bb_chrom_c) continue ; - } + // Storage for coordinate columns + std::vector chroms; + std::vector starts; + std::vector ends; - // set maximum boundaries if start / end are not specified - int bb_start = Rf_isNull(start) ? 0 : as_cpp(start) ; - int bb_end = Rf_isNull(end) ? bbf->cl->len[nc] : as_cpp(end) ; + // Storage for typed extra columns + std::vector> int_cols(num_extra_fields); + std::vector> dbl_cols(num_extra_fields); + std::vector> str_cols(num_extra_fields); - intervals = bbGetOverlappingEntries(bbf, bb_chrom, bb_start, bb_end, 1) ; + bbOverlappingEntries_t* intervals; - if (!intervals) - stop("Failed to retreived intervals for chrom `%s`\n", bb_chrom) ; + int nchrom = bbf->cl->nKeys; + for (int nc = 0; nc < nchrom; ++nc) { + char* bb_chrom = bbf->cl->chrom[nc]; + std::string bb_chrom_c(bb_chrom); - int nint = intervals->l ; + if (!Rf_isNull(chrom)) { + std::string r_chrom = as_cpp(chrom); + if (r_chrom != bb_chrom_c) + continue; + } - for(int i=0; i(start); + int bb_end = Rf_isNull(end) ? bbf->cl->len[nc] : as_cpp(end); - int start = intervals->start[i] ; - int end = intervals->end[i] ; // Use actual end coordinate from bigBed - std::string val = intervals->str[i] ; + intervals = bbGetOverlappingEntries(bbf, bb_chrom, bb_start, bb_end, 1); - if (i == 0) { - chroms.push_back(bb_chrom_c) ; - starts.push_back(start) ; - ends.push_back(end) ; - vals.push_back(val) ; - } else { - if (start == ends.back() && val == vals.back()) { - ends.back() = end ; - } else { - chroms.push_back(bb_chrom_c) ; - starts.push_back(start) ; - ends.push_back(end) ; - vals.push_back(val) ; + if (!intervals) + stop("Failed to retrieve intervals for chrom `%s`\n", bb_chrom); + + int nint = intervals->l; + + for (int i = 0; i < nint; ++i) { + int ivl_start = intervals->start[i]; + int ivl_end = intervals->end[i]; + std::string val_str = intervals->str[i]; + + chroms.push_back(bb_chrom_c); + starts.push_back(ivl_start); + ends.push_back(ivl_end); + + // Parse tab-separated extra fields + std::vector tokens = split_string(val_str, '\t'); + + for (size_t j = 0; j < num_extra_fields; ++j) { + std::string token = (j < tokens.size()) ? tokens[j] : ""; + + switch (field_rtypes[j]) { + case RType::Integer: + if (token.empty()) { + int_cols[j].push_back(NA_INTEGER); + } else { + try { + int_cols[j].push_back(std::stoi(token)); + } catch (...) { + int_cols[j].push_back(NA_INTEGER); + } + } + break; + case RType::Double: + if (token.empty()) { + dbl_cols[j].push_back(NA_REAL); + } else { + try { + dbl_cols[j].push_back(std::stod(token)); + } catch (...) { + dbl_cols[j].push_back(NA_REAL); + } + } + break; + case RType::String: + str_cols[j].push_back(token); + break; } } } - bbDestroyOverlappingEntries(intervals) ; + bbDestroyOverlappingEntries(intervals); } - bwClose(bbf) ; - bwCleanup() ; + bwClose(bbf); + bwCleanup(); + + // Build the data frame with named columns + writable::list result; + writable::strings col_names; + + // Add coordinate columns + result.push_back(writable::strings(chroms.begin(), chroms.end())); + col_names.push_back("chrom"); + + result.push_back(writable::integers(starts.begin(), starts.end())); + col_names.push_back("start"); + + result.push_back(writable::integers(ends.begin(), ends.end())); + col_names.push_back("end"); + + // Add typed extra columns + for (size_t j = 0; j < num_extra_fields; ++j) { + switch (field_rtypes[j]) { + case RType::Integer: + result.push_back(writable::integers(int_cols[j].begin(), int_cols[j].end())); + break; + case RType::Double: + result.push_back(writable::doubles(dbl_cols[j].begin(), dbl_cols[j].end())); + break; + case RType::String: + result.push_back(writable::strings(str_cols[j].begin(), str_cols[j].end())); + break; + } + col_names.push_back(field_names[j]); + } - return writable::data_frame({ - "chrom"_nm = chroms, - "start"_nm = starts, - "end"_nm = ends, - "value"_nm = vals - }) ; + result.attr("names") = col_names; + result.attr("class") = writable::strings({"data.frame"}); + result.attr("row.names") = writable::integers({NA_INTEGER, -static_cast(chroms.size())}); + + return as_cpp(result); } [[cpp11::register]] std::string bigbed_sql_cpp(std::string bbfname) { + const char* bbfile = bbfname.c_str(); - const char* bbfile = bbfname.c_str() ; - - bigWigFile_t *bbf = NULL; + bigWigFile_t* bbf = NULL; // NULL can be a CURL callback. see libBigWig demos - bbf = bwOpen(bbfile, NULL, "r") ; + bbf = bwOpen(bbfile, NULL, "r"); if (!bbf) - stop("Failed to open file: '%s'\n", bbfname.c_str()) ; + stop("Failed to open file: '%s'\n", bbfname.c_str()); - char* sql = bbGetSQL(bbf) ; + char* sql = bbGetSQL(bbf); - std::string sql_str(sql) ; - free(sql) ; + std::string sql_str(sql); + free(sql); - bwClose(bbf) ; + bwClose(bbf); - return(sql_str) ; + return (sql_str); } diff --git a/tests/testthat/test-read.R b/tests/testthat/test-read.R index 7618dc2..2f78340 100644 --- a/tests/testthat/test-read.R +++ b/tests/testthat/test-read.R @@ -95,3 +95,36 @@ test_that("read_bigbed correctly parses interval coordinates", { expect_true(all(region_data$chrom == "chr1")) expect_true(all(region_data$start >= 4797000 | region_data$end <= 4798000)) }) + +test_that("read_bigbed coerces columns based on autoSql types", { + bb_file <- test_path("data/test.bb") + bb_data <- read_bigbed(bb_file) + + # Coordinate columns should be integers + expect_type(bb_data$start, "integer") + expect_type(bb_data$end, "integer") + + # String columns should be character + expect_type(bb_data$chrom, "character") + expect_type(bb_data$name, "character") + + # uint/int columns should be integer (based on BED12 autoSql schema) + expect_type(bb_data$score, "integer") + expect_type(bb_data$thickStart, "integer") + expect_type(bb_data$thickEnd, "integer") + expect_type(bb_data$reserved, "integer") + expect_type(bb_data$blockCount, "integer") + + # char[1] should be character + expect_type(bb_data$strand, "character") + + # Array types (int[blockCount]) should remain character (comma-separated) + expect_type(bb_data$blockSizes, "character") + expect_type(bb_data$chromStarts, "character") + + # Verify integer values are correct (not NA or corrupted) + expect_true(all(bb_data$score >= 0)) + expect_true(all(bb_data$blockCount > 0)) + expect_true(all(bb_data$thickStart >= bb_data$start)) + expect_true(all(bb_data$thickEnd <= bb_data$end)) +})