Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -106,5 +106,6 @@ Authors@R: c(
person(given="@badasahog", role="ctb", comment="GitHub user"),
person("Vinit", "Thakur", role="ctb"),
person("Mukul", "Kumar", role="ctb"),
person("Ildikó", "Czeller", role="ctb")
person("Ildikó", "Czeller", role="ctb"),
person("Manmita", "Das", role="ctb")
)
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@

1. `nafill()`, `setnafill()` extended to work on logical vectors (part of [#3992](https://github.com/Rdatatable/data.table/issues/3992)). Thanks @jangorecki for the request and @MichaelChirico for the PR.

2. `[,showProgress=]` and `options(datatable.showProgress)` now accept an integer to control the progress bar update interval in seconds, allowing finer control over progress reporting frequency; `TRUE` uses the default 3-second interval, [#6514](https://github.com/Rdatatable/data.table/issues/6514). Thanks @ethanbsmith for the report and @ben-schwen for the PR.

3. `tables()` can now optionally report `data.table` objects stored one level deep inside list objects when `list_search=TRUE`, with `list_len_threshold` to avoid scanning extremely long lists, [#2606](https://github.com/Rdatatable/data.table/issues/2606).

### Notes

1. {data.table} now depends on R 3.5.0 (2018).
Expand All @@ -32,6 +36,8 @@

3. `fread("file://...")` works for file URIs with spaces, [#7550](https://github.com/Rdatatable/data.table/issues/7550). Thanks @aitap for the report and @MichaelChirico for the PR.

4. `sum(<int64 column>)` by group is correct with missing entries and GForce activated ([#7571](https://github.com/Rdatatable/data.table/issues/7571)). Thanks to @rweberc for the report and @manmita for the fix. The issue was caused by a faulty early `break` that spilled between groups, and resulted in silently incorrect results!

## data.table [v1.18.0](https://github.com/Rdatatable/data.table/milestone/37?closed=1) 23 December 2025

### BREAKING CHANGE
Expand Down
4 changes: 2 additions & 2 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ replace_dot_alias = function(e) {
if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which)
if (is.null(nomatch) && is.na(which)) stopf("which=NA with nomatch=0|NULL would always return an empty vector. Please change or remove either which or nomatch.")
if (!with && missing(j)) stopf("j must be provided when with=FALSE")
if (!missing(by) && !isTRUEorFALSE(showProgress)) stopf("%s must be TRUE or FALSE", "showProgress")
if (!missing(by) && !(isTRUEorFALSE(showProgress) || (is.numeric(showProgress) && length(showProgress)==1L && showProgress >= 0))) stopf("showProgress must be TRUE, FALSE, or a single non-negative number") # nocov
irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency.
notjoin = FALSE
rightcols = leftcols = integer()
Expand Down Expand Up @@ -1972,7 +1972,7 @@ replace_dot_alias = function(e) {
}
ans = c(g, ans)
} else {
ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose, showProgress)
ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose, as.integer(showProgress))
}
# unlock any locked data.table components of the answer, #4159
# MAX_DEPTH prevents possible infinite recursion from truly recursive object, #4173
Expand Down
94 changes: 82 additions & 12 deletions R/tables.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,28 +19,98 @@ type_size = function(DT) {
}

tables = function(mb=type_size, order.col="NAME", width=80L,
env=parent.frame(), silent=FALSE, index=FALSE)
env=parent.frame(), silent=FALSE, index=FALSE,
list_search=FALSE, list_len_threshold=100000L)
{
# Prints name, size and colnames of all data.tables in the calling environment by default
mb_name = as.character(substitute(mb))
if (isTRUE(mb)) { mb=type_size; mb_name="type_size" }
names = ls(envir=env, all.names=TRUE) # include "hidden" objects (starting with .)
obj = mget(names, envir=env) # doesn't copy; mget is ok with ... unlike get, #5197
w = which(vapply_1b(obj, is.data.table))
if (!length(w)) {

list_info = NULL
if (list_search) {
is_list = vapply_1b(obj, is.list)
dt_mask = logical(length(obj))
if (length(w)) dt_mask[w] = TRUE
is_df = vapply_1b(obj, is.data.frame)
list_candidates = which(is_list & !dt_mask & !is_df)

if (length(list_candidates)) {
n_extra = 0L
list_locations = vector("list", length(list_candidates))
for (j in seq_along(list_candidates)) {
obj_idx = list_candidates[j]
L = obj[[obj_idx]]
if (!is.list(L)) next
lenL = length(L)
if (is.finite(list_len_threshold) && lenL > list_len_threshold) next
elem_is_dt = vapply_1b(L, is.data.table)
idx = which(elem_is_dt)
if (length(idx)) {
list_locations[[j]] = idx
n_extra = n_extra + length(idx)
}
}

if (n_extra) {
list_info = data.table(NAME=character(n_extra), NROW=0L, NCOL=0L, MB=0.0, COLS=list(), KEY=list(), INDICES=list())
extra_row = 0L
for (j in seq_along(list_candidates)) {
obj_idx = list_candidates[j]
idx = list_locations[[j]]
if (is.null(idx) || !length(idx)) next
L = obj[[obj_idx]]
obj_name = names[obj_idx]
elem_names = names(L)
for (k in idx) {
extra_row = extra_row + 1L
DT = L[[k]]
if (!is.null(elem_names) && length(elem_names) >= k && nzchar(elem_names[k])) {
nm = paste0(obj_name, "$", elem_names[k])
} else {
nm = paste0(obj_name, "[[", k, "]]")
}
set(list_info, extra_row, "NAME", nm)
set(list_info, extra_row, "NROW", nrow(DT))
set(list_info, extra_row, "NCOL", ncol(DT))
if (is.function(mb)) set(list_info, extra_row, "MB", as.integer(mb(DT)/1048576L))
if (!is.null(tt<-names(DT))) set(list_info, extra_row, "COLS", tt)
if (!is.null(tt<-key(DT))) set(list_info, extra_row, "KEY", tt)
if (index && !is.null(tt<-indices(DT))) set(list_info, extra_row, "INDICES", tt)
}
}
}
}
}

info = NULL
if (length(w)) {
info = data.table(NAME=names[w], NROW=0L, NCOL=0L, MB=0.0, COLS=list(), KEY=list(), INDICES=list())
for (i in seq_along(w)) { # avoid rbindlist(lapply(DT_names)) in case of a large number of tables
DT = obj[[w[i]]]
set(info, i, "NROW", nrow(DT))
set(info, i, "NCOL", ncol(DT))
if (is.function(mb)) set(info, i, "MB", as.integer(mb(DT)/1048576L)) # i.e. 1024**2
if (!is.null(tt<-names(DT))) set(info, i, "COLS", tt) # TODO: don't need these if()s when #5526 is done
if (!is.null(tt<-key(DT))) set(info, i, "KEY", tt)
if (index && !is.null(tt<-indices(DT))) set(info, i, "INDICES", tt)
}
}

if (!is.null(list_info) && nrow(list_info)) {
if (is.null(info)) {
info = list_info
} else {
info = rbind(info, list_info)
}
}

if (is.null(info) || !nrow(info)) {
if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env))
return(invisible(data.table(NULL)))
}
info = data.table(NAME=names[w], NROW=0L, NCOL=0L, MB=0.0, COLS=list(), KEY=list(), INDICES=list())
for (i in seq_along(w)) { # avoid rbindlist(lapply(DT_names)) in case of a large number of tables
DT = obj[[w[i]]]
set(info, i, "NROW", nrow(DT))
set(info, i, "NCOL", ncol(DT))
if (is.function(mb)) set(info, i, "MB", as.integer(mb(DT)/1048576L)) # i.e. 1024**2
if (!is.null(tt<-names(DT))) set(info, i, "COLS", tt) # TODO: don't need these if()s when #5526 is done
if (!is.null(tt<-key(DT))) set(info, i, "KEY", tt)
if (index && !is.null(tt<-indices(DT))) set(info, i, "INDICES", tt)
}
if (!is.function(mb)) info[,MB:=NULL]
if (!index) info[,INDICES:=NULL]
if (!order.col %chin% names(info)) stopf("order.col='%s' not a column name of info", order.col)
Expand Down
68 changes: 68 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -21978,3 +21978,71 @@ local({
test(2357.1, fread(f), DT)
test(2357.2, fread(paste0("file://", f)), DT)
})

#7571 issue for na.rm on int64
if (test_bit64) local({
# integer64 + GForce grouped sum with na.rm = FALSE
# Example 1 from issue: ids 1:8, 9, 9; three leading NAs then 4:10
dt_short = data.table(
id = c(1:8, 9, 9),
value = c(rep(NA_integer64_, 3L), as.integer64(4:10))
)
test(2358.1, options=c(datatable.optimize=2L),
dt_short[, sum(value, na.rm = FALSE), by = id]$V1,
as.integer64(c(NA, NA, NA, 4:8, 19))
)

# Example 2 from issue: ids in pairs, same values; checks multi-row groups
dt_short2 = data.table(
id = rep(1:5, each = 2L),
value = c(rep(NA_integer64_, 3L), as.integer64(4:10))
)
test(2358.2, options=c(datatable.optimize=2L),
dt_short2[, sum(value, na.rm = FALSE), by = id]$V1,
as.integer64(c(NA, NA, 11, 15, 19))
)

# Test mean for integer64 with NA
dt_mean = data.table(
id = c(1,1,2,2,3,3),
value = as.integer64(c(NA, NA, NA, 20000000, 5, 3))
)
test(2358.3, options=c(datatable.optimize=2L),
dt_mean[, mean(value, na.rm=FALSE), by = id]$V1,
c(NA, NA, 4)
)

# GForce sum vs base::sum for integer64
DT = data.table(id = sample(letters, 1000, TRUE), value = as.integer64(sample(c(1:100, NA), 1000, TRUE)))
gforce = DT[, .(gforce_sum = sum(value)), by=id]
base = DT[, .(true_sum = base::sum(value)), by=id]
merged = merge(gforce, base, by="id", all=TRUE)
test(2358.4, options=c(datatable.optimize=2L),
merged$gforce_sum, merged$true_sum
)

# GForce mean vs base::mean for integer64
DTm = data.table(id = sample(letters, 1000, TRUE), value = as.integer64(sample(c(1:100, NA), 1000, TRUE)))
gforce_m = DTm[, .(gforce_mean = mean(value)), by=id]
base_m = DTm[, .(true_mean = base::mean(value)), by=id]
merged_m = merge(gforce_m, base_m, by="id", all=TRUE)
test(2358.5, options=c(datatable.optimize=2L),
merged$gforce_mean, merged$true_mean
)
})

#2606 tables() list_search finds nested data.tables in lists
xenv2 = new.env()
xenv2$L = list(
data.table(a = 1L),
data.table(a = 1:2)
)
test(2360.1,
tables(env = xenv2, silent = TRUE, list_search = TRUE)[, .(NAME, NROW, NCOL)],
data.table(NAME = c("L[[1]]", "L[[2]]"), NROW = c(1L, 2L), NCOL = c(1L, 1L))
)
test(2360.2,
nrow(tables(env = xenv2, silent = TRUE, list_search = TRUE, list_len_threshold = 1L)),
0L
)
rm(xenv2)
2 changes: 1 addition & 1 deletion man/data.table.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac

\item{env}{ List or an environment, passed to \code{\link{substitute2}} for substitution of parameters in \code{i}, \code{j} and \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. For more details see \href{../doc/datatable-programming.html}{\code{vignette("datatable-programming")}}. }

\item{showProgress}{ \code{TRUE} shows progress indicator with estimated time to completion for lengthy "by" operations. }
\item{showProgress}{ \code{TRUE} (default when \code{interactive()}) shows a progress indicator with estimated time to completion for lengthy "by" operations, updating every 3 seconds. An integer value controls the update interval in seconds (minimum 3). \code{FALSE} disables the progress indicator. }
}
\details{
\code{data.table} builds on base \R functionality to reduce 2 types of time:\cr
Expand Down
11 changes: 6 additions & 5 deletions src/dogroups.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,10 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
SEXP SDall = PROTECT(findVar(install(".SDall"), env)); nprotect++; // PROTECT for rchk
SEXP SD = PROTECT(findVar(install(".SD"), env)); nprotect++;

const bool showProgress = LOGICAL(showProgressArg)[0]==1 && ngrp > 1; // showProgress only if more than 1 group
int updateTime = INTEGER(showProgressArg)[0];
const bool showProgress = updateTime > 0 && ngrp > 1; // showProgress only if more than 1 group
double startTime = (showProgress) ? wallclock() : 0; // For progress printing, startTime is set at the beginning
double nextTime = (showProgress) ? startTime+3 : 0; // wait 3 seconds before printing progress
double nextTime = (showProgress) ? startTime + MAX(updateTime, 3) : 0; // wait at least 3 seconds before starting to print progress

hashtab * specials = hash_create(3 + ngrpcols + xlength(SDall)); // .I, .N, .GRP plus columns of .BY plus SDall
PROTECT(specials->prot); nprotect++;
Expand Down Expand Up @@ -451,17 +452,17 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
// could potentially refactor to use fread's progress() function, however we would lose some information in favor of simplicity.
double now;
if (showProgress && (now=wallclock())>=nextTime) {
// # nocov start. Requires long-running test case
double avgTimePerGroup = (now-startTime)/(i+1);
int ETA = (int)(avgTimePerGroup*(ngrp-i-1));
if (hasPrinted || ETA >= 0) {
// # nocov start. Requires long-running test case
if (verbose && !hasPrinted) Rprintf(_("\n"));
Rprintf("\r"); // # notranslate. \r is not internationalizable
Rprintf(_("Processed %d groups out of %d. %.0f%% done. Time elapsed: %ds. ETA: %ds."), i+1, ngrp, 100.0*(i+1)/ngrp, (int)(now-startTime), ETA);
// # nocov end
}
nextTime = now+1;
nextTime = now+updateTime;
hasPrinted = true;
// # nocov end
}
ansloc += maxn;
if (firstalloc) {
Expand Down
12 changes: 6 additions & 6 deletions src/gsumm.c
Original file line number Diff line number Diff line change
Expand Up @@ -502,13 +502,13 @@ SEXP gsum(SEXP x, SEXP narmArg)
const int64_t *my_gx = gx + b*batchSize + pos;
const uint16_t *my_low = low + b*batchSize + pos;
for (int i=0; i<howMany; i++) {
const int64_t elem = my_gx[i];
if (elem!=INT64_MIN) {
_ans[my_low[i]] += elem;
} else {
_ans[my_low[i]] = INT64_MIN;
break;
if (_ans[my_low[i]] == INT64_MIN) continue;
const int64_t b = my_gx[i];
if (b == INT64_MIN) {
if (!narm) _ans[my_low[i]] = INT64_MIN;
continue;
}
_ans[my_low[i]] += b;
}
}
}
Expand Down