From 45dc9e708c8db7b45b3004823c51746166a3b87e Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 25 Oct 2025 18:43:10 +0200 Subject: [PATCH 01/21] add requires_utf8 argument to tests --- R/test.data.table.R | 25 +++++- inst/tests/tests.Rraw | 187 ++++++++++++++++++++++++------------------ 2 files changed, 131 insertions(+), 81 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 6e264c871f..af7283e9bd 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -361,7 +361,7 @@ gc_mem = function() { # nocov end } -test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL) { +test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL,requires_utf8=FALSE) { if (!is.null(env)) { old = Sys.getenv(names(env), names=TRUE, unset=NA) to_unset = !lengths(env) @@ -375,6 +375,29 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no Sys.unsetenv(names(old)[!is_preset]) }, add=TRUE) } + # Check UTF-8 requirement + if (requires_utf8) { + utf8_available = l10n_info()$`UTF-8` || { + lc_ctype = Sys.getlocale('LC_CTYPE') + lc_wantctype = 'en_US.UTF-8' + # Japanese multibyte characters require utf8. As of 2025, we're likely to be already running in a UTF-8 locale, but if not, try this setlocale() call as a last chance. + # Unfortunately, there is no guaranteed, portable way of switching to UTF-8 US English. + # Avoid the warning upon possible failure, #7210. + lc_newctype = suppressWarnings(Sys.setlocale('LC_CTYPE', lc_wantctype)) + if (identical(lc_newctype, lc_wantctype)) { + on.exit(Sys.setlocale('LC_CTYPE', lc_ctype), add=TRUE) + TRUE + } else FALSE + } + if (!utf8_available) { + last_utf8_skip = get0("last_utf8_skip", parent.frame(), ifnotfound=0, inherits=TRUE) + if (num - last_utf8_skip >= 1) { + catf("Test %s skipped because it needs a UTF-8 locale.\n", num) + } + assign("last_utf8_skip", num, parent.frame(), inherits=TRUE) + return(invisible(TRUE)) + } + } # Usage: # i) tests that x equals y when both x and y are supplied, the most common usage # ii) tests that x is TRUE when y isn't supplied diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 609977b991..9edc00a7d3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3568,7 +3568,28 @@ DT[,`:=`(last.x=tail(x,1L),last.x1=tail(x1,1L)),by=y] test(1086, class(DT$last.x), c("POSIXct", "POSIXt")) test(1087, class(DT$last.x1), "ITime") -# Tests 1088-1093 were non-ASCII. Now in DtNonAsciiTests +# chmatch on 'unknown' encoding (e.g. as.character(as.symbol("\u00E4")) )falling back to match, #2538 and #4818 +x1 <- c("al\u00E4", "ala", "\u00E4allc", "coep") +x2 <- c("ala", "al\u00E4") +test(1088.1, requires_utf8=TRUE, chmatch(x1, x2), match(x1, x2)) # should not fallback to "match" +test(1088.2, requires_utf8=TRUE, x1 %chin% x2, x1 %in% x2) +# change x1 to symbol to character +x3 <- unlist(lapply(x1, function(x) as.character(as.name(x))), use.names=FALSE) +test(1089.1, requires_utf8=TRUE, chmatch(x3, x2), match(x3, x2)) # should fallback to match in "x" +test(1089.2, requires_utf8=TRUE, x3 %chin% x2, x3 %in% x2) # should fallback to match in "x" +# change x2 to symbol to character +x4 <- unlist(lapply(x2, function(x) as.character(as.name(x))), use.names=FALSE) +test(1090.1, requires_utf8=TRUE, chmatch(x1,x4), match(x1, x4)) # should fallback to match in "table" +test(1090.2, requires_utf8=TRUE, x1 %chin% x4, x1 %in% x4) +# both are symbols to characters +test(1091.1, requires_utf8=TRUE, chmatch(x3, x4), match(x3, x4)) # should fallback to "match" in "x" as well. +test(1091.2, requires_utf8=TRUE, x3 %chin% x4, x3 %in% x4) +# for completness, include test from #2528 of non ascii LHS of := (it could feasibly fail in future due to something other than chmatch) + +DT = data.table(pas = c(1:5, NA, 6:10), good = c(1:10, NA)) +setnames(DT, "pas", "p\u00E4s") +test(1092, requires_utf8=TRUE, eval(parse(text="DT[is.na(p\u00E4s), p\u00E4s := 99L]")), data.table("p\u00E4s" = c(1:5, 99L, 6:10), good = c(1:10,NA))) +test(1093, requires_utf8=TRUE, eval(parse(text="DT[, p\u00E4s := 34L]")), data.table("p\u00E4s" = 34L, good=c(1:10,NA))) # print of unnamed DT with >20 <= 100 rows, #97 (RF#4934) DT <- data.table(x=1:25, y=letters[1:25]) @@ -4320,7 +4341,10 @@ test(1162.24, is.sorted(rep(NA_character_, 2))) x <- character(0) test(1163, last(x), character(0)) -# Test 1164 was a non-ASCII test, now in DtNonAsciiTests +# Bug fix for #5159 - chmatch and character encoding (for some reason this seems to pass the test on a mac as well) +a<-c("a","\u00E4","\u00DF","z") +au<-iconv(a,"UTF8","latin1") +test(1164.1, requires_utf8=TRUE, chmatch(a, au), match(a, au)) # Bug fix for #73 - segfault when rbindlist on empty data.tables x <- as.data.table(BOD) @@ -4606,7 +4630,22 @@ test(1228.4, class(DT), class(DT[, sum(b), by=a])) test(1228.5, class(DT), class(DT[a>1, sum(b), by=a])) test(1228.6, class(DT), class(DT[a>1, c:=sum(b), by=a])) -# test 1229 was non-ASCII, now in package DtNonAsciiTests +# savetl_init error after error, in v1.9.2, thanks Arun +DT <- data.table(x=1:5, y=10:6) +test(1229.1, DT[forderv(DT, -1)], error="non-existing column") +test(1229.2, setkey(DT), data.table(x=1:5, y=10:6, key="x,y")) +# umlaut in column names (red herring I think, but testing anyway) +sentEx = data.table(abend = c(1, 1, 0, 0, 2), + aber = c(0, 1, 0, 0, 0), + "\u00FCber" = c(1, 0, 0, 0, 0), + "\u00FCberall" = c(0, 0, 0, 0, 0), + "\u00FCberlegt" = c(0, 0, 0, 0, 0), + ID = structure(c(1L, 1L, 2L, 2L, 2L), .Label = c("0019", "0021"), class = "factor"), + abgeandert = c(1, 1, 1, 0, 0), + abgebildet = c(0, 0, 1, 1, 0), + abgelegt = c(0, 0, 0, 0, 3)) +test(1229.3, requires_utf8=TRUE, sentEx[, lapply(.SD, sum), by=ID], data.table(ID=factor(c("0019","0021")), abend=c(2,2), aber=c(1,0), "\u00FCber"=c(1,0), + "\u00FCberall"=c(0,0), "\u00FCberlegt" = c(0,0), abgeandert=c(2,1), abgebildet = c(0,2), abgelegt=c(0,3))) # Test that ad hoc by detects if ordered and dogroups switches to memcpy if contiguous, #1050 DT = data.table(a=1:3,b=1:6,key="a") @@ -17655,11 +17694,7 @@ test(2194.5, endsWithAny(NA_character_, 'a'), FALSE) test(2194.6, endsWithAny(character(), 'a'), error="Internal error.*types or lengths incorrect") # file used in encoding tests txt = readLines(testDir("issue_563_fread.txt")) -local(if (eval(utf8_check_expr)) { - test(2194.7, endsWithAny(txt, 'B'), error="Internal error.*types or lengths incorrect") # txt is length 5 -} else { - cat("Test 2194.7 skipped because it needs a UTF-8 locale.\n") -}) +test(2194.7, requires_utf8=TRUE, endsWithAny(txt, 'B'), error="Internal error.*types or lengths incorrect") # txt is length 5 test(2194.8, endsWith('abcd', 'd'), error="Internal error.*use endsWithAny") # uniqueN(x, by=character()) was internal error, #4594 @@ -18641,59 +18676,55 @@ test(2252.2, dt[, let(b=2L)], error = "\\[ was called on a data.table.*not data. rm(.datatable.aware) # tests for trunc.char handling wide characters #5096 -local(if (eval(utf8_check_expr)) { - accented_a = "\u0061\u0301" - ja_ichi = "\u4E00" - ja_ni = "\u4E8C" - ja_ko = "\u3053" - ja_n = "\u3093" - dots = "..." - clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output - # Tests for combining character latin a and acute accent, single row - DT = data.table(strrep(accented_a, 4L)) - test(2253.01, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(accented_a, 4L)) - test(2253.02, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(accented_a, 3L), dots)) - test(2253.03, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(accented_a, 1L), dots)) - # Tests for full-width japanese character ichi, single row - DT = data.table(strrep(ja_ichi, 4L)) - test(2253.04, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(ja_ichi, 4L)) - test(2253.05, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(ja_ichi, 3L), dots)) - test(2253.06, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(ja_ichi, 1L), dots)) - # Tests for multiple, different length combining character rows - DT = data.table(strrep(accented_a, 1L:4L)) - test(2253.07, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(accented_a, 1:4L)) - test(2253.08, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(strrep(accented_a, 1:3), paste0(strrep(accented_a, 3L), dots))) - test(2253.09, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(accented_a, rep(paste0(accented_a, dots), 3L))) - # Tests for multiple, different length full-width characters - DT = data.table(strrep(ja_ichi, 1L:4L)) - test(2253.10, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(ja_ichi, 1:4L)) - test(2253.11, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(strrep(ja_ichi, 1:3), paste0(strrep(ja_ichi, 3L), dots))) - test(2253.12, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(ja_ichi, rep(paste0(ja_ichi, dots), 3L))) - # Tests for combined characters, multiple columns - DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa") - test(2253.13, options=list(datatable.prettyprint.char = 4L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")) - test(2253.14, options=list(datatable.prettyprint.char = 3L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")) - test(2253.15, options=list(datatable.prettyprint.char = 2L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2), paste0(strrep(ja_ko, 2), dots) , strrep(accented_a, 2), "aa...")) - test(2253.16, options=list(datatable.prettyprint.char = 1L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, paste0(ja_ni, dots), paste0(ja_ko, dots), paste0(accented_a, dots), "a...")) - # Tests for multiple columns, multiple rows - DT = data.table(strrep(ja_ko, 1:3L), strrep(ja_n, 2:4L), strrep(accented_a, 3)) - test(2253.17, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), - c(paste0(ja_ko, " ", strrep(ja_n, 2L), " ", strrep(accented_a, 3L)), - paste0(strrep(ja_ko, 2L), " ", strrep(ja_n, 3L), " ", strrep(accented_a, 3L)), - paste(strrep(ja_ko, 3L), strrep(ja_n, 4L), strrep(accented_a, 3L)))) - test(2253.18, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), - c(paste0(ja_ko, " ", strrep(ja_n, 2L), " ", strrep(accented_a, 3L)), - paste0(strrep(ja_ko, 2L), " ", strrep(ja_n, 3L), " ", strrep(accented_a, 3L)), - paste(strrep(ja_ko, 3L), paste0(strrep(ja_n, 3L), dots), strrep(accented_a, 3L)))) - test(2253.19, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), - c(paste0(ja_ko, " ", paste0(ja_n, dots), " ", paste0(accented_a, dots)), - paste0(c(ja_ko, ja_n, accented_a), dots, collapse=" "), - paste0(c(ja_ko, ja_n, accented_a), dots, collapse=" "))) - # test for data.table with NA, #6441 - test(2253.20, options=list(datatable.prettyprint.char = 1L), data.table(a = c("abc", NA)), output=" a\n1: a...\n2: ") -} else { - cat("Tests 2253* skipped because they need a UTF-8 locale.\n") -}) +accented_a = "\u0061\u0301" +ja_ichi = "\u4E00" +ja_ni = "\u4E8C" +ja_ko = "\u3053" +ja_n = "\u3093" +dots = "..." +clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output +# Tests for combining character latin a and acute accent, single row +DT = data.table(strrep(accented_a, 4L)) +test(2253.01, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(accented_a, 4L)) +test(2253.02, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(accented_a, 3L), dots)) +test(2253.03, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(accented_a, 1L), dots)) +# Tests for full-width japanese character ichi, single row +DT = data.table(strrep(ja_ichi, 4L)) +test(2253.04, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(ja_ichi, 4L)) +test(2253.05, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(ja_ichi, 3L), dots)) +test(2253.06, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(ja_ichi, 1L), dots)) +# Tests for multiple, different length combining character rows +DT = data.table(strrep(accented_a, 1L:4L)) +test(2253.07, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(accented_a, 1:4L)) +test(2253.08, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(strrep(accented_a, 1:3), paste0(strrep(accented_a, 3L), dots))) +test(2253.09, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(accented_a, rep(paste0(accented_a, dots), 3L))) +# Tests for multiple, different length full-width characters +DT = data.table(strrep(ja_ichi, 1L:4L)) +test(2253.10, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(ja_ichi, 1:4L)) +test(2253.11, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(strrep(ja_ichi, 1:3), paste0(strrep(ja_ichi, 3L), dots))) +test(2253.12, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(ja_ichi, rep(paste0(ja_ichi, dots), 3L))) +# Tests for combined characters, multiple columns +DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa") +test(2253.13, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 4L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")) +test(2253.14, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 3L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")) +test(2253.15, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 2L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2), paste0(strrep(ja_ko, 2), dots) , strrep(accented_a, 2), "aa...")) +test(2253.16, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 1L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, paste0(ja_ni, dots), paste0(ja_ko, dots), paste0(accented_a, dots), "a...")) +# Tests for multiple columns, multiple rows +DT = data.table(strrep(ja_ko, 1:3L), strrep(ja_n, 2:4L), strrep(accented_a, 3)) +test(2253.17, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), + c(paste0(ja_ko, " ", strrep(ja_n, 2L), " ", strrep(accented_a, 3L)), + paste0(strrep(ja_ko, 2L), " ", strrep(ja_n, 3L), " ", strrep(accented_a, 3L)), + paste(strrep(ja_ko, 3L), strrep(ja_n, 4L), strrep(accented_a, 3L)))) +test(2253.18, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), + c(paste0(ja_ko, " ", strrep(ja_n, 2L), " ", strrep(accented_a, 3L)), + paste0(strrep(ja_ko, 2L), " ", strrep(ja_n, 3L), " ", strrep(accented_a, 3L)), + paste(strrep(ja_ko, 3L), paste0(strrep(ja_n, 3L), dots), strrep(accented_a, 3L)))) +test(2253.19, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), + c(paste0(ja_ko, " ", paste0(ja_n, dots), " ", paste0(accented_a, dots)), + paste0(c(ja_ko, ja_n, accented_a), dots, collapse=" "), + paste0(c(ja_ko, ja_n, accented_a), dots, collapse=" "))) +# test for data.table with NA, #6441 +test(2253.20, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 1L), data.table(a = c("abc", NA)), output=" a\n1: a...\n2: ") # allow 1-D matrix in j for consistency, #783 DT=data.table(a = rep(1:2, 3), b = 1:6) @@ -20830,21 +20861,17 @@ x = data.table(a=1, b=2L) y = data.table(c=1.5, d=1L) test(2297.31, y[x, on=.(c == a, d == a), nomatch=NULL], output="Empty data.table (0 rows and 3 cols): c,d,b") -local(if (eval(utf8_check_expr)) { - # rbindlist(l, use.names=TRUE) should handle different colnames encodings #5452 - x = data.table(a = 1, b = 2, c = 3) - y = data.table(x = 4, y = 5, z = 6) - # a-umlaut, o-umlaut, u-umlaut - setnames(x , c("\u00e4", "\u00f6", "\u00fc")) - setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1")) - test(2298.1, rbindlist(list(x,y), use.names=TRUE), data.table("\u00e4"=c(1,6), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) - test(2298.2, rbindlist(list(y,x), use.names=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(6,1))) - set(y, j="\u00e4", value=NULL) - test(2298.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), data.table("\u00e4"=c(1,NA), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) - test(2298.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(NA,1))) -} else { - cat("Tests 2298.* skipped because they need a UTF-8 locale.\n") -}) +# rbindlist(l, use.names=TRUE) should handle different colnames encodings #5452 +x = data.table(a = 1, b = 2, c = 3) +y = data.table(x = 4, y = 5, z = 6) +# a-umlaut, o-umlaut, u-umlaut +setnames(x , c("\u00e4", "\u00f6", "\u00fc")) +setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1")) +test(2298.1, requires_utf8=TRUE, rbindlist(list(x,y), use.names=TRUE), data.table("\u00e4"=c(1,6), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) +test(2298.2, requires_utf8=TRUE, rbindlist(list(y,x), use.names=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(6,1))) +set(y, j="\u00e4", value=NULL) +test(2298.3, requires_utf8=TRUE, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), data.table("\u00e4"=c(1,NA), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) +test(2298.4, requires_utf8=TRUE, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(NA,1))) # #6592: printing nested single-column frames test(2299.01, format_list_item(data.frame(a=1)), output="") @@ -21615,13 +21642,13 @@ if (base::getRversion() >= "4.3.0") { ## follow up of #7213, see #7321 } # fwrite: allow dec=',' with single column, #7227 -test(2337.1, fwrite(data.table(1), dec=","), NULL) +test(2337.1, fwrite(data.table(1), dec=","), output = "V1\n1") if (base::getRversion() >= "4.0.0") { # rely on stopifnot(named = ...) for correct message test(2337.2, fwrite(data.table(0.1, 0.2), dec=",", sep=","), error = "dec and sep must be distinct") } -test(2337.3, is.null(fwrite(data.table(c(0.1, 0.2)), dec=",", sep="\t"))) -test(2337.4, is.null(fwrite(data.table(a=numeric(), b=numeric()), dec=",", sep=","))) -test(2337.5, is.null(fwrite(data.table(a=numeric()), dec=",", sep=","))) +test(2337.3, fwrite(data.table(c(0.1, 0.2)), dec=",", sep="\t"), output = "V1\n0,1\n0,2") +test(2337.4, fwrite(data.table(a=numeric(), b=numeric()), dec=",", sep=","), output = "a,b") +test(2337.5, fwrite(data.table(a=numeric()), dec=",", sep=","), output = "a") # 2864 force decimal points for whole numbers in numeric columns dd = data.table(x=c(1, 2, 3)) From e9ecf6900384cd4c402b88c53e23fcc20cf9e60b Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 25 Oct 2025 20:48:00 +0200 Subject: [PATCH 02/21] fix warnings of sys.source --- R/test.data.table.R | 25 ++++--- inst/tests/tests.Rraw | 149 +++++++++++++++++++++++------------------- 2 files changed, 93 insertions(+), 81 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index af7283e9bd..61078a2996 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -361,6 +361,8 @@ gc_mem = function() { # nocov end } +utf8_check = function(test_str) identical(test_str, enc2native(test_str)) + test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL,requires_utf8=FALSE) { if (!is.null(env)) { old = Sys.getenv(names(env), names=TRUE, unset=NA) @@ -376,23 +378,18 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no }, add=TRUE) } # Check UTF-8 requirement - if (requires_utf8) { - utf8_available = l10n_info()$`UTF-8` || { - lc_ctype = Sys.getlocale('LC_CTYPE') - lc_wantctype = 'en_US.UTF-8' - # Japanese multibyte characters require utf8. As of 2025, we're likely to be already running in a UTF-8 locale, but if not, try this setlocale() call as a last chance. - # Unfortunately, there is no guaranteed, portable way of switching to UTF-8 US English. - # Avoid the warning upon possible failure, #7210. - lc_newctype = suppressWarnings(Sys.setlocale('LC_CTYPE', lc_wantctype)) - if (identical(lc_newctype, lc_wantctype)) { - on.exit(Sys.setlocale('LC_CTYPE', lc_ctype), add=TRUE) - TRUE - } else FALSE + if (!isFALSE(requires_utf8)) { + # Test string with common UTF-8 symbols that appear in tests: ñ (test 2266), ü (test 1229.3), ん (Japanese) + # If requires_utf8 is TRUE, use the default test string; if it's a string, use that string + if (isTRUE(requires_utf8)) { + test_str = "a\u00F1o \u00FCber \u3093" + } else { + test_str = requires_utf8 } - if (!utf8_available) { + if (!utf8_check(test_str)) { last_utf8_skip = get0("last_utf8_skip", parent.frame(), ifnotfound=0, inherits=TRUE) if (num - last_utf8_skip >= 1) { - catf("Test %s skipped because it needs a UTF-8 locale.\n", num) + catf("Test %s skipped because required UTF-8 symbols cannot be represented in native encoding.\n", num) } assign("last_utf8_skip", num, parent.frame(), inherits=TRUE) return(invisible(TRUE)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9edc00a7d3..58f5687b09 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3569,27 +3569,30 @@ test(1086, class(DT$last.x), c("POSIXct", "POSIXt")) test(1087, class(DT$last.x1), "ITime") # chmatch on 'unknown' encoding (e.g. as.character(as.symbol("\u00E4")) )falling back to match, #2538 and #4818 -x1 <- c("al\u00E4", "ala", "\u00E4allc", "coep") -x2 <- c("ala", "al\u00E4") -test(1088.1, requires_utf8=TRUE, chmatch(x1, x2), match(x1, x2)) # should not fallback to "match" -test(1088.2, requires_utf8=TRUE, x1 %chin% x2, x1 %in% x2) +x1 = c("al\u00E4", "ala", "\u00E4allc", "coep") +x2 = c("ala", "al\u00E4") +tstc = function(y) unlist(lapply(y, function(x) as.character(as.name(x))), use.names=FALSE) +test(1088.1, requires_utf8="\u00E4", chmatch(x1, x2), match(x1, x2)) # should not fallback to "match" +test(1088.2, requires_utf8="\u00E4", x1 %chin% x2, x1 %in% x2) # change x1 to symbol to character -x3 <- unlist(lapply(x1, function(x) as.character(as.name(x))), use.names=FALSE) -test(1089.1, requires_utf8=TRUE, chmatch(x3, x2), match(x3, x2)) # should fallback to match in "x" -test(1089.2, requires_utf8=TRUE, x3 %chin% x2, x3 %in% x2) # should fallback to match in "x" +test(1089.1, requires_utf8="\u00E4", chmatch(tstc(x1), x2), match(tstc(x1), x2)) # should fallback to match in "x" +test(1089.2, requires_utf8="\u00E4", tstc(x1) %chin% x2, tstc(x1) %in% x2) # should fallback to match in "x" # change x2 to symbol to character -x4 <- unlist(lapply(x2, function(x) as.character(as.name(x))), use.names=FALSE) -test(1090.1, requires_utf8=TRUE, chmatch(x1,x4), match(x1, x4)) # should fallback to match in "table" -test(1090.2, requires_utf8=TRUE, x1 %chin% x4, x1 %in% x4) +test(1090.1, requires_utf8="\u00E4", chmatch(x1,tstc(x2)), match(x1, tstc(x2))) # should fallback to match in "table" +test(1090.2, requires_utf8="\u00E4", x1 %chin% tstc(x2), x1 %in% tstc(x2)) # both are symbols to characters -test(1091.1, requires_utf8=TRUE, chmatch(x3, x4), match(x3, x4)) # should fallback to "match" in "x" as well. -test(1091.2, requires_utf8=TRUE, x3 %chin% x4, x3 %in% x4) +test(1091.1, requires_utf8="\u00E4", chmatch(tstc(x1), tstc(x2)), match(tstc(x1), tstc(x2))) # should fallback to "match" in "x" as well. +test(1091.2, requires_utf8="\u00E4", tstc(x1) %chin% tstc(x2), tstc(x1) %in% tstc(x2)) # for completness, include test from #2528 of non ascii LHS of := (it could feasibly fail in future due to something other than chmatch) -DT = data.table(pas = c(1:5, NA, 6:10), good = c(1:10, NA)) -setnames(DT, "pas", "p\u00E4s") -test(1092, requires_utf8=TRUE, eval(parse(text="DT[is.na(p\u00E4s), p\u00E4s := 99L]")), data.table("p\u00E4s" = c(1:5, 99L, 6:10), good = c(1:10,NA))) -test(1093, requires_utf8=TRUE, eval(parse(text="DT[, p\u00E4s := 34L]")), data.table("p\u00E4s" = 34L, good=c(1:10,NA))) +local(if (utf8_check("\u00E4")) { +eval(parse(text=' + DT = data.table(pas = c(1:5, NA, 6:10), good = c(1:10, NA)) + setnames(DT, "pas", "p\u00E4s") + test(1092, requires_utf8="\u00E4", eval(parse(text="DT[is.na(p\u00E4s), p\u00E4s := 99L]")), data.table("p\u00E4s" = c(1:5, 99L, 6:10), good = c(1:10,NA))) + test(1093, requires_utf8="\u00E4", eval(parse(text="DT[, p\u00E4s := 34L]")), data.table("p\u00E4s" = 34L, good=c(1:10,NA))) +')) +} else cat("Tests 1092+1093 skipped because required UTF-8 symbols cannot be represented in native encoding.\n")) # print of unnamed DT with >20 <= 100 rows, #97 (RF#4934) DT <- data.table(x=1:25, y=letters[1:25]) @@ -4343,8 +4346,8 @@ test(1163, last(x), character(0)) # Bug fix for #5159 - chmatch and character encoding (for some reason this seems to pass the test on a mac as well) a<-c("a","\u00E4","\u00DF","z") -au<-iconv(a,"UTF8","latin1") -test(1164.1, requires_utf8=TRUE, chmatch(a, au), match(a, au)) + au<-iconv(a,"UTF8","latin1") + test(1164.1, requires_utf8=c("\u00E4", "\u00DF"), chmatch(a, au), match(a, au)) # Bug fix for #73 - segfault when rbindlist on empty data.tables x <- as.data.table(BOD) @@ -4634,18 +4637,24 @@ test(1228.6, class(DT), class(DT[a>1, c:=sum(b), by=a])) DT <- data.table(x=1:5, y=10:6) test(1229.1, DT[forderv(DT, -1)], error="non-existing column") test(1229.2, setkey(DT), data.table(x=1:5, y=10:6, key="x,y")) -# umlaut in column names (red herring I think, but testing anyway) -sentEx = data.table(abend = c(1, 1, 0, 0, 2), - aber = c(0, 1, 0, 0, 0), - "\u00FCber" = c(1, 0, 0, 0, 0), - "\u00FCberall" = c(0, 0, 0, 0, 0), - "\u00FCberlegt" = c(0, 0, 0, 0, 0), - ID = structure(c(1L, 1L, 2L, 2L, 2L), .Label = c("0019", "0021"), class = "factor"), - abgeandert = c(1, 1, 1, 0, 0), - abgebildet = c(0, 0, 1, 1, 0), - abgelegt = c(0, 0, 0, 0, 3)) -test(1229.3, requires_utf8=TRUE, sentEx[, lapply(.SD, sum), by=ID], data.table(ID=factor(c("0019","0021")), abend=c(2,2), aber=c(1,0), "\u00FCber"=c(1,0), - "\u00FCberall"=c(0,0), "\u00FCberlegt" = c(0,0), abgeandert=c(2,1), abgebildet = c(0,2), abgelegt=c(0,3))) +# umlaut in column names (red herring I think, but testing anyway +local(if (utf8_check("\u00e4\u00f6\u00fc")) { + eval(parse(text = ' + sentEx = data.table(abend = c(1, 1, 0, 0, 2), + aber = c(0, 1, 0, 0, 0), + "\u00FCber" = c(1, 0, 0, 0, 0), + "\u00FCberall" = c(0, 0, 0, 0, 0), + "\u00FCberlegt" = c(0, 0, 0, 0, 0), + ID = structure(c(1L, 1L, 2L, 2L, 2L), .Label = c("0019", "0021"), class = "factor"), + abgeandert = c(1, 1, 1, 0, 0), + abgebildet = c(0, 0, 1, 1, 0), + abgelegt = c(0, 0, 0, 0, 3)) + test(1229.3, sentEx[, lapply(.SD, sum), by=ID], data.table(ID=factor(c("0019","0021")), abend=c(2,2), aber=c(1,0), "\u00FCber"=c(1,0), + "\u00FCberall"=c(0,0), "\u00FCberlegt" = c(0,0), abgeandert=c(2,1), abgebildet = c(0,2), abgelegt=c(0,3))) + ')) +} else { + cat("Tests 1229.3 skipped because required UTF-8 symbols cannot be represented in native encoding.\n") +}) # Test that ad hoc by detects if ordered and dogroups switches to memcpy if contiguous, #1050 DT = data.table(a=1:3,b=1:6,key="a") @@ -7938,10 +7947,8 @@ test(1547, foo(1L, 5L, a=2L, "c"), c("2", "c")) # Fix for encoding issues in windows, #563 f = testDir("issue_563_fread.txt") -ans1 <- fread(f, sep=",", header=TRUE) -ans2 <- fread(f, sep=",", header=TRUE, encoding="UTF-8") -test(1548.1, unique(unlist(lapply(ans1, Encoding))), "unknown") -test(1548.2, unique(unlist(lapply(ans2, Encoding))), "UTF-8") +test(1548.1, requires_utf8=TRUE, unique(unlist(lapply(fread(f, sep=",", header=TRUE), Encoding))), "unknown") +test(1548.2, requires_utf8=TRUE, unique(unlist(lapply(fread(f, sep=",", header=TRUE, encoding="UTF-8"), Encoding))), "UTF-8") # 1549 moved to benchmark.Rraw, #5517 @@ -17693,8 +17700,9 @@ test(2194.4, endsWithAny(letters, 'e'), error="Internal error.*types or lengths test(2194.5, endsWithAny(NA_character_, 'a'), FALSE) test(2194.6, endsWithAny(character(), 'a'), error="Internal error.*types or lengths incorrect") # file used in encoding tests -txt = readLines(testDir("issue_563_fread.txt")) -test(2194.7, requires_utf8=TRUE, endsWithAny(txt, 'B'), error="Internal error.*types or lengths incorrect") # txt is length 5 +needed_chars = "\u0105\u017E\u016B\u012F\u0173\u0117\u0161\u0119" +txt = parse(text='readLines(testDir("issue_563_fread.txt"))') +test(2194.7, requires_utf8=needed_chars, endsWithAny(eval(txt), 'B'), error="Internal error.*types or lengths incorrect") # txt is length 5 test(2194.8, endsWith('abcd', 'd'), error="Internal error.*use endsWithAny") # uniqueN(x, by=character()) was internal error, #4594 @@ -18681,50 +18689,51 @@ ja_ichi = "\u4E00" ja_ni = "\u4E8C" ja_ko = "\u3053" ja_n = "\u3093" +nc = paste0(accented_a, ja_ichi, ja_ni, ja_ko, ja_n) dots = "..." clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output # Tests for combining character latin a and acute accent, single row DT = data.table(strrep(accented_a, 4L)) -test(2253.01, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(accented_a, 4L)) -test(2253.02, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(accented_a, 3L), dots)) -test(2253.03, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(accented_a, 1L), dots)) +test(2253.01, requires_utf8=nc, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(accented_a, 4L)) +test(2253.02, requires_utf8=nc, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(accented_a, 3L), dots)) +test(2253.03, requires_utf8=nc, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(accented_a, 1L), dots)) # Tests for full-width japanese character ichi, single row DT = data.table(strrep(ja_ichi, 4L)) -test(2253.04, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(ja_ichi, 4L)) -test(2253.05, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(ja_ichi, 3L), dots)) -test(2253.06, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(ja_ichi, 1L), dots)) +test(2253.04, requires_utf8=nc, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(ja_ichi, 4L)) +test(2253.05, requires_utf8=nc, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(ja_ichi, 3L), dots)) +test(2253.06, requires_utf8=nc, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(ja_ichi, 1L), dots)) # Tests for multiple, different length combining character rows DT = data.table(strrep(accented_a, 1L:4L)) -test(2253.07, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(accented_a, 1:4L)) -test(2253.08, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(strrep(accented_a, 1:3), paste0(strrep(accented_a, 3L), dots))) -test(2253.09, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(accented_a, rep(paste0(accented_a, dots), 3L))) +test(2253.07, requires_utf8=nc, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(accented_a, 1:4L)) +test(2253.08, requires_utf8=nc, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(strrep(accented_a, 1:3), paste0(strrep(accented_a, 3L), dots))) +test(2253.09, requires_utf8=nc, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(accented_a, rep(paste0(accented_a, dots), 3L))) # Tests for multiple, different length full-width characters DT = data.table(strrep(ja_ichi, 1L:4L)) -test(2253.10, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(ja_ichi, 1:4L)) -test(2253.11, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(strrep(ja_ichi, 1:3), paste0(strrep(ja_ichi, 3L), dots))) -test(2253.12, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(ja_ichi, rep(paste0(ja_ichi, dots), 3L))) +test(2253.10, requires_utf8=nc, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(ja_ichi, 1:4L)) +test(2253.11, requires_utf8=nc, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(strrep(ja_ichi, 1:3), paste0(strrep(ja_ichi, 3L), dots))) +test(2253.12, requires_utf8=nc, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(ja_ichi, rep(paste0(ja_ichi, dots), 3L))) # Tests for combined characters, multiple columns DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa") -test(2253.13, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 4L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")) -test(2253.14, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 3L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")) -test(2253.15, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 2L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2), paste0(strrep(ja_ko, 2), dots) , strrep(accented_a, 2), "aa...")) -test(2253.16, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 1L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, paste0(ja_ni, dots), paste0(ja_ko, dots), paste0(accented_a, dots), "a...")) +test(2253.13, requires_utf8=nc, options=list(datatable.prettyprint.char = 4L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")) +test(2253.14, requires_utf8=nc, options=list(datatable.prettyprint.char = 3L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")) +test(2253.15, requires_utf8=nc, options=list(datatable.prettyprint.char = 2L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2), paste0(strrep(ja_ko, 2), dots) , strrep(accented_a, 2), "aa...")) +test(2253.16, requires_utf8=nc, options=list(datatable.prettyprint.char = 1L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, paste0(ja_ni, dots), paste0(ja_ko, dots), paste0(accented_a, dots), "a...")) # Tests for multiple columns, multiple rows DT = data.table(strrep(ja_ko, 1:3L), strrep(ja_n, 2:4L), strrep(accented_a, 3)) -test(2253.17, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), +test(2253.17, requires_utf8=nc, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(paste0(ja_ko, " ", strrep(ja_n, 2L), " ", strrep(accented_a, 3L)), paste0(strrep(ja_ko, 2L), " ", strrep(ja_n, 3L), " ", strrep(accented_a, 3L)), paste(strrep(ja_ko, 3L), strrep(ja_n, 4L), strrep(accented_a, 3L)))) -test(2253.18, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), +test(2253.18, requires_utf8=nc, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(paste0(ja_ko, " ", strrep(ja_n, 2L), " ", strrep(accented_a, 3L)), paste0(strrep(ja_ko, 2L), " ", strrep(ja_n, 3L), " ", strrep(accented_a, 3L)), paste(strrep(ja_ko, 3L), paste0(strrep(ja_n, 3L), dots), strrep(accented_a, 3L)))) -test(2253.19, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), +test(2253.19, requires_utf8=nc, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(paste0(ja_ko, " ", paste0(ja_n, dots), " ", paste0(accented_a, dots)), paste0(c(ja_ko, ja_n, accented_a), dots, collapse=" "), paste0(c(ja_ko, ja_n, accented_a), dots, collapse=" "))) # test for data.table with NA, #6441 -test(2253.20, requires_utf8=TRUE, options=list(datatable.prettyprint.char = 1L), data.table(a = c("abc", NA)), output=" a\n1: a...\n2: ") +test(2253.20, requires_utf8=nc, options=list(datatable.prettyprint.char = 1L), data.table(a = c("abc", NA)), output=" a\n1: a...\n2: ") # allow 1-D matrix in j for consistency, #783 DT=data.table(a = rep(1:2, 3), b = 1:6) @@ -20861,17 +20870,23 @@ x = data.table(a=1, b=2L) y = data.table(c=1.5, d=1L) test(2297.31, y[x, on=.(c == a, d == a), nomatch=NULL], output="Empty data.table (0 rows and 3 cols): c,d,b") -# rbindlist(l, use.names=TRUE) should handle different colnames encodings #5452 -x = data.table(a = 1, b = 2, c = 3) -y = data.table(x = 4, y = 5, z = 6) -# a-umlaut, o-umlaut, u-umlaut -setnames(x , c("\u00e4", "\u00f6", "\u00fc")) -setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1")) -test(2298.1, requires_utf8=TRUE, rbindlist(list(x,y), use.names=TRUE), data.table("\u00e4"=c(1,6), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) -test(2298.2, requires_utf8=TRUE, rbindlist(list(y,x), use.names=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(6,1))) -set(y, j="\u00e4", value=NULL) -test(2298.3, requires_utf8=TRUE, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), data.table("\u00e4"=c(1,NA), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) -test(2298.4, requires_utf8=TRUE, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(NA,1))) +local(if (utf8_check("\u00e4\u00f6\u00fc")) { + # rbindlist(l, use.names=TRUE) should handle different colnames encodings #5452 + x = data.table(a = 1, b = 2, c = 3) + y = data.table(x = 4, y = 5, z = 6) + # a-umlaut, o-umlaut, u-umlaut + eval(parse(text = ' + setnames(x , c("\u00e4", "\u00f6", "\u00fc")) + setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1")) + test(2298.1, rbindlist(list(x,y), use.names=TRUE), data.table("\u00e4"=c(1,6), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) + test(2298.2, rbindlist(list(y,x), use.names=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(6,1))) + set(y, j="\u00e4", value=NULL) + test(2298.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), data.table("\u00e4"=c(1,NA), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) + test(2298.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(NA,1))) + ')) +} else { + cat("Tests 2298.* skipped because they need a UTF-8 locale.\n") +}) # #6592: printing nested single-column frames test(2299.01, format_list_item(data.frame(a=1)), output="") From 8161adf24ff9ccf326674af5bb5961f327605cfb Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 25 Oct 2025 20:53:19 +0200 Subject: [PATCH 03/21] change typos --- R/test.data.table.R | 8 +------- inst/tests/tests.Rraw | 6 +++--- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 61078a2996..91656d8576 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -379,13 +379,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no } # Check UTF-8 requirement if (!isFALSE(requires_utf8)) { - # Test string with common UTF-8 symbols that appear in tests: ñ (test 2266), ü (test 1229.3), ん (Japanese) - # If requires_utf8 is TRUE, use the default test string; if it's a string, use that string - if (isTRUE(requires_utf8)) { - test_str = "a\u00F1o \u00FCber \u3093" - } else { - test_str = requires_utf8 - } + test_str = if (isTRUE(requires_utf8)) "\u00F1\u00FC\u3093" else requires_utf8 if (!utf8_check(test_str)) { last_utf8_skip = get0("last_utf8_skip", parent.frame(), ifnotfound=0, inherits=TRUE) if (num - last_utf8_skip >= 1) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 58f5687b09..d6c3adace6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -4346,8 +4346,8 @@ test(1163, last(x), character(0)) # Bug fix for #5159 - chmatch and character encoding (for some reason this seems to pass the test on a mac as well) a<-c("a","\u00E4","\u00DF","z") - au<-iconv(a,"UTF8","latin1") - test(1164.1, requires_utf8=c("\u00E4", "\u00DF"), chmatch(a, au), match(a, au)) +au<-iconv(a,"UTF8","latin1") +test(1164.1, requires_utf8=c("\u00E4", "\u00DF"), chmatch(a, au), match(a, au)) # Bug fix for #73 - segfault when rbindlist on empty data.tables x <- as.data.table(BOD) @@ -4653,7 +4653,7 @@ local(if (utf8_check("\u00e4\u00f6\u00fc")) { "\u00FCberall"=c(0,0), "\u00FCberlegt" = c(0,0), abgeandert=c(2,1), abgebildet = c(0,2), abgelegt=c(0,3))) ')) } else { - cat("Tests 1229.3 skipped because required UTF-8 symbols cannot be represented in native encoding.\n") + cat("Test 1229.3 skipped because required UTF-8 symbols cannot be represented in native encoding.\n") }) # Test that ad hoc by detects if ordered and dogroups switches to memcpy if contiguous, #1050 From 1a963900326a83d2030b467448c10dc794005ecb Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 25 Oct 2025 21:25:04 +0200 Subject: [PATCH 04/21] register utf8_check function --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d6c3adace6..26cec19b74 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -78,6 +78,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { test = data.table:::test uniqlengths = data.table:::uniqlengths uniqlist = data.table:::uniqlist + utf8_check = data.table:::utf8_check warningf = data.table:::warningf which_ = data.table:::which_ which.first = data.table:::which.first From 0e82cf0befe386c5ea3c71df8a31884f32693326 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 25 Oct 2025 22:04:05 +0200 Subject: [PATCH 05/21] add documentation and NEWS --- NEWS.md | 2 ++ man/test.Rd | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 3face7519b..db69c7ef80 100644 --- a/NEWS.md +++ b/NEWS.md @@ -363,6 +363,8 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T 7. In rare situations a data.table object may lose its internal attribute that holds a self-reference. New helper function `.selfref.ok()` tests just that. It is only intended for technical use cases. See manual for examples. +8. `test()` gains new argument `requires_utf8` to skip tests when UTF-8 support is not available, [#7336](https://github.com/Rdatatable/data.table/issues/7336). Thanks @MichaelChirico for the suggestion and @ben-schwen for the implementation. + ## data.table [v1.17.8](https://github.com/Rdatatable/data.table/milestone/41) (6 July 2025) 1. Internal functions used to signal errors are now marked as non-returning, silencing a compiler warning about potentially unchecked allocation failure. Thanks to Prof. Brian D. Ripley for the report and @aitap for the fix, [#7070](https://github.com/Rdatatable/data.table/pull/7070). diff --git a/man/test.Rd b/man/test.Rd index 594040aca9..fcbbab3f79 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -8,7 +8,7 @@ test(num, x, y = TRUE, error = NULL, warning = NULL, message = NULL, output = NULL, notOutput = NULL, ignore.warning = NULL, - options = NULL, env = NULL) + options = NULL, env = NULL, requires_utf8 = FALSE) } \arguments{ \item{num}{ A unique identifier for a test, helpful in identifying the source of failure when testing is not working. Currently, we use a manually-incremented system with tests formatted as \code{n.m}, where essentially \code{n} indexes an issue and \code{m} indexes aspects of that issue. For the most part, your new PR should only have one value of \code{n} (scroll to the end of \code{inst/tests/tests.Rraw} to see the next available ID) and then index the tests within your PR by increasing \code{m}. Note -- \code{n.m} is interpreted as a number, so \code{123.4} and \code{123.40} are actually the same -- please \code{0}-pad as appropriate. Test identifiers are checked to be in increasing order at runtime to prevent duplicates being possible. } @@ -22,6 +22,7 @@ test(num, x, y = TRUE, \item{ignore.warning}{ A single character string. Any warnings emitted by \code{x} that contain this string are dropped. Remaining warnings are compared to the expected \code{warning} as normal. } \item{options}{ A named list of options to set for the duration of the test. Any code evaluated during this call to \code{test()} (usually, \code{x}, or maybe \code{y}) will run with the named options set, and the original options will be restored on return. This is a named list since different options can have different types in general, but in typical usage, only one option is set at a time, in which case a named vector is also accepted. } \item{env}{ A named list of environment variables to set for the duration of the test, much like \code{options}. A list entry set to \code{NULL} will unset (i.e., \code{\link{Sys.unsetenv}}) the corresponding variable. } +\item{requires_utf8}{ \code{FALSE} (default), \code{TRUE}, or a character string. When set, the test is skipped if UTF-8 characters cannot be represented in the native encoding. Use \code{TRUE} for default UTF-8 test characters or provide a custom string of test characters. } } \note{ \code{NA_real_} and \code{NaN} are treated as equal, use \code{identical} if distinction is needed. See examples below. From fa5967f054ed6d45eeb77f8ccffd194e1fcf0501 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 25 Oct 2025 22:35:06 +0200 Subject: [PATCH 06/21] add nocov for region that only hits on non UTF8 systems --- R/test.data.table.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/test.data.table.R b/R/test.data.table.R index 91656d8576..e3148220ba 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -381,12 +381,14 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no if (!isFALSE(requires_utf8)) { test_str = if (isTRUE(requires_utf8)) "\u00F1\u00FC\u3093" else requires_utf8 if (!utf8_check(test_str)) { + # nocov start last_utf8_skip = get0("last_utf8_skip", parent.frame(), ifnotfound=0, inherits=TRUE) if (num - last_utf8_skip >= 1) { catf("Test %s skipped because required UTF-8 symbols cannot be represented in native encoding.\n", num) } assign("last_utf8_skip", num, parent.frame(), inherits=TRUE) return(invisible(TRUE)) + # nocov end } } # Usage: From 05b3923ee817b136dcbed9fddb54379fa2071c30 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 30 Dec 2025 18:25:38 +0100 Subject: [PATCH 07/21] be more specific in NEWS --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index d9776abd5f..4f030ed850 100644 --- a/NEWS.md +++ b/NEWS.md @@ -24,7 +24,7 @@ 3. Vignettes are now built using `litedown` instead of `knitr`, [#6394](https://github.com/Rdatatable/data.table/issues/6394). Thanks @jangorecki for the suggestion and @ben-schwen and @aitap for the implementation. -4. `test()` gains new argument `requires_utf8` to skip tests when UTF-8 support is not available, [#7336](https://github.com/Rdatatable/data.table/issues/7336). Thanks @MichaelChirico for the suggestion and @ben-schwen for the implementation. +4. The data.table test suite is a bit more robust to lacking UTF-8 support via a new `requires_utf8` argument to `test()` to skip tests when UTF-8 support is not available, [#7336](https://github.com/Rdatatable/data.table/issues/7336). Thanks @MichaelChirico for the suggestion and @ben-schwen for the implementation. ### BUG FIXES From 47a1aa00c2fd1b6ab4835e78b8239ae56c31448a Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 30 Dec 2025 18:25:56 +0100 Subject: [PATCH 08/21] use combine instead of paste --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 28a3eb45d8..d20c567042 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18699,7 +18699,7 @@ ja_ichi = "\u4E00" ja_ni = "\u4E8C" ja_ko = "\u3053" ja_n = "\u3093" -nc = paste0(accented_a, ja_ichi, ja_ni, ja_ko, ja_n) +nc = c(accented_a, ja_ichi, ja_ni, ja_ko, ja_n) dots = "..." clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output # Tests for combining character latin a and acute accent, single row From e11d36d331f1ae8fd12721294905d933a01af62d Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 30 Dec 2025 18:27:03 +0100 Subject: [PATCH 09/21] use vector form --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d20c567042..1795d22d73 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17700,7 +17700,7 @@ test(2194.4, endsWithAny(letters, 'e'), error="Internal error.*types or lengths test(2194.5, endsWithAny(NA_character_, 'a'), FALSE) test(2194.6, endsWithAny(character(), 'a'), error="Internal error.*types or lengths incorrect") # file used in encoding tests -needed_chars = "\u0105\u017E\u016B\u012F\u0173\u0117\u0161\u0119" +needed_chars = c("\u0105", "\u017E", "\u016B", "\u012F", "\u0173", "\u0117", "\u0161", "\u0119") txt = parse(text='readLines(testDir("issue_563_fread.txt"))') test(2194.7, requires_utf8=needed_chars, endsWithAny(eval(txt), 'B'), error="Internal error.*types or lengths incorrect") # txt is length 5 test(2194.8, endsWith('abcd', 'd'), error="Internal error.*use endsWithAny") From 18c1722b28ae174b6eea6f277d98ce945f9b2cc5 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 3 Jan 2026 13:22:05 +0100 Subject: [PATCH 10/21] use local --- inst/tests/tests.Rraw | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1795d22d73..e21d264e5f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3571,20 +3571,24 @@ test(1086, class(DT$last.x), c("POSIXct", "POSIXt")) test(1087, class(DT$last.x1), "ITime") # chmatch on 'unknown' encoding (e.g. as.character(as.symbol("\u00E4")) )falling back to match, #2538 and #4818 +local({ x1 = c("al\u00E4", "ala", "\u00E4allc", "coep") x2 = c("ala", "al\u00E4") -tstc = function(y) unlist(lapply(y, function(x) as.character(as.name(x))), use.names=FALSE) -test(1088.1, requires_utf8="\u00E4", chmatch(x1, x2), match(x1, x2)) # should not fallback to "match" -test(1088.2, requires_utf8="\u00E4", x1 %chin% x2, x1 %in% x2) -# change x1 to symbol to character -test(1089.1, requires_utf8="\u00E4", chmatch(tstc(x1), x2), match(tstc(x1), x2)) # should fallback to match in "x" -test(1089.2, requires_utf8="\u00E4", tstc(x1) %chin% x2, tstc(x1) %in% x2) # should fallback to match in "x" -# change x2 to symbol to character -test(1090.1, requires_utf8="\u00E4", chmatch(x1,tstc(x2)), match(x1, tstc(x2))) # should fallback to match in "table" -test(1090.2, requires_utf8="\u00E4", x1 %chin% tstc(x2), x1 %in% tstc(x2)) -# both are symbols to characters -test(1091.1, requires_utf8="\u00E4", chmatch(tstc(x1), tstc(x2)), match(tstc(x1), tstc(x2))) # should fallback to "match" in "x" as well. -test(1091.2, requires_utf8="\u00E4", tstc(x1) %chin% tstc(x2), tstc(x1) %in% tstc(x2)) +if (utf8_check(c(x1,x2))) { + tstc = function(y) unlist(lapply(y, function(x) as.character(as.name(x))), use.names=FALSE) + test(1088.1, chmatch(x1, x2), match(x1, x2)) # should not fallback to "match" + test(1088.2, x1 %chin% x2, x1 %in% x2) + # change x1 to symbol to character + test(1089.1, chmatch(tstc(x1), x2), match(tstc(x1), x2)) # should fallback to match in "x" + test(1089.2, tstc(x1) %chin% x2, tstc(x1) %in% x2) # should fallback to match in "x" + # change x2 to symbol to character + test(1090.1, chmatch(x1,tstc(x2)), match(x1, tstc(x2))) # should fallback to match in "table" + test(1090.2, x1 %chin% tstc(x2), x1 %in% tstc(x2)) + # both are symbols to characters + test(1091.1, chmatch(tstc(x1), tstc(x2)), match(tstc(x1), tstc(x2))) # should fallback to "match" in "x" as well. + test(1091.2, tstc(x1) %chin% tstc(x2), tstc(x1) %in% tstc(x2)) +} else cat("Tests 1088-1091 skipped because required UTF-8 symbols cannot be represented in native encoding.\n") +}) # for completness, include test from #2528 of non ascii LHS of := (it could feasibly fail in future due to something other than chmatch) local(if (utf8_check("\u00E4")) { From b4d49ee8df0627e7e48803a14719a9e21b0490b7 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 3 Jan 2026 13:28:54 +0100 Subject: [PATCH 11/21] explain eval parse for utf8 check --- R/test.data.table.R | 5 +++++ inst/tests/tests.Rraw | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 99b48abc3c..ea34d6c8bc 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -370,6 +370,11 @@ gc_mem = function() { # nocov end } +# Check if UTF-8 symbols can be represented in native encoding +# R's parser requires symbol names (PRINTNAME in LANGSXP) to be in native encoding. In non-UTF-8 +# locales, parsing Unicode escapes like \u00FC fails with a warning and substitutes . +# Tests using requires_utf8 are skipped when UTF-8 cannot be represented. Using eval(parse(text=...)) +# defers parsing to runtime, allowing the encoding check to run first and avoid source() warnings. utf8_check = function(test_str) identical(test_str, enc2native(test_str)) test = function(num, x, y=TRUE, diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e21d264e5f..cfe952453d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3592,7 +3592,7 @@ if (utf8_check(c(x1,x2))) { # for completness, include test from #2528 of non ascii LHS of := (it could feasibly fail in future due to something other than chmatch) local(if (utf8_check("\u00E4")) { -eval(parse(text=' +eval(parse(text=' # eval(parse()) defers parsing to runtime; see utf8_check description DT = data.table(pas = c(1:5, NA, 6:10), good = c(1:10, NA)) setnames(DT, "pas", "p\u00E4s") test(1092, requires_utf8="\u00E4", eval(parse(text="DT[is.na(p\u00E4s), p\u00E4s := 99L]")), data.table("p\u00E4s" = c(1:5, 99L, 6:10), good = c(1:10,NA))) @@ -4645,7 +4645,7 @@ test(1229.1, DT[forderv(DT, -1)], error="non-existing column") test(1229.2, setkey(DT), data.table(x=1:5, y=10:6, key="x,y")) # umlaut in column names (red herring I think, but testing anyway local(if (utf8_check("\u00e4\u00f6\u00fc")) { - eval(parse(text = ' + eval(parse(text = ' # eval(parse()) defers parsing to runtime; see utf8_check description sentEx = data.table(abend = c(1, 1, 0, 0, 2), aber = c(0, 1, 0, 0, 0), "\u00FCber" = c(1, 0, 0, 0, 0), @@ -20908,7 +20908,7 @@ local(if (utf8_check("\u00e4\u00f6\u00fc")) { x = data.table(a = 1, b = 2, c = 3) y = data.table(x = 4, y = 5, z = 6) # a-umlaut, o-umlaut, u-umlaut - eval(parse(text = ' + eval(parse(text = ' # eval(parse()) defers parsing to runtime; see utf8_check description setnames(x , c("\u00e4", "\u00f6", "\u00fc")) setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1")) test(2298.1, rbindlist(list(x,y), use.names=TRUE), data.table("\u00e4"=c(1,6), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) From 9f19186fa288e23b02143cb78e5adb3a0e4a909a Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 3 Jan 2026 13:36:43 +0100 Subject: [PATCH 12/21] remove nested eval parse --- inst/tests/tests.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index cfe952453d..09d16af221 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3595,8 +3595,8 @@ local(if (utf8_check("\u00E4")) { eval(parse(text=' # eval(parse()) defers parsing to runtime; see utf8_check description DT = data.table(pas = c(1:5, NA, 6:10), good = c(1:10, NA)) setnames(DT, "pas", "p\u00E4s") - test(1092, requires_utf8="\u00E4", eval(parse(text="DT[is.na(p\u00E4s), p\u00E4s := 99L]")), data.table("p\u00E4s" = c(1:5, 99L, 6:10), good = c(1:10,NA))) - test(1093, requires_utf8="\u00E4", eval(parse(text="DT[, p\u00E4s := 34L]")), data.table("p\u00E4s" = 34L, good=c(1:10,NA))) + test(1092, DT[is.na(p\u00E4s), p\u00E4s := 99L], data.table("p\u00E4s" = c(1:5, 99L, 6:10), good = c(1:10,NA))) + test(1093, DT[, p\u00E4s := 34L], data.table("p\u00E4s" = 34L, good=c(1:10,NA))) ')) } else cat("Tests 1092+1093 skipped because required UTF-8 symbols cannot be represented in native encoding.\n")) From 96db2c149653c9565921e7b4eb07fa30cdbcaa4c Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 3 Jan 2026 13:42:49 +0100 Subject: [PATCH 13/21] coding style --- inst/tests/tests.Rraw | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 09d16af221..346b031a89 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -4351,8 +4351,8 @@ x <- character(0) test(1163, last(x), character(0)) # Bug fix for #5159 - chmatch and character encoding (for some reason this seems to pass the test on a mac as well) -a<-c("a","\u00E4","\u00DF","z") -au<-iconv(a,"UTF8","latin1") +a = c("a","\u00E4","\u00DF","z") +au = iconv(a,"UTF8","latin1") test(1164.1, requires_utf8=c("\u00E4", "\u00DF"), chmatch(a, au), match(a, au)) # Bug fix for #73 - segfault when rbindlist on empty data.tables @@ -4640,7 +4640,7 @@ test(1228.5, class(DT), class(DT[a>1, sum(b), by=a])) test(1228.6, class(DT), class(DT[a>1, c:=sum(b), by=a])) # savetl_init error after error, in v1.9.2, thanks Arun -DT <- data.table(x=1:5, y=10:6) +DT = data.table(x=1:5, y=10:6) test(1229.1, DT[forderv(DT, -1)], error="non-existing column") test(1229.2, setkey(DT), data.table(x=1:5, y=10:6, key="x,y")) # umlaut in column names (red herring I think, but testing anyway From 5fcaaa56c84908ffa71f7e7b06a00d97bc1fdf2a Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 3 Jan 2026 13:51:35 +0100 Subject: [PATCH 14/21] use local instead of multiple requires_utf8 --- inst/tests/tests.Rraw | 92 ++++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c8b155ef13..2d21804f49 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18698,56 +18698,60 @@ test(2252.2, dt[, let(b=2L)], error = "\\[ was called on a data.table.*not data. rm(.datatable.aware) # tests for trunc.char handling wide characters #5096 +local({ accented_a = "\u0061\u0301" ja_ichi = "\u4E00" ja_ni = "\u4E8C" ja_ko = "\u3053" ja_n = "\u3093" nc = c(accented_a, ja_ichi, ja_ni, ja_ko, ja_n) -dots = "..." -clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output -# Tests for combining character latin a and acute accent, single row -DT = data.table(strrep(accented_a, 4L)) -test(2253.01, requires_utf8=nc, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(accented_a, 4L)) -test(2253.02, requires_utf8=nc, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(accented_a, 3L), dots)) -test(2253.03, requires_utf8=nc, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(accented_a, 1L), dots)) -# Tests for full-width japanese character ichi, single row -DT = data.table(strrep(ja_ichi, 4L)) -test(2253.04, requires_utf8=nc, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(ja_ichi, 4L)) -test(2253.05, requires_utf8=nc, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(ja_ichi, 3L), dots)) -test(2253.06, requires_utf8=nc, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(ja_ichi, 1L), dots)) -# Tests for multiple, different length combining character rows -DT = data.table(strrep(accented_a, 1L:4L)) -test(2253.07, requires_utf8=nc, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(accented_a, 1:4L)) -test(2253.08, requires_utf8=nc, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(strrep(accented_a, 1:3), paste0(strrep(accented_a, 3L), dots))) -test(2253.09, requires_utf8=nc, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(accented_a, rep(paste0(accented_a, dots), 3L))) -# Tests for multiple, different length full-width characters -DT = data.table(strrep(ja_ichi, 1L:4L)) -test(2253.10, requires_utf8=nc, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(ja_ichi, 1:4L)) -test(2253.11, requires_utf8=nc, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(strrep(ja_ichi, 1:3), paste0(strrep(ja_ichi, 3L), dots))) -test(2253.12, requires_utf8=nc, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(ja_ichi, rep(paste0(ja_ichi, dots), 3L))) -# Tests for combined characters, multiple columns -DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa") -test(2253.13, requires_utf8=nc, options=list(datatable.prettyprint.char = 4L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")) -test(2253.14, requires_utf8=nc, options=list(datatable.prettyprint.char = 3L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")) -test(2253.15, requires_utf8=nc, options=list(datatable.prettyprint.char = 2L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2), paste0(strrep(ja_ko, 2), dots) , strrep(accented_a, 2), "aa...")) -test(2253.16, requires_utf8=nc, options=list(datatable.prettyprint.char = 1L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, paste0(ja_ni, dots), paste0(ja_ko, dots), paste0(accented_a, dots), "a...")) -# Tests for multiple columns, multiple rows -DT = data.table(strrep(ja_ko, 1:3L), strrep(ja_n, 2:4L), strrep(accented_a, 3)) -test(2253.17, requires_utf8=nc, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), - c(paste0(ja_ko, " ", strrep(ja_n, 2L), " ", strrep(accented_a, 3L)), - paste0(strrep(ja_ko, 2L), " ", strrep(ja_n, 3L), " ", strrep(accented_a, 3L)), - paste(strrep(ja_ko, 3L), strrep(ja_n, 4L), strrep(accented_a, 3L)))) -test(2253.18, requires_utf8=nc, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), - c(paste0(ja_ko, " ", strrep(ja_n, 2L), " ", strrep(accented_a, 3L)), - paste0(strrep(ja_ko, 2L), " ", strrep(ja_n, 3L), " ", strrep(accented_a, 3L)), - paste(strrep(ja_ko, 3L), paste0(strrep(ja_n, 3L), dots), strrep(accented_a, 3L)))) -test(2253.19, requires_utf8=nc, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), - c(paste0(ja_ko, " ", paste0(ja_n, dots), " ", paste0(accented_a, dots)), - paste0(c(ja_ko, ja_n, accented_a), dots, collapse=" "), - paste0(c(ja_ko, ja_n, accented_a), dots, collapse=" "))) -# test for data.table with NA, #6441 -test(2253.20, requires_utf8=nc, options=list(datatable.prettyprint.char = 1L), data.table(a = c("abc", NA)), output=" a\n1: a...\n2: ") +if (utf8_check(nc)) { + dots = "..." + clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output + # Tests for combining character latin a and acute accent, single row + DT = data.table(strrep(accented_a, 4L)) + test(2253.01, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(accented_a, 4L)) + test(2253.02, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(accented_a, 3L), dots)) + test(2253.03, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(accented_a, 1L), dots)) + # Tests for full-width japanese character ichi, single row + DT = data.table(strrep(ja_ichi, 4L)) + test(2253.04, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(ja_ichi, 4L)) + test(2253.05, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(ja_ichi, 3L), dots)) + test(2253.06, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(ja_ichi, 1L), dots)) + # Tests for multiple, different length combining character rows + DT = data.table(strrep(accented_a, 1L:4L)) + test(2253.07, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(accented_a, 1:4L)) + test(2253.08, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(strrep(accented_a, 1:3), paste0(strrep(accented_a, 3L), dots))) + test(2253.09, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(accented_a, rep(paste0(accented_a, dots), 3L))) + # Tests for multiple, different length full-width characters + DT = data.table(strrep(ja_ichi, 1L:4L)) + test(2253.10, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), strrep(ja_ichi, 1:4L)) + test(2253.11, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(strrep(ja_ichi, 1:3), paste0(strrep(ja_ichi, 3L), dots))) + test(2253.12, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c(ja_ichi, rep(paste0(ja_ichi, dots), 3L))) + # Tests for combined characters, multiple columns + DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa") + test(2253.13, options=list(datatable.prettyprint.char = 4L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")) + test(2253.14, options=list(datatable.prettyprint.char = 3L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa")) + test(2253.15, options=list(datatable.prettyprint.char = 2L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, strrep(ja_ni, 2), paste0(strrep(ja_ko, 2), dots) , strrep(accented_a, 2), "aa...")) + test(2253.16, options=list(datatable.prettyprint.char = 1L), capture.output(print(DT))[-1L], paste("1:", ja_ichi, paste0(ja_ni, dots), paste0(ja_ko, dots), paste0(accented_a, dots), "a...")) + # Tests for multiple columns, multiple rows + DT = data.table(strrep(ja_ko, 1:3L), strrep(ja_n, 2:4L), strrep(accented_a, 3)) + test(2253.17, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), + c(paste0(ja_ko, " ", strrep(ja_n, 2L), " ", strrep(accented_a, 3L)), + paste0(strrep(ja_ko, 2L), " ", strrep(ja_n, 3L), " ", strrep(accented_a, 3L)), + paste(strrep(ja_ko, 3L), strrep(ja_n, 4L), strrep(accented_a, 3L)))) + test(2253.18, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), + c(paste0(ja_ko, " ", strrep(ja_n, 2L), " ", strrep(accented_a, 3L)), + paste0(strrep(ja_ko, 2L), " ", strrep(ja_n, 3L), " ", strrep(accented_a, 3L)), + paste(strrep(ja_ko, 3L), paste0(strrep(ja_n, 3L), dots), strrep(accented_a, 3L)))) + test(2253.19, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), + c(paste0(ja_ko, " ", paste0(ja_n, dots), " ", paste0(accented_a, dots)), + paste0(c(ja_ko, ja_n, accented_a), dots, collapse=" "), + paste0(c(ja_ko, ja_n, accented_a), dots, collapse=" "))) + # test for data.table with NA, #6441 + test(2253.20, options=list(datatable.prettyprint.char = 1L), data.table(a = c("abc", NA)), output=" a\n1: a...\n2: ") +} else cat("Tests 2253.* skipped because required UTF-8 symbols cannot be represented in native encoding.\n") +}) # allow 1-D matrix in j for consistency, #783 DT=data.table(a = rep(1:2, 3), b = 1:6) From d48b26953f461f893f54ad2b7f3437fd36158607 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 3 Jan 2026 13:54:04 +0100 Subject: [PATCH 15/21] restore cat --- inst/tests/tests.Rraw | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2d21804f49..8606ec219c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18750,7 +18750,8 @@ if (utf8_check(nc)) { paste0(c(ja_ko, ja_n, accented_a), dots, collapse=" "))) # test for data.table with NA, #6441 test(2253.20, options=list(datatable.prettyprint.char = 1L), data.table(a = c("abc", NA)), output=" a\n1: a...\n2: ") -} else cat("Tests 2253.* skipped because required UTF-8 symbols cannot be represented in native encoding.\n") +} else { + cat("Tests 2253* skipped because they need a UTF-8 locale.\n") }) # allow 1-D matrix in j for consistency, #783 From 8d8597213be383c4c00965987b78613fb9ad39f5 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 3 Jan 2026 14:06:00 +0100 Subject: [PATCH 16/21] fix bracket --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8606ec219c..884818bee6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18752,7 +18752,7 @@ if (utf8_check(nc)) { test(2253.20, options=list(datatable.prettyprint.char = 1L), data.table(a = c("abc", NA)), output=" a\n1: a...\n2: ") } else { cat("Tests 2253* skipped because they need a UTF-8 locale.\n") -}) +}}) # allow 1-D matrix in j for consistency, #783 DT=data.table(a = rep(1:2, 3), b = 1:6) From c9c843f0689714988c84344f7ebdf1c7ae3e9c5e Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 10 Jan 2026 23:35:05 -0800 Subject: [PATCH 17/21] nit: typo --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4d7ef4c170..d517dfde83 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3589,7 +3589,7 @@ if (utf8_check(c(x1,x2))) { test(1091.2, tstc(x1) %chin% tstc(x2), tstc(x1) %in% tstc(x2)) } else cat("Tests 1088-1091 skipped because required UTF-8 symbols cannot be represented in native encoding.\n") }) -# for completness, include test from #2528 of non ascii LHS of := (it could feasibly fail in future due to something other than chmatch) +# for completeness, include test from #2528 of non ascii LHS of := (it could feasibly fail in future due to something other than chmatch) local(if (utf8_check("\u00E4")) { eval(parse(text=' # eval(parse()) defers parsing to runtime; see utf8_check description From 6d10dc977e9fb259220e017075096b856a49b306 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 10 Jan 2026 23:37:13 -0800 Subject: [PATCH 18/21] nit: match ')' --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d517dfde83..2606a4c80c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -4644,7 +4644,7 @@ test(1228.6, class(DT), class(DT[a>1, c:=sum(b), by=a])) DT = data.table(x=1:5, y=10:6) test(1229.1, DT[forderv(DT, -1)], error="non-existing column") test(1229.2, setkey(DT), data.table(x=1:5, y=10:6, key="x,y")) -# umlaut in column names (red herring I think, but testing anyway +# umlaut in column names (red herring I think, but testing anyway) local(if (utf8_check("\u00e4\u00f6\u00fc")) { eval(parse(text = ' # eval(parse()) defers parsing to runtime; see utf8_check description sentEx = data.table(abend = c(1, 1, 0, 0, 2), From d4c3fa8a5230879bf766b2faf57db9805a3259cf Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 10 Jan 2026 23:41:18 -0800 Subject: [PATCH 19/21] prefer passing characters individually --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2606a4c80c..2c1d782c51 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -20909,7 +20909,7 @@ x = data.table(a=1, b=2L) y = data.table(c=1.5, d=1L) test(2297.31, y[x, on=.(c == a, d == a), nomatch=NULL], output="Empty data.table (0 rows and 3 cols): c,d,b") -local(if (utf8_check("\u00e4\u00f6\u00fc")) { +local(if (utf8_check(c("\u00e4", "\u00f6", "\u00fc"))) { # rbindlist(l, use.names=TRUE) should handle different colnames encodings #5452 x = data.table(a = 1, b = 2, c = 3) y = data.table(x = 4, y = 5, z = 6) From 0d352ef67c799ffd22f41b104fc3dc2243fa0139 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 11 Jan 2026 17:04:34 +0100 Subject: [PATCH 20/21] simplify check --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2c1d782c51..1fb831c2bc 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -4645,7 +4645,7 @@ DT = data.table(x=1:5, y=10:6) test(1229.1, DT[forderv(DT, -1)], error="non-existing column") test(1229.2, setkey(DT), data.table(x=1:5, y=10:6, key="x,y")) # umlaut in column names (red herring I think, but testing anyway) -local(if (utf8_check("\u00e4\u00f6\u00fc")) { +local(if (utf8_check("\u00fc")) { eval(parse(text = ' # eval(parse()) defers parsing to runtime; see utf8_check description sentEx = data.table(abend = c(1, 1, 0, 0, 2), aber = c(0, 1, 0, 0, 0), From 1d18cff17f7ce62f267d5c1bf7c9d2a877b9769b Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 11 Jan 2026 17:10:54 +0100 Subject: [PATCH 21/21] add comment about default test_str --- R/test.data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index ea34d6c8bc..282d71bca0 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -396,7 +396,7 @@ test = function(num, x, y=TRUE, } # Check UTF-8 requirement if (!isFALSE(requires_utf8)) { - test_str = if (isTRUE(requires_utf8)) "\u00F1\u00FC\u3093" else requires_utf8 + test_str = if (isTRUE(requires_utf8)) "\u00F1\u00FC\u3093" else requires_utf8 # the default test_str are UTF-8 symbols we found over time, TOOD: harden this default if (!utf8_check(test_str)) { # nocov start last_utf8_skip = get0("last_utf8_skip", parent.frame(), ifnotfound=0, inherits=TRUE)