Merge branch 'master' into cbind-merge-list-prep

Rdatatable · Sep 27, 2024 · db210a8 · db210a8
2 parents 707d5cf + fcc9de2
commit db210a8
Show file tree

Hide file tree

Showing 94 changed files with 11,920 additions and 533 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -16,6 +16,7 @@
 ^\.devcontainer$
 ^\.graphics$
 ^\.github$
+^\.vscode$
 ^\.zed$
 
 ^\.gitlab-ci\.yml$

diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R
@@ -1,5 +1,28 @@
+# Test case adapted from https://github.com/Rdatatable/data.table/issues/6105#issue-2268691745 which is where the issue was reported.
+# https://github.com/Rdatatable/data.table/pull/6107 fixed performance across 3 ways to specify a column as Date, and we test each individually.
+extra.args.6107 <- c(
+  "colClasses=list(Date='date')",
+  "colClasses='Date'",
+  "select=list(Date='date')")
+extra.test.list <- list()
+for (extra.arg in extra.args.6107){
+  this.test <- atime::atime_test(
+    setup = {
+      set.seed(1)
+      DT = data.table(date=.Date(sample(20000, N, replace=TRUE)))
+      tmp_csv = tempfile()
+      fwrite(DT, tmp_csv)
+    },
+    Slow = "e9087ce9860bac77c51467b19e92cf4b72ca78c7", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/a77e8c22e44e904835d7b34b047df2eff069d1f2) of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
+    Fast = "a77e8c22e44e904835d7b34b047df2eff069d1f2") # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
+  this.test$expr = str2lang(sprintf("data.table::fread(tmp_csv, %s)", extra.arg))
+  extra.test.list[[sprintf("fread(%s) improved in #6107", extra.arg)]] <- this.test
+}
+
 # A list of performance tests.
 #
+# See documentation in https://github.com/Rdatatable/data.table/wiki/Performance-testing for best practices.
+#
 # Each entry in this list corresponds to a performance test and contains a sublist with three mandatory arguments:
 # - N: A numeric sequence of data sizes to vary.
 # - setup: An expression evaluated for every data size before measuring time/memory.
@@ -17,6 +40,8 @@
 # @note Please check https://github.com/tdhock/atime/blob/main/vignettes/data.table.Rmd for more information.
 # nolint start: undesirable_operator_linter. ':::' needed+appropriate here.
 test.list <- atime::atime_test_list(
+  # Common N and pkg.edit.fun are defined here, and inherited in all test cases below which do not re-define them.
+  N = as.integer(10^seq(1, 7, by=0.25)),
   # A function to customize R package metadata and source files to facilitate version-specific installation and testing.
   #
   # This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R)
@@ -73,10 +98,9 @@ test.list <- atime::atime_test_list(
       paste0('useDynLib(', new.Package_))
   },
 
-  # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311
-  # Fixed in: https://github.com/Rdatatable/data.table/pull/4440
+  # Performance regression discussed in https://github.com/Rdatatable/data.table/issues/4311
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/4440#issuecomment-632842980 which is the fix PR.
   "shallow regression fixed in #4440" = atime::atime_test(
-    N = 10^seq(3,8),
     setup = {
       set.seed(1L)
       dt <- data.table(a = sample.int(N))
@@ -87,29 +111,27 @@ test.list <- atime::atime_test_list(
     Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/0f0e7127b880df8459b0ed064dc841acd22f5b73) in the PR (https://github.com/Rdatatable/data.table/pull/4440/commits) that fixes the regression
     Fixed = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # Merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440)
 
-  # Test based on: https://github.com/Rdatatable/data.table/issues/5424
-  # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491
-  # Fixed in: https://github.com/Rdatatable/data.table/pull/5463
+  # Test based on https://github.com/Rdatatable/data.table/issues/5424
+  # Performance regression introduced from a commit in https://github.com/Rdatatable/data.table/pull/4491
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/5463#issue-1373642456 which is the fix PR.
   "memrecycle regression fixed in #5463" = atime::atime_test(
-    N = 10^seq(3, 8),
     setup = {
-      n <- N/100
+      bigN <- N*100
       set.seed(2L)
       dt <- data.table(
-        g = sample(seq_len(n), N, TRUE),
-        x = runif(N),
+        g = sample(seq_len(N), bigN, TRUE),
+        x = runif(bigN),
         key = "g")
       dt_mod <- copy(dt)
     },
     expr = data.table:::`[.data.table`(dt_mod, , N := .N, by = g),
-    Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits)
-    Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits)
-    Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits)
+    Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR (https://github.com/Rdatatable/data.table/pull/4491/commits) that introduced the issue
+    Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR (https://github.com/Rdatatable/data.table/pull/4491/commits) that introduced the issue
+    Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR (https://github.com/Rdatatable/data.table/pull/5463/commits) that fixed the regression
 
-  # Issue reported in: https://github.com/Rdatatable/data.table/issues/5426
-  # To be fixed in: https://github.com/Rdatatable/data.table/pull/5427
+  # Issue reported in https://github.com/Rdatatable/data.table/issues/5426
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/5427#issue-1323678063 which is the fix PR.
   "setDT improved in #5427" = atime::atime_test(
-    N = 10^seq(1, 7),
     setup = {
       L <- replicate(N, 1, simplify = FALSE)
       setDT(L)
@@ -118,43 +140,73 @@ test.list <- atime::atime_test_list(
       data.table:::setattr(L, "class", NULL)
       data.table:::setDT(L)
     },
-    Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801)
-    Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15"), # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits)
+    Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) in the PR (https://github.com/Rdatatable/data.table/pull/5427/commits) that fixes the issue
+    Fast = "af48a805e7a5026a0c2d0a7fd9b587fea5cfa3c4"), # Last commit in the PR (https://github.com/Rdatatable/data.table/pull/5427/commits) that fixes the issue
 
-  # Issue reported in: https://github.com/Rdatatable/data.table/issues/4200
-  # To be fixed in: https://github.com/Rdatatable/data.table/pull/4558
+  # Test case adapted from https://github.com/Rdatatable/data.table/issues/4200#issuecomment-645980224 which is where the issue was reported.
+  # Fixed in https://github.com/Rdatatable/data.table/pull/4558
   "DT[by] fixed in #4558" = atime::atime_test(
-    N = 10^seq(1, 20),
     setup = {
       d <- data.table(
-        id3 = sample(c(seq.int(N*0.9), sample( N*0.9, N*0.1, TRUE))),
+        id = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))),
         v1 = sample(5L, N, TRUE),
         v2 = sample(5L, N, TRUE)
       )
     },
-    expr = {
-      expr=data.table:::`[.data.table`(d, , max(v1) - min(v2), by = id3)
-    },
+    expr = data.table:::`[.data.table`(d, , max(v1) - min(v2), by = id),
     Before = "7a9eaf62ede487625200981018d8692be8c6f134", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/515de90a6068911a148e54343a3503043b8bb87c) in the PR (https://github.com/Rdatatable/data.table/pull/4164/commits) that introduced the regression
     Regression = "c152ced0e5799acee1589910c69c1a2c6586b95d", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/15f0598b9828d3af2eb8ddc9b38e0356f42afe4f) in the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression
     Fixed = "f750448a2efcd258b3aba57136ee6a95ce56b302"), # Second commit of the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression
 
-  # Issue with sorting again when already sorted: https://github.com/Rdatatable/data.table/issues/4498
-  # Fixed in: https://github.com/Rdatatable/data.table/pull/4501
+  # Issue with sorting again when already sorted, as reported in https://github.com/Rdatatable/data.table/issues/4498
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/4501#issue-625311918 which is the fix PR.
   "DT[,.SD] improved in #4501" = atime::atime_test(
-    N = 10^seq(1, 10, by=0.5),
     setup = {
       set.seed(1)
       L = as.data.table(as.character(rnorm(N, 1, 0.5)))
       setkey(L, V1)
     },
     ## New DT can safely retain key.
-    expr = {
-      data.table:::`[.data.table`(L, , .SD)
-    },
+    expr = data.table:::`[.data.table`(L, , .SD),
     Fast = "353dc7a6b66563b61e44b2fa0d7b73a0f97ca461", # Close-to-last merge commit in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue 
     Slow = "3ca83738d70d5597d9e168077f3768e32569c790", # Circa 2024 master parent of close-to-last merge commit (https://github.com/Rdatatable/data.table/commit/353dc7a6b66563b61e44b2fa0d7b73a0f97ca461) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue 
     Slower = "cacdc92df71b777369a217b6c902c687cf35a70d"), # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue 
 
-  NULL)
+  # Test case adapted from https://github.com/Rdatatable/data.table/issues/6286#issue-2412141289 which is where the issue was reported.
+  # Fixed in https://github.com/Rdatatable/data.table/pull/6296
+  "DT[by,verbose=TRUE] improved in #6296" = atime::atime_test(
+    setup = {
+      dt = data.table(a = 1:N)
+      dt_mod <- copy(dt)
+    },
+    expr = data.table:::`[.data.table`(dt_mod, , 1, by = a, verbose = TRUE),
+    Slow = "a01f00f7438daf4612280d6886e6929fa8c8f76e", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/fc0c1e76408c34a8482f16f7421d262c7f1bde32) in the PR (https://github.com/Rdatatable/data.table/pull/6296/commits) that fixes the issue
+    Fast = "f248bbe6d1204dfc8def62328788eaadcc8e17a1"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6296) that fixes the issue
+
+  # Test case adapted from https://github.com/Rdatatable/data.table/issues/5492#issue-1416598382 which is where the issue was reported,
+  # and from https://github.com/Rdatatable/data.table/pull/5493#issue-1416656788 which is the fix PR.
+  "transform improved in #5493" = atime::atime_test(
+    setup = {
+      df <- data.frame(x = runif(N))
+      dt <- as.data.table(df)
+    },
+    expr = data.table:::transform.data.table(dt, y = round(x)),
+    Slow = "0895fa247afcf6b38044bd5f56c0d209691ddb31", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/93ce3ce1373bf733ebd2036e2883d2ffe377ab58) in the PR (https://github.com/Rdatatable/data.table/pull/5493/commits) that fixes the issue
+    Fast = "2d1a0575f87cc50e90f64825c30d7a6cb6b05dd7"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5493) that fixes the issue
+
+  # Test case created directly using the atime code below (not adapted from any other benchmark), based on the issue/fix PR https://github.com/Rdatatable/data.table/pull/5054#issue-930603663 "melt should be more efficient when there are missing input columns."
+  "melt improved in #5054" = atime::atime_test(
+    setup = {
+      DT <- as.data.table(as.list(1:N))
+      measure.vars <- lapply(1:N, function(i) {
+        x = rep(NA, N)
+        x[i] = i
+        x
+      })  
+    },
+    expr = data.table:::melt(DT, measure.vars = measure.vars),
+    Slow = "fd24a3105953f7785ea7414678ed8e04524e6955", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/ed72e398df76a0fcfd134a4ad92356690e4210ea) of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue
+    Fast = "ed72e398df76a0fcfd134a4ad92356690e4210ea"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue
+
+  tests=extra.test.list)
 # nolint end: undesirable_operator_linter.
diff --git a/.ci/linters/po/msgfmt_linter.R b/.ci/linters/po/msgfmt_linter.R
@@ -0,0 +1,45 @@
+# Use msgfmt to check for untranslated/fuzzy messages, and for whether
+#   the implied .mo compiled form matches that which is already checked in
+msgfmt_linter <- function(po_file) {
+  mo_tmp <- tempfile()
+  on.exit(unlink(mo_tmp))
+
+  res = system2("msgfmt", c("--statistics", po_file, "-o", mo_tmp), stdout=TRUE, stderr=TRUE)
+  if (any(grepl("untranslated message|fuzzy translation", res))) {
+    cat(sprintf("In %s, found incomplete translations:\n%s\n", po_file, paste(res, collapse="\n")))
+    stop("Please fix.")
+  }
+
+  mo_ref = sprintf(
+    "inst/%s/LC_MESSAGES/%sdata.table.mo",
+    gsub("^R-|[.]po$", "", po_file),
+    if (startsWith(basename(po_file), "R-")) "R-" else ""
+  )
+
+  if (!file.exists(mo_ref)) {
+    stop(po_file, " has not been compiled as ", mo_ref, ". Please fix.")
+  }
+  if (tools::md5sum(mo_ref) == tools::md5sum(mo_tmp)) return(invisible())
+  # TODO(#6517): Re-activate this part of the check to ensure .mo is up to date.
+  cat(sprintf("Note: MD5 sum of msgfmt output for %s does not match %s.\n", po_file, mo_ref))
+  return(invisible())
+
+  # NB: file.mtime() will probably be wrong, it will reflect the check-out time of the git repo.
+  last_edit_time = system2("git",
+    c("log", "-1", '--format="%ad"', "--date=format:'%Y-%m-%d %H:%M:%S'", "--", mo_ref),
+    stdout=TRUE
+  )
+  cat(sprintf(
+    ".mo compilation %s of .po translation %s appears out of date! It was last updated %s\n",
+    mo_ref, po_file, last_edit_time
+  ))
+
+  unmo_tmp = tempfile()
+  unmo_ref = tempfile()
+  on.exit(unlink(c(unmo_tmp, unmo_ref)), add=TRUE)
+  system2("msgunfmt", c(mo_tmp, "-o", unmo_tmp))
+  system2("msgunfmt", c(mo_ref, "-o", unmo_ref))
+  cat("Here are the observed differences after converting back to .po:\n\n")
+  system2("diff", c(unmo_tmp, unmo_ref))
+  stop("Please fix.")
+}
diff --git a/.ci/linters/po/tools_check_linter.R b/.ci/linters/po/tools_check_linter.R
@@ -0,0 +1,7 @@
+tools_check_linter = function(po_file) {
+  res = tools::checkPoFile(po_file, strictPlural=TRUE)
+  if (NROW(res)) {
+    print(res)
+    stop("Fix the above .po file issues.")
+  }
+}
diff --git a/.ci/linters/po/utf8_linter.R b/.ci/linters/po/utf8_linter.R
@@ -0,0 +1,4 @@
+utf8_linter <- function(po_file) {
+  if (!any(grepl("charset=UTF-8", readLines(po_file), fixed=TRUE)))
+    stop("In ", po_file, ", please use charset=UTF-8.")
+}
diff --git a/.ci/linters/r/class1_linter.R b/.ci/linters/r/class1_linter.R
@@ -0,0 +1,10 @@
+class1_linter = lintr::make_linter_from_xpath(
+  "
+    //OP-LEFT-BRACKET[
+      preceding-sibling::expr/expr/SYMBOL_FUNCTION_CALL[text() = 'class']
+      and following-sibling::expr/NUM_CONST[text() = '1' or text() = '1L']
+    ]
+      /parent::expr
+  ",
+  "Use class1(x) to get class(x)[1L], or classes1(x) to do so for a full list/data.table"
+)
diff --git a/.dev/cc.R b/.dev/cc.R
@@ -91,9 +91,9 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, path=Sys.getenv(
   if (clean) system("rm *.o *.so")
   OMP = if (omp) "openmp" else "no-openmp"
   if (debug) {
-    cmd = sprintf(R"(MAKEFLAGS='-j CC=%s PKG_CFLAGS=-f% CFLAGS=-std=c99\ -O0\ -ggdb\ %s\ -pedantic' R CMD SHLIB -d -o data_table.so *.c)", CC, OMP, W32)
+    cmd = sprintf(R"(MAKEFLAGS='-j CC=%s PKG_CFLAGS=-f% CFLAGS=-std=c11\ -O0\ -ggdb\ %s\ -pedantic' R CMD SHLIB -d -o data_table.so *.c)", CC, OMP, W32)
   } else {
-    cmd = sprintf(R"(MAKEFLAGS='-j CC=%s CFLAGS=-f%s\ -std=c99\ -O3\ -pipe\ -Wall\ -pedantic\ -Wstrict-prototypes\ -isystem\ /usr/share/R/include\ %s\ -fno-common' R CMD SHLIB -o data_table.so *.c)", CC, OMP, W32)
+    cmd = sprintf(R"(MAKEFLAGS='-j CC=%s CFLAGS=-f%s\ -std=c11\ -O3\ -pipe\ -Wall\ -pedantic\ -Wstrict-prototypes\ -isystem\ /usr/share/R/include\ %s\ -fno-common' R CMD SHLIB -o data_table.so *.c)", CC, OMP, W32)
     # the -isystem suppresses strict-prototypes warnings from R's headers, #5477. Look at the output to see what -I is and pass the same path to -isystem.
     # TODO add -Wextra too?
   }

diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
@@ -56,21 +56,18 @@ jobs:
       - uses: r-lib/actions/setup-r@v2
       - name: Check translations
         run: |
-          setwd("po")
-          for (po_file in list.files(pattern = "[.]po$")) {
-            res = tools::checkPoFile(po_file, strictPlural=TRUE)
-            if (NROW(res)) { print(res); stop("Fix the above .po file issues.") }
-
-            if (!any(grepl("charset=UTF-8", readLines(po_file), fixed=TRUE)))
-              stop("In ", po_file, ", please use charset=UTF-8.")
-
-            res = system2("msgfmt", c("--statistics", po_file, "-o", tempfile()), stdout=TRUE, stderr=TRUE)
-            if (any(grepl("untranslated message|fuzzy translation", res))) {
-              cat(sprintf("In %s, found incomplete translations:\n%s\n", po_file, paste(res, collapse="\n")))
-              stop("Please fix.")
-            }
+          linter_env = new.env()
+          for (f in list.files('.ci/linters/po', full.names=TRUE)) sys.source(f, linter_env)
+          for (po_file in list.files(pattern = "[.]po$", full.names=TRUE)) {
+            # only pay attention to files edited in the current PR, otherwise we can get
+            #   a situation like after #6424 where some untranslated messages were added
+            #   as part of non-translation maintenance, but this GHA would go red repeatedly
+            #   until a translation is added or the blank/fuzzy translations removed. We'd
+            #   rather only have the failure on one PR, then ignore these files later.
+            diff_v_master = system2("git", c("diff", "master", po_file), stdout=TRUE)
+            if (!length(diff_v_master)) next
+            for (linter in ls(linter_env)) linter_env[[linter]](po_file)
           }
-          cat("All translation quality checks completed successfully!\n")
         shell: Rscript {0}
   lint-md:
     runs-on: ubuntu-latest

diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml
@@ -1,4 +1,4 @@
-name: Autocomment atime-based performance regression analysis on PRs
+name: atime performance tests
 
 on:
   pull_request:
@@ -15,10 +15,11 @@ on:
 
 jobs:
   comment:
+    if: github.repository == 'Rdatatable/data.table'
     runs-on: ubuntu-latest
     container: ghcr.io/iterative/cml:0-dvc2-base1
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
       repo_token: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: Anirban166/[email protected].0
+      - uses: Anirban166/[email protected].1