From f2f91a57dea1750be74f134689b262846f8c96ec Mon Sep 17 00:00:00 2001
From: Nick Christofides <118103879+NicChr@users.noreply.github.com>
Date: Mon, 23 Sep 2024 11:52:37 +0100
Subject: [PATCH] Bug fixes.

---
 NEWS.md          |   2 +-
 R/extras.R       |  69 ++++++++++++++++---
 R/scalars.R      |   4 +-
 R/utils.R        |   5 ++
 cran-comments.md |   7 +-
 man/extras.Rd    |  13 +---
 src/int64.cpp    |  56 +++++++++++----
 src/scalars.cpp  | 176 ++++++++++++++++-------------------------------
 8 files changed, 175 insertions(+), 157 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index ff2c91b..5c93d6a 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -22,7 +22,7 @@ supported in any sequence functions or in the 'set_math' functions.
 
 * `lag_` now uses `memmove` where possible.
 
-* Fixed an issue where `lag_(x)` was materialising x twice if x was an altrep 
+* Fixed an issue where `lag_(x)` was materialising x twice if x was an ALTREP 
 integer sequence.
 
 # cheapr 0.9.3 (29-Jul-2024)
diff --git a/R/extras.R b/R/extras.R
index 0d3f695..33247c7 100644
--- a/R/extras.R
+++ b/R/extras.R
@@ -126,13 +126,64 @@ cut_numeric <- function(x, breaks, labels = NULL, include.lowest = FALSE,
 }
 #' @export
 #' @rdname extras
-cut.integer64 <- function(x, breaks, labels = NULL, include.lowest = FALSE,
-                          right = TRUE, dig.lab = 3L, ordered_result = FALSE, ...){
-  cut_numeric(as.double(x), breaks = breaks,
-              labels = labels, include.lowest = include.lowest,
-              right = right, dig.lab = dig.lab, ordered_result = ordered_result,
-              ...)
-
+cut.integer64 <- function(x, ...){
+  cut_numeric(cpp_int64_to_numeric(x), ...)
+  # if (!is.numeric(x))
+  #   stop("'x' must be numeric")
+  # if (length(breaks) == 1L) {
+  #   if (is.na(breaks) || breaks < 2L)
+  #     stop("invalid number of intervals")
+  #   nb <- as.integer(breaks + 1)
+  #   dx <- diff(rx <- as.double(range(x, na.rm = TRUE)))
+  #
+  #   if (isTRUE(dx == 0)) {
+  #     dx <- if (rx[1L] != 0)
+  #       abs(rx[1L])
+  #     else 1
+  #     breaks <- seq.int(rx[1L] - dx/1000, rx[2L] + dx/1000,
+  #                       length.out = nb)
+  #   }
+  #   else {
+  #     breaks <- seq.int(rx[1L], rx[2L], length.out = nb)
+  #     breaks[c(1L, nb)] <- c(rx[1L] - dx/1000, rx[2L] +
+  #                              dx/1000)
+  #   }
+  # }
+  # else nb <- length(breaks <- sort(as.double(breaks)))
+  # if (anyDuplicated(breaks))
+  #   stop("'breaks' are not unique")
+  # codes.only <- FALSE
+  # if (is.null(labels)) {
+  #   for (dig in dig.lab:max(12L, dig.lab)) {
+  #     ch.br <- cpp_format_numeric_as_int64(breaks)
+  #     ch.br[na_find(ch.br)] <- "NA"
+  #     if (ok <- all(ch.br[-1L] != ch.br[-nb]))
+  #       break
+  #   }
+  #   labels <- if (ok)
+  #     paste0(if (right)
+  #       "("
+  #       else "[", ch.br[-nb], ",", ch.br[-1L], if (right)
+  #         "]"
+  #       else ")")
+  #   else paste0("Range_", seq_len(nb - 1L))
+  #   if (ok && include.lowest) {
+  #     if (right)
+  #       substr(labels[1L], 1L, 1L) <- "["
+  #     else substring(labels[nb - 1L], nchar(labels[nb -
+  #                                                    1L], "c")) <- "]"
+  #   }
+  # }
+  # else if (is.logical(labels) && !labels)
+  #   codes.only <- TRUE
+  # else if (length(labels) != nb - 1L)
+  #   stop("number of intervals and length of 'labels' differ")
+  # code <- .bincode(cpp_int64_to_numeric(x), breaks, right, include.lowest)
+  # if (!codes.only) {
+  #   levels(code) <- as.character(labels)
+  #   class(code) <- c(if (ordered_result) "ordered" else character(0), "factor")
+  # }
+  # code
 }
 #' @export
 #' @rdname extras
@@ -180,8 +231,8 @@ deframe_ <- function(x){
 }
 #' @export
 #' @rdname extras
-sample_ <- function(x, size = cpp_vec_length(x), replace = FALSE, prob = NULL){
-  sset(x, sample.int(cpp_vec_length(x), size, replace, prob))
+sample_ <- function(x, size = vector_length(x), replace = FALSE, prob = NULL){
+  sset(x, sample.int(vector_length(x), size, replace, prob))
 }
 #' @export
 #' @rdname extras
diff --git a/R/scalars.R b/R/scalars.R
index 008ae53..edb94e9 100644
--- a/R/scalars.R
+++ b/R/scalars.R
@@ -107,9 +107,9 @@ na_count <- num_na
 #' @export
 na_find <- function(x, invert = FALSE){
   if (invert){
-    which_na(x)
-  } else {
     which_not_na(x)
+  } else {
+    which_na(x)
   }
 }
 #' @rdname scalars
diff --git a/R/utils.R b/R/utils.R
index e984195..69c04dd 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -155,6 +155,11 @@ as.integer.integer64 <- function(x, ...){
 
 #' @exportS3Method collapse::fsum
 fsum.integer64 <- function(x, g = NULL, ...){
+
+  ## The statement is here because
+  ## when there are groups, collapse::fsum
+  ## errors when there is a 32-bit int overflow
+
   if (is.null(g)){
     collapse::fsum(cpp_int64_to_numeric(x), ...)
   } else {
diff --git a/cran-comments.md b/cran-comments.md
index 34d3e31..d504ce9 100644
--- a/cran-comments.md
+++ b/cran-comments.md
@@ -1,15 +1,14 @@
-* Updated to version 0.9.3
-
-* Have fixed the issues listed at https://cran.r-project.org/web/checks/check_results_cheapr.html
+* Updated to version 0.9.5
 
 * Checked and passed using rhub v2.0.0 in the following environments:
 
 ## Test environments
 - R-hubv2 windows (R-devel)
 - R-hubv2 linux (R-devel)
-- R-hubv2 macos (R-devel)
+- R-hubv2 macos-arm64 (R-devel)
 - R-hubv2 clang-asan
 - R-hubv2 ubuntu-gcc12
+- R-hubv2 ubuntu-clang
 
 Additionally checked on win-builder.r-project.org:
 
diff --git a/man/extras.Rd b/man/extras.Rd
index 890142b..03c936d 100644
--- a/man/extras.Rd
+++ b/man/extras.Rd
@@ -30,16 +30,7 @@ cut_numeric(
   ...
 )
 
-\method{cut}{integer64}(
-  x,
-  breaks,
-  labels = NULL,
-  include.lowest = FALSE,
-  right = TRUE,
-  dig.lab = 3L,
-  ordered_result = FALSE,
-  ...
-)
+\method{cut}{integer64}(x, ...)
 
 x \%in_\% table
 
@@ -49,7 +40,7 @@ enframe_(x, name = "name", value = "value")
 
 deframe_(x)
 
-sample_(x, size = cpp_vec_length(x), replace = FALSE, prob = NULL)
+sample_(x, size = vector_length(x), replace = FALSE, prob = NULL)
 
 val_insert(x, value, n = NULL, prop = NULL)
 
diff --git a/src/int64.cpp b/src/int64.cpp
index e482782..85e4319 100644
--- a/src/int64.cpp
+++ b/src/int64.cpp
@@ -51,6 +51,48 @@ SEXP cpp_int64_to_double(SEXP x){
   return out;
 }
 
+// Can all numbers be safely converted to 32-bit int?
+
+bool cpp_all_integerable(SEXP x, int shift = 0){
+  R_xlen_t n = Rf_xlength(x);
+  bool out = true;
+
+  switch (TYPEOF(x)){
+  case LGLSXP:
+  case INTSXP: {
+    break;
+  }
+  case REALSXP: {
+    if (is_int64(x)){
+    long long int *p_x = INTEGER64_PTR(x);
+    long long int int_max = integer_max_;
+    long long int shift_ = shift;
+    for (R_xlen_t i = 0; i < n; ++i){
+      if (!cheapr_is_na_int64(p_x[i]) && ( (std::llabs(p_x[i]) + shift_) > int_max )){
+        out = false;
+        break;
+      }
+    }
+  } else {
+    double *p_x = REAL(x);
+    double int_max = integer_max_;
+    double shift_ = shift;
+    for (R_xlen_t i = 0; i < n; ++i){
+      if (!cheapr_is_na_dbl(p_x[i]) && ( (std::fabs(p_x[i]) + shift_) > int_max )){
+        out = false;
+        break;
+      }
+    }
+  }
+  break;
+  }
+  default: {
+    Rf_error("%s cannot handle an object of type %s", __func__, Rf_type2char(TYPEOF(x)));
+  }
+  }
+  return out;
+}
+
 // Convert 64-bit integer to int if possible, otherwise double
 
 [[cpp11::register]]
@@ -58,19 +100,7 @@ SEXP cpp_int64_to_numeric(SEXP x){
   if (!is_int64(x)){
     Rf_error("x must be an integer64");
   }
-  R_xlen_t n = Rf_xlength(x);
-
-  long long *p_x = INTEGER64_PTR(x);
-
-  bool maybe_int = true;
-  long long int int_max = integer_max_;
-  for (R_xlen_t i = 0; i < n; ++i){
-    if (!cheapr_is_na_int64(p_x[i]) && std::llabs(p_x[i]) > int_max){
-     maybe_int = false;
-     break;
-    }
-  }
-  return maybe_int ? cpp_int64_to_int(x) : cpp_int64_to_double(x);
+  return cpp_all_integerable(x, 0) ? cpp_int64_to_int(x) : cpp_int64_to_double(x);
 }
 
 // The reverse operation
diff --git a/src/scalars.cpp b/src/scalars.cpp
index 7d50d17..a1cc75b 100644
--- a/src/scalars.cpp
+++ b/src/scalars.cpp
@@ -1,5 +1,44 @@
 #include "cheapr_cpp.h"
 
+bool is_scalar_na(SEXP x){
+  if (Rf_xlength(x) != 1){
+    Rf_error("x must be a scalar value");
+  }
+  switch(TYPEOF(x)){
+  case LGLSXP:
+  case INTSXP: {
+    return cheapr_is_na_int(Rf_asInteger(x));
+  }
+  case REALSXP: {
+    if (is_int64(x)){
+    return cheapr_is_na_int64(INTEGER64_PTR(x)[0]);
+  } else {
+    return cheapr_is_na_dbl(Rf_asReal(x));
+  }
+  }
+  case STRSXP: {
+    return cheapr_is_na_str(Rf_asChar(x));
+  }
+  case CPLXSXP: {
+    return cheapr_is_na_cplx(Rf_asComplex(x));
+  }
+  case RAWSXP: {
+    return false;
+  }
+  default: {
+    Rf_error("%s cannot handle an object of type %s", __func__, Rf_type2char(TYPEOF(x)));
+  }
+  }
+}
+
+// Doesn't properly handle integer64
+bool implicit_na_coercion(SEXP x, int target_type){
+  SEXP target = Rf_protect(Rf_coerceVector(x, target_type));
+  bool out = !is_scalar_na(x) && is_scalar_na(target);
+  Rf_unprotect(1);
+  return out;
+}
+
 R_xlen_t scalar_count(SEXP x, SEXP value, bool recursive){
   if (cpp_vec_length(value) != 1){
     Rf_error("value must be a vector of length 1");
@@ -10,8 +49,7 @@ R_xlen_t scalar_count(SEXP x, SEXP value, bool recursive){
   bool do_parallel = n >= 100000;
   int n_cores = do_parallel ? num_cores() : 1;
 
-  SEXP val_is_na = Rf_protect(cpp_is_na(value));
-  ++NP;
+  SEXP val_is_na = Rf_protect(cpp_is_na(value)); ++NP;
   if (Rf_length(val_is_na) == 1 && LOGICAL(val_is_na)[0]){
     Rf_unprotect(NP);
     return na_count(x, recursive);
@@ -24,14 +62,8 @@ R_xlen_t scalar_count(SEXP x, SEXP value, bool recursive){
   }
   case LGLSXP:
   case INTSXP: {
-    Rf_protect(value = Rf_coerceVector(value, INTSXP));
-    ++NP;
-    if (Rf_length(value) != 1){
-      Rf_unprotect(NP);
-      Rf_error("value must be of length 1");
-    }
-    // Basically if the coercion produces NA the count is 0.
-    if (INTEGER(value)[0] == NA_INTEGER) break;
+    if (implicit_na_coercion(value, TYPEOF(x))) break;
+    Rf_protect(value = Rf_coerceVector(value, INTSXP)); ++NP;
     int val = Rf_asInteger(value);
     int *p_x = INTEGER(x);
     if (do_parallel){
@@ -44,14 +76,8 @@ R_xlen_t scalar_count(SEXP x, SEXP value, bool recursive){
     break;
   }
   case REALSXP: {
-    Rf_protect(value = Rf_coerceVector(value, REALSXP));
-    ++NP;
-    if (Rf_length(value) != 1){
-      Rf_unprotect(NP);
-      Rf_error("value must be of length 1");
-    }
-    // Basically if the coercion produces NA the count is 0.
-    if (REAL(value)[0] != REAL(value)[0]) break;
+    if (implicit_na_coercion(value, TYPEOF(x))) break;
+    Rf_protect(value = Rf_coerceVector(value, REALSXP)); ++NP;
     double val = Rf_asReal(value);
     double *p_x = REAL(x);
     if (do_parallel){
@@ -64,16 +90,9 @@ R_xlen_t scalar_count(SEXP x, SEXP value, bool recursive){
     break;
   }
   case STRSXP: {
-    Rf_protect(value = Rf_coerceVector(value, STRSXP));
-    ++NP;
-    if (Rf_length(value) != 1){
-      Rf_unprotect(NP);
-      Rf_error("value must be of length 1");
-    }
-    // Basically if the coercion produces NA the count is 0.
-    if (STRING_ELT(value, 0) == NA_STRING) break;
-    SEXP val = Rf_protect(Rf_asChar(value));
-    ++NP;
+    if (implicit_na_coercion(value, TYPEOF(x))) break;
+    Rf_protect(value = Rf_coerceVector(value, STRSXP)); ++NP;
+    SEXP val = Rf_protect(Rf_asChar(value)); ++NP;
     const SEXP *p_x = STRING_PTR_RO(x);
     if (do_parallel){
 #pragma omp parallel for simd num_threads(n_cores) reduction(+:count)
@@ -96,12 +115,6 @@ R_xlen_t scalar_count(SEXP x, SEXP value, bool recursive){
   default: {
     Rf_unprotect(NP);
     Rf_error("%s cannot handle an object of type %s", __func__, Rf_type2char(TYPEOF(x)));
-    // SEXP is_equal = Rf_protect(cpp11::package("base")["=="](x, value));
-    // ++NP;
-    // SEXP r_true = Rf_protect(Rf_ScalarLogical(true));
-    // ++NP;
-    // count = scalar_count(is_equal, r_true, true);
-    // break;
   }
   }
   Rf_unprotect(NP);
@@ -156,6 +169,10 @@ SEXP cpp_val_replace(SEXP x, SEXP value, SEXP replace, bool recursive){
   }
   case LGLSXP:
   case INTSXP: {
+    if (implicit_na_coercion(value, TYPEOF(x))){
+    out = x;
+    break;
+  }
     SEXP temp = Rf_protect(Rf_allocVector(INTSXP, 0)); ++NP;
     int *p_out = INTEGER(temp);
     Rf_protect(value = Rf_coerceVector(value, INTSXP)); ++NP;
@@ -181,6 +198,10 @@ SEXP cpp_val_replace(SEXP x, SEXP value, SEXP replace, bool recursive){
     break;
   }
   case REALSXP: {
+    if (implicit_na_coercion(value, TYPEOF(x))){
+    out = x;
+    break;
+  }
     if (is_int64(x)){
     SEXP temp = Rf_protect(Rf_allocVector(REALSXP, 0)); ++NP;
     long long *p_out = INTEGER64_PTR(temp);
@@ -253,6 +274,10 @@ SEXP cpp_val_replace(SEXP x, SEXP value, SEXP replace, bool recursive){
   break;
   }
   case STRSXP: {
+    if (implicit_na_coercion(value, TYPEOF(x))){
+    out = x;
+    break;
+  }
     Rf_protect(value = Rf_coerceVector(value, STRSXP)); ++NP;
     Rf_protect(replace = Rf_coerceVector(replace, STRSXP)); ++NP;
     SEXP val = Rf_protect(Rf_asChar(value)); ++NP;
@@ -297,86 +322,3 @@ SEXP cpp_val_replace(SEXP x, SEXP value, SEXP replace, bool recursive){
   Rf_unprotect(NP);
   return out;
 }
-// SEXP cpp_val_replace(SEXP x, SEXP value, SEXP replace, bool set){
-//
-//   //TO-DO: Make sure this can't work on ALTREP
-//
-//   int NP = 0;
-//   R_xlen_t n = Rf_xlength(x);
-//   if (Rf_length(value) != 1){
-//     Rf_error("value must be a vector of length 1");
-//   }
-//   if (Rf_length(replace) != 1){
-//     Rf_error("replace must be a vector of length 1");
-//   }
-//   if (Rf_isVectorList(x)){
-//     Rf_error("x must not be a list");
-//   }
-//   bool val_is_na = cpp_any_na(value, true);
-//   if (val_is_na && !cpp_any_na(x, true)){
-//     Rf_unprotect(NP);
-//     return x;
-//   }
-//   SEXP out;
-//   switch ( TYPEOF(x) ){
-//   case NILSXP: {
-//     out = Rf_protect(R_NilValue);
-//     ++NP;
-//     break;
-//   }
-//   case LGLSXP:
-//   case INTSXP: {
-//     Rf_protect(value = Rf_coerceVector(value, INTSXP));
-//     ++NP;
-//     Rf_protect(replace = Rf_coerceVector(replace, INTSXP));
-//     ++NP;
-//     int val = Rf_asInteger(value);
-//     int repl = Rf_asInteger(replace);
-//     int *p_x = INTEGER(x);
-//     out = Rf_protect(set ? x : Rf_duplicate(x));
-//     ++NP;
-//     int *p_out = INTEGER(out);
-//     for (R_xlen_t i = 0; i < n; ++i) if (p_x[i] == val) p_out[i] = repl;
-//     break;
-//   }
-//   case REALSXP: {
-//     Rf_protect(value = Rf_coerceVector(value, REALSXP));
-//     ++NP;
-//     Rf_protect(replace = Rf_coerceVector(replace, REALSXP));
-//     ++NP;
-//     double val = Rf_asReal(value);
-//     double repl = Rf_asReal(replace);
-//     double *p_x = REAL(x);
-//     out = Rf_protect(set ? x : Rf_duplicate(x));
-//     ++NP;
-//     double *p_out = REAL(out);
-//     if (val_is_na){
-//       for (R_xlen_t i = 0; i < n; ++i) if (p_x[i] != p_x[i]) p_out[i] = repl;
-//     } else {
-//       for (R_xlen_t i = 0; i < n; ++i) if (p_x[i] == val) p_out[i] = repl;
-//     }
-//     break;
-//   }
-//   case STRSXP: {
-//     Rf_protect(value = Rf_coerceVector(value, STRSXP));
-//     ++NP;
-//     Rf_protect(replace = Rf_coerceVector(replace, STRSXP));
-//     ++NP;
-//     SEXP val = Rf_protect(Rf_asChar(value));
-//     ++NP;
-//     SEXP repl = Rf_protect(Rf_asChar(replace));
-//     ++NP;
-//     const SEXP *p_x = STRING_PTR_RO(x);
-//     out = Rf_protect(set ? x : Rf_duplicate(x));
-//     ++NP;
-//     for (R_xlen_t i = 0; i < n; ++i) if (p_x[i] == val) SET_STRING_ELT(out, i, repl);
-//     break;
-//   }
-//   default: {
-//     Rf_unprotect(NP);
-//     Rf_error("%s cannot handle an object of type %s", __func__, Rf_type2char(TYPEOF(x)));
-//   }
-//   }
-//   Rf_unprotect(NP);
-//   return out;
-// }