From 8dc7e1498115d1db6ca83b76c33e76ab006125a4 Mon Sep 17 00:00:00 2001 From: Nick Christofides <118103879+NicChr@users.noreply.github.com> Date: Thu, 19 Sep 2024 17:05:49 +0100 Subject: [PATCH] Further bit64 support. --- R/cpp11.R | 4 + src/cheapr_cpp.h | 3 + src/cpp11.cpp | 8 ++ src/gcd.cpp | 242 ++++++++++++++++++++++++++++---------- src/nas.cpp | 12 +- src/utils.cpp | 57 ++++++--- src/which.cpp | 4 +- tests/testthat/test-gcd.R | 10 +- 8 files changed, 247 insertions(+), 93 deletions(-) diff --git a/R/cpp11.R b/R/cpp11.R index 05f2dab..b82dbd7 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -232,6 +232,10 @@ cpp_int64_to_double <- function(x) { .Call(`_cheapr_cpp_int64_to_double`, x) } +cpp_numeric_to_int64 <- function(x) { + .Call(`_cheapr_cpp_numeric_to_int64`, x) +} + cpp_format_double_as_int64 <- function(x) { .Call(`_cheapr_cpp_format_double_as_int64`, x) } diff --git a/src/cheapr_cpp.h b/src/cheapr_cpp.h index 06048d2..78ff0a1 100644 --- a/src/cheapr_cpp.h +++ b/src/cheapr_cpp.h @@ -14,6 +14,9 @@ #ifndef VECTOR_PTR_RO #define VECTOR_PTR_RO(x) ((const SEXP*) DATAPTR_RO(x)) #endif +#ifndef INTEGER64_PTR +#define INTEGER64_PTR(x) ((long long*) REAL(x)) +#endif #ifdef _OPENMP #include diff --git a/src/cpp11.cpp b/src/cpp11.cpp index 8a39a3b..7d108c7 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -412,6 +412,13 @@ extern "C" SEXP _cheapr_cpp_int64_to_double(SEXP x) { END_CPP11 } // utils.cpp +SEXP cpp_numeric_to_int64(SEXP x); +extern "C" SEXP _cheapr_cpp_numeric_to_int64(SEXP x) { + BEGIN_CPP11 + return cpp11::as_sexp(cpp_numeric_to_int64(cpp11::as_cpp>(x))); + END_CPP11 +} +// utils.cpp SEXP cpp_format_double_as_int64(SEXP x); extern "C" SEXP _cheapr_cpp_format_double_as_int64(SEXP x) { BEGIN_CPP11 @@ -479,6 +486,7 @@ static const R_CallMethodDef CallEntries[] = { {"_cheapr_cpp_matrix_row_na_counts", (DL_FUNC) &_cheapr_cpp_matrix_row_na_counts, 1}, {"_cheapr_cpp_new_list", (DL_FUNC) &_cheapr_cpp_new_list, 2}, {"_cheapr_cpp_num_na", (DL_FUNC) &_cheapr_cpp_num_na, 2}, + {"_cheapr_cpp_numeric_to_int64", (DL_FUNC) &_cheapr_cpp_numeric_to_int64, 1}, {"_cheapr_cpp_r_unnested_length", (DL_FUNC) &_cheapr_cpp_r_unnested_length, 1}, {"_cheapr_cpp_row_na_counts", (DL_FUNC) &_cheapr_cpp_row_na_counts, 1}, {"_cheapr_cpp_sequence", (DL_FUNC) &_cheapr_cpp_sequence, 3}, diff --git a/src/gcd.cpp b/src/gcd.cpp index 624a1e0..726183d 100644 --- a/src/gcd.cpp +++ b/src/gcd.cpp @@ -5,6 +5,11 @@ template T cpp_sign(T x) { return (x > 0) - (x < 0); } +#define CHEAPR_INT_TO_INT64(x) ((long long int) (x == NA_INTEGER ? NA_INTEGER64 : x)) +#define CHEAPR_DBL_TO_INT64(x) ((long long int) (x != x ? NA_INTEGER64 : x)) +#define CHEAPR_INT64_TO_INT(x) ((int) (x == NA_INTEGER64 ? NA_INTEGER : x)) +#define CHEAPR_INT64_TO_DBL(x) ((double) (x == NA_INTEGER64 ? NA_REAL : x)) + [[cpp11::register]] double cpp_gcd2(double x, double y, double tol, bool na_rm){ double zero = 0.0; @@ -68,6 +73,41 @@ int cpp_gcd2_int(int x, int y, bool na_rm){ return x; } +long long int cpp_gcd2_int64(long long int x, long long int y, bool na_rm){ + long long int zero = 0; + bool has_na = ( x == NA_INTEGER64 || y == NA_INTEGER64 ); + if (!na_rm && has_na){ + return NA_INTEGER64; + } + if (na_rm && has_na){ + if (x == NA_INTEGER64){ + return y; + } else { + return x; + } + } + // GCD(0,0)=0 + if (x == zero && y == zero){ + return zero; + } + // GCD(a,0)=a + if (x == zero){ + return y; + } + // GCD(a,0)=a + if (y == zero){ + return x; + } + long long int r; + // Taken from number theory lecture notes + while(y != zero){ + r = x % y; + x = y; + y = r; + } + return x; +} + [[cpp11::register]] double cpp_lcm2(double x, double y, double tol, bool na_rm){ if (na_rm && ( !(x == x) || !(y == y) )){ @@ -79,6 +119,29 @@ double cpp_lcm2(double x, double y, double tol, bool na_rm){ return ( std::fabs(x) / cpp_gcd2(x, y, tol, true) ) * std::fabs(y); } +long long int cpp_lcm2_int64(long long int x, long long int y, bool na_rm){ + int num_nas = (x == NA_INTEGER64) + (y == NA_INTEGER64); + if ( num_nas >= 1 ){ + if (na_rm && num_nas == 1){ + return (x == NA_INTEGER64 ? y : x); + } else { + return NA_INTEGER64; + } + } + if (x == 0 && y == 0){ + return 0; + } + // 64-bit integer overflow check + // Make sure not to divide by zero! + + long long res = std::llabs(x) / cpp_gcd2_int64(x, y, false); + if (y != 0 && (std::llabs(res) > (LLONG_MAX / std::llabs(y)))){ + Rf_error("64-bit integer overflow, please use doubles"); + } else { + return (res * std::llabs(y)); + } +} + double cpp_lcm2_int(int x, int y, bool na_rm){ int num_nas = (x == NA_INTEGER) + (y == NA_INTEGER); if ( num_nas >= 1 ){ @@ -100,56 +163,82 @@ SEXP cpp_gcd(SEXP x, double tol, bool na_rm, bool break_early, bool round){ Rf_error("tol must be >= 0 and < 1"); } int NP = 0; - int n = Rf_length(x); + R_xlen_t n = Rf_xlength(x); - if (Rf_isReal(x) && Rf_inherits(x, "integer64")){ - Rf_protect(x = cpp_int64_to_double(x)); ++NP; - } switch(TYPEOF(x)){ case LGLSXP: case INTSXP: { int *p_x = INTEGER(x); - SEXP out = Rf_protect(Rf_allocVector(INTSXP, std::min(n, 1))); ++NP; - int *p_out = INTEGER(out); - int gcd = p_x[0]; - double agcd; // A double because you cant do abs(NA_integer_) - for (int i = 1; i < n; ++i) { - gcd = cpp_gcd2_int(gcd, p_x[i], na_rm); - agcd = std::fabs(gcd); - if ((agcd > 0.0 && agcd <= 1.0) || (!na_rm && (gcd == NA_INTEGER))){ - break; + SEXP out = Rf_protect(Rf_allocVector(INTSXP, n == 0 ? 0 : 1)); ++NP; + if (n > 0){ + int gcd = p_x[0]; + int agcd; + for (R_xlen_t i = 1; i < n; ++i) { + gcd = cpp_gcd2_int(gcd, p_x[i], na_rm); + if (gcd == NA_INTEGER){ + if (!na_rm) break; + } else { + agcd = std::abs(gcd); + if (agcd > 0 && agcd == 1){ + break; + } + } } + INTEGER(out)[0] = gcd; } - p_out[0] = gcd; Rf_unprotect(NP); return out; } default: { + if (is_int64(x)){ + SEXP out = Rf_protect(Rf_allocVector(REALSXP, n == 0 ? 0 : 1)); ++NP; + if (n > 0){ + long long int *p_x = INTEGER64_PTR(x); + long long int gcd = p_x[0]; + long long int agcd; + for (R_xlen_t i = 1; i < n; ++i) { + gcd = cpp_gcd2_int64(gcd, p_x[i], na_rm); + if (gcd == NA_INTEGER64){ + if (!na_rm) break; + } else { + agcd = std::abs(gcd); + if (agcd > 0 && agcd == 1){ + break; + } + } + } + REAL(out)[0] = CHEAPR_INT64_TO_DBL(gcd); + } + Rf_unprotect(NP); + return out; + } else { double *p_x = REAL(x); - SEXP out = Rf_protect(Rf_allocVector(REALSXP, std::min(n, 1))); ++NP; - double *p_out = REAL(out); - double gcd = p_x[0]; - double agcd; - for (int i = 1; i < n; ++i) { - gcd = cpp_gcd2(gcd, p_x[i], tol, na_rm); - agcd = std::fabs(gcd); - if ((!na_rm && !(gcd == gcd))){ - break; + SEXP out = Rf_protect(Rf_allocVector(REALSXP, n == 0 ? 0 : 1)); ++NP; + if (n > 0){ + double gcd = p_x[0]; + double agcd; + for (R_xlen_t i = 1; i < n; ++i) { + gcd = cpp_gcd2(gcd, p_x[i], tol, na_rm); + agcd = std::fabs(gcd); + if ((!na_rm && !(gcd == gcd))){ + break; + } + if (break_early && agcd > 0.0 && agcd < (tol + tol)){ + gcd = tol * cpp_sign(gcd); + break; + } } - if (break_early && agcd > 0.0 && agcd < (tol + tol)){ - gcd = tol * cpp_sign(gcd); - break; + if (round && tol > 0){ + double factor = std::pow(10, std::ceil(std::fabs(std::log10(tol))) + 1); + gcd = std::round(gcd * factor) / factor; } + REAL(out)[0] = gcd; } - if (round && tol > 0){ - double factor = std::pow(10, std::ceil(std::fabs(std::log10(tol))) + 1); - gcd = std::round(gcd * factor) / factor; - } - p_out[0] = gcd; Rf_unprotect(NP); return out; } } + } } // Lowest common multiple using GCD Euclidean algorithm @@ -159,60 +248,83 @@ SEXP cpp_lcm(SEXP x, double tol, bool na_rm){ if (tol < 0 || tol >= 1){ Rf_error("tol must be >= 0 and < 1"); } - int n = Rf_length(x); + R_xlen_t n = Rf_xlength(x); int NP = 0; - if (Rf_isReal(x) && Rf_inherits(x, "integer64")){ - Rf_protect(x = cpp_int64_to_double(x)); ++NP; - } switch(TYPEOF(x)){ case LGLSXP: case INTSXP: { int *p_x = INTEGER(x); - SEXP out = Rf_protect(Rf_allocVector(REALSXP, std::min(n, 1))); ++NP; - double *p_out = REAL(out); - double lcm = p_x[0]; - if (p_x[0] == NA_INTEGER){ - lcm = NA_REAL; - } - int lcm_int = p_x[0]; - double int_max = integer_max_; - for (int i = 1; i < n; ++i) { - if (!na_rm && !(lcm == lcm)){ - lcm = NA_REAL; - break; + + SEXP out; + + if (n > 0){ + + // Initialise first value as lcm + long long int lcm = CHEAPR_INT_TO_INT64(p_x[0]); + + for (R_xlen_t i = 1; i < n; ++i) { + if (!na_rm && lcm == NA_INTEGER64){ + break; + } + lcm = cpp_lcm2_int64(lcm, CHEAPR_INT_TO_INT64(p_x[i]), na_rm); } - lcm = cpp_lcm2_int(lcm_int, p_x[i], na_rm); - if (std::fabs(lcm) > int_max){ - Rf_warning("Integer overflow, returning NA"); - lcm = NA_REAL; - break; + bool is_short = lcm == NA_INTEGER64 || (std::llabs(lcm) <= integer_max_); + out = Rf_protect(Rf_allocVector(is_short ? INTSXP : REALSXP, 1)); ++NP; + if (is_short){ + int temp = CHEAPR_INT64_TO_INT(lcm); + INTEGER(out)[0] = temp; + } else { + double temp = CHEAPR_INT64_TO_DBL(lcm); + REAL(out)[0] = temp; } - lcm_int = (lcm == lcm) ? lcm : NA_INTEGER; + } else { + out = Rf_protect(Rf_allocVector(INTSXP, 0)); ++NP; } - p_out[0] = lcm; - Rf_protect(out = Rf_coerceVector(out, INTSXP)); ++NP; Rf_unprotect(NP); return out; } default: { + if (is_int64(x)){ + long long *p_x = INTEGER64_PTR(x); + + SEXP out = Rf_protect(Rf_allocVector(REALSXP, n == 0 ? 0 : 1)); ++NP; + + if (n > 0){ + // Initialise first value as lcm + long long int lcm = p_x[0]; + + for (R_xlen_t i = 1; i < n; ++i) { + if (!na_rm && lcm == NA_INTEGER64){ + break; + } + lcm = cpp_lcm2_int64(lcm, p_x[i], na_rm); + } + double temp = CHEAPR_INT64_TO_DBL(lcm); + REAL(out)[0] = temp; + } + Rf_unprotect(NP); + return out; + } else { double *p_x = REAL(x); - SEXP out = Rf_protect(Rf_allocVector(REALSXP, std::min(n, 1))); ++NP; - double *p_out = REAL(out); - double lcm = p_x[0]; - for (int i = 1; i < n; ++i) { - if (!na_rm && !(lcm == lcm)){ - lcm = NA_REAL; - break; + SEXP out = Rf_protect(Rf_allocVector(REALSXP, n == 0 ? 0 : 1)); ++NP; + if (n > 0){ + double lcm = p_x[0]; + for (R_xlen_t i = 1; i < n; ++i) { + if (!na_rm && !(lcm == lcm)){ + lcm = NA_REAL; + break; + } + lcm = cpp_lcm2(lcm, p_x[i], tol, na_rm); + if (lcm == R_PosInf || lcm == R_NegInf) break; } - lcm = cpp_lcm2(lcm, p_x[i], tol, na_rm); - if (lcm == R_PosInf || lcm == R_NegInf) break; + REAL(out)[0] = lcm; } - p_out[0] = lcm; Rf_unprotect(NP); return out; } } + } } // Vectorised binary gcd diff --git a/src/nas.cpp b/src/nas.cpp index a1ab3b7..65bd2ae 100644 --- a/src/nas.cpp +++ b/src/nas.cpp @@ -51,7 +51,7 @@ R_xlen_t na_count(SEXP x, bool recursive){ } case REALSXP: { if (is_int64(x)){ - long long *p_x = (long long *)REAL(x); + long long *p_x = INTEGER64_PTR(x); if (do_parallel){ #pragma omp parallel for simd num_threads(n_cores) reduction(+:count) CHEAPR_COUNT_NA(cheapr_is_na_int64); @@ -141,7 +141,7 @@ bool cpp_any_na(SEXP x, bool recursive){ } case REALSXP: { if (is_int64(x)){ - long long *p_x = (long long *) REAL(x); + long long *p_x = INTEGER64_PTR(x); CHEAPR_ANY_NA(cheapr_is_na_int64); } else { double *p_x = REAL(x); @@ -203,7 +203,7 @@ bool cpp_all_na(SEXP x, bool return_true_on_empty, bool recursive){ } case REALSXP: { if (is_int64(x)){ - long long *p_x = (long long *) REAL(x); + long long *p_x = INTEGER64_PTR(x); CHEAPR_ALL_NA(cheapr_is_na_int64); } else { double *p_x = REAL(x); @@ -277,7 +277,7 @@ SEXP cpp_is_na(SEXP x){ out = Rf_protect(Rf_allocVector(LGLSXP, n)); int *p_out = LOGICAL(out); if (is_int64(x)){ - long long *p_x = (long long *) REAL(x); + long long *p_x = INTEGER64_PTR(x); if (n_cores > 1){ OMP_PARALLEL_FOR_SIMD CHEAPR_VEC_IS_NA(cheapr_is_na_int64); @@ -725,7 +725,7 @@ SEXP cpp_matrix_row_na_counts(SEXP x){ } case REALSXP: { if (is_int64(x)){ - long long *p_x = (long long *) REAL(x); + long long *p_x = INTEGER64_PTR(x); #pragma omp for for (R_xlen_t i = 0; i < n; ++i){ #pragma omp atomic @@ -797,7 +797,7 @@ SEXP cpp_matrix_col_na_counts(SEXP x){ } case REALSXP: { if (is_int64(x)){ - long long *p_x = (long long *) REAL(x); + long long *p_x = INTEGER64_PTR(x); #pragma omp for for (R_xlen_t i = 0; i < n; ++i){ #pragma omp atomic diff --git a/src/utils.cpp b/src/utils.cpp index 4600428..9d881ef 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -69,6 +69,8 @@ bool is_int64(SEXP x){ return Rf_isReal(x) && Rf_inherits(x, "integer64"); } +// We almost never want to convert back to 32-bit int + [[cpp11::register]] SEXP cpp_int64_to_double(SEXP x){ if (!is_int64(x)){ @@ -79,7 +81,7 @@ SEXP cpp_int64_to_double(SEXP x){ SEXP out = Rf_protect(Rf_allocVector(REALSXP, n)); double *p_out = REAL(out); - long long *p_x = (long long *) REAL(x); + long long *p_x = INTEGER64_PTR(x); for (R_xlen_t i = 0; i < n; ++i){ p_out[i] = cheapr_is_na_int64(p_x[i]) ? NA_REAL : (double) p_x[i]; @@ -88,21 +90,44 @@ SEXP cpp_int64_to_double(SEXP x){ return out; } -// The reverse operation but don't need this -// SEXP cpp_double_to_int64(SEXP x){ -// R_xlen_t n = Rf_xlength(x); -// -// SEXP out = Rf_protect(Rf_allocVector(REALSXP, n)); -// long long *p_out = (long long *) REAL(out); -// double *p_x = REAL(x); -// -// for (R_xlen_t i = 0; i < n; ++i){ -// p_out[i] = cheapr_is_na_dbl(p_x[i]) ? NA_INTEGER64 : (long long) p_x[i]; -// } -// Rf_classgets(out, Rf_mkString("integer64")); -// Rf_unprotect(1); -// return out; -// } +// The reverse operation + +[[cpp11::register]] +SEXP cpp_numeric_to_int64(SEXP x){ + + if (is_int64(x)){ + return x; + } + + R_xlen_t n = Rf_xlength(x); + + switch (TYPEOF(x)){ + case INTSXP: { + SEXP out = Rf_protect(Rf_allocVector(REALSXP, n)); + long long *p_out = INTEGER64_PTR(out); + int *p_x = INTEGER(x); + + for (R_xlen_t i = 0; i < n; ++i){ + p_out[i] = cheapr_is_na_int(p_x[i]) ? NA_INTEGER64 : (long long) p_x[i]; + } + Rf_classgets(out, Rf_mkString("integer64")); + Rf_unprotect(1); + return out; + } + default: { + SEXP out = Rf_protect(Rf_allocVector(REALSXP, n)); + long long *p_out = INTEGER64_PTR(out); + double *p_x = REAL(x); + + for (R_xlen_t i = 0; i < n; ++i){ + p_out[i] = cheapr_is_na_dbl(p_x[i]) ? NA_INTEGER64 : (long long) p_x[i]; + } + Rf_classgets(out, Rf_mkString("integer64")); + Rf_unprotect(1); + return out; + } + } +} // Found here stackoverflow.com/questions/347949 template diff --git a/src/which.cpp b/src/which.cpp index 2379755..256ff51 100644 --- a/src/which.cpp +++ b/src/which.cpp @@ -229,7 +229,7 @@ SEXP cpp_which_na(SEXP x){ case REALSXP: { R_xlen_t count = na_count(x, true); if (Rf_inherits(x, "integer64")){ - long long *p_x = (long long *) REAL(x); + long long *p_x = INTEGER64_PTR(x); if (is_short){ int out_size = count; SEXP out = Rf_protect(Rf_allocVector(INTSXP, out_size)); @@ -391,7 +391,7 @@ SEXP cpp_which_not_na(SEXP x){ case REALSXP: { R_xlen_t count = na_count(x, true); if (Rf_inherits(x, "integer64")){ - long long *p_x = (long long *) REAL(x); + long long *p_x = INTEGER64_PTR(x); if (is_short){ int out_size = n - count; SEXP out = Rf_protect(Rf_allocVector(INTSXP, out_size)); diff --git a/tests/testthat/test-gcd.R b/tests/testthat/test-gcd.R index 65d8012..0fd4d96 100644 --- a/tests/testthat/test-gcd.R +++ b/tests/testthat/test-gcd.R @@ -151,10 +151,12 @@ test_that("signs", { }) test_that("Overflow", { - expect_equal(scm(1:22), 232792560) - expect_equal(scm(-(1:22)), -232792560) - expect_warning(scm(1:23)) - expect_warning(scm(-(1:23))) + expect_equal(scm(1:30), 2329089562800) + expect_equal(scm(-(1:30)), -2329089562800) + expect_error(scm(1:50)) + expect_error(scm(-(1:50))) + expect_equal(scm(as.double(1:50)), 18523376382441352270484866200) + expect_equal(scm(-as.double(1:50)), -18523376382441352270484866200) }) test_that("Binary gcd and scm", {