diff --git a/DESCRIPTION b/DESCRIPTION index 45375b9..abb258f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,9 +14,10 @@ BugReports: https://github.com/dselivanov/LSHR/issues VignetteBuilder: knitr SystemRequirements: C++11 Depends: - Matrix, methods Imports: + Matrix, + bit64, data.table(>= 1.9.10), magrittr (>= 1.5), Rcpp (>= 0.10.3), diff --git a/NAMESPACE b/NAMESPACE index 7a96e8b..c3b1ad8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ import(futile.logger) import(magrittr) import(methods) importFrom(Rcpp,evalCpp) +importFrom(bit64,integer64) importFrom(ggplot2,aes) importFrom(ggplot2,geom_line) importFrom(ggplot2,ggplot) diff --git a/R/LSHR.R b/R/LSHR.R index ba17116..57603d6 100644 --- a/R/LSHR.R +++ b/R/LSHR.R @@ -7,6 +7,7 @@ #' @import magrittr #' @import methods #' @import Matrix +#' @importFrom bit64 integer64 #' @import futile.logger #' @importFrom Rcpp evalCpp #' @importFrom ggplot2 ggplot geom_line aes scale_color_discrete diff --git a/R/RcppExports.R b/R/RcppExports.R index c1ab7c8..6226e65 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -2,26 +2,26 @@ # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 hashfun_1 <- function(vec) { - .Call('LSHR_hashfun_1', PACKAGE = 'LSHR', vec) + .Call('_LSHR_hashfun_1', PACKAGE = 'LSHR', vec) } hashfun_2 <- function(vec) { - .Call('LSHR_hashfun_2', PACKAGE = 'LSHR', vec) + .Call('_LSHR_hashfun_2', PACKAGE = 'LSHR', vec) } get_minhash_matrix <- function(unique_shingles_length, hashfun_number, seed) { - .Call('LSHR_get_minhash_matrix', PACKAGE = 'LSHR', unique_shingles_length, hashfun_number, seed) + .Call('_LSHR_get_minhash_matrix', PACKAGE = 'LSHR', unique_shingles_length, hashfun_number, seed) } sign_bit <- function(x) { - .Call('LSHR_sign_bit', PACKAGE = 'LSHR', x) + .Call('_LSHR_sign_bit', PACKAGE = 'LSHR', x) } hash_signatures <- function(m, bands_number, rows_per_band) { - .Call('LSHR_hash_signatures', PACKAGE = 'LSHR', m, bands_number, rows_per_band) + .Call('_LSHR_hash_signatures', PACKAGE = 'LSHR', m, bands_number, rows_per_band) } project_spmat <- function(m, n, hash_fun_id_offest, n_threads = 0L) { - .Call('LSHR_project_spmat', PACKAGE = 'LSHR', m, n, hash_fun_id_offest, n_threads) + .Call('_LSHR_project_spmat', PACKAGE = 'LSHR', m, n, hash_fun_id_offest, n_threads) } diff --git a/R/cosine.R b/R/cosine.R index 223ab44..450453f 100644 --- a/R/cosine.R +++ b/R/cosine.R @@ -13,8 +13,8 @@ get_similar_pairs_cosine <- function(X, bands_number, rows_per_band, seed = 1L, verbose = FALSE, mc.cores = 1, n_band_join = bands_number, ...) { lsh_start = Sys.time() - PACK_BITS = 32L - stopifnot(rows_per_band <= 32L) + PACK_BITS = 64L + stopifnot(rows_per_band <= PACK_BITS) if(inherits(X, "sparseMatrix")) if(!inherits(X, "dgRMatrix")) { diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 5fda54c..60e1024 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -7,7 +7,7 @@ using namespace Rcpp; // hashfun_1 Rcpp::IntegerVector hashfun_1(IntegerVector vec); -RcppExport SEXP LSHR_hashfun_1(SEXP vecSEXP) { +RcppExport SEXP _LSHR_hashfun_1(SEXP vecSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -18,7 +18,7 @@ END_RCPP } // hashfun_2 Rcpp::IntegerVector hashfun_2(IntegerVector vec); -RcppExport SEXP LSHR_hashfun_2(SEXP vecSEXP) { +RcppExport SEXP _LSHR_hashfun_2(SEXP vecSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -29,7 +29,7 @@ END_RCPP } // get_minhash_matrix IntegerVector get_minhash_matrix(uint32_t unique_shingles_length, uint32_t hashfun_number, uint32_t seed); -RcppExport SEXP LSHR_get_minhash_matrix(SEXP unique_shingles_lengthSEXP, SEXP hashfun_numberSEXP, SEXP seedSEXP) { +RcppExport SEXP _LSHR_get_minhash_matrix(SEXP unique_shingles_lengthSEXP, SEXP hashfun_numberSEXP, SEXP seedSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -42,7 +42,7 @@ END_RCPP } // sign_bit IntegerMatrix sign_bit(NumericMatrix x); -RcppExport SEXP LSHR_sign_bit(SEXP xSEXP) { +RcppExport SEXP _LSHR_sign_bit(SEXP xSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -53,7 +53,7 @@ END_RCPP } // hash_signatures IntegerMatrix hash_signatures(IntegerMatrix m, int bands_number, int rows_per_band); -RcppExport SEXP LSHR_hash_signatures(SEXP mSEXP, SEXP bands_numberSEXP, SEXP rows_per_bandSEXP) { +RcppExport SEXP _LSHR_hash_signatures(SEXP mSEXP, SEXP bands_numberSEXP, SEXP rows_per_bandSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -65,8 +65,8 @@ BEGIN_RCPP END_RCPP } // project_spmat -IntegerVector project_spmat(const S4& m, int n, int hash_fun_id_offest, int n_threads); -RcppExport SEXP LSHR_project_spmat(SEXP mSEXP, SEXP nSEXP, SEXP hash_fun_id_offestSEXP, SEXP n_threadsSEXP) { +SEXP project_spmat(const S4& m, int n, int hash_fun_id_offest, int n_threads); +RcppExport SEXP _LSHR_project_spmat(SEXP mSEXP, SEXP nSEXP, SEXP hash_fun_id_offestSEXP, SEXP n_threadsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -78,3 +78,18 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } + +static const R_CallMethodDef CallEntries[] = { + {"_LSHR_hashfun_1", (DL_FUNC) &_LSHR_hashfun_1, 1}, + {"_LSHR_hashfun_2", (DL_FUNC) &_LSHR_hashfun_2, 1}, + {"_LSHR_get_minhash_matrix", (DL_FUNC) &_LSHR_get_minhash_matrix, 3}, + {"_LSHR_sign_bit", (DL_FUNC) &_LSHR_sign_bit, 1}, + {"_LSHR_hash_signatures", (DL_FUNC) &_LSHR_hash_signatures, 3}, + {"_LSHR_project_spmat", (DL_FUNC) &_LSHR_project_spmat, 4}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_LSHR(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/random_projection.cpp b/src/random_projection.cpp index 2f3ca6b..cdeb34f 100644 --- a/src/random_projection.cpp +++ b/src/random_projection.cpp @@ -29,7 +29,7 @@ int omp_thread_count() { } // [[Rcpp::export]] -IntegerVector project_spmat(const S4 &m, int n, int hash_fun_id_offest, int n_threads = 0) { +SEXP project_spmat(const S4 &m, int n, int hash_fun_id_offest, int n_threads = 0) { int num_threads = n_threads; if(num_threads == 0) num_threads = omp_thread_count(); IntegerVector dims = m.slot("Dim"); @@ -42,8 +42,9 @@ IntegerVector project_spmat(const S4 &m, int n, int hash_fun_id_offest, int n_th NumericVector XX = m.slot("x"); double *X = XX.begin(); - IntegerVector res(N); - int *res_ptr = res.begin(); + // IntegerVector res(N); + NumericVector res(N); + unsigned long long *res_ptr = (unsigned long long*)dataptr(res); #ifdef _OPENMP #pragma omp parallel for num_threads(num_threads) @@ -52,7 +53,7 @@ IntegerVector project_spmat(const S4 &m, int n, int hash_fun_id_offest, int n_th uint32_t h1, h2; int p1 = P[i]; int p2 = P[i + 1]; - vector row(32); + vector row(64); float x; for(int k = p1; k < p2; k++) { int j = J[k]; @@ -67,15 +68,16 @@ IntegerVector project_spmat(const S4 &m, int n, int hash_fun_id_offest, int n_th row[hh] += ((int)h * x); } } - std::bitset<32> bitrow; + std::bitset<64> bitrow; for(int hh = 0; hh < n; hh++) { if(row[hh] < 0) bitrow[hh] = 0; else bitrow[hh] = 1; } - res_ptr[i] = bitrow.to_ulong(); + res_ptr[i] = bitrow.to_ullong(); } + res.attr("class") = "integer64"; return res; }