Skip to content

Commit

Permalink
initial support for 64-bit signatures in random projections - related…
Browse files Browse the repository at this point in the history
… to #12. Specialized 32-bit  implementation to be explored.
  • Loading branch information
dselivanov committed Sep 25, 2017
1 parent f1d5c95 commit 19604d5
Show file tree
Hide file tree
Showing 7 changed files with 42 additions and 22 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ BugReports: https://github.com/dselivanov/LSHR/issues
VignetteBuilder: knitr
SystemRequirements: C++11
Depends:
Matrix,
methods
Imports:
Matrix,
bit64,
data.table(>= 1.9.10),
magrittr (>= 1.5),
Rcpp (>= 0.10.3),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import(futile.logger)
import(magrittr)
import(methods)
importFrom(Rcpp,evalCpp)
importFrom(bit64,integer64)
importFrom(ggplot2,aes)
importFrom(ggplot2,geom_line)
importFrom(ggplot2,ggplot)
Expand Down
1 change: 1 addition & 0 deletions R/LSHR.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#' @import magrittr
#' @import methods
#' @import Matrix
#' @importFrom bit64 integer64
#' @import futile.logger
#' @importFrom Rcpp evalCpp
#' @importFrom ggplot2 ggplot geom_line aes scale_color_discrete
Expand Down
12 changes: 6 additions & 6 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,26 @@
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

hashfun_1 <- function(vec) {
.Call('LSHR_hashfun_1', PACKAGE = 'LSHR', vec)
.Call('_LSHR_hashfun_1', PACKAGE = 'LSHR', vec)
}

hashfun_2 <- function(vec) {
.Call('LSHR_hashfun_2', PACKAGE = 'LSHR', vec)
.Call('_LSHR_hashfun_2', PACKAGE = 'LSHR', vec)
}

get_minhash_matrix <- function(unique_shingles_length, hashfun_number, seed) {
.Call('LSHR_get_minhash_matrix', PACKAGE = 'LSHR', unique_shingles_length, hashfun_number, seed)
.Call('_LSHR_get_minhash_matrix', PACKAGE = 'LSHR', unique_shingles_length, hashfun_number, seed)
}

sign_bit <- function(x) {
.Call('LSHR_sign_bit', PACKAGE = 'LSHR', x)
.Call('_LSHR_sign_bit', PACKAGE = 'LSHR', x)
}

hash_signatures <- function(m, bands_number, rows_per_band) {
.Call('LSHR_hash_signatures', PACKAGE = 'LSHR', m, bands_number, rows_per_band)
.Call('_LSHR_hash_signatures', PACKAGE = 'LSHR', m, bands_number, rows_per_band)
}

project_spmat <- function(m, n, hash_fun_id_offest, n_threads = 0L) {
.Call('LSHR_project_spmat', PACKAGE = 'LSHR', m, n, hash_fun_id_offest, n_threads)
.Call('_LSHR_project_spmat', PACKAGE = 'LSHR', m, n, hash_fun_id_offest, n_threads)
}

4 changes: 2 additions & 2 deletions R/cosine.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
get_similar_pairs_cosine <- function(X, bands_number, rows_per_band, seed = 1L, verbose = FALSE,
mc.cores = 1, n_band_join = bands_number, ...) {
lsh_start = Sys.time()
PACK_BITS = 32L
stopifnot(rows_per_band <= 32L)
PACK_BITS = 64L
stopifnot(rows_per_band <= PACK_BITS)

if(inherits(X, "sparseMatrix"))
if(!inherits(X, "dgRMatrix")) {
Expand Down
29 changes: 22 additions & 7 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ using namespace Rcpp;

// hashfun_1
Rcpp::IntegerVector hashfun_1(IntegerVector vec);
RcppExport SEXP LSHR_hashfun_1(SEXP vecSEXP) {
RcppExport SEXP _LSHR_hashfun_1(SEXP vecSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Expand All @@ -18,7 +18,7 @@ END_RCPP
}
// hashfun_2
Rcpp::IntegerVector hashfun_2(IntegerVector vec);
RcppExport SEXP LSHR_hashfun_2(SEXP vecSEXP) {
RcppExport SEXP _LSHR_hashfun_2(SEXP vecSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Expand All @@ -29,7 +29,7 @@ END_RCPP
}
// get_minhash_matrix
IntegerVector get_minhash_matrix(uint32_t unique_shingles_length, uint32_t hashfun_number, uint32_t seed);
RcppExport SEXP LSHR_get_minhash_matrix(SEXP unique_shingles_lengthSEXP, SEXP hashfun_numberSEXP, SEXP seedSEXP) {
RcppExport SEXP _LSHR_get_minhash_matrix(SEXP unique_shingles_lengthSEXP, SEXP hashfun_numberSEXP, SEXP seedSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Expand All @@ -42,7 +42,7 @@ END_RCPP
}
// sign_bit
IntegerMatrix sign_bit(NumericMatrix x);
RcppExport SEXP LSHR_sign_bit(SEXP xSEXP) {
RcppExport SEXP _LSHR_sign_bit(SEXP xSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Expand All @@ -53,7 +53,7 @@ END_RCPP
}
// hash_signatures
IntegerMatrix hash_signatures(IntegerMatrix m, int bands_number, int rows_per_band);
RcppExport SEXP LSHR_hash_signatures(SEXP mSEXP, SEXP bands_numberSEXP, SEXP rows_per_bandSEXP) {
RcppExport SEXP _LSHR_hash_signatures(SEXP mSEXP, SEXP bands_numberSEXP, SEXP rows_per_bandSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Expand All @@ -65,8 +65,8 @@ BEGIN_RCPP
END_RCPP
}
// project_spmat
IntegerVector project_spmat(const S4& m, int n, int hash_fun_id_offest, int n_threads);
RcppExport SEXP LSHR_project_spmat(SEXP mSEXP, SEXP nSEXP, SEXP hash_fun_id_offestSEXP, SEXP n_threadsSEXP) {
SEXP project_spmat(const S4& m, int n, int hash_fun_id_offest, int n_threads);
RcppExport SEXP _LSHR_project_spmat(SEXP mSEXP, SEXP nSEXP, SEXP hash_fun_id_offestSEXP, SEXP n_threadsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Expand All @@ -78,3 +78,18 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}

static const R_CallMethodDef CallEntries[] = {
{"_LSHR_hashfun_1", (DL_FUNC) &_LSHR_hashfun_1, 1},
{"_LSHR_hashfun_2", (DL_FUNC) &_LSHR_hashfun_2, 1},
{"_LSHR_get_minhash_matrix", (DL_FUNC) &_LSHR_get_minhash_matrix, 3},
{"_LSHR_sign_bit", (DL_FUNC) &_LSHR_sign_bit, 1},
{"_LSHR_hash_signatures", (DL_FUNC) &_LSHR_hash_signatures, 3},
{"_LSHR_project_spmat", (DL_FUNC) &_LSHR_project_spmat, 4},
{NULL, NULL, 0}
};

RcppExport void R_init_LSHR(DllInfo *dll) {
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
}
14 changes: 8 additions & 6 deletions src/random_projection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ int omp_thread_count() {
}

// [[Rcpp::export]]
IntegerVector project_spmat(const S4 &m, int n, int hash_fun_id_offest, int n_threads = 0) {
SEXP project_spmat(const S4 &m, int n, int hash_fun_id_offest, int n_threads = 0) {
int num_threads = n_threads;
if(num_threads == 0) num_threads = omp_thread_count();
IntegerVector dims = m.slot("Dim");
Expand All @@ -42,8 +42,9 @@ IntegerVector project_spmat(const S4 &m, int n, int hash_fun_id_offest, int n_th
NumericVector XX = m.slot("x");
double *X = XX.begin();

IntegerVector res(N);
int *res_ptr = res.begin();
// IntegerVector res(N);
NumericVector res(N);
unsigned long long *res_ptr = (unsigned long long*)dataptr(res);

#ifdef _OPENMP
#pragma omp parallel for num_threads(num_threads)
Expand All @@ -52,7 +53,7 @@ IntegerVector project_spmat(const S4 &m, int n, int hash_fun_id_offest, int n_th
uint32_t h1, h2;
int p1 = P[i];
int p2 = P[i + 1];
vector<float> row(32);
vector<float> row(64);
float x;
for(int k = p1; k < p2; k++) {
int j = J[k];
Expand All @@ -67,15 +68,16 @@ IntegerVector project_spmat(const S4 &m, int n, int hash_fun_id_offest, int n_th
row[hh] += ((int)h * x);
}
}
std::bitset<32> bitrow;
std::bitset<64> bitrow;
for(int hh = 0; hh < n; hh++) {
if(row[hh] < 0)
bitrow[hh] = 0;
else
bitrow[hh] = 1;
}
res_ptr[i] = bitrow.to_ulong();
res_ptr[i] = bitrow.to_ullong();
}
res.attr("class") = "integer64";
return res;
}

0 comments on commit 19604d5

Please sign in to comment.