From a19c9854ddd07f530309e0e8deb28b02c1368671 Mon Sep 17 00:00:00 2001 From: James McLaughlin Date: Thu, 31 Oct 2024 16:29:56 +0000 Subject: [PATCH] remove blank ids, remove fasthash to fix building on mac --- 02_assign_ids/grebi_assign_ids/Cargo.toml | 1 - 02_assign_ids/grebi_assign_ids/src/main.rs | 21 ++- .../grebi_extract_identifiers/Cargo.toml | 1 - .../grebi_extract_identifiers/src/main.rs | 1 + .../grebi_identifiers2groups/Cargo.toml | 1 - .../grebi_superclasses2types/Cargo.toml | 1 - Cargo.lock | 139 +----------------- grebi_shared/src/lib.rs | 15 -- 8 files changed, 25 insertions(+), 155 deletions(-) diff --git a/02_assign_ids/grebi_assign_ids/Cargo.toml b/02_assign_ids/grebi_assign_ids/Cargo.toml index af59944..4118d17 100644 --- a/02_assign_ids/grebi_assign_ids/Cargo.toml +++ b/02_assign_ids/grebi_assign_ids/Cargo.toml @@ -7,7 +7,6 @@ edition = "2021" serde_json = { version = "1.0.108", features=["preserve_order"] } grebi_shared = { path = "../../grebi_shared" } csv = "1.3.0" -fasthash = "0.4.0" lmdb-zero = "0.4.4" bloomfilter = "1.0.13" jemallocator = "0.5.4" diff --git a/02_assign_ids/grebi_assign_ids/src/main.rs b/02_assign_ids/grebi_assign_ids/src/main.rs index efa1afd..88d0854 100644 --- a/02_assign_ids/grebi_assign_ids/src/main.rs +++ b/02_assign_ids/grebi_assign_ids/src/main.rs @@ -12,8 +12,6 @@ use clap::Parser; use grebi_shared::find_strings; use grebi_shared::load_groups_txt::load_id_to_group_mapping; -use grebi_shared::check_id; - #[derive(clap::Parser, Debug)] #[command(author, version, about, long_about = None)] @@ -200,7 +198,10 @@ fn get_ids<'a, 'b>(json:&mut JsonParser<'a>, ids:&'b mut BTreeSet<&'a [u8]>) { json.end_array(); } else if json.peek().kind == JsonTokenType::StartString { let id = json.string(); - ids.insert(id.clone()); + if check_id(&id) { + ids.insert(id.clone()); + } + } else if json.peek().kind == JsonTokenType::StartObject { // maybe a reification json.begin_object(); @@ -218,3 +219,17 @@ fn get_ids<'a, 'b>(json:&mut JsonParser<'a>, ids:&'b mut BTreeSet<&'a [u8]>) { } } + +// Duplicated in grebi_extract_identifiers +fn check_id(id:&[u8]) -> bool { + if id.len() >= 16 { + // long numeric ID is prob a UUID and fine + return true; + } + for c in id { + if !c.is_ascii_digit() { + return true; + } + } + return false; +} diff --git a/02_assign_ids/grebi_extract_identifiers/Cargo.toml b/02_assign_ids/grebi_extract_identifiers/Cargo.toml index 7e33761..f26b4d6 100644 --- a/02_assign_ids/grebi_extract_identifiers/Cargo.toml +++ b/02_assign_ids/grebi_extract_identifiers/Cargo.toml @@ -8,7 +8,6 @@ clap = { version = "4.4.11", features = ["derive"] } serde_json = { version = "1.0.108", features=["preserve_order"] } grebi_shared = { path = "../../grebi_shared" } csv = "1.3.0" -fasthash = "0.4.0" lmdb-zero = "0.4.4" jemallocator = "0.5.4" diff --git a/02_assign_ids/grebi_extract_identifiers/src/main.rs b/02_assign_ids/grebi_extract_identifiers/src/main.rs index c0a3a80..b133b25 100644 --- a/02_assign_ids/grebi_extract_identifiers/src/main.rs +++ b/02_assign_ids/grebi_extract_identifiers/src/main.rs @@ -129,6 +129,7 @@ fn write_ids(k:&[u8], json:&mut JsonParser, writer:&mut BufWriter bool { if id.len() >= 16 { // long numeric ID is prob a UUID and fine diff --git a/02_assign_ids/grebi_identifiers2groups/Cargo.toml b/02_assign_ids/grebi_identifiers2groups/Cargo.toml index fdef46d..a72066c 100644 --- a/02_assign_ids/grebi_identifiers2groups/Cargo.toml +++ b/02_assign_ids/grebi_identifiers2groups/Cargo.toml @@ -7,7 +7,6 @@ edition = "2021" serde_json = { version = "1.0.108", features=["preserve_order"] } grebi_shared = { path = "../../grebi_shared" } csv = "1.3.0" -fasthash = "0.4.0" lmdb-zero = "0.4.4" bloomfilter = "1.0.13" jemallocator = "0.5.4" diff --git a/02_assign_ids/grebi_superclasses2types/Cargo.toml b/02_assign_ids/grebi_superclasses2types/Cargo.toml index 1f8dade..fd8d49c 100644 --- a/02_assign_ids/grebi_superclasses2types/Cargo.toml +++ b/02_assign_ids/grebi_superclasses2types/Cargo.toml @@ -7,7 +7,6 @@ edition = "2021" serde_json = { version = "1.0.108", features=["preserve_order"] } grebi_shared = { path = "../../grebi_shared" } csv = "1.3.0" -fasthash = "0.4.0" lmdb-zero = "0.4.4" bloomfilter = "1.0.13" jemallocator = "0.5.4" diff --git a/Cargo.lock b/Cargo.lock index ada220c..b86eb5f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -24,7 +24,7 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "once_cell", "version_check", "zerocopy", @@ -93,12 +93,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "autocfg" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" - [[package]] name = "bindgen" version = "0.69.4" @@ -200,12 +194,6 @@ dependencies = [ "nom", ] -[[package]] -name = "cfg-if" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" - [[package]] name = "cfg-if" version = "1.0.0" @@ -293,7 +281,7 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -380,28 +368,6 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" -[[package]] -name = "fasthash" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "032213946b4eaae09117ec63f020322b78ca7a31d8aa2cf64df3032e1579690f" -dependencies = [ - "cfg-if 0.1.10", - "fasthash-sys", - "num-traits", - "seahash", - "xoroshiro128", -] - -[[package]] -name = "fasthash-sys" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6de941abfe2e715cdd34009d90546f850597eb69ca628ddfbf616e53dda28f8" -dependencies = [ - "gcc", -] - [[package]] name = "flate2" version = "1.0.28" @@ -422,12 +388,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "fuchsia-cprng" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" - [[package]] name = "fxhash" version = "0.2.1" @@ -459,7 +419,7 @@ version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe9006bed769170c11f845cf00c7c1e9092aeb3f268e007c3e760ac68008070f" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "js-sys", "libc", "wasi", @@ -479,7 +439,6 @@ dependencies = [ "bloomfilter", "clap", "csv", - "fasthash", "grebi_shared", "jemallocator", "lmdb-zero", @@ -492,7 +451,6 @@ version = "0.1.0" dependencies = [ "clap", "csv", - "fasthash", "grebi_shared", "jemallocator", "lmdb-zero", @@ -506,7 +464,6 @@ dependencies = [ "bloomfilter", "clap", "csv", - "fasthash", "fxhash", "grebi_shared", "hashbrown", @@ -705,7 +662,6 @@ dependencies = [ "bloomfilter", "clap", "csv", - "fasthash", "grebi_shared", "jemallocator", "lmdb-zero", @@ -873,7 +829,7 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "windows-targets", ] @@ -988,15 +944,6 @@ dependencies = [ "minimal-lexical", ] -[[package]] -name = "num-traits" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" -dependencies = [ - "autocfg", -] - [[package]] name = "once_cell" version = "1.19.0" @@ -1078,43 +1025,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "rand" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293" -dependencies = [ - "fuchsia-cprng", - "libc", - "rand_core 0.3.1", - "rdrand", - "winapi", -] - -[[package]] -name = "rand_core" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" -dependencies = [ - "rand_core 0.4.2", -] - -[[package]] -name = "rand_core" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" - -[[package]] -name = "rdrand" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" -dependencies = [ - "rand_core 0.3.1", -] - [[package]] name = "regex" version = "1.10.2" @@ -1220,12 +1130,6 @@ version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" -[[package]] -name = "seahash" -version = "3.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58f57ca1d128a43733fd71d583e837b1f22239a37ebea09cde11d8d9a9080f47" - [[package]] name = "serde" version = "1.0.197" @@ -1277,7 +1181,7 @@ version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "cpufeatures", "digest", ] @@ -1603,7 +1507,7 @@ version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ed0d4f68a3015cc185aff4db9506a015f4b96f95303897bfa23f846db54064e" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "wasm-bindgen-macro", ] @@ -1657,28 +1561,6 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "323f4da9523e9a669e1eaf9c6e763892769b1d38c623913647bfdc1532fe4549" -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - [[package]] name = "windows-sys" version = "0.52.0" @@ -1745,15 +1627,6 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" -[[package]] -name = "xoroshiro128" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0eeda34baec49c4f1eb2c04d59b761582fd6330010f9330ca696ca1a355dfcd" -dependencies = [ - "rand", -] - [[package]] name = "zerocopy" version = "0.7.32" diff --git a/grebi_shared/src/lib.rs b/grebi_shared/src/lib.rs index f56d604..289a60b 100644 --- a/grebi_shared/src/lib.rs +++ b/grebi_shared/src/lib.rs @@ -7,21 +7,6 @@ pub mod slice_materialised_edge; pub mod load_metadata_mapping_table; pub mod load_groups_txt; -pub fn check_id(k:&[u8], id:&[u8]) -> bool { - if id.len() >= 16 { - // long numeric ID is prob a UUID and fine - return true; - } - for c in id { - if !c.is_ascii_digit() { - return true; - } - } - // also triggers for blank IDs - eprintln!("Found unprefixed numeric ID {} for identifier property {}. Unqualified numbers like this as identifiers are ambiguous and may cause incorrect equivalences.", String::from_utf8_lossy(id), String::from_utf8_lossy(k)); - return false; -} - // get the id without parsing json pub fn get_id<'a>(json:&'a [u8])->&'a [u8] {