Skip to content

Commit

Permalink
add chembl
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesamcl committed Jul 31, 2024
1 parent f8e5f08 commit 4a927ca
Show file tree
Hide file tree
Showing 15 changed files with 236 additions and 77 deletions.
39 changes: 0 additions & 39 deletions 00_fetch_data/chembl/export.py

This file was deleted.

4 changes: 2 additions & 2 deletions 01_ingest/grebi_ingest_gwas/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ fn main() {
.trim(csv::Trim::All)
.from_reader(reader);

if args.filename.starts_with("gwas-catalog-associations") {
if args.filename.contains("gwas-catalog-associations") {
eprintln!("GWAS ingest: writing associations");
write_associations(&mut csv_reader, &mut output_nodes, &args.datasource_name);
} else if args.filename.starts_with("gwas-catalog-studies") {
} else if args.filename.contains("gwas-catalog-studies") {
eprintln!("GWAS ingest: writing studies");
write_studies(&mut csv_reader, &mut output_nodes, &args.datasource_name);
} else {
Expand Down
13 changes: 13 additions & 0 deletions 01_ingest/grebi_ingest_sqlite/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[package]
name = "grebi_ingest_sqlite"
version = "0.1.0"
edition = "2021"

[dependencies]
Inflector = "0.11.4"
clap = { version = "4.4.11", features = ["derive"] }
hex = "0.4.3"
rusqlite = "0.31.0"
serde_json = { version = "1.0.108", features=["preserve_order"] }
jemallocator = "0.5.4"

152 changes: 152 additions & 0 deletions 01_ingest/grebi_ingest_sqlite/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
use rusqlite::{params, Connection, Result};
use serde_json::{json, Value};
use inflector::Inflector;
use std::collections::HashMap;
use clap::Parser;

#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;

#[derive(clap::Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {

#[arg(long)]
datasource_name: String,

#[arg(long)]
filename: String,

}

fn main() -> Result<()> {

let args = Args::parse();
let conn = Connection::open(&args.filename)?;
let prefix = args.datasource_name.to_lowercase();

let schema_info = get_schema_info(&conn)?;
let foreign_keys = get_foreign_keys(&conn)?;

for (table, columns) in &schema_info {

if table.starts_with("sqlite_") {
continue;
}

let grebi_type = format!("{}:{}", prefix, table.to_singular());

eprintln!("--- Reading table: {} => {}", table, grebi_type);

let primary_keys = get_primary_keys(&conn, table).unwrap();
eprintln!("\tcolumns: {:?}", columns);
eprintln!("\tprimary keys: {:?}", primary_keys);

let mut stmt = conn.prepare(&format!("SELECT * FROM {}", table)).unwrap();
let mut rows = stmt.query([])?;
while let Some(row) = rows.next()? {

let mut json_obj = json!({});
let mut ids = Vec::new();

for (idx, column) in columns.iter().enumerate() {

let value:Option<String> = match row.get(idx)? {
rusqlite::types::Value::Null => None,
rusqlite::types::Value::Integer(i) => Some(i.to_string()),
rusqlite::types::Value::Real(r) => Some(r.to_string()),
rusqlite::types::Value::Text(t) => Some(t.to_string()),
rusqlite::types::Value::Blob(b) => Some(hex::encode(b))
};

if value.is_none() {
continue;
}

let v = value.unwrap();

let col_name = format!("{}:{}", prefix, column);

if primary_keys.contains(column) {
ids.push(format!("{}:{}:{}", prefix, table.to_singular(), v.clone()));
}

let fk_info = foreign_keys.get(&(table.clone(), column.clone()));

if fk_info.is_some() {
json_obj[&col_name] = json!(format!("{}:{}:{}", prefix, fk_info.unwrap().0.to_singular(), v));
ids.push(format!("{}:{}:{}", fk_info.unwrap().0.to_singular(), prefix, v));
} else {
json_obj[&col_name] = json!(v);
}
}

json_obj["grebi:type"] = json!(grebi_type);
json_obj["id"] = json!(ids);

println!("{}", serde_json::to_string(&json_obj).unwrap());
}
}
Ok(())
}

fn get_schema_info(conn: &Connection) -> Result<HashMap<String, Vec<String>>> {
let mut schema_info = HashMap::new();
let mut stmt = conn.prepare("SELECT name FROM sqlite_master WHERE type='table'")?;
let tables = stmt.query_map(params![], |row| row.get(0))?;

for table in tables {
let table: String = table?;
let mut columns = Vec::new();
let mut col_stmt = conn.prepare(&format!("PRAGMA table_info({})", table))?;
let col_info = col_stmt.query_map(params![], |row| row.get(1))?;

for col in col_info {
columns.push(col?);
}
schema_info.insert(table, columns);
}
Ok(schema_info)
}

fn get_primary_keys(conn: &Connection, table: &str) -> Result<Vec<String>> {
let mut primary_keys = Vec::new();
let mut stmt = conn.prepare(&format!("PRAGMA table_info({})", table))?;
let col_info = stmt.query_map(params![], |row| {
let name: String = row.get(1)?;
let is_pk: bool = row.get(5)?;
Ok((name, is_pk))
})?;

for col in col_info {
let (name, is_pk) = col?;
if is_pk {
primary_keys.push(name);
}
}
Ok(primary_keys)
}

fn get_foreign_keys(conn: &Connection) -> Result<HashMap<(String, String), (String, String)>> {
let mut foreign_keys = HashMap::new();
let mut stmt = conn.prepare("SELECT name FROM sqlite_master WHERE type='table'")?;
let tables = stmt.query_map(params![], |row| row.get(0))?;

for table in tables {
let table: String = table?;
let mut fk_stmt = conn.prepare(&format!("PRAGMA foreign_key_list({})", table))?;
let fk_info = fk_stmt.query_map(params![], |row| {
let from: String = row.get(3)?;
let to_table: String = row.get(2)?;
let to: String = row.get(4)?;
Ok((from, to_table, to))
})?;

for fk in fk_info {
let (from, to_table, to) = fk?;
foreign_keys.insert((table.clone(), from), (to_table, to));
}
}
Ok(foreign_keys)
}

1 change: 0 additions & 1 deletion 02_assign_ids/grebi_assign_ids/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ serde_json = { version = "1.0.108", features=["preserve_order"] }
grebi_shared = { path = "../../grebi_shared" }
csv = "1.3.0"
fasthash = "0.4.0"
rusqlite = "0.30.0"
lmdb-zero = "0.4.4"
bloomfilter = "1.0.13"
jemallocator = "0.5.4"
Expand Down
1 change: 0 additions & 1 deletion 02_assign_ids/grebi_identifiers2groups/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ serde_json = { version = "1.0.108", features=["preserve_order"] }
grebi_shared = { path = "../../grebi_shared" }
csv = "1.3.0"
fasthash = "0.4.0"
rusqlite = "0.30.0"
lmdb-zero = "0.4.4"
bloomfilter = "1.0.13"
clap = { version = "4.4.11", features = ["derive"] }
Expand Down
1 change: 0 additions & 1 deletion 02_assign_ids/grebi_identifiers2groups/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
use std::collections::{HashSet, HashMap, BTreeMap};
use std::{env, io};
use csv;
use rusqlite::Connection;
use bloomfilter::Bloom;
use clap::Parser;
use std::io::{BufRead, BufReader };
Expand Down
51 changes: 32 additions & 19 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ members = [
"01_ingest/grebi_ingest_json",
"01_ingest/grebi_ingest_reactome",
"01_ingest/grebi_ingest_kgx_edges",
"01_ingest/grebi_ingest_sqlite",
"01_ingest/grebi_normalise_prefixes",
"02_assign_ids/grebi_extract_identifiers",
"02_assign_ids/grebi_identifiers2groups",
Expand Down
13 changes: 13 additions & 0 deletions configs/datasource_configs/chembl.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"name": "ChEMBL",
"enabled": true,
"ingests": [
{
"ingest_files": ["./00_fetch_data/chembl/chembl_34/chembl_34_sqlite/chembl_34.db"],
"ingest_script": "./target/release/grebi_ingest_sqlite",
"stdin": false,
"ingest_args": [
]
}
]
}
Loading

0 comments on commit 4a927ca

Please sign in to comment.