Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: use docm snapshot data #533

Merged
merged 1 commit into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 0 additions & 29 deletions server/lib/genome/importers/api_importers/docm/api_client.rb

This file was deleted.

97 changes: 0 additions & 97 deletions server/lib/genome/importers/api_importers/docm/importer.rb

This file was deleted.

95 changes: 95 additions & 0 deletions server/lib/genome/importers/file_importers/docm.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
module Genome
module Importers
module FileImporters
module Docm
class Importer < Genome::Importers::Base
attr_reader :file_path

def initialize(tsv_root_path)
@tsv_root = if tsv_root_path.nil?
'lib/data/docm/'
else
tsv_root_path
end
@source_db_name = 'DoCM'
@drug_claims = {}
@gene_claims = {}
@interaction_claims = {}
end

def create_claims
create_drug_claims
create_gene_claims
create_interaction_claims
end

private

def create_new_source
@source ||= Source.create(
{
base_url: 'http://docm.info/',
site_url: 'http://docm.info/',
citation: 'Ainscough BJ, Griffith M, Coffman AC, Wagner AH, Kunisaki J, Choudhary MN, McMichael JF, Fulton RS, Wilson RK, Griffith OL, Mardis ER. DoCM: a database of curated mutations in cancer. Nat Methods. 2016 Sep 29;13(10):806-7. doi: 10.1038/nmeth.4000. PMID: 27684579; PMCID: PMC5317181.',
citation_short: 'Ainscough BJ, et al. DoCM: a database of curated mutations in cancer. Nat Methods. 2016 Sep 29;13(10):806-7.',
pmid: '27684579',
pmcid: 'PMC5317181',
doi: '10.1038/nmeth.4000',
source_db_version: '2024-10-02',
source_trust_level_id: SourceTrustLevel.EXPERT_CURATED,
source_db_name:,
full_name: 'Database of Curated Mutations',
license: License::CC_BY_4_0,
license_link: 'https://github.com/griffithlab/docm/blob/c8d2a8723f505689074d07841931475b9b7e914c/app/views/static/about.html.haml#L86'
}
)
@source.source_types << SourceType.find_by(type: 'interaction')
@source.save
end


def create_drug_claims
CSV.foreach("#{@tsv_root}drug_claim.csv", headers: true, col_sep: ',') do |row|
dc = create_drug_claim(row[0])
@drug_claims[row[0]] = dc
end
end

def create_gene_claims
CSV.foreach("#{@tsv_root}gene_claim.csv", headers: true, col_sep: ',') do |row|
gc = create_gene_claim(row[0], GeneNomenclature::NCBI_NAME)
@gene_claims[row[0]] = gc
end
end

def create_interaction_claims
CSV.foreach("#{@tsv_root}interaction_claim.csv", headers: true, col_sep: ',') do |row|
gc = @gene_claims[row[1]]
dc = @drug_claims[row[0]]
next if gc.nil? || dc.nil?

ic = create_interaction_claim(gc, dc)
@interaction_claims[[gc, dc]] = ic
end
CSV.foreach("#{@tsv_root}interaction_claim_attributes.csv", headers: true, col_sep: ',') do |row|
gc = @gene_claims[row[3]]
dc = @drug_claims[row[2]]
next if gc.nil? || dc.nil?

ic = @interaction_claims[[gc, dc]]
create_interaction_claim_attribute(ic, row[0], row[1])
end
CSV.foreach("#{@tsv_root}interaction_claim_publications.csv", headers: true, col_sep: ',') do |row|
gc = @gene_claims[row[3]]
dc = @drug_claims[row[2]]
next if gc.nil? || dc.nil?

ic = @interaction_claims[[gc, dc]]
create_interaction_claim_publication(ic, row[0])
end
end
end
end
end
end
end
41 changes: 41 additions & 0 deletions server/lib/genome/importers/file_importers/docm.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
-- It's a little tricky to grab data from the final DOCM dump;
-- instead, these queries help us produce dumps from our last extraction in DGIdb

-- gene claim
SELECT gc.name, gc.nomenclature
FROM gene_claims gc
LEFT JOIN sources s on gc.source_id = s.id
WHERE source_db_name = 'DoCM';

-- drug claim
SELECT dc.name, dc.nomenclature
FROM drug_claims dc
LEFT JOIN sources s on dc.source_id = s.id
WHERE s.source_db_name = 'DoCM';

-- interaction claim
SELECT dc.name, gc.name
FROM interaction_claims ic
LEFT JOIN sources s on ic.source_id = s.id
LEFT JOIN drug_claims dc on ic.drug_claim_id = dc.id
LEFT JOIN gene_claims gc on ic.gene_claim_id = gc.id
WHERE s.source_db_name = 'DoCM';

-- interaction claim attributes
SELECT ica.name, ica.value, dc.name, gc.name
FROM interaction_claims ic
LEFT JOIN sources s on ic.source_id = s.id
LEFT JOIN drug_claims dc on ic.drug_claim_id = dc.id
LEFT JOIN gene_claims gc on ic.gene_claim_id = gc.id
RIGHT JOIN interaction_claim_attributes ica on ic.id = ica.interaction_claim_id
WHERE s.source_db_name = 'DoCM';

-- interaction claim publications
SELECT p.pmid, p.citation, dc.name, gc.name
FROM interaction_claims ic
LEFT JOIN sources s on ic.source_id = s.id
LEFT JOIN drug_claims dc on ic.drug_claim_id = dc.id
LEFT JOIN gene_claims gc on ic.gene_claim_id = gc.id
RIGHT JOIN interaction_claims_publications icp ON icp.interaction_claim_id = ic.id
LEFT JOIN publications p ON p.id = icp.publication_id
WHERE s.source_db_name = 'DoCM';