diff --git a/server/lib/genome/importers/api_importers/docm/api_client.rb b/server/lib/genome/importers/api_importers/docm/api_client.rb deleted file mode 100644 index 8b765b8f..00000000 --- a/server/lib/genome/importers/api_importers/docm/api_client.rb +++ /dev/null @@ -1,29 +0,0 @@ -module Genome; module Importers; module ApiImporters; module Docm - class ApiClient - def variants - variants = JSON.parse(get_page(variant_url)) - end - - def variant_url - 'http://docm.info/api/v1/variants.json' - end - - def get_page(url) - uri = URI(url) - - uri.query = URI.encode_www_form(docm_params) - - req = Net::HTTP::Get.new(uri) - resp = Net::HTTP.start(uri.host, uri.port) { |http| http.read_timeout = 1000; http.request(req)} - if resp.code != '200' - raise StandardError.new('Failed HTTP request') - end - - resp.body - end - - def docm_params - {'detailed_view' => true} - end - end -end; end; end; end diff --git a/server/lib/genome/importers/api_importers/docm/importer.rb b/server/lib/genome/importers/api_importers/docm/importer.rb deleted file mode 100644 index f2412e67..00000000 --- a/server/lib/genome/importers/api_importers/docm/importer.rb +++ /dev/null @@ -1,97 +0,0 @@ -module Genome; module Importers; module ApiImporters; module Docm - class Importer < Genome::Importers::Base - attr_reader :new_version - - def initialize - @source_db_name = 'DoCM' - end - - def create_claims - create_interaction_claims - end - - private - - def create_interaction_claims - api_client = ApiClient.new - api_client.variants.each do |variant| - interaction_information = parse_interaction_information(variant) - interaction_information.each do |interaction_info| - gc = create_gene_claim(variant['gene'], GeneNomenclature::NCBI_NAME) - dc = create_drug_claim(interaction_info['Therapeutic Context'].upcase, - DrugNomenclature::PRIMARY_NAME) - ic = create_interaction_claim(gc, dc) - create_interaction_claim_attributes(ic, interaction_info) - create_interaction_claim_publications(ic, variant['diseases']) - create_interaction_claim_link(ic, 'DoCM Website', 'http://docm.info') - end - end - backfill_publication_information - end - - def parse_interaction_information(variant) - return [] unless variant.include?('meta') - - drug_data = variant['meta'].find { |table| table.include?('Drug Interaction Data') } - return [] unless drug_data.present? - - fields = drug_data['Drug Interaction Data']['fields'] - rows = drug_data['Drug Interaction Data']['rows'] - Set.new.tap do |interaction_information| - rows.each do |row| - row_hash = {} - fields.zip(row).each do |name, value| - row_hash[name] = value - end - row_hash['Therapeutic Context'].split(/,|\+|plus/).each do |drug| - info = row_hash.dup - info['Therapeutic Context'] = drug - interaction_information.add(info) if valid_drug?(drug) - end - end - end - end - - def valid_drug?(drug_name) - %w[inhib HER3 TKI anti BRAF radio BH3].none? { |name| drug_name.include?(name) } - end - - def create_interaction_claim_attributes(interaction_claim, interaction_info) - { - InteractionAttributeName::APPROVAL_STATUS => 'Status', - InteractionAttributeName::PATHWAY => 'Pathway', - InteractionAttributeName::VARIANT_EFFECT => 'Effect' - }.each do |name, interaction_info_key| - create_interaction_claim_attribute(interaction_claim, name, interaction_info[interaction_info_key]) - end - end - - def create_interaction_claim_publications(interaction_claim, diseases) - diseases.each do |disease| - create_interaction_claim_publication(interaction_claim, disease['source_pubmed_id']) - end - end - - def create_new_source - @source ||= Source.create( - { - base_url: 'http://docm.info/', - site_url: 'http://docm.info/', - citation: 'Ainscough BJ, Griffith M, Coffman AC, Wagner AH, Kunisaki J, Choudhary MN, McMichael JF, Fulton RS, Wilson RK, Griffith OL, Mardis ER. DoCM: a database of curated mutations in cancer. Nat Methods. 2016 Sep 29;13(10):806-7. doi: 10.1038/nmeth.4000. PMID: 27684579; PMCID: PMC5317181.', - citation_short: 'Ainscough BJ, et al. DoCM: a database of curated mutations in cancer. Nat Methods. 2016 Sep 29;13(10):806-7.', - pmid: '27684579', - pmcid: 'PMC5317181', - doi: '10.1038/nmeth.4000', - source_db_version: set_current_date_version, - source_trust_level_id: SourceTrustLevel.EXPERT_CURATED, - source_db_name: source_db_name, - full_name: 'Database of Curated Mutations', - license: License::CC_BY_4_0, - license_link: 'http://www.docm.info/about' - } - ) - @source.source_types << SourceType.find_by(type: 'interaction') - @source.save - end - end -end; end; end; end diff --git a/server/lib/genome/importers/file_importers/docm.rb b/server/lib/genome/importers/file_importers/docm.rb new file mode 100644 index 00000000..695d1912 --- /dev/null +++ b/server/lib/genome/importers/file_importers/docm.rb @@ -0,0 +1,95 @@ +module Genome + module Importers + module FileImporters + module Docm + class Importer < Genome::Importers::Base + attr_reader :file_path + + def initialize(tsv_root_path) + @tsv_root = if tsv_root_path.nil? + 'lib/data/docm/' + else + tsv_root_path + end + @source_db_name = 'DoCM' + @drug_claims = {} + @gene_claims = {} + @interaction_claims = {} + end + + def create_claims + create_drug_claims + create_gene_claims + create_interaction_claims + end + + private + + def create_new_source + @source ||= Source.create( + { + base_url: 'http://docm.info/', + site_url: 'http://docm.info/', + citation: 'Ainscough BJ, Griffith M, Coffman AC, Wagner AH, Kunisaki J, Choudhary MN, McMichael JF, Fulton RS, Wilson RK, Griffith OL, Mardis ER. DoCM: a database of curated mutations in cancer. Nat Methods. 2016 Sep 29;13(10):806-7. doi: 10.1038/nmeth.4000. PMID: 27684579; PMCID: PMC5317181.', + citation_short: 'Ainscough BJ, et al. DoCM: a database of curated mutations in cancer. Nat Methods. 2016 Sep 29;13(10):806-7.', + pmid: '27684579', + pmcid: 'PMC5317181', + doi: '10.1038/nmeth.4000', + source_db_version: '2024-10-02', + source_trust_level_id: SourceTrustLevel.EXPERT_CURATED, + source_db_name:, + full_name: 'Database of Curated Mutations', + license: License::CC_BY_4_0, + license_link: 'https://github.com/griffithlab/docm/blob/c8d2a8723f505689074d07841931475b9b7e914c/app/views/static/about.html.haml#L86' + } + ) + @source.source_types << SourceType.find_by(type: 'interaction') + @source.save + end + + + def create_drug_claims + CSV.foreach("#{@tsv_root}drug_claim.csv", headers: true, col_sep: ',') do |row| + dc = create_drug_claim(row[0]) + @drug_claims[row[0]] = dc + end + end + + def create_gene_claims + CSV.foreach("#{@tsv_root}gene_claim.csv", headers: true, col_sep: ',') do |row| + gc = create_gene_claim(row[0], GeneNomenclature::NCBI_NAME) + @gene_claims[row[0]] = gc + end + end + + def create_interaction_claims + CSV.foreach("#{@tsv_root}interaction_claim.csv", headers: true, col_sep: ',') do |row| + gc = @gene_claims[row[1]] + dc = @drug_claims[row[0]] + next if gc.nil? || dc.nil? + + ic = create_interaction_claim(gc, dc) + @interaction_claims[[gc, dc]] = ic + end + CSV.foreach("#{@tsv_root}interaction_claim_attributes.csv", headers: true, col_sep: ',') do |row| + gc = @gene_claims[row[3]] + dc = @drug_claims[row[2]] + next if gc.nil? || dc.nil? + + ic = @interaction_claims[[gc, dc]] + create_interaction_claim_attribute(ic, row[0], row[1]) + end + CSV.foreach("#{@tsv_root}interaction_claim_publications.csv", headers: true, col_sep: ',') do |row| + gc = @gene_claims[row[3]] + dc = @drug_claims[row[2]] + next if gc.nil? || dc.nil? + + ic = @interaction_claims[[gc, dc]] + create_interaction_claim_publication(ic, row[0]) + end + end + end + end + end + end +end diff --git a/server/lib/genome/importers/file_importers/docm.sql b/server/lib/genome/importers/file_importers/docm.sql new file mode 100644 index 00000000..e14077b9 --- /dev/null +++ b/server/lib/genome/importers/file_importers/docm.sql @@ -0,0 +1,41 @@ +-- It's a little tricky to grab data from the final DOCM dump; +-- instead, these queries help us produce dumps from our last extraction in DGIdb + +-- gene claim +SELECT gc.name, gc.nomenclature +FROM gene_claims gc + LEFT JOIN sources s on gc.source_id = s.id +WHERE source_db_name = 'DoCM'; + +-- drug claim +SELECT dc.name, dc.nomenclature +FROM drug_claims dc + LEFT JOIN sources s on dc.source_id = s.id +WHERE s.source_db_name = 'DoCM'; + +-- interaction claim +SELECT dc.name, gc.name +FROM interaction_claims ic + LEFT JOIN sources s on ic.source_id = s.id + LEFT JOIN drug_claims dc on ic.drug_claim_id = dc.id + LEFT JOIN gene_claims gc on ic.gene_claim_id = gc.id +WHERE s.source_db_name = 'DoCM'; + +-- interaction claim attributes +SELECT ica.name, ica.value, dc.name, gc.name +FROM interaction_claims ic + LEFT JOIN sources s on ic.source_id = s.id + LEFT JOIN drug_claims dc on ic.drug_claim_id = dc.id + LEFT JOIN gene_claims gc on ic.gene_claim_id = gc.id + RIGHT JOIN interaction_claim_attributes ica on ic.id = ica.interaction_claim_id +WHERE s.source_db_name = 'DoCM'; + +-- interaction claim publications +SELECT p.pmid, p.citation, dc.name, gc.name +FROM interaction_claims ic + LEFT JOIN sources s on ic.source_id = s.id + LEFT JOIN drug_claims dc on ic.drug_claim_id = dc.id + LEFT JOIN gene_claims gc on ic.gene_claim_id = gc.id + RIGHT JOIN interaction_claims_publications icp ON icp.interaction_claim_id = ic.id + LEFT JOIN publications p ON p.id = icp.publication_id +WHERE s.source_db_name = 'DoCM';