From 5387c3b901ae216b01be120441126838631b6d3f Mon Sep 17 00:00:00 2001 From: Susanna Kiwala Date: Thu, 16 May 2024 08:57:12 -0500 Subject: [PATCH] Update NCIt updater to use TSV file --- server/Gemfile | 3 + server/Gemfile.lock | 2 + server/app/jobs/update_nci_thesaurus.rb | 2 +- .../app/lib/importer/nci_thesaurus_mirror.rb | 56 ++++++------------- 4 files changed, 23 insertions(+), 40 deletions(-) diff --git a/server/Gemfile b/server/Gemfile index 046a1fa9f..63b23dc55 100644 --- a/server/Gemfile +++ b/server/Gemfile @@ -31,6 +31,9 @@ gem 'scenic', '~>1.5.4' #entrez symbol downloads gem 'net-ftp', '~>0.3.3' +#NCIt term download +gem 'rubyzip', '~>2.3.2' + #higher performance json encoding gem 'oj', '~> 3.16.3' diff --git a/server/Gemfile.lock b/server/Gemfile.lock index f248f4f38..3c04ab473 100644 --- a/server/Gemfile.lock +++ b/server/Gemfile.lock @@ -417,6 +417,7 @@ GEM ffi (~> 1.12) ruby2_keywords (0.0.5) ruby_dig (0.0.2) + rubyzip (2.3.2) sanitize (6.0.2) crass (~> 1.0.2) nokogiri (>= 1.12.0) @@ -574,6 +575,7 @@ DEPENDENCIES rack-mini-profiler (~> 2.0) rails (~> 7.1) rinku (~> 2.0.6) + rubyzip (~> 2.3.2) sanitize (~> 6.0.2) sass-rails (>= 6) scenic (~> 1.5.4) diff --git a/server/app/jobs/update_nci_thesaurus.rb b/server/app/jobs/update_nci_thesaurus.rb index 50c290228..bdefd86d6 100644 --- a/server/app/jobs/update_nci_thesaurus.rb +++ b/server/app/jobs/update_nci_thesaurus.rb @@ -34,7 +34,7 @@ def remove_download end def latest_ncit_path - "https://stars.renci.org/var/NCIt/ncit.obo" + "https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/Thesaurus.FLAT.zip" end end diff --git a/server/app/lib/importer/nci_thesaurus_mirror.rb b/server/app/lib/importer/nci_thesaurus_mirror.rb index d1a4ece59..8368c9a7d 100644 --- a/server/app/lib/importer/nci_thesaurus_mirror.rb +++ b/server/app/lib/importer/nci_thesaurus_mirror.rb @@ -1,14 +1,25 @@ +require "csv" +require "zip" + module Importer class NciThesaurusMirror attr_reader :parser, :version def initialize(path, version = Time.now.utc.iso8601) - @parser = Obo::Parser.new(path) + zip_file = Zip::File.open(path) + entry = zip_file.glob('*.txt').first + csv_text = entry.get_input_stream.read + @parser = CSV.parse( + csv_text, + col_sep: "\t", + liberal_parsing: true, + headers: ['code', 'concept_iri', 'parents', 'synonyms', 'definition', 'display_name', 'concept_status', 'semantic_type', 'concept_in_subset'], + ) @version = version end def import - parser.elements.each do |elem| + parser.each do |elem| if valid_entry?(elem) create_object_from_entry(elem) end @@ -16,33 +27,19 @@ def import end def valid_entry?(entry) - semantic_types = semantic_types(entry) - obsolete_concepts = obsolete_concepts(entry) - (entry['id'].present? && entry['name'].present? && entry.respond_to?(:name) && entry.name == 'Term' && - (semantic_types & valid_semantic_types).length > 0 && - (obsolete_concepts & ['Obsolete_Concept']).length == 0) - end - - def semantic_types(entry) - matcher = /^NCIT:P106 "(?.+)"/ - entry['property_value'].map { |s| s.match(matcher) }.compact.map { |s| s[:semantic_type] } + valid_semantic_types.include?(entry['semantic_type']) && entry['concept_status'].nil? end def valid_semantic_types ['Pharmacologic Substance', 'Pharmacological Substance', 'Clinical Drug', 'Therapeutic or Preventive Procedure', 'Hazardous or Poisonous Substance'] end - def obsolete_concepts(entry) - matcher = /^NCIT:P310 "(?.+)"/ - entry['property_value'].map { |s| s.match(matcher) }.compact.map { |s| s[:obsolete_concept] } - end - def create_object_from_entry(entry) - name = Therapy.capitalize_name(entry['name']) - ncit_id = entry['id'].sub('NCIT:', '') + synonyms = entry['synonyms'].split('|').map{|s| Therapy.capitalize_name(s)} + name = synonyms.shift() + ncit_id = entry['code'] therapy = ::Therapy.where(ncit_id: ncit_id).first_or_initialize therapy.name = name - synonyms = process_synonyms(entry['synonym']).uniq synonyms.each do |syn| therapy_alias = ::TherapyAlias.where(name: syn).first_or_create if !therapy.therapy_aliases.map{|a| a.name.downcase}.include?(syn.downcase) && !(syn.downcase == therapy.name.downcase) @@ -51,25 +48,6 @@ def create_object_from_entry(entry) end therapy.save end - - def process_synonyms(synonym_element) - vals = if synonym_element.blank? - [] - elsif synonym_element.is_a?(String) - [extract_synonym(synonym_element)] - elsif synonym_element.is_a?(Array) - synonym_element.map { |s| extract_synonym(s) } - end - vals.compact - end - - def extract_synonym(value) - if match_data = value.match(/^"(?.+)" EXACT \[.*\]/) - Therapy.capitalize_name(match_data[:name]) - else - nil - end - end end end