From f2e541b8114d9259fd028a032ddddb9b65f540e2 Mon Sep 17 00:00:00 2001 From: Ronald Tse Date: Fri, 17 Apr 2020 18:37:44 +0800 Subject: [PATCH] Update term parsing to fix math and usageInfo --- lib/iev/termbase/nested_term_builder.rb | 31 ++++--- lib/iev/termbase/term_builder.rb | 118 ++++++++++++++++-------- 2 files changed, 95 insertions(+), 54 deletions(-) diff --git a/lib/iev/termbase/nested_term_builder.rb b/lib/iev/termbase/nested_term_builder.rb index 0a6292d..5b359db 100644 --- a/lib/iev/termbase/nested_term_builder.rb +++ b/lib/iev/termbase/nested_term_builder.rb @@ -22,7 +22,7 @@ def self.build(options) attr_reader :data, :status, :options def term_attributes - @term_attributes ||= data.to_s.split("; ") + @term_attributes ||= HTMLEntities.new(:expanded).decode(data.to_s) || "" end def build_nested_term @@ -43,14 +43,14 @@ def term_status end def extract_gender - genders = term_attributes.grep(/[m|f|n]$|[m|f|n] pl/) + genders = term_attributes.match(/\s([m|f|n])$|^([m|f|n])[\s,]?|([m|f|n]) (pl)/) - unless genders.empty? + if genders plurality = "singular" - genders = genders.first.split(" ") - plurality = "plural" if genders.size > 1 && genders[1] == "pl" + gender = genders[1..3].join('') + plurality = "plural" if genders.size > 1 && genders[4] == "pl" - { "gender" => genders[0], "plurality" => plurality } + { "gender" => gender, "plurality" => plurality } end end @@ -65,29 +65,32 @@ def parts_hash end def extract_geographical_area - term_attributes.grep(/[A-Z]{2}$/).first + area = term_attributes.match(/([A-Z]{2})$/) + if area && area.size > 1 + area[1] + end end def extract_part_of_speach parts_regex = /noun|名詞|verb|動詞|Adjektiv|adj|形容詞|형용사/ - part_of_speaches = term_attributes.grep(parts_regex) + part_of_speaches = term_attributes.match(parts_regex) - unless part_of_speaches.empty? - part_of_speach = part_of_speaches.first + if part_of_speaches + part_of_speach = part_of_speaches[1] parts_hash[part_of_speach] || part_of_speach end end def extract_usage_info - usage_info = term_attributes.grep(/</).first + usage_info = term_attributes.match(/<(.*?)>/) - if usage_info - usage_info.gsub(/<(.*?)>/, '\1') + if usage_info && usage_info.size > 1 + usage_info[1].strip end end def extract_prefix - term_attributes.grep(/Präfix|prefix|préfixe|接尾語|접두사|przedrostek|prefixo|词头/).empty? ? nil : true + term_attributes.match(/Präfix|prefix|préfixe|接尾語|접두사|przedrostek|prefixo|词头/) ? true : nil end end end diff --git a/lib/iev/termbase/term_builder.rb b/lib/iev/termbase/term_builder.rb index 290a17b..6d7e2de 100644 --- a/lib/iev/termbase/term_builder.rb +++ b/lib/iev/termbase/term_builder.rb @@ -2,6 +2,7 @@ require "iev/termbase/iso_639_code" require "iev/termbase/nested_term_builder" require 'mathml2asciimath' +require 'relaton_bib' module Iev module Termbase @@ -188,6 +189,10 @@ def nested_term Iev::Termbase::NestedTermBuilder end + def text_to_asciimath(text) + html_entities_to_asciimath(HTMLEntities.new(:expanded).decode(text)) + end + def html_to_asciimath(input) return input if input.nil? || input.empty? @@ -196,11 +201,11 @@ def html_to_asciimath(input) to_asciimath.css('i').each do |math_element| # puts "HTML MATH!! #{math_element.to_xml}" # puts "HTML MATH!! #{math_element.text}" - decoded = HTMLEntities.new.decode(math_element.text) + decoded = text_to_asciimath(math_element.text) case decoded.length when 1..12 - # puts "(#{math_element.text} to => #{HTMLEntities.new.decode(math_element.text)})" - math_element.replace "stem:[#{decoded.gsub(/[&;]/, '')}]" + # puts "(#{math_element.text} to => #{decoded})" + math_element.replace "stem:[#{decoded}]" when 0 math_element.remove else @@ -213,7 +218,7 @@ def html_to_asciimath(input) when 0 math_element.remove else - math_element.replace "~#{math_element.text}~" + math_element.replace "~#{text_to_asciimath(math_element.text)}~" end end @@ -222,7 +227,7 @@ def html_to_asciimath(input) when 0 math_element.remove else - math_element.replace "^#{math_element.text}^" + math_element.replace "^#{text_to_asciimath(math_element.text)}^" end end @@ -243,19 +248,57 @@ def html_to_asciimath(input) x.replace "`#{x.text}`" end - html_entities_to_asciimath( + html_entities_to_stem( to_asciimath.children.to_s.gsub(/\]stem:\[/, '').gsub(/<\/?[uo]l>/, '') ) end def html_entities_to_asciimath(x) + x.gsub("α", "alpha"). + gsub("β", "beta"). + gsub("γ", "gamma"). + gsub("Γ", "Gamma"). + gsub("δ", "delta"). + gsub("Δ", "Delta"). + gsub("ε", "epsilon"). + gsub("ϵ", "varepsilon"). + gsub("ζ", "zeta"). + gsub("η", "eta"). + gsub("θ", "theta"). + gsub("Θ", "Theta"). + gsub("ϑ", "vartheta"). + gsub("ι", "iota"). + gsub("κ", "kappa"). + gsub("λ", "lambda"). + gsub("Λ", "Lambda"). + gsub("μ", "mu"). + gsub("ν", "nu"). + gsub("ξ", "xi"). + gsub("Ξ", "Xi"). + gsub("π", "pi"). + gsub("Π", "Pi"). + gsub("ρ", "rho"). + gsub("β", "beta"). + gsub("σ", "sigma"). + gsub("Σ", "Sigma"). + gsub("τ", "tau"). + gsub("υ", "upsilon"). + gsub("φ", "phi"). + gsub("Φ", "Phi"). + gsub("ϕ", "varphi"). + gsub("χ", "chi"). + gsub("ψ", "psi"). + gsub("Ψ", "Psi"). + gsub("ω", "omega") + end + + def html_entities_to_stem(x) x.gsub("α", "stem:[alpha]"). gsub("β", "stem:[beta]"). gsub("γ", "stem:[gamma]"). gsub("Γ", "stem:[Gamma]"). gsub("δ", "stem:[delta]"). gsub("Δ", "stem:[Delta]"). - gsub("Δ", "stem:[Delta]"). gsub("ε", "stem:[epsilon]"). gsub("ϵ", "stem:[varepsilon]"). gsub("ζ", "stem:[zeta]"). @@ -301,8 +344,8 @@ def mathml_to_asciimath(input) # to_asciimath.remove_namespaces! to_asciimath.css('math').each do |math_element| - asciimath = MathML2AsciiMath.m2a(math_element.to_xml).strip - # puts "ASCIIMATH!! #{asciimath}" + asciimath = MathML2AsciiMath.m2a(text_to_asciimath(math_element.to_xml)).strip + # puts"ASCIIMATH!! #{asciimath}" if asciimath.empty? math_element.remove @@ -325,37 +368,32 @@ def extract_definition_value def extract_authoritative_source source = find_value_for("SOURCE") - if source - begin - # source = "ISO/IEC GUIDE 99:2007 1.26" - raw_ref = source.match(/\A[^,\()]+/).to_s - - clean_ref = raw_ref. - sub(";", ":"). - sub(/\u2011/, "-"). - sub(/IEC\sIEEE/, "IEC/IEEE"). - sub(/\d\.[\d\.]+/, ""). - strip - - clause = source. - gsub(clean_ref, ""). - gsub(/\A,?\s+/,""). - strip - - item = RelatonDb.instance.fetch(clean_ref) - - src = {} - src["ref"] = clean_ref - src["clause"] = clause unless clause.empty? - src["link"] = item.url if item - src - rescue RelatonBib::RequestError => e - warn e.message - src - end - else - source - end + return nil if source.nil? + + # source = "ISO/IEC GUIDE 99:2007 1.26" + raw_ref = source.match(/\A[^,\()]+/).to_s + + clean_ref = raw_ref. + sub(";", ":"). + sub(/\u2011/, "-"). + sub(/IEC\sIEEE/, "IEC/IEEE"). + sub(/\d\.[\d\.]+/, ""). + strip + + clause = source. + gsub(clean_ref, ""). + gsub(/\A,?\s+/,""). + strip + + item = ::Iev::Termbase::RelatonDb.instance.fetch(clean_ref) + + src = {} + src["ref"] = clean_ref + src["clause"] = clause unless clause.empty? + src["link"] = item.url if item + src + rescue ::RelatonBib::RequestError => e + warn e.message end SIMG_PATH_REGEX = ""