Skip to content

Commit

Permalink
Update term parsing to fix math and usageInfo
Browse files Browse the repository at this point in the history
  • Loading branch information
ronaldtse committed Apr 17, 2020
1 parent 5d652b1 commit f2e541b
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 54 deletions.
31 changes: 17 additions & 14 deletions lib/iev/termbase/nested_term_builder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def self.build(options)
attr_reader :data, :status, :options

def term_attributes
@term_attributes ||= data.to_s.split("; ")
@term_attributes ||= HTMLEntities.new(:expanded).decode(data.to_s) || ""
end

def build_nested_term
Expand All @@ -43,14 +43,14 @@ def term_status
end

def extract_gender
genders = term_attributes.grep(/[m|f|n]$|[m|f|n] pl/)
genders = term_attributes.match(/\s([m|f|n])$|^([m|f|n])[\s,]?|([m|f|n]) (pl)/)

unless genders.empty?
if genders
plurality = "singular"
genders = genders.first.split(" ")
plurality = "plural" if genders.size > 1 && genders[1] == "pl"
gender = genders[1..3].join('')
plurality = "plural" if genders.size > 1 && genders[4] == "pl"

{ "gender" => genders[0], "plurality" => plurality }
{ "gender" => gender, "plurality" => plurality }
end
end

Expand All @@ -65,29 +65,32 @@ def parts_hash
end

def extract_geographical_area
term_attributes.grep(/[A-Z]{2}$/).first
area = term_attributes.match(/([A-Z]{2})$/)
if area && area.size > 1
area[1]
end
end

def extract_part_of_speach
parts_regex = /noun|名詞|verb|動詞|Adjektiv|adj|形容詞|형용사/
part_of_speaches = term_attributes.grep(parts_regex)
part_of_speaches = term_attributes.match(parts_regex)

unless part_of_speaches.empty?
part_of_speach = part_of_speaches.first
if part_of_speaches
part_of_speach = part_of_speaches[1]
parts_hash[part_of_speach] || part_of_speach
end
end

def extract_usage_info
usage_info = term_attributes.grep(/&lt/).first
usage_info = term_attributes.match(/<(.*?)>/)

if usage_info
usage_info.gsub(/&lt;(.*?)&gt/, '\1')
if usage_info && usage_info.size > 1
usage_info[1].strip
end
end

def extract_prefix
term_attributes.grep(/Präfix|prefix|préfixe|接尾語|접두사|przedrostek|prefixo|词头/).empty? ? nil : true
term_attributes.match(/Präfix|prefix|préfixe|接尾語|접두사|przedrostek|prefixo|词头/) ? true : nil
end
end
end
Expand Down
118 changes: 78 additions & 40 deletions lib/iev/termbase/term_builder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
require "iev/termbase/iso_639_code"
require "iev/termbase/nested_term_builder"
require 'mathml2asciimath'
require 'relaton_bib'

module Iev
module Termbase
Expand Down Expand Up @@ -188,6 +189,10 @@ def nested_term
Iev::Termbase::NestedTermBuilder
end

def text_to_asciimath(text)
html_entities_to_asciimath(HTMLEntities.new(:expanded).decode(text))
end

def html_to_asciimath(input)
return input if input.nil? || input.empty?

Expand All @@ -196,11 +201,11 @@ def html_to_asciimath(input)
to_asciimath.css('i').each do |math_element|
# puts "HTML MATH!! #{math_element.to_xml}"
# puts "HTML MATH!! #{math_element.text}"
decoded = HTMLEntities.new.decode(math_element.text)
decoded = text_to_asciimath(math_element.text)
case decoded.length
when 1..12
# puts "(#{math_element.text} to => #{HTMLEntities.new.decode(math_element.text)})"
math_element.replace "stem:[#{decoded.gsub(/[&;]/, '')}]"
# puts "(#{math_element.text} to => #{decoded})"
math_element.replace "stem:[#{decoded}]"
when 0
math_element.remove
else
Expand All @@ -213,7 +218,7 @@ def html_to_asciimath(input)
when 0
math_element.remove
else
math_element.replace "~#{math_element.text}~"
math_element.replace "~#{text_to_asciimath(math_element.text)}~"
end
end

Expand All @@ -222,7 +227,7 @@ def html_to_asciimath(input)
when 0
math_element.remove
else
math_element.replace "^#{math_element.text}^"
math_element.replace "^#{text_to_asciimath(math_element.text)}^"
end
end

Expand All @@ -243,19 +248,57 @@ def html_to_asciimath(input)
x.replace "`#{x.text}`"
end

html_entities_to_asciimath(
html_entities_to_stem(
to_asciimath.children.to_s.gsub(/\]stem:\[/, '').gsub(/<\/?[uo]l>/, '')
)
end

def html_entities_to_asciimath(x)
x.gsub("&alpha;", "alpha").
gsub("&beta;", "beta").
gsub("&gamma;", "gamma").
gsub("&Gamma;", "Gamma").
gsub("&delta;", "delta").
gsub("&Delta;", "Delta").
gsub("&epsilon;", "epsilon").
gsub("&varepsilon;", "varepsilon").
gsub("&zeta;", "zeta").
gsub("&eta;", "eta").
gsub("&theta;", "theta").
gsub("&Theta;", "Theta").
gsub("&vartheta;", "vartheta").
gsub("&iota;", "iota").
gsub("&kappa;", "kappa").
gsub("&lambda;", "lambda").
gsub("&Lambda;", "Lambda").
gsub("&mu;", "mu").
gsub("&nu;", "nu").
gsub("&xi;", "xi").
gsub("&Xi;", "Xi").
gsub("&pi;", "pi").
gsub("&Pi;", "Pi").
gsub("&rho;", "rho").
gsub("&beta;", "beta").
gsub("&sigma;", "sigma").
gsub("&Sigma;", "Sigma").
gsub("&tau;", "tau").
gsub("&upsilon;", "upsilon").
gsub("&phi;", "phi").
gsub("&Phi;", "Phi").
gsub("&varphi;", "varphi").
gsub("&chi;", "chi").
gsub("&psi;", "psi").
gsub("&Psi;", "Psi").
gsub("&omega;", "omega")
end

def html_entities_to_stem(x)
x.gsub("&alpha;", "stem:[alpha]").
gsub("&beta;", "stem:[beta]").
gsub("&gamma;", "stem:[gamma]").
gsub("&Gamma;", "stem:[Gamma]").
gsub("&delta;", "stem:[delta]").
gsub("&Delta;", "stem:[Delta]").
gsub("&Delta;", "stem:[Delta]").
gsub("&epsilon;", "stem:[epsilon]").
gsub("&varepsilon;", "stem:[varepsilon]").
gsub("&zeta;", "stem:[zeta]").
Expand Down Expand Up @@ -301,8 +344,8 @@ def mathml_to_asciimath(input)
# to_asciimath.remove_namespaces!

to_asciimath.css('math').each do |math_element|
asciimath = MathML2AsciiMath.m2a(math_element.to_xml).strip
# puts "ASCIIMATH!! #{asciimath}"
asciimath = MathML2AsciiMath.m2a(text_to_asciimath(math_element.to_xml)).strip
# puts"ASCIIMATH!! #{asciimath}"

if asciimath.empty?
math_element.remove
Expand All @@ -325,37 +368,32 @@ def extract_definition_value
def extract_authoritative_source
source = find_value_for("SOURCE")

if source
begin
# source = "ISO/IEC GUIDE 99:2007 1.26"
raw_ref = source.match(/\A[^,\()]+/).to_s

clean_ref = raw_ref.
sub(";", ":").
sub(/\u2011/, "-").
sub(/IEC\sIEEE/, "IEC/IEEE").
sub(/\d\.[\d\.]+/, "").
strip

clause = source.
gsub(clean_ref, "").
gsub(/\A,?\s+/,"").
strip

item = RelatonDb.instance.fetch(clean_ref)

src = {}
src["ref"] = clean_ref
src["clause"] = clause unless clause.empty?
src["link"] = item.url if item
src
rescue RelatonBib::RequestError => e
warn e.message
src
end
else
source
end
return nil if source.nil?

# source = "ISO/IEC GUIDE 99:2007 1.26"
raw_ref = source.match(/\A[^,\()]+/).to_s

clean_ref = raw_ref.
sub(";", ":").
sub(/\u2011/, "-").
sub(/IEC\sIEEE/, "IEC/IEEE").
sub(/\d\.[\d\.]+/, "").
strip

clause = source.
gsub(clean_ref, "").
gsub(/\A,?\s+/,"").
strip

item = ::Iev::Termbase::RelatonDb.instance.fetch(clean_ref)

src = {}
src["ref"] = clean_ref
src["clause"] = clause unless clause.empty?
src["link"] = item.url if item
src
rescue ::RelatonBib::RequestError => e
warn e.message
end

SIMG_PATH_REGEX = "<simg .*\\/\\$file\\/([\\d\\-\\w\.]+)>"
Expand Down

0 comments on commit f2e541b

Please sign in to comment.