diff --git a/dwc_agent.gemspec b/dwc_agent.gemspec index a010409..244ec32 100644 --- a/dwc_agent.gemspec +++ b/dwc_agent.gemspec @@ -7,7 +7,7 @@ Gem::Specification.new do |s| s.name = 'dwc_agent' s.version = DwcAgent::Version.version s.license = 'MIT' - s.date = '2024-11-07' + s.date = '2024-11-08' s.summary = "Parse Darwin Core agent terms such as recordedBy and identifiedBy" s.description = "Parses the typically messy content in Darwin Core terms that contain people names" s.authors = ["David P. Shorthouse"] diff --git a/lib/dwc_agent/cleaner.rb b/lib/dwc_agent/cleaner.rb index 0c6ae6d..6f1df2d 100644 --- a/lib/dwc_agent/cleaner.rb +++ b/lib/dwc_agent/cleaner.rb @@ -2,17 +2,25 @@ module DwcAgent class Cleaner + @defaults = { + blacklist: BLACKLIST, + given_blacklist: GIVEN_BLACKLIST, + family_blacklist: FAMILY_BLACKLIST, + particles: PARTICLES + } + class << self + attr_reader :defaults + def instance Thread.current[:dwc_agent_cleaner] ||= new end end - def initialize - @blacklist = BLACKLIST - @given_blacklist = GIVEN_BLACKLIST - @family_blacklist = FAMILY_BLACKLIST - @particles = PARTICLES + attr_reader :options + + def initialize(options = {}) + @options = self.class.defaults.merge(options) end def default @@ -35,7 +43,7 @@ def clean(parsed_namae) end if parsed_namae.given && - @given_blacklist.any?{ |s| s.casecmp(parsed_namae.given) == 0 } + options[:given_blacklist].any?{ |s| s.casecmp(parsed_namae.given) == 0 } return end @@ -55,7 +63,7 @@ def clean(parsed_namae) return default end - if parsed_namae.display_order =~ @blacklist + if parsed_namae.display_order =~ options[:blacklist] return default end @@ -113,7 +121,7 @@ def clean(parsed_namae) end if parsed_namae.family && - @family_blacklist.any?{ |s| s.casecmp(parsed_namae.family) == 0 } + options[:family_blacklist].any?{ |s| s.casecmp(parsed_namae.family) == 0 } return default end @@ -140,7 +148,7 @@ def clean(parsed_namae) if !family.nil? && given.nil? && !particle.nil? && - !@particles.include?(particle.downcase) + !options[:particles].include?(particle.downcase) given = particle.sub(/[a-z]\./, &:upcase).sub(/^(.)/) { $1.capitalize } particle = nil end @@ -161,11 +169,11 @@ def clean(parsed_namae) return default end - if !family.nil? && @family_blacklist.any?{ |s| s.casecmp(family) == 0 } + if !family.nil? && options[:family_blacklist].any?{ |s| s.casecmp(family) == 0 } return default end - if !given.nil? && @given_blacklist.any?{ |s| s.casecmp(given) == 0 } + if !given.nil? && options[:given_blacklist].any?{ |s| s.casecmp(given) == 0 } return default end diff --git a/lib/dwc_agent/parser.rb b/lib/dwc_agent/parser.rb index 394c0a9..ca11dcb 100644 --- a/lib/dwc_agent/parser.rb +++ b/lib/dwc_agent/parser.rb @@ -2,27 +2,33 @@ module DwcAgent class Parser + @defaults = { + prefer_comma_as_separator: true, + separator: SPLIT_BY, + title: TITLE, + appellation: APPELLATION, + suffix: SUFFIX, + strip_out_regex: Regexp.new(STRIP_OUT.to_s), + tidy_remains_regex: Regexp.new(POST_STRIP_TIDY.to_s), + char_subs_regex: Regexp.new([CHAR_SUBS.keys.join].to_s), + phrase_subs_regex: Regexp.new(PHRASE_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s), + residual_terminators_regex: Regexp.new(SPLIT_BY.to_s + %r{\s*\z}.to_s), + separators: SEPARATORS.map{|k,v| [ Regexp.new(k), v] } + } + class << self + attr_reader :defaults + def instance Thread.current[:dwc_agent_parser] ||= new end end - def initialize - options = { - prefer_comma_as_separator: true, - separator: SPLIT_BY, - title: TITLE, - appellation: APPELLATION, - suffix: SUFFIX - } - @namae = Namae::Parser.new(options) - @strip_out_regex = Regexp.new STRIP_OUT.to_s - @tidy_remains_regex = Regexp.new POST_STRIP_TIDY.to_s - @char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s - @phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s - @residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s - @separators = SEPARATORS.map{|k,v| [ Regexp.new(k), v] } + attr_reader :options, :namae + + def initialize(options = {}) + @options = self.class.defaults.merge(options) + @namae = Namae::Parser.new(@options) end # Parses the passed-in string and returns a list of names. @@ -31,14 +37,14 @@ def initialize # @return [Array] the list of parsed names def parse(name) return [] if name.nil? || name == "" - name.gsub!(@strip_out_regex, ' ') - name.gsub!(@tidy_remains_regex, '') - name.gsub!(Regexp.union(@char_subs_regex, @phrase_subs_regex), CHAR_SUBS.merge(PHRASE_SUBS)) - @separators.each{|k| name.gsub!(k[0], k[1])} - name.gsub!(@residual_terminators_regex, '') + name.gsub!(options[:strip_out_regex], ' ') + name.gsub!(options[:tidy_remains_regex], '') + name.gsub!(Regexp.union(options[:char_subs_regex], options[:phrase_subs_regex]), CHAR_SUBS.merge(PHRASE_SUBS)) + options[:separators].each{|k| name.gsub!(k[0], k[1])} + name.gsub!(options[:residual_terminators_regex], '') name.squeeze!(' ') name.strip! - @namae.parse(name) + namae.parse(name) end end diff --git a/lib/dwc_agent/version.rb b/lib/dwc_agent/version.rb index 28e33f7..d826d3b 100644 --- a/lib/dwc_agent/version.rb +++ b/lib/dwc_agent/version.rb @@ -3,8 +3,8 @@ module DwcAgent class Version MAJOR = 3 - MINOR = 2 - PATCH = 1 + MINOR = 3 + PATCH = 0 BUILD = 0 def self.version