diff --git a/CHANGELOG.md b/CHANGELOG.md index 8861fe0f5..81b874be9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,12 +34,12 @@ Versioning](https://semver.org/spec/v2.0.0.html). - documentation for adding new ingest formats to Datura - byebug gem for debugging - instructions for installing Javascript Runtime files for Saxon -- API schema can either be 1.0 or 2.0 (which includes nested fields); 1.0 will be run by default unless 2.0 is specified. Add the following to `public.yml` or `private.yml` in the data repo: +- API schema can either be the original 1.0 or the newly updated 2.0 (which includes new fields including nested fields); 1.0 will be run by default unless 2.0 is specified. Add the following to `public.yml` or `private.yml` in the data repo: ``` api_version: '2.0' ``` See new schema (2.0) documentation [here](https://github.com/CDRH/datura/docs/schema_v2.md) -- schema validation with API version 2.0, invalidly constructed documents will not post +- schema validation with API version 2.0: invalidly constructed documents will not post - authentication with Elasticesarch 8.5; add the following to `public.yml` or `private.yml` in the data repo: ``` es_user: username @@ -47,24 +47,32 @@ See new schema (2.0) documentation [here](https://github.com/CDRH/datura/docs/sc ``` - field overrides for new fields in the new API schema - functionality to transform EAD files and post them to elasticsearch +- functionality to transform PDF files (including text and metadata) and post them to elasticsearch +- limiting `text` field to a specific limit: `text_limit` in `public.yml` or `private.yml` +- configuration options related to Elasticsearch, including `es_schema_override` and `es_schema_path` to change the location of the Elasticsearch schema +- more detailed errors including a stack trace ### Changed - update ruby to 3.1.2 - date_standardize now relies on strftime instead of manual zero padding for month, day - minor corrections to documentation - XPath: "text" is now ingested as an array and will be displayed delimitted by spaces +- "text" field now includes "notes" XPath +- refactored posting script (`Datura.run`) - refactored command line methods into elasticsearch library - refactored and moved date_standardize and date_display helper methods -- Nokogiri methods `get_text` and `get_list` on TEI now return nil rather than empty strings or arrays if there are no matches +- Nokogiri methods `get_text` and `get_list` on TEI now return nil rather than empty strings or arrays if there are no matches. fields have been changed to check for these nil values ### Migration - check to make sure "text" xpath is doing desired behavior - use Elasticsearch 8.5 or higher and add authentication as described above if security is enabled. See [dev docs instructions](https://github.com/CDRH/cdrh_dev_docs/blob/update_elasticsearch_documentation/publishing/2_basic_requirements.md#downloading-elasticsearch). - upgrade data repos to Ruby 3.1.2 +- - add api version to config as described above - make sure fields are consistent with the api schema, many have been renamed or changed in format -- add nil checks with get_text and get_list methods +- add nil checks with get_text and get_list methods as needed - add EadToES overrides if ingesting EAD files +- add `byebug` and `pdf-reader` to Gemfile in repos based on Datura - if overriding the `read_csv` method in `lib/datura/file_type.rb`, the hash must be prefixed with ** (`**{}`). ## [v0.2.0-beta](https://github.com/CDRH/datura/compare/v0.1.6...v0.2.0-beta) - 2020-08-17 - Altering field and xpath behavior, adds get_elements diff --git a/lib/config/public.yml b/lib/config/public.yml index 11b42716c..7fefc6c85 100644 --- a/lib/config/public.yml +++ b/lib/config/public.yml @@ -9,22 +9,20 @@ # the collection specific configuration files: # (config/public.yml and config/private.yml) - ################### # Defaults # ################### default: - # SCRIPT POWER # recommend this be increased in private.yml # on more powerful systems to improve runtime threads: 5 # LOGGING - log_old_number: 1 # number of log files before beginning to erase - log_size: 32768000 # size of log file in bytes - log_level: Logger::INFO # available levels: UNKNOWN, FATAL, ERROR, WARN, INFO, DEBUG + log_old_number: 1 # number of log files before beginning to erase + log_size: 32768000 # size of log file in bytes + log_level: Logger::INFO # available levels: UNKNOWN, FATAL, ERROR, WARN, INFO, DEBUG # ELASTICSEARCH SCHEMA CONFIGURATION # if es_schema_override is false, datura is base directory @@ -40,14 +38,17 @@ default: api_version: "1.0" # NOTE: es_schema option is set later as combination of above # es_schema_override, es_schema_path, and api_version + # ES currently has a limited character size for keyword fields of 1000000 + # exceeding this limit (generally in text field) will cause errors when searching + text_limit: 900000 # RESOURCE LOCATIONS - data_base: https://cdrhmedia.unl.edu # xml, csv, html snippets, etc - media_base: https://cdrhmedia.unl.edu # images, audio, video - es_index: override_to_set_index # elasticsearch index name - es_path: http://localhost:9200 # elasticsearch path (recommend override) - solr_core: override_to_set_core # solr core name - solr_path: http://localhost:8983/solr # solr path (recommend override) + data_base: https://cdrhmedia.unl.edu # xml, csv, html snippets, etc + media_base: https://cdrhmedia.unl.edu # images, audio, video + es_index: override_to_set_index # elasticsearch index name + es_path: http://localhost:9200 # elasticsearch path (recommend override) + solr_core: override_to_set_core # solr core name + solr_path: http://localhost:8983/solr # solr path (recommend override) # OUTPUT LOCATION # default is [environment]/output/[file_type] @@ -92,7 +93,6 @@ default: development: data_base: https://cdrhdev1.unl.edu/media - ################## # Production # ################## diff --git a/lib/datura/parser_options/post.rb b/lib/datura/parser_options/post.rb index c3dd1218d..b6e48880f 100644 --- a/lib/datura/parser_options/post.rb +++ b/lib/datura/parser_options/post.rb @@ -22,12 +22,12 @@ def self.post_params # default to no restricted format options["format"] = nil - opts.on( '-f', '--format [input]', 'Supported formats (csv, html, pdf, tei, vra, webs)') do |input| + opts.on( '-f', '--format [input]', 'Supported formats (csv, html, ead, pdf, tei, vra, webs)') do |input| if %w[authority annotations].include?(input) puts "'authority' and 'annotations' are invalid formats".red puts "Please select a supported format or rename your custom format" exit - elsif !%w[csv html pdf tei vra webs].include?(input) + elsif !%w[csv ead html pdf tei vra webs].include?(input) puts "Caution: Requested custom format #{input}.".red puts "See FileCustom class for implementation instructions" end diff --git a/lib/datura/to_es/csv_to_es/fields.rb b/lib/datura/to_es/csv_to_es/fields.rb index 9adc59692..616470280 100644 --- a/lib/datura/to_es/csv_to_es/fields.rb +++ b/lib/datura/to_es/csv_to_es/fields.rb @@ -187,7 +187,7 @@ def text text_all += text_additional text_all = text_all.compact - Datura::Helpers.normalize_space(text_all.join(" ")) + Datura::Helpers.normalize_space(text_all.join(" "))[0..@options["text_limit"]] end # override and add by collection as needed diff --git a/lib/datura/to_es/ead_to_es/fields.rb b/lib/datura/to_es/ead_to_es/fields.rb index dfca070f7..95b89976e 100644 --- a/lib/datura/to_es/ead_to_es/fields.rb +++ b/lib/datura/to_es/ead_to_es/fields.rb @@ -77,7 +77,9 @@ def data_type def date(before=true) datestr = get_text(@xpaths["date"]) - return Datura::Helpers.date_standardize(datestr, before) + if datestr + return Datura::Helpers.date_standardize(datestr, before) + end end def date_display @@ -210,7 +212,9 @@ def text text = [] @xpaths.keys.each do |xpath| body = get_text(@xpaths[xpath]) - text << body + if body + text << body + end end text # TODO: do we need to preserve tags like in text? if so, turn get_text to true diff --git a/lib/datura/to_es/ead_to_es_items/fields.rb b/lib/datura/to_es/ead_to_es_items/fields.rb index 6eaf9c718..7764e083b 100644 --- a/lib/datura/to_es/ead_to_es_items/fields.rb +++ b/lib/datura/to_es/ead_to_es_items/fields.rb @@ -39,7 +39,7 @@ def creator_sort end def collection - "whitman-finding_aid_manuscripts" + "#{@options["collection"]}_items" end def collection_desc @@ -104,9 +104,11 @@ def format def get_id # doc = id doc = get_text(@xpaths["identifier"]) - if doc == "" + if !doc title = get_text(@xpaths["file"]) - return "#{@filename}_#{title}" + if title + return "#{@filename}_#{title}" + end end return "#{@filename}_#{doc}" end @@ -212,11 +214,13 @@ def text # means no worrying about handling spacing between words text = [] body = get_text(@xpaths["text"]) - text << body + if body + text << body + end # TODO: do we need to preserve tags like in text? if so, turn get_text to true # text << CommonXml.convert_tags_in_string(body) text += text_additional - return Datura::Helpers.normalize_space(text.join(" ")) + return Datura::Helpers.normalize_space(text.join(" "))[0..@options["text_limit"]] end def text_additional diff --git a/lib/datura/to_es/html_to_es/fields.rb b/lib/datura/to_es/html_to_es/fields.rb index 725ac4c4d..babe8adfc 100644 --- a/lib/datura/to_es/html_to_es/fields.rb +++ b/lib/datura/to_es/html_to_es/fields.rb @@ -152,9 +152,11 @@ def text # means no worrying about handling spacing between words text = [] body = get_text(@xpaths["text"]) - text << body + if body + text << body + end text += text_additional - Datura::Helpers.normalize_space(text.join(" ")) + Datura::Helpers.normalize_space(text.join(" "))[0..@options["text_limit"]] end def text_additional @@ -278,7 +280,7 @@ def keywords4 get_text(@xpaths["keywords4"]) end - def keywords4 + def keywords5 get_text(@xpaths["keywords5"]) end diff --git a/lib/datura/to_es/pdf_to_es/fields.rb b/lib/datura/to_es/pdf_to_es/fields.rb index b8ce90f52..1c3bfe4f2 100644 --- a/lib/datura/to_es/pdf_to_es/fields.rb +++ b/lib/datura/to_es/pdf_to_es/fields.rb @@ -187,7 +187,7 @@ def text end text_all += text_additional text_all = text_all.compact - Datura::Helpers.normalize_space(text_all.join(" "))[0..999999] + Datura::Helpers.normalize_space(text_all.join(" "))[0..@options["text_limit"]] end # override and add by collection as needed @@ -319,4 +319,7 @@ def has_relation def has_source end + def uri_html + end + end diff --git a/lib/datura/to_es/tei_to_es/fields.rb b/lib/datura/to_es/tei_to_es/fields.rb index 9b435cb72..960fd1929 100644 --- a/lib/datura/to_es/tei_to_es/fields.rb +++ b/lib/datura/to_es/tei_to_es/fields.rb @@ -205,11 +205,13 @@ def text # means no worrying about handling spacing between words text_all = [] body = get_text(@xpaths["text"], keep_tags: false, delimiter: '') - text_all << body + if body + text_all << body + end # TODO: do we need to preserve tags like in text? if so, turn get_text to true # text_all << CommonXml.convert_tags_in_string(body) text_all += text_additional - Datura::Helpers.normalize_space(text_all.join(" ")) + Datura::Helpers.normalize_space(text_all.join(" "))[0..@options["text_limit"]] end def text_additional diff --git a/lib/datura/to_es/tei_to_es/tei_to_es_personography.rb b/lib/datura/to_es/tei_to_es/tei_to_es_personography.rb index 7e4ff79be..4d3d43de2 100644 --- a/lib/datura/to_es/tei_to_es/tei_to_es_personography.rb +++ b/lib/datura/to_es/tei_to_es/tei_to_es_personography.rb @@ -16,7 +16,9 @@ def category def creator creators = get_list(@xpaths["creators"], false, @parent_xml) - creators.map { |c| { "name" => c } } + if creators + creators.map { |c| { "name" => c } } + end end def creators diff --git a/lib/datura/to_es/vra_to_es/fields.rb b/lib/datura/to_es/vra_to_es/fields.rb index 0f62b94d0..bdd3c4fd3 100644 --- a/lib/datura/to_es/vra_to_es/fields.rb +++ b/lib/datura/to_es/vra_to_es/fields.rb @@ -52,7 +52,9 @@ def data_type def date(before=true) datestr = get_list(@xpaths["date"]).first - Datura::Helpers.date_standardize(datestr, before) + if datestr + Datura::Helpers.date_standardize(datestr, before) + end end def date_display @@ -116,16 +118,16 @@ def person # subject element if get_text("@type", xml: p) == "personalName" { - id: nil, - name: get_text(".", xml: p), - role: nil + "id" => nil, + "name" => get_text(".", xml: p), + "role" => nil } # agent element else { - id: nil, - name: get_text("name", xml: p), - role: get_text("role", xml: p) + "id" => nil, + "name" => get_text("name", xml: p), + "role" => get_text("role", xml: p) } end end @@ -191,11 +193,13 @@ def text # handling separate fields in array # means no worrying about handling spacing between words text_all = [] - text_all << get_text(@xpaths["text"]) + if get_text(@xpaths["text"]) + text_all << get_text(@xpaths["text"]) + end # TODO: do we need to preserve tags like in text? if so, turn get_text to true # text_all << CommonXml.convert_tags_in_string(body) text_all += text_additional - Datura::Helpers.normalize_space(text_all.join(" ")) + Datura::Helpers.normalize_space(text_all.join(" "))[0..@options["text_limit"]] end def text_additional diff --git a/lib/datura/to_es/vra_to_es/vra_to_es_personography.rb b/lib/datura/to_es/vra_to_es/vra_to_es_personography.rb index 8d8f904b1..cab0c5591 100644 --- a/lib/datura/to_es/vra_to_es/vra_to_es_personography.rb +++ b/lib/datura/to_es/vra_to_es/vra_to_es_personography.rb @@ -13,7 +13,9 @@ def category def creator creators = get_list(@xpaths["creators"], xml: @parent_xml) - creators.map { |c| { "name" => c } } + if creators + creators.map { |c| { "name" => c } } + end end def creator_sort diff --git a/lib/datura/to_es/webs_to_es/fields.rb b/lib/datura/to_es/webs_to_es/fields.rb index 90e91d70d..8721e4462 100644 --- a/lib/datura/to_es/webs_to_es/fields.rb +++ b/lib/datura/to_es/webs_to_es/fields.rb @@ -162,9 +162,11 @@ def text # means no worrying about handling spacing between words text = [] body = get_text(@xpaths["text"]) - text << body + if body + text << body + end text += text_additional - Datura::Helpers.normalize_space(text.join(" ")) + Datura::Helpers.normalize_space(text.join(" "))[0..@options["text_limit"]] end def text_additional