Skip to content

Wikibase to Solr #13: constructing Solr objects

Human Experience Systems LLC edited this page May 17, 2023 · 1 revision

$solr_OBJECTS is a global array that is populated with key-value data extracted from the Wikibase export data.

RUBY FUNCTIONS

require 'json'
require 'csv'
require 'date'
require 'time'
require 'optparse'

CONSTANTS

ID_DS_ID                                 = "P1"
ID_MANUSCRIPT_HOLDING                    = "P2"
ID_DESCRIBED_MANUSCRIPT                  = "P3"
ID_HOLDING_INSTITUTION_IN_AUTHORITY_FILE = "P4"
ID_HOLDING_INSTITUTION_AS_RECORDED       = "P5"
ID_HOLDING_STATUS                        = "P6"
ID_INSTITUTIONAL_ID                      = "P7"
ID_SHELFMARK                             = "P8"
ID_LINK_TO_INSTITUTIONAL_RECORD          = "P9"
ID_TITLE_AS_RECORDED                     = "P10"
ID_STANDARD_TITLE                        = "P11"
ID_UNIFORM_TITLE_AS_RECORDED             = "P12"
ID_IN_ORIGINAL_SCRIPT                    = "P13"
ID_ASSOCIATED_NAME_AS_RECORDED           = "P14"
ID_ROLE_IN_AUTHORITY_FILE                = "P15"
ID_INSTANCE_OF                           = "P16"
ID_NAME_IN_AUTHORITY_FILE                = "P17"
ID_GENRE_AS_RECORDED                     = "P18"
ID_SUBJECT_AS_RECORDED                   = "P19"
ID_TERM_IN_AUTHORITY_FILE                = "P20"
ID_LANGUAGE_AS_RECORDED                  = "P21"
ID_LANGUAGE_IN_AUTHORITY_FILE            = "P22"
ID_PRODUCTION_DATE_AS_RECORDED           = "P23"
ID_PRODUCTION_CENTURY_IN_AUTHORITY_FILE  = "P24"
ID_CENTURY                               = "P25"
ID_DATED                                 = "P26"
ID_PRODUCTION_PLACE_AS_RECORDED          = "P27"
ID_PLACE_IN_AUTHORITY_FILE               = "P28"
ID_PHYSICAL_DESCRIPTION                  = "P29"
ID_MATERIAL_AS_RECORDED                  = "P30"
ID_MATERIAL_IN_AUTHORITY_FILE            = "P31"
ID_NOTE                                  = "P32"
ID_ACKNOWLEDGEMENTS                      = "P33"
ID_DATE_ADDED                            = "P34"
ID_DATE_LAST_UPDATED                     = "P35"
ID_LATEST_DATE                           = "P36"
ID_EARLIEST_DATE                         = "P37"
ID_START_TIME                            = "P38"
ID_END_TIME                              = "P39"
ID_EXTERNAL_IDENTIFIER                   = "P40"
ID_IIIF_MANIFEST                         = "P41"
ID_WIKIDATA_QID                          = "P42"
ID_VIAF_ID                               = "P43"
ID_EXTERNAL_URI                          = "P44"
ID_EQUIVALENT_PROPERTY                   = "P45"
ID_FORMATTER_URL                         = "P46"
ID_SUBCLASS_OF                           = "P47"

HELPER METHODS

##
# For either an Item, a claim property, or a qualifier property, return the value
# specified by 'type'. This method works nested hashes with the structure:
# 
#   data['mainsnak']['datavalue']['value']
#
# or
#
#   data['datavalue']['value']
#
# If 'type' == 'value', return the result of the ['datavalue']['value'] chain;
# otherwise, return 'value'.
#
# Any string will work for `type`. The only special ‘type' is `value`, which returns the 
# whatever is returned by the ‘value’ key. The property value types in the DS Wikibase 
# JSON are:
#
#   'entity-type'
#   'numeric-id'
#   'id'
#   'time'
#   'timezone'
#   'before'
#   'after'
#   'precision'
#   'calendarmodel'
#
# @param [Hash] data item or claim property or qualifier property
# @param [String] type the value type to be returned
# @return [Hash,String] the result of extracting the nested data specified by type
def get_value_by_type(data, type)
  return unless data.instance_of?(Hash)

  # if `data` has a 'mainsnak', then we need to get the nested hash with a
  # 'datavalue', 'value' chain; otherwise, we assume 'data' is a hash
  # with a 'datavalue', 'chain'
  datavalue_hash = data['mainsnak'] || data

  # Be safe anyway: make sure 'datavalue_hash' isn't nil
  return unless datavalue_hash

  # {"snaktype"=>"value", "property"=>"P16", "datavalue"=>{"value"=>{"entity-type"=>"item", "numeric-id"=>3, "id"=>"Q3"}, "type"=>"wikibase-entityid"}, "datatype"=>"wikibase-item"}

  # if I'm right that everything at this point is a hash with a 'datavalue', 'value'
  # chain, then the following will **always** return a hash or a string; but, to be 
  # safe, make sure value is a hash if `#dig(...)` returns `nil`
  value = datavalue_hash.dig('datavalue', 'value') || {}

  return value if type == 'value'

  value[type]    
end

##
# Get the Wikibase 'instance of’ QID if there’s ‘P16’ ‘instance_of’ claim, if present.
# Otherwise, return 'nil’. 
#
# Example:
# 
#  JSON structure:
#
#      "claims":
#      {
#          "P16":
#          [
#              {
#                  "mainsnak":
#                  {
#                      "snaktype": "value",
#                      "property": "P16",
#                      "datavalue":
#                      {
#                          "value":
#                          {
#                              "entity-type": "item",
#                              "numeric-id": 17,
#                              "id": "Q17"
#                          },
#                          "type": "wikibase-entityid"
#                      },
#                      "datatype": "wikibase-item"
#                  },
#                  "type": "statement",
#                  "id": "Q18$37029DB4-8D1C-4F47-BCBB-26F0C41F1046",
#                  "rank": "normal"
#              }
#          ],
#          // ... etc. ...
#      },
#
#     instance_of = get_instance_of(claims_array) # => ‘Q17'

def get_first_instance_of(claims)

  return unless claims.instance_of?(Hash)
  return unless claims[ID_INSTANCE_OF]
  return if claims[ID_INSTANCE_OF].empty?

  # each claim property is an array, get the first one 
  claim = claims[ID_INSTANCE_OF].first

  #claim.dig('mainsnak', 'datavalue', 'value', 'numeric-id')
  get_value_by_type(claim, 'numeric-id')
end

def get_first_wikidata_id(claims)
  return unless has_wikidata_id(claims)

  # each claim property is an array, get the first one 
  claim = claims[ID_WIKIDATA_QID].first

  #claim.dig('mainsnak', 'datavalue', 'value')
  get_value_by_type(claim, 'value')
end    

def has_wikidata_id(claims)
  return unless claims.instance_of?(Hash)
  return unless claims[ID_WIKIDATA_QID]
  return if claims[ID_WIKIDATA_QID].empty?

  return true
end    

def get_first_external_uri(claims)
  return unless has_external_uri(claims)

  # each claim property is an array, get the first one 
  claim = claims[ID_EXTERNAL_URI].first

  #claim.dig('mainsnak', 'datavalue', 'value')
  get_value_by_type(claim, 'value')
end       

def has_external_uri(claims)
  return unless claims.instance_of?(Hash)
  return unless claims[ID_EXTERNAL_URI]
  return if claims[ID_EXTERNAL_URI].empty?

  return true
end     

def get_hash_value(value)
  if value.is_a?(Hash)
    if value["id"]
      item_VALUE_ID = value["id"]
      item_VALUE = $item_LABELS[item_VALUE_ID]
      item_URI = $item_URIS[item_VALUE_ID]
    elsif value["time"]
      item_VALUE = value["time"]
    end 
  end
  return item_VALUE
end

def get_hash_uri(value)
  if value.is_a?(Hash)
    if value["id"]
      item_VALUE_ID = value["id"]
      item_VALUE = $item_LABELS[item_VALUE_ID]
      item_URI = $item_URIS[item_VALUE_ID]
    elsif value["time"]
      item_VALUE = value["time"]
    end 
  end
  return item_URI
end

def solr_format(value)
    str = value.is_a?(Array) || value.is_a?(Hash) ? JSON.generate(value) : value
    str.is_a?(String) ? str.unicode_normalize : str
end

def solr_create(id, fieldname, value)
    formatted = solr_format value
    $solr_OBJECTS[id] ||= {}
    $solr_OBJECTS[id][fieldname] ||= []
    $solr_OBJECTS[id][fieldname] << formatted unless $solr_OBJECTS[id][fieldname].include? formatted
end

INPUT / OUTPUT CONFIGURATION

dir = File.dirname __FILE__
importJSONfile = File.expand_path 'export-dev-0302.json', dir

LOAD DATA

data = JSON.load_file importJSONfile

CREATE LOOKUP ARRAYS

$item_LABELS = {}
$item_URIS = {}

POPULATE LOOKUP ARRAYS

## Loop through every item from the Wikibase JSON export to populate item_LABELS and item_URIS
data.each do |item|

  ## item.keys = ["type", "id", "labels", "descriptions", "aliases", "claims", "sitelinks", "lastrevid"]

  ## Retrieve the item ID (value)
  item_ID = item["id"]

  ## Retrieve the item claims (deep array)
  item_CLAIMS = item["claims"]

  ## Retrieve the ID_INSTANCE_OF (deep dig into claims via get_first_instance_of method)
  item_INSTANCE_OF = get_first_instance_of item_CLAIMS

  ## Unlikely, but if there are no claims or ID_INSTANCE_OF, skip to the next item
  next if item_CLAIMS.empty?
  next if item_INSTANCE_OF.nil?

  # Wikibase items with an ID_INSTANCE_OF = Q4-Q17 contain "lookup values" that we want to use when constructing the Solr item
  if item_INSTANCE_OF.between?(4,17) then

      # Construct reference arrays for filling in Q-entity values in the main loop
      
      # Labels are the text string values associated with every item, which we often use in the Solr item values
      $item_LABELS[item_ID] = item["labels"]["en"]["value"]

      # URI's are the Linked Data entity URLs, generally terms from Linked Data Authority's such as VIAF
      has_external_uri(item_CLAIMS) ? $item_URIS[item_ID] = get_first_external_uri(item_CLAIMS): nil

      # Wikidata ID properties are not being stored with the full URL, so we have to append the a base URL to the stored value
      has_wikidata_id(item_CLAIMS) ? $item_URIS[item_ID] = "https://www.wikidata.org/wiki/" + get_first_wikidata_id(item_CLAIMS): nil

  end
end

CONSTRUCT SOLR OBJECTS

$solr_OBJECTS = {}

## Loop through every item from the Wikibase JSON export to generate Solr items
data.each do |item|

  ## item.keys = ["type", "id", "labels", "descriptions", "aliases", "claims", "sitelinks", "lastrevid"]

  ## Retrieve the item ID (value)
  item_ID = item["id"]

  ## Store Wikibase item ID (e.g. Q942) in the Solr document for reference
  solr_create item_ID, "qid_meta", item_ID

  ## Retrieve the item claims (deep array)
  item_CLAIMS = item["claims"]

  ## Retrieve the ID_INSTANCE_OF (deep dig into claims via get_first_instance_of method)
  item_INSTANCE_OF = get_first_instance_of item_CLAIMS

  ## Unlikely, but if there are no claims, skip to the next item
  next if item_CLAIMS.empty?
  next if item_INSTANCE_OF.nil?

  # Wikibase items with an ID_INSTANCE_OF = Q1-Q3 contain the manuscript data that we want to use when constructing the Solr item
  next unless item_INSTANCE_OF.between?(1,3)

  # Wikibase item claims array contains an arbitrary list of property ID's (P1-P47)     
  item_CLAIMS.each_key do |propertyID|

    # Each property ID has an array with zero, one, or many values
    item_PROPERTY_ARRAY = item_CLAIMS.dig propertyID

    # Skip ahead if the array has zero elements/data in it
    next if item_PROPERTY_ARRAY.nil?

    # Loop through each instance of a property
    item_PROPERTY_ARRAY.each do |propertyInstance|

        # Retrieve the actual text string value (or Q-entity reference) that we use for the Solr item
        item_PROPERTY_VALUE = propertyInstance&.dig "mainsnak", "datavalue", "value"

        # When the retrieved value is a hash/array, that means we have to dig one level further to retrieve the value we want
        # Translation logic for PROPERTY values that are not a text string in datavalue-value are slightly different than qualifiers
        if item_PROPERTY_VALUE.is_a?(Hash)

            if item_PROPERTY_VALUE["id"]
              # When the hash has an "id" field, we want that (it is the Q-id, e.g. Q1, Q942)
              item_PROPERTY_VALUE = item_PROPERTY_VALUE["id"]
            elsif item_PROPERTY_VALUE["time"]
              # When the hash has a "time" field, we want that value
              item_PROPERTY_VALUE = get_hash_value(item_PROPERTY_VALUE)
            end
            
        end

        # Each property ID may be further described by an array of qualifiers, which are properties
        item_PROPERTY_QUALIFIERS = propertyInstance.dig "qualifiers"

        # Skip ahead if there are no qualifiers
        next if item_PROPERTY_QUALIFIERS.nil?

        # Loop through each qualifier ID in the qualifier array
        item_PROPERTY_QUALIFIERS.each do |qualifier,qualifierArray| # qualifier => qualifierArray

            # Each qualifier may have multiple instances of data within it, e.g. multiple authors
            qualifierArray.each do |qualifierInstance|

                # Retrieve the value that we use for the Solr item
                item_PROPERTY_QUALIFIER_VALUE = qualifierInstance&.dig "datavalue", "value"

                # When the retrieved value is a hash/array, that means we have to dig one level further to retrieve the value we want
                # Translation logic for QUALIFIER values that are not a text string in datavalue-value are slightly different than qualifiers
                if item_PROPERTY_QUALIFIER_VALUE.is_a?(Hash)

                    if item_PROPERTY_QUALIFIER_VALUE["id"]
                      item_PROPERTY_QUALIFIER_VALUE_ID = item_PROPERTY_QUALIFIER_VALUE["id"]
                      item_PROPERTY_QUALIFIER_VALUE_URI = get_hash_uri(item_PROPERTY_QUALIFIER_VALUE)
                      item_PROPERTY_QUALIFIER_VALUE = get_hash_value(item_PROPERTY_QUALIFIER_VALUE)
                    end

                    if item_PROPERTY_QUALIFIER_VALUE["time"]
                      item_PROPERTY_QUALIFIER_VALUE = get_hash_value(item_PROPERTY_QUALIFIER_VALUE)
                    end
                    
                end
            end
        end
    end
  end 

end

pp $solr_OBJECTS

SAMPLE OUTPUT

{"Q1"=>{"qid_meta"=>["Q1"]},
"Q2"=>{"qid_meta"=>["Q2"]},
"Q3"=>{"qid_meta"=>["Q3"]},
"Q4"=>{"qid_meta"=>["Q4"]},
"Q5"=>{"qid_meta"=>["Q5"]},
"Q6"=>{"qid_meta"=>["Q6"]},
"Q7"=>{"qid_meta"=>["Q7"]},
"Q8"=>{"qid_meta"=>["Q8"]},
"Q9"=>{"qid_meta"=>["Q9"]},
"Q10"=>{"qid_meta"=>["Q10"]},
"Q11"=>{"qid_meta"=>["Q11"]},
"Q12"=>{"qid_meta"=>["Q12"]},
"Q13"=>{"qid_meta"=>["Q13"]},
"Q14"=>{"qid_meta"=>["Q14"]},
"Q15"=>{"qid_meta"=>["Q15"]},
"Q16"=>{"qid_meta"=>["Q16"]},
"Q17"=>{"qid_meta"=>["Q17"]},
"P1"=>{"qid_meta"=>["P1"]},
"P2"=>{"qid_meta"=>["P2"]},
"P3"=>{"qid_meta"=>["P3"]},
"P4"=>{"qid_meta"=>["P4"]},
"P5"=>{"qid_meta"=>["P5"]},
"P6"=>{"qid_meta"=>["P6"]},
"P7"=>{"qid_meta"=>["P7"]},
"P8"=>{"qid_meta"=>["P8"]},
"P9"=>{"qid_meta"=>["P9"]},
"P10"=>{"qid_meta"=>["P10"]},
"P11"=>{"qid_meta"=>["P11"]},
"P12"=>{"qid_meta"=>["P12"]},
"P13"=>{"qid_meta"=>["P13"]},
"P14"=>{"qid_meta"=>["P14"]},
"P15"=>{"qid_meta"=>["P15"]},
"P16"=>{"qid_meta"=>["P16"]}