Skip to content

Commit

Permalink
Showing 24 changed files with 264 additions and 43 deletions.
2 changes: 2 additions & 0 deletions docs/de/migration.md
Original file line number Diff line number Diff line change
@@ -155,6 +155,8 @@ Diese URLs sollten gültig und auflösbar sein.
Der Host-Teil dieser URLs wird aus der Anfrage generiert.

## Migration der Mets-Datei
**Achtung** Bitte erstellen Sie vorher ein Backup. In einigen Fällen sind die mets-Dateien inkonsistent, d. h. die gespeicherten Werte und Vokabularreferenzen stimmen nicht überein. Das Migrationsskript verwendet die Vokabularreferenzen, um die richtigen Werte zu finden. Wenn die Vokabularreferenzen falsch und die Werte richtig sind, wird die Migration die Datenintegrität korrumpieren!

Dieser Schritt kann nur durchgeführt werden, wenn die Migration der Vokabulardaten erfolgreich abgeschlossen wurde!

Wenn die Datei `migration.csv` vorhanden ist, führen Sie den folgenden Befehl in der aktivierten Python-Umgebung aus:
2 changes: 2 additions & 0 deletions docs/en/migration.md
Original file line number Diff line number Diff line change
@@ -127,6 +127,8 @@ blau,123
This file maps all record values to the corresponding record IDs in the reference vocabulary.

## Mets file migration
**Caution** Please create a backup beforehand. In some cases the mets files are inconsistent, i. e. the stored values and vocabulary references do not match. The migration script uses the vocabulary references to find the correct values. If the vocabulary references were wrong and the values correct, the migration will corrupt the data!

This step can only be done after the vocabulary data migration has been successfully completed!

With the `migration.csv` file present, run the following command in the activated Python environment:
24 changes: 18 additions & 6 deletions migration/lib/api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import requests
import json
import sys

SCHEMA_INSERTION_URL = 'http://{{HOST}}:{{PORT}}/api/v1/schemas'
SCHEMA_LOOKUP_URL = 'http://{{HOST}}:{{PORT}}/api/v1/schemas/{{SCHEMA_ID}}'
@@ -61,7 +62,16 @@ def query(self, url, obj=None, method='POST'):
response = requests.request(method, url=url, headers=HEADERS, data=payload)
try:
# Check for success
if response.status_code // 100 != 2:
if response.status_code == 401 or response.status_code == 403:
error_msg = f'API call was not successful, reason: Authentification'
logging.critical(error_msg)
sys.exit(1)
raise Exception(error_msg)
if response.status_code == 404:
error_msg = f'API call was not successful, reason: Entity not found {url}'
logging.warning(error_msg)
raise Exception(error_msg)
elif response.status_code // 100 != 2:
error_msg = f'API call was not successful, reason:\n{extract_error_from_response(response)}'
logging.warning(error_msg)
raise Exception(error_msg)
@@ -111,18 +121,20 @@ def insert_record(self, record):
result = self.query(url, record)
return result['id']

def find_record(self, ctx, vocabulary_id, search_term):
def find_record(self, ctx, vocabulary_id, search_term, search_field=None):
url = self.urls[RECORD_SEARCH].replace('{{VOCABULARY_ID}}', str(vocabulary_id)).replace('{{SEARCH_TERM}}', search_term)
result = self.query(url, obj=None, method='GET')
if not '_embedded' in result:
raise Exception(f'Record search for search term "{search_term}" has no results')
raise Exception(f'Record search in vocabulary "{vocabulary_id}" for search term "{search_term}" has no results')
results = result['_embedded']['vocabularyRecordList']
# Filter for exact searches
results = [r for r in results if ctx.record_contains_value(r, search_term)]
results = [r for r in results if ctx.record_contains_value(r, search_term, search_field=search_field)]

if len(results) == 0:
raise Exception(f'Record search for search term "{search_term}" has no results')
raise Exception(f'Record search in vocabulary "{vocabulary_id}" for search term "{search_term}" has no results')
elif len(results) > 1:
raise Exception(f'Record search for search term "{search_term}" has no unique result, {len(results)} records found')
ids = [r['id'] for r in results]
raise Exception(f'Record search in vocabulary "{vocabulary_id}" for search term "{search_term}" has no unique result, {len(results)} records found: {ids}')

return results[0]['id']

34 changes: 28 additions & 6 deletions migration/lib/mets_context.py
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@
RECORD_PATTERN = re.compile('^(\\d+).*$')

class Context:
def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, mapping_file, preferred_mets_main_value_language, manual_id_fix):
def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, mapping_file, preferred_mets_main_value_language, manual_id_fix, trust, enable_relation_vocabulary_column_logic):
self.api = api
self.dry = dry
self.verbose = verbose
@@ -15,12 +15,24 @@ def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, map
self.mapping_file = mapping_file
self.preferred_mets_main_value_language = preferred_mets_main_value_language
self.manual_id_fix = manual_id_fix
self.trust = trust
self.enable_relation_vocabulary_column_logic = enable_relation_vocabulary_column_logic
self.vocabulary_name_id_map = {}
self.vocabulary_id_name_map = {}
self.vocabulary_id_map = {}
self.record_id_map = {}
self.vocabulary_id_schema_id_map = {}
self.schema_id_main_field_id_map = {}

def find_vocabulary_by_name(self, identifier):
if not identifier in self.vocabulary_name_id_map:
error = f'Vocabulary name "{identifier}" not found'
if self.continue_on_error:
logging.error(error)
else:
raise Exception(error)
return self.vocabulary_name_id_map[identifier]

def lookup_vocabulary_name(self, identifier):
if not identifier in self.vocabulary_id_name_map:
error = f'Vocabulary name not found for vocabulary with ID {identifier}'
@@ -69,12 +81,22 @@ def retrieve_main_field_id(self, schema_id):
self.schema_id_main_field_id_map[schema_id] = main_definitions[0]['id']
return self.schema_id_main_field_id_map[schema_id]

def record_contains_value(self, record, value):
def record_contains_value(self, record, value, search_field=None):
field_id = None
if search_field != None:
vocabulary = self.api.lookup_vocabulary(record['vocabularyId'])
schema = self.api.lookup_schema(vocabulary['schemaId'])
ids = [d['id'] for d in schema['definitions'] if d['name'] == search_field]
if len(ids) != 1:
logging.critical(f'Non unique "{search_field}" fields found: {ids}!')
sys.exit(1)
field_id = ids[0]
for f in record['fields']:
for v in f['values']:
for t in v['translations']:
if t['value'] == value:
return True
if field_id == None or f['definitionId'] == field_id:
for v in f['values']:
for t in v['translations']:
if t['value'] == value:
return True
return False

def extract_language_values(self, field):
119 changes: 112 additions & 7 deletions migration/lib/mets_manipulator.py
Original file line number Diff line number Diff line change
@@ -40,14 +40,14 @@ def process_mets_file(self):
self.ctx.log_processed(self.file_path)

def process_node(self, node):
if self.is_vocabulary_reference(node) and not self.is_already_migrated(node):
self.process_vocabulary_reference(node)
if self.ctx.dry:
dump_node(node)
if self.is_manual_id_reference(node):
self.process_manual_id_reference(node)
if self.ctx.dry:
dump_node(node)
elif self.is_vocabulary_reference(node) and not self.is_already_migrated(node):
self.process_vocabulary_reference(node)
if self.ctx.dry:
dump_node(node)
for child in node:
self.process_node(child)

@@ -67,6 +67,14 @@ def generate_record_uri(self, record_id):
return self.record_endpoint.replace('{{ID}}', str(record_id))

def process_vocabulary_reference(self, node):
if (self.ctx.trust == 'ID'):
self.process_vocabulary_reference_by_id(node)
elif (self.ctx.trust == 'Value'):
self.process_vocabulary_reference_by_value(node)
else:
raise Exception(f'Unknown trust source \"{self.ctx.trust}\"')

def process_vocabulary_reference_by_id(self, node):
try:
# Extract old vocabulary and record ids
valueURI = node.attrib['valueURI']
@@ -132,18 +140,115 @@ def process_vocabulary_reference(self, node):
error = f'Unable to retrieve vocabulary and record id from valueURI: {valueURI}\n\t\t{e}'
logging.debug(error)
self.ctx.log_issue(self.file_path, error)

def process_vocabulary_reference_by_value(self, node):
try:
vocabulary_name = node.attrib['authority']

if vocabulary_name == 'geonames':
return
vocabulary_id = self.ctx.find_vocabulary_by_name(vocabulary_name)
except Exception as e:
error = f'Unable to retrieve vocabulary by name: {vocabulary_name}\n\t\t{e}'
logging.debug(error)
self.ctx.log_issue(self.file_path, error)
return

try:
value = node.text

search_field=None
inverse_search_field=None
if self.ctx.enable_relation_vocabulary_column_logic and 'Relationship' in vocabulary_name:
parent = node.getparent()
if parent == None:
logging.warn(f'No parent found!')
dump_node(node)
return

entity_type = None
for sibling in parent:
if sibling.attrib['name'] == 'RelationEntityType':
entity_type = sibling.text
break

entity_type_in_relation_count = vocabulary_name.count(entity_type)
if entity_type_in_relation_count == 1:
# Find out relation direction
separator_position = vocabulary_name.index('-')
entity_type_position = vocabulary_name.index(entity_type)

# use second column of vocabulary: `Reverse relationship` (The relation vocabulary is specified from `A->B`, the relation references an entity of type `A` and is therefore of type `B`)
if entity_type_position < separator_position:
search_field='Reverse relationship'
inverse_search_field='Relationship type'
else:
search_field='Relationship type'
inverse_search_field='Reverse relationship'

try:
new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=search_field)
except:
new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=inverse_search_field)
old_value = node.text
record_data = self.ctx.api.lookup_record(new_record_id)

v = self.ctx.api.lookup_vocabulary(record_data['vocabularyId'])
s = self.ctx.api.lookup_schema(v['schemaId'])
ids = [d['id'] for d in s['definitions'] if d['name'] == search_field] # We need the value, that we actually originally searched for
if len(ids) != 1:
logging.critical(f'Non unique "{search_field}" fields found: {ids}!')
sys.exit(1)

field_data = [f for f in record_data['fields'] if f['definitionId'] == ids[0]]
if len(field_data) != 1:
logging.critical(f'Record [{new_record_id}] has no unique search column entry field')
sys.exit(1)

# Replace node text if not matching any translation of main value
translated_main_values = self.ctx.extract_language_values(field_data[0])
new_value = self.ctx.extract_preferred_language(translated_main_values)

#dump_node(node)
logging.warn(f'Relation is saved in the wrong direction, correct direction found and corrected: "{old_value}" -> "{new_value}"')
node.text = new_value

else:
new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=None)

# Set all attributes accordingly
node.attrib['authority'] = vocabulary_name
node.attrib['authorityURI'] = self.generate_vocabulary_uri(vocabulary_id)
node.attrib['valueURI'] = self.generate_record_uri(new_record_id)

self.changed = True
except Exception as e:
error = f'Unable to find record by value: {value}\n\t\t{e}'
logging.error(error)
self.ctx.log_issue(self.file_path, error)

def process_manual_id_reference(self, node):
try:
if node.text == None:
return
record_id_old = int(node.text)
record_id_new = self.ctx.lookup_record_id(record_id_old)
node.text = str(record_id_new)

if 'authority' in node.attrib or 'authorityURI' in node.attrib or 'valueURI' in node.attrib:
record = self.ctx.api.lookup_record(record_id_new)
vocabulary = self.ctx.api.lookup_vocabulary(record['vocabularyId'])
node.attrib['authority'] = vocabulary['name']
node.attrib['authorityURI'] = self.generate_vocabulary_uri(vocabulary['id'])
node.attrib['valueURI'] = self.generate_record_uri(record_id_new)

self.changed = True
except Exception as e:
msg = f'Unable to read ID {node.text}!'
logging.critical(msg)
raise Exception(msg)
logging.warn(msg)
#raise Exception(msg)

def dump_node(node):
attributes = ' '.join(f'{k}="{v}"' for k, v in node.attrib.items())
logging.info(f'<{node.tag} {attributes} />')
value = node.text
logging.info(f'<{node.tag} {attributes}>{value}</{node.tag}>')
2 changes: 2 additions & 0 deletions migration/lib/mets_migrator.py
Original file line number Diff line number Diff line change
@@ -43,6 +43,8 @@ def load_mapping_file(self):

if not vocabulary_id_new in self.ctx.vocabulary_id_name_map:
self.ctx.vocabulary_id_name_map[vocabulary_id_new] = vocabulary_name
if not vocabulary_name in self.ctx.vocabulary_name_id_map:
self.ctx.vocabulary_name_id_map[vocabulary_name] = vocabulary_id_new
if not vocabulary_id_old in self.ctx.vocabulary_id_map:
self.ctx.vocabulary_id_map[vocabulary_id_old] = vocabulary_id_new
if not record_id_old in self.ctx.record_id_map:
4 changes: 3 additions & 1 deletion migration/metadata-migrator.py
Original file line number Diff line number Diff line change
@@ -14,7 +14,7 @@ def main():
args.vocabulary_server_port,
args.vocabulary_server_token
)
ctx = Context(api, args.dry, args.verbose, args.continue_on_error, args.metadata_directory, args.mapping_file, args.preferred_mets_main_value_language, args.manual_id_fix)
ctx = Context(api, args.dry, args.verbose, args.continue_on_error, args.metadata_directory, args.mapping_file, args.preferred_mets_main_value_language, args.manual_id_fix, args.trust, args.enable_relation_vocabulary_column_logic)

try:
migrator = MetsMigrator(ctx)
@@ -39,6 +39,8 @@ def parse_args():
parser.add_argument('--vocabulary-server-port', type=str, default='8081', help='vocabulary server port')
parser.add_argument('--vocabulary-server-token', type=str, default=None, help='vocabulary server security token')
parser.add_argument('--preferred-mets-main-value-language', type=str, default='eng', help='Default language to use for mets value writing, if present and prior value invalid')
parser.add_argument('--trust', required=False, type=str, default='ID', help='Set the data source to trust for the migration. Possible values are: "ID" and "Value". If "ID" is set, the record ID is parsed from the valueURI and used to find the migrated record. If "Value" is set, the XML elements value is used to find the newly migrated record by value. Defaults to "ID".')
parser.add_argument('--enable-relation-vocabulary-column-logic', required=False, default=False, action='store_const', const=True, help='Activate relationship vocabulary correct column finding logic (reverse vs non-reverse, artist dictionary)')
parser.add_argument('--manual-id-fix', type=str, default=None, help='Manually fix the record ID of elements whose name attribute matches this parameter. Caution, this must not be executed twice!')
parser.add_argument('--log', required=False, default='INFO', help='logger level (possible values are: NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL)')
parser.add_argument('--verbose', required=False, default=False, action='store_const', const=True, help='verbose output')
4 changes: 2 additions & 2 deletions module-core/pom.xml
Original file line number Diff line number Diff line change
@@ -10,7 +10,7 @@
</parent>
<groupId>io.goobi.vocabulary</groupId>
<artifactId>vocabulary-server-core</artifactId>
<version>1.1.8</version>
<version>1.1.9</version>
<name>Vocabulary-Server-Core</name>
<description>Spring Boot based RESTful web service for vocabulary management</description>
<packaging>jar</packaging>
@@ -35,7 +35,7 @@
<dependency>
<groupId>io.goobi.vocabulary</groupId>
<artifactId>vocabulary-server-exchange</artifactId>
<version>1.1.8</version>
<version>1.1.9</version>
<scope>compile</scope>
</dependency>

Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package io.goobi.vocabulary.api;

import io.goobi.vocabulary.api.assemblers.FieldDefinitionAssembler;
import io.goobi.vocabulary.exchange.FieldDefinition;
import io.goobi.vocabulary.service.manager.FieldDefinitionDTOManager;
import org.springframework.hateoas.EntityModel;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

@RestController
@RequestMapping("/api/v1")
public class FieldDefinitionController {
private final FieldDefinitionDTOManager manager;
private final FieldDefinitionAssembler assembler;

public FieldDefinitionController(FieldDefinitionDTOManager manager, FieldDefinitionAssembler assembler) {
this.manager = manager;
this.assembler = assembler;
}

@GetMapping("/fieldDefinitions/{id}")
public EntityModel<FieldDefinition> one(@PathVariable long id) {
return assembler.toModel(manager.get(id));
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package io.goobi.vocabulary.api;

import io.goobi.vocabulary.maintenance.selfcheck.SelfCheckResult;
import io.goobi.vocabulary.monitoring.SelfCheckResult;
import io.goobi.vocabulary.service.manager.MaintenanceManager;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package io.goobi.vocabulary.api;

import io.goobi.vocabulary.maintenance.MonitoringResult;
import io.goobi.vocabulary.monitoring.MonitoringResult;
import io.goobi.vocabulary.service.manager.MaintenanceManager;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.GetMapping;
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package io.goobi.vocabulary.api.assemblers;

import io.goobi.vocabulary.api.FieldTypeController;
import io.goobi.vocabulary.api.VocabularySchemaController;
import io.goobi.vocabulary.exchange.FieldDefinition;
import org.springframework.hateoas.EntityModel;
import org.springframework.hateoas.server.RepresentationModelAssembler;
import org.springframework.stereotype.Component;

import static org.springframework.hateoas.server.mvc.WebMvcLinkBuilder.linkTo;
import static org.springframework.hateoas.server.mvc.WebMvcLinkBuilder.methodOn;

@Component
public class FieldDefinitionAssembler implements RepresentationModelAssembler<FieldDefinition, EntityModel<FieldDefinition>> {
@Override
public EntityModel<FieldDefinition> toModel(FieldDefinition entity) {
return EntityModel.of(entity,
linkTo(methodOn(FieldTypeController.class).one(entity.getId())).withSelfRel(),
linkTo(methodOn(VocabularySchemaController.class).one(entity.getSchemaId())).withRel("schema")
);
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package io.goobi.vocabulary.maintenance;
package io.goobi.vocabulary.service.maintenance;

import lombok.Getter;
import org.apache.commons.lang3.StringUtils;
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package io.goobi.vocabulary.service.manager;

import io.goobi.vocabulary.exception.EntityNotFoundException;
import io.goobi.vocabulary.exchange.FieldDefinition;
import io.goobi.vocabulary.repositories.FieldDefinitionRepository;
import io.goobi.vocabulary.service.exchange.DTOMapper;
import org.springframework.stereotype.Service;

@Service
public class FieldDefinitionDTOManager {
private final FieldDefinitionRepository fieldDefinitionRepository;
private final DTOMapper modelMapper;

public FieldDefinitionDTOManager(FieldDefinitionRepository fieldDefinitionRepository, DTOMapper modelMapper) {
this.fieldDefinitionRepository = fieldDefinitionRepository;
this.modelMapper = modelMapper;
}

public FieldDefinition get(long id) {
return modelMapper.toDTO(
fieldDefinitionRepository.findById(id)
.orElseThrow(() -> new EntityNotFoundException(FieldDefinition.class, id))
);
}
}
Original file line number Diff line number Diff line change
@@ -2,14 +2,14 @@

import com.fasterxml.jackson.databind.ObjectMapper;
import io.goobi.vocabulary.exception.VocabularyException;
import io.goobi.vocabulary.maintenance.FlywayInformation;
import io.goobi.vocabulary.maintenance.ManifestReader;
import io.goobi.vocabulary.maintenance.MonitoringResult;
import io.goobi.vocabulary.maintenance.MonitoringState;
import io.goobi.vocabulary.maintenance.VersionInformation;
import io.goobi.vocabulary.maintenance.VersionsCollection;
import io.goobi.vocabulary.maintenance.selfcheck.SelfCheckResult;
import io.goobi.vocabulary.maintenance.selfcheck.ValidationResult;
import io.goobi.vocabulary.monitoring.FlywayInformation;
import io.goobi.vocabulary.service.maintenance.ManifestReader;
import io.goobi.vocabulary.monitoring.MonitoringResult;
import io.goobi.vocabulary.monitoring.MonitoringState;
import io.goobi.vocabulary.monitoring.VersionInformation;
import io.goobi.vocabulary.monitoring.VersionsCollection;
import io.goobi.vocabulary.monitoring.SelfCheckResult;
import io.goobi.vocabulary.monitoring.ValidationResult;
import io.goobi.vocabulary.model.jpa.FieldTypeEntity;
import io.goobi.vocabulary.model.jpa.VocabularyEntity;
import io.goobi.vocabulary.model.jpa.VocabularyRecordEntity;
2 changes: 1 addition & 1 deletion module-exchange/pom.xml
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>io.goobi.vocabulary</groupId>
<artifactId>vocabulary-server-exchange</artifactId>
<version>1.1.8</version>
<version>1.1.9</version>
<name>Vocabulary Exchange</name>
<description>Vocabulary data exchange classes</description>
<packaging>jar</packaging>
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package io.goobi.vocabulary.maintenance;
package io.goobi.vocabulary.monitoring;

import java.util.Date;

Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package io.goobi.vocabulary.maintenance;
package io.goobi.vocabulary.monitoring;

import io.goobi.vocabulary.maintenance.selfcheck.SelfCheckResult;
import io.goobi.vocabulary.monitoring.SelfCheckResult;

public record MonitoringResult(MonitoringState monitoring, VersionsCollection versions, FlywayInformation flyway, SelfCheckResult selfCheck) {
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package io.goobi.vocabulary.maintenance;
package io.goobi.vocabulary.monitoring;

public record MonitoringState(String database, String selfCheck) {
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package io.goobi.vocabulary.maintenance.selfcheck;
package io.goobi.vocabulary.monitoring;

import java.util.Date;

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package io.goobi.vocabulary.maintenance.selfcheck;
package io.goobi.vocabulary.monitoring;

import java.util.List;

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package io.goobi.vocabulary.maintenance;
package io.goobi.vocabulary.monitoring;

public record VersionInformation(String version, String hash) {
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package io.goobi.vocabulary.maintenance;
package io.goobi.vocabulary.monitoring;

public record VersionsCollection(VersionInformation core) {
}
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>io.goobi.vocabulary</groupId>
<artifactId>vocabulary-server</artifactId>
<version>1.1.8</version>
<version>1.1.9</version>
<name>Vocabulary-Server</name>
<packaging>pom</packaging>
<description>RESTful webservice for vocabulary management</description>

0 comments on commit 8d81e1e

Please sign in to comment.