Skip to content

Commit

Permalink
Remove --no-parents CLI argument, use "save_parent_id" in property fi…
Browse files Browse the repository at this point in the history
…le to specify which node(s) need to have parent's ID save in them to prevent unwanted merging of nodes.
  • Loading branch information
n2iw committed Apr 30, 2021
1 parent a3aab5e commit eb380ed
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 22 deletions.
3 changes: 3 additions & 0 deletions config/props-icdc-pmvp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ Properties:
reuse_nodes:
- registration

save_parent_id:
- demographic

visit_date_in_nodes:
vital_signs: date_of_vital_signs
physical_exam: date_of_examination
Expand Down
32 changes: 16 additions & 16 deletions data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def validate_files(self, cheat_mode, file_list, max_violations):
self.log.info('Cheat mode enabled, all validations skipped!')
return True

def load(self, file_list, cheat_mode, dry_run, loading_mode, wipe_db, max_violations, no_parents,
def load(self, file_list, cheat_mode, dry_run, loading_mode, wipe_db, max_violations,
split=False, no_backup=True, backup_folder="/", neo4j_uri=None):
if not self.check_files(file_list):
return False
Expand Down Expand Up @@ -213,14 +213,14 @@ def load(self, file_list, cheat_mode, dry_run, loading_mode, wipe_db, max_violat
with self.driver.session() as session:
# Split Transactions enabled
if split:
self._load_all(session, file_list, loading_mode, no_parents, split, wipe_db)
self._load_all(session, file_list, loading_mode, split, wipe_db)

# Split Transactions Disabled
else:
# Data updates transaction
tx = session.begin_transaction()
try:
self._load_all(tx, file_list, loading_mode, no_parents, split, wipe_db)
self._load_all(tx, file_list, loading_mode, split, wipe_db)
tx.commit()
except Exception as e:
tx.rollback()
Expand Down Expand Up @@ -249,14 +249,14 @@ def load(self, file_list, cheat_mode, dry_run, loading_mode, wipe_db, max_violat
return {NODES_CREATED: self.nodes_created, RELATIONSHIP_CREATED: self.relationships_created,
NODES_DELETED: self.nodes_deleted, RELATIONSHIP_DELETED: self.relationships_deleted}

def _load_all(self, tx, file_list, loading_mode, no_parents, split, wipe_db):
def _load_all(self, tx, file_list, loading_mode, split, wipe_db):
if wipe_db:
self.wipe_db(tx, split)
for txt in file_list:
self.load_nodes(tx, txt, loading_mode, no_parents, split)
self.load_nodes(tx, txt, loading_mode, split)
if loading_mode != DELETE_MODE:
for txt in file_list:
self.load_relationships(tx, txt, loading_mode, no_parents, split)
self.load_relationships(tx, txt, loading_mode, split)

# Remove extra spaces at begining and end of the keys and values
@staticmethod
Expand All @@ -270,7 +270,7 @@ def cleanup_node(node):
# Add uuid to nodes if one not exists
# Add parent id(s)
# Add extra properties for "value with unit" properties
def prepare_node(self, node, no_parents):
def prepare_node(self, node):
obj = self.cleanup_node(node)

node_type = obj.get(NODE_TYPE, None)
Expand Down Expand Up @@ -324,7 +324,7 @@ def prepare_node(self, node, no_parents):
for key, value in obj.items():
obj2[key] = value
# Add parent id field(s) into node
if self.schema.is_parent_pointer(key) and not no_parents and obj[NODE_TYPE] not in self.schema.props.reuse_nodes:
if obj[NODE_TYPE] in self.schema.props.save_parent_id and self.schema.is_parent_pointer(key):
header = key.split('.')
if len(header) > 2:
self.log.warning('Column header "{}" has multiple periods!'.format(key))
Expand Down Expand Up @@ -365,7 +365,7 @@ def get_signature(self, node):
return '{{ {} }}'.format(', '.join(result))

# Validate all cases exist in a data (TSV/TXT) file
def validate_cases_exist_in_file(self, file_name, max_violations, no_parents):
def validate_cases_exist_in_file(self, file_name, max_violations):
if not self.driver or not isinstance(self.driver, Driver):
self.log.error('Invalid Neo4j Python Driver!')
return False
Expand All @@ -378,7 +378,7 @@ def validate_cases_exist_in_file(self, file_name, max_violations, no_parents):
validation_failed = False
violations = 0
for org_obj in reader:
obj = self.prepare_node(org_obj, no_parents)
obj = self.prepare_node(org_obj)
line_num += 1
# Validate parent exist
if CASE_ID in obj:
Expand All @@ -394,7 +394,7 @@ def validate_cases_exist_in_file(self, file_name, max_violations, no_parents):
return not validation_failed

# Validate all parents exist in a data (TSV/TXT) file
def validate_parents_exist_in_file(self, file_name, max_violations, no_parents):
def validate_parents_exist_in_file(self, file_name, max_violations):
validation_failed = True
if not self.driver or not isinstance(self.driver, Driver):
self.log.error('Invalid Neo4j Python Driver!')
Expand All @@ -409,7 +409,7 @@ def validate_parents_exist_in_file(self, file_name, max_violations, no_parents):
violations = 0
for org_obj in reader:
line_num += 1
obj = self.prepare_node(org_obj, no_parents)
obj = self.prepare_node(org_obj)
results = self.collect_relationships(obj, session, False, line_num)
relationships = results[RELATIONSHIPS]
provided_parents = results[PROVIDED_PARENTS]
Expand Down Expand Up @@ -581,7 +581,7 @@ def delete_single_node(self, session, node):
return (nodes_deleted, relationship_deleted)

# load file
def load_nodes(self, session, file_name, loading_mode, no_parents, split=False):
def load_nodes(self, session, file_name, loading_mode, split=False):
if loading_mode == NEW_MODE:
action_word = 'Loading new'
elif loading_mode == UPSERT_MODE:
Expand Down Expand Up @@ -611,7 +611,7 @@ def load_nodes(self, session, file_name, loading_mode, no_parents, split=False):
for org_obj in reader:
line_num += 1
transaction_counter += 1
obj = self.prepare_node(org_obj, no_parents)
obj = self.prepare_node(org_obj)
node_type = obj[NODE_TYPE]
node_id = self.schema.get_id(obj)
if not node_id:
Expand Down Expand Up @@ -772,7 +772,7 @@ def remove_old_relationship(self, session, node_type, node, relationship):
if not del_result:
self.log.error('Delete old relationship failed!')

def load_relationships(self, session, file_name, loading_mode, no_parents, split=False):
def load_relationships(self, session, file_name, loading_mode, split=False):
if loading_mode == NEW_MODE:
action_word = 'Loading new'
elif loading_mode == UPSERT_MODE:
Expand All @@ -798,7 +798,7 @@ def load_relationships(self, session, file_name, loading_mode, no_parents, split
for org_obj in reader:
line_num += 1
transaction_counter += 1
obj = self.prepare_node(org_obj, no_parents)
obj = self.prepare_node(org_obj)
node_type = obj[NODE_TYPE]
results = self.collect_relationships(obj, tx, True, line_num)
relationships = results[RELATIONSHIPS]
Expand Down
6 changes: 1 addition & 5 deletions loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ def parse_arguments():
parser.add_argument('-m', '--mode', help='Loading mode', choices=[UPSERT_MODE, NEW_MODE, DELETE_MODE],
default=UPSERT_MODE)
parser.add_argument('--dataset', help='Dataset directory')
parser.add_argument('--no-parents', help='Does not save parent IDs in children', action='store_true')
parser.add_argument('--split-transactions', help='Creates a separate transaction for each file',
action='store_true')
return parser.parse_args()
Expand Down Expand Up @@ -164,9 +163,6 @@ def process_arguments(args, log):
if not config.max_violations:
config.max_violations = 10

if args.no_parents:
config.no_parents = args.no_parents

return config


Expand Down Expand Up @@ -225,7 +221,7 @@ def main():
loader = DataLoader(driver, schema, plugins)

loader.load(file_list, config.cheat_mode, config.dry_run, config.loading_mode, config.wipe_db,
config.max_violations, config.no_parents, split=config.split_transactions,
config.max_violations, split=config.split_transactions,
no_backup=config.no_backup, neo4j_uri=config.neo4j_uri, backup_folder=config.backup_folder)

if driver:
Expand Down
2 changes: 1 addition & 1 deletion props.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(self, file_name):
self.domain = props.get('domain', 'Unknown.domain.nci.nih.gov')
self.rel_prop_delimiter = props.get('rel_prop_delimiter', '$')
self.indexes = props.get('indexes', [])
self.reuse_nodes = props.get('reuse_nodes', [])
self.save_parent_id = props.get('save_parent_id', [])
else:
msg = f'Can NOT open file: "{file_name}"'
self.log.error(msg)
Expand Down

0 comments on commit eb380ed

Please sign in to comment.