From 78a23014a180aed4d6c5a1be259f97c1b2288a24 Mon Sep 17 00:00:00 2001 From: Austin Mueller Date: Thu, 17 Sep 2020 09:15:11 -0400 Subject: [PATCH] Bento616 Added capability to split transactions by file using the --split-transactions argument --- config/data-loader-config.example.yml | 4 +++- docs/data-loader.md | 6 ++++++ loader.py | 13 +++++++++++-- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/config/data-loader-config.example.yml b/config/data-loader-config.example.yml index addffe59..f0bca9c8 100644 --- a/config/data-loader-config.example.yml +++ b/config/data-loader-config.example.yml @@ -30,8 +30,10 @@ Config: no_confirmation: false # Max violations to display, default is 10, can be overridden by -M/--max-violations argument max_violations: 10 - #Disable saving parent IDs in children + # Disable saving parent IDs in children no_parents: false + # Split the loading transaction into separate transactions for each file + split_transactions: false # S3 bucket name, if you are loading from an S3 bucket, can be overridden by -b/--bucket argument s3_bucket: diff --git a/docs/data-loader.md b/docs/data-loader.md index 95261812..95775f6c 100644 --- a/docs/data-loader.md +++ b/docs/data-loader.md @@ -63,6 +63,7 @@ An example configuration file can be found in ````config/data-loader-config.exam * ````no_confirmation````: Automatically confirms any confirmation prompts that are displayed during the data loading * ````max_violations````: The maximum number of violations (per data file) to be displayed in the console output during data loading * ````no_parents````: Does not save parent node IDs in children nodes +* ````split_transactions````: Splits the database load operations into separate transactions for each file * ````s3_bucket````: The name of the S3 bucket containing the data to be loaded * ````s3_folder````: The name of the S3 folder containing the data to be loaded * ````loading_mode````: The loading mode to be used @@ -157,6 +158,11 @@ All of command line arguments can be specified in the configuration file. If an * Command : ````--no-parents```` * Not Required * Default Value : ````false```` +* **Enable Split Transactions Mode** + * Creates a separate database transactions for each file while loading + * Command : ````--split-transactions```` + * Not Required + * Default Value : ````false```` * **Dataset Directory** * The directory containing the data to be loaded, a temporary directory if loading from an S3 bucket * Command : ````--dataset ```` diff --git a/loader.py b/loader.py index d833d545..293e1e02 100755 --- a/loader.py +++ b/loader.py @@ -49,7 +49,8 @@ def parse_arguments(): default=UPSERT_MODE) parser.add_argument('--dataset', help='Dataset directory') parser.add_argument('--no-parents', help='Does not save parent IDs in children', action='store_true') - + parser.add_argument('--split-transactions', help='Creates a separate transaction for each file', + action='store_true') return parser.parse_args() @@ -93,10 +94,16 @@ def process_arguments(args, log): sys.exit(1) # Conditionally Required Fields + if args.split_transactions: + config.split_transactions = args.split_transactions if args.no_backup: config.no_backup = args.no_backup if args.backup_folder: config.backup_folder = args.backup_folder + if config.split_transactions and config.no_backup: + log.error('--split-transaction and --no-backup cannot both be enabled, a backup is required when running' + ' in split transactions mode') + sys.exit(1) if not config.backup_folder and not config.no_backup: log.error('Backup folder not specified! A backup folder is required unless the --no-backup argument is used') sys.exit(1) @@ -163,6 +170,8 @@ def process_arguments(args, log): if args.no_parents: config.no_parents = args.no_parents + + return config @@ -262,7 +271,7 @@ def main(): loader = DataLoader(driver, schema, visit_creator) loader.load(file_list, config.cheat_mode, config.dry_run, config.loading_mode, config.wipe_db, - config.max_violations, config.no_parents) + config.max_violations, config.no_parents, split=config.split_transactions) if driver: driver.close()