From d967f99b22919412fe8da11530b08e566dbebb9d Mon Sep 17 00:00:00 2001 From: Yury Liavitski Date: Fri, 22 Jan 2021 11:11:41 +0100 Subject: [PATCH] Added config to skip rows --- .gitignore | 2 ++ README.md | 21 +++++++++++++++++++++ tap_gsheets/__init__.py | 16 +++++++++++----- tap_gsheets/gsheet_loader.py | 14 +++++++------- 4 files changed, 41 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index d9eb9ac..345418f 100644 --- a/.gitignore +++ b/.gitignore @@ -104,3 +104,5 @@ state.json *.iml +.idea/ +cred.json diff --git a/README.md b/README.md index 4a93c63..fe515a5 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,27 @@ Process several sheets and worksheets in that sheets at a time: } ``` +Specify the row number (1-based) to start processing from, in case you want to skip some unnecessary rows. The default number is 1. + +```hocon +{ # config.conf + sheets = [ + { + name = "Investor Loans", + start_from_row = 5, + worksheets = [ + PageA, + PageB, + PageC + ] + }, + { + # ... + } + ] +} +``` + # Overriding configuration The configurations in the file can be overriden with the command line parameter `--overrides`, which takes configuration overrides in a JSON string and applies them over the passed diff --git a/tap_gsheets/__init__.py b/tap_gsheets/__init__.py index 3bcf5d9..d0649d1 100644 --- a/tap_gsheets/__init__.py +++ b/tap_gsheets/__init__.py @@ -32,6 +32,11 @@ def sync(config): for sheet in sheets: sheet_name = sheet["name"] + start_from_row = 1 + + if "start_from_row" in sheet: + start_from_row = sheet["start_from_row"] + if "worksheets" in sheet: worksheets = sheet["worksheets"] else: @@ -41,14 +46,14 @@ def sync(config): try: if len(worksheets) > 0: for worksheet in worksheets: - process_worksheet(gsheets_loader, sheet_name, worksheet, config) + process_worksheet(gsheets_loader, sheet_name, worksheet, start_from_row, config) else: - process_worksheet(gsheets_loader, sheet_name, None, config) + process_worksheet(gsheets_loader, sheet_name, None, start_from_row, config) except Exception as e: LOGGER.error(f"Can't process a worksheet {sheet_name} because of:\n{e}", ) -def process_worksheet(gsheets_loader, sheet_name, worksheet, config): +def process_worksheet(gsheets_loader, sheet_name, worksheet, start_from_row, config): if worksheet is None: name_with_worksheet = sheet_name else: @@ -59,8 +64,9 @@ def process_worksheet(gsheets_loader, sheet_name, worksheet, config): else: stream_name = tableize(parameterize(name_with_worksheet)) - schema = gsheets_loader.get_schema(sheet_name, worksheet) - records = gsheets_loader.get_records_as_json(sheet_name, worksheet) + schema = gsheets_loader.get_schema(sheet_name, worksheet, start_from_row) + + records = gsheets_loader.get_records_as_json(sheet_name, worksheet, start_from_row) # additional data transformations column_mapping = None diff --git a/tap_gsheets/gsheet_loader.py b/tap_gsheets/gsheet_loader.py index f68b375..2dec0ed 100644 --- a/tap_gsheets/gsheet_loader.py +++ b/tap_gsheets/gsheet_loader.py @@ -25,7 +25,7 @@ def __init__(self, config): self.sheet_name = None self.spreadsheet = None - def get_data(self, sheet_name, worksheet_name): + def get_data(self, sheet_name, worksheet_name, start_from_row): # reset cache in case of switching to another sheet if self.sheet_name is None or self.sheet_name != sheet_name: del self.data @@ -41,15 +41,15 @@ def get_data(self, sheet_name, worksheet_name): if worksheet_name not in self.data: sheet = self.spreadsheet.worksheet(worksheet_name) - self.data[worksheet_name] = sheet.get_all_records() - self.headers[worksheet_name] = sheet.row_values(1) + self.data[worksheet_name] = sheet.get_all_records(head=start_from_row) + self.headers[worksheet_name] = sheet.row_values(start_from_row) - def get_records_as_json(self, sheet_name, worksheet_name): - self.get_data(sheet_name, worksheet_name) + def get_records_as_json(self, sheet_name, worksheet_name, start_from_row): + self.get_data(sheet_name, worksheet_name, start_from_row) return self.data[worksheet_name] - def get_schema(self, sheet_name, worksheet_name): - self.get_data(sheet_name, worksheet_name) + def get_schema(self, sheet_name, worksheet_name, start_from_row): + self.get_data(sheet_name, worksheet_name, start_from_row) # add object to schema builder so he can infer schema builder = SchemaBuilder()