diff --git a/README.md b/README.md index fe515a5..a9546cf 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,48 @@ Process several sheets and worksheets in that sheets at a time: } ``` +By default date field define as date with "%Y-%m-%d %H:%M:%S" format. If there is necessary to be changed need to set specific_date_format: + +```hocon +{ # config.conf + sheets = [ + { + name = "Investor Loans", + worksheets = [ + PageA, + PageB, + PageC + ], + specific_date_format = "%Y-%m-%d" + }, + { + # ... + } + ] +} +``` + +By default date field define as date. If there is necessary to be changed need to set date_processing to False: + +```hocon +{ # config.conf + sheets = [ + { + name = "Investor Loans", + worksheets = [ + PageA, + PageB, + PageC + ], + date_processing = False + }, + { + # ... + } + ] +} +``` + Specify the row number (1-based) to start processing from, in case you want to skip some unnecessary rows. The default number is 1. ```hocon diff --git a/tap_gsheets/__init__.py b/tap_gsheets/__init__.py index d0649d1..b0e46d8 100644 --- a/tap_gsheets/__init__.py +++ b/tap_gsheets/__init__.py @@ -6,6 +6,7 @@ from pyhocon import ConfigFactory from inflection import parameterize, tableize, underscore import argparse +from dateutil import parser LOGGER = singer.get_logger() @@ -18,6 +19,8 @@ def sync(config): # read config sheets = [] + date_processing = True + if 'sheet_name' in config: # one-sheet single page config sheet = { @@ -42,18 +45,24 @@ def sync(config): else: worksheets = [] + if "specific_date_format" in sheet: + gsheet_loader.specific_date_format = sheet["specific_date_format"] + + if 'date_processing' in sheet and not sheet['date_processing']: + date_processing = False + # noinspection PyBroadException try: if len(worksheets) > 0: for worksheet in worksheets: - process_worksheet(gsheets_loader, sheet_name, worksheet, start_from_row, config) + process_worksheet(gsheets_loader, sheet_name, worksheet, start_from_row, config, date_processing) else: - process_worksheet(gsheets_loader, sheet_name, None, start_from_row, config) + process_worksheet(gsheets_loader, sheet_name, None, start_from_row, config, None) except Exception as e: LOGGER.error(f"Can't process a worksheet {sheet_name} because of:\n{e}", ) -def process_worksheet(gsheets_loader, sheet_name, worksheet, start_from_row, config): +def process_worksheet(gsheets_loader, sheet_name, worksheet, start_from_row, config, date_processing): if worksheet is None: name_with_worksheet = sheet_name else: @@ -64,9 +73,13 @@ def process_worksheet(gsheets_loader, sheet_name, worksheet, start_from_row, con else: stream_name = tableize(parameterize(name_with_worksheet)) + records = gsheets_loader.get_records_as_json(sheet_name, worksheet, start_from_row) + + if date_processing: + run_date_processing(records) + schema = gsheets_loader.get_schema(sheet_name, worksheet, start_from_row) - records = gsheets_loader.get_records_as_json(sheet_name, worksheet, start_from_row) # additional data transformations column_mapping = None @@ -100,6 +113,20 @@ def process_worksheet(gsheets_loader, sheet_name, worksheet, start_from_row, con singer.write_record(stream_name, record_transformed) +def run_date_processing(records): + for record in records: + counter = 0 + for field in record: + if record[field]: + try: + d = parser.parse(record[field]) + record[field] = (d.replace(tzinfo=None) + d.utcoffset()).strftime(gsheet_loader.specific_date_format) if d.tzinfo else d.strftime(gsheet_loader.specific_date_format) + except (TypeError, ValueError) as exception: + counter += 1 + pass + if len(record) == counter: + break + def main(): # parse arguments. get config file path. diff --git a/tap_gsheets/gsheet_loader.py b/tap_gsheets/gsheet_loader.py index 2dec0ed..c997684 100644 --- a/tap_gsheets/gsheet_loader.py +++ b/tap_gsheets/gsheet_loader.py @@ -1,12 +1,13 @@ import gspread from gspread.utils import numericise_all -from genson import SchemaBuilder +from genson import SchemaBuilder, TypedSchemaStrategy from singer.schema import Schema from oauth2client.service_account import ServiceAccountCredentials import logging +import datetime logging.getLogger('oauth2client').setLevel(logging.ERROR) - +specific_date_format = "%Y-%m-%d %H:%M:%S" class GSheetsLoader: """Wrapper for authenticating and retrieving data from Google Sheets""" @@ -52,7 +53,7 @@ def get_schema(self, sheet_name, worksheet_name, start_from_row): self.get_data(sheet_name, worksheet_name, start_from_row) # add object to schema builder so he can infer schema - builder = SchemaBuilder() + builder = CustomSchemaBuilder() if len(self.data[worksheet_name]) == 0: # build sample record to be used for schema inference if the # spreadsheet is empty @@ -67,3 +68,40 @@ def get_schema(self, sheet_name, worksheet_name, start_from_row): self.schema[worksheet_name] = singer_schema.to_dict() return self.schema[worksheet_name] + + +class CustomDateTime(TypedSchemaStrategy): + """ + strategy for date-time formatted strings + """ + JS_TYPE = 'string' + PYTHON_TYPE = (str, type(u'')) + + # create a new instance variable + def __init__(self, node_class): + super().__init__(node_class) + self.format = "date-time" + self.timestamp = None + + @classmethod + def match_object(self, obj): + super().match_object(obj) + try: + date_time_obj = datetime.datetime.strptime(obj, "{}".format(specific_date_format)) + if isinstance(date_time_obj, datetime.datetime): + return True + else: + return False + except (TypeError, ValueError) as exception: + #print(exception) + return False + + def to_schema(self): + schema = super().to_schema() + schema['type'] = self.JS_TYPE + schema['format'] = self.format + return schema + +class CustomSchemaBuilder(SchemaBuilder): + """ detects & labels date-time formatted strings """ + EXTRA_STRATEGIES = (CustomDateTime,)