Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added config to skip rows #1

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,5 @@ state.json

*.iml

.idea/
cred.json
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,27 @@ Process several sheets and worksheets in that sheets at a time:
}
```

Specify the row number (1-based) to start processing from, in case you want to skip some unnecessary rows. The default number is 1.

```hocon
{ # config.conf
sheets = [
{
name = "Investor Loans",
start_from_row = 5,
worksheets = [
PageA,
PageB,
PageC
]
},
{
# ...
}
]
}
```

# Overriding configuration
The configurations in the file can be overriden with the command line parameter `--overrides`,
which takes configuration overrides in a JSON string and applies them over the passed
Expand Down
16 changes: 11 additions & 5 deletions tap_gsheets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ def sync(config):
for sheet in sheets:
sheet_name = sheet["name"]

start_from_row = 1

if "start_from_row" in sheet:
start_from_row = sheet["start_from_row"]

if "worksheets" in sheet:
worksheets = sheet["worksheets"]
else:
Expand All @@ -41,14 +46,14 @@ def sync(config):
try:
if len(worksheets) > 0:
for worksheet in worksheets:
process_worksheet(gsheets_loader, sheet_name, worksheet, config)
process_worksheet(gsheets_loader, sheet_name, worksheet, start_from_row, config)
else:
process_worksheet(gsheets_loader, sheet_name, None, config)
process_worksheet(gsheets_loader, sheet_name, None, start_from_row, config)
except Exception as e:
LOGGER.error(f"Can't process a worksheet {sheet_name} because of:\n{e}", )


def process_worksheet(gsheets_loader, sheet_name, worksheet, config):
def process_worksheet(gsheets_loader, sheet_name, worksheet, start_from_row, config):
if worksheet is None:
name_with_worksheet = sheet_name
else:
Expand All @@ -59,8 +64,9 @@ def process_worksheet(gsheets_loader, sheet_name, worksheet, config):
else:
stream_name = tableize(parameterize(name_with_worksheet))

schema = gsheets_loader.get_schema(sheet_name, worksheet)
records = gsheets_loader.get_records_as_json(sheet_name, worksheet)
schema = gsheets_loader.get_schema(sheet_name, worksheet, start_from_row)

records = gsheets_loader.get_records_as_json(sheet_name, worksheet, start_from_row)

# additional data transformations
column_mapping = None
Expand Down
14 changes: 7 additions & 7 deletions tap_gsheets/gsheet_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __init__(self, config):
self.sheet_name = None
self.spreadsheet = None

def get_data(self, sheet_name, worksheet_name):
def get_data(self, sheet_name, worksheet_name, start_from_row):
# reset cache in case of switching to another sheet
if self.sheet_name is None or self.sheet_name != sheet_name:
del self.data
Expand All @@ -41,15 +41,15 @@ def get_data(self, sheet_name, worksheet_name):

if worksheet_name not in self.data:
sheet = self.spreadsheet.worksheet(worksheet_name)
self.data[worksheet_name] = sheet.get_all_records()
self.headers[worksheet_name] = sheet.row_values(1)
self.data[worksheet_name] = sheet.get_all_records(head=start_from_row)
self.headers[worksheet_name] = sheet.row_values(start_from_row)

def get_records_as_json(self, sheet_name, worksheet_name):
self.get_data(sheet_name, worksheet_name)
def get_records_as_json(self, sheet_name, worksheet_name, start_from_row):
self.get_data(sheet_name, worksheet_name, start_from_row)
return self.data[worksheet_name]

def get_schema(self, sheet_name, worksheet_name):
self.get_data(sheet_name, worksheet_name)
def get_schema(self, sheet_name, worksheet_name, start_from_row):
self.get_data(sheet_name, worksheet_name, start_from_row)

# add object to schema builder so he can infer schema
builder = SchemaBuilder()
Expand Down