Skip to content

Commit

Permalink
Merge pull request #1 from heliocentrist/allow-skipping-rows
Browse files Browse the repository at this point in the history
Added config to skip rows
  • Loading branch information
yury-liavitski-miro authored Jan 25, 2021
2 parents e918e8b + d967f99 commit 550ed94
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 12 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,5 @@ state.json

*.iml

.idea/
cred.json
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,27 @@ Process several sheets and worksheets in that sheets at a time:
}
```

Specify the row number (1-based) to start processing from, in case you want to skip some unnecessary rows. The default number is 1.

```hocon
{ # config.conf
sheets = [
{
name = "Investor Loans",
start_from_row = 5,
worksheets = [
PageA,
PageB,
PageC
]
},
{
# ...
}
]
}
```

# Overriding configuration
The configurations in the file can be overriden with the command line parameter `--overrides`,
which takes configuration overrides in a JSON string and applies them over the passed
Expand Down
16 changes: 11 additions & 5 deletions tap_gsheets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ def sync(config):
for sheet in sheets:
sheet_name = sheet["name"]

start_from_row = 1

if "start_from_row" in sheet:
start_from_row = sheet["start_from_row"]

if "worksheets" in sheet:
worksheets = sheet["worksheets"]
else:
Expand All @@ -41,14 +46,14 @@ def sync(config):
try:
if len(worksheets) > 0:
for worksheet in worksheets:
process_worksheet(gsheets_loader, sheet_name, worksheet, config)
process_worksheet(gsheets_loader, sheet_name, worksheet, start_from_row, config)
else:
process_worksheet(gsheets_loader, sheet_name, None, config)
process_worksheet(gsheets_loader, sheet_name, None, start_from_row, config)
except Exception as e:
LOGGER.error(f"Can't process a worksheet {sheet_name} because of:\n{e}", )


def process_worksheet(gsheets_loader, sheet_name, worksheet, config):
def process_worksheet(gsheets_loader, sheet_name, worksheet, start_from_row, config):
if worksheet is None:
name_with_worksheet = sheet_name
else:
Expand All @@ -59,8 +64,9 @@ def process_worksheet(gsheets_loader, sheet_name, worksheet, config):
else:
stream_name = tableize(parameterize(name_with_worksheet))

schema = gsheets_loader.get_schema(sheet_name, worksheet)
records = gsheets_loader.get_records_as_json(sheet_name, worksheet)
schema = gsheets_loader.get_schema(sheet_name, worksheet, start_from_row)

records = gsheets_loader.get_records_as_json(sheet_name, worksheet, start_from_row)

# additional data transformations
column_mapping = None
Expand Down
14 changes: 7 additions & 7 deletions tap_gsheets/gsheet_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __init__(self, config):
self.sheet_name = None
self.spreadsheet = None

def get_data(self, sheet_name, worksheet_name):
def get_data(self, sheet_name, worksheet_name, start_from_row):
# reset cache in case of switching to another sheet
if self.sheet_name is None or self.sheet_name != sheet_name:
del self.data
Expand All @@ -41,15 +41,15 @@ def get_data(self, sheet_name, worksheet_name):

if worksheet_name not in self.data:
sheet = self.spreadsheet.worksheet(worksheet_name)
self.data[worksheet_name] = sheet.get_all_records()
self.headers[worksheet_name] = sheet.row_values(1)
self.data[worksheet_name] = sheet.get_all_records(head=start_from_row)
self.headers[worksheet_name] = sheet.row_values(start_from_row)

def get_records_as_json(self, sheet_name, worksheet_name):
self.get_data(sheet_name, worksheet_name)
def get_records_as_json(self, sheet_name, worksheet_name, start_from_row):
self.get_data(sheet_name, worksheet_name, start_from_row)
return self.data[worksheet_name]

def get_schema(self, sheet_name, worksheet_name):
self.get_data(sheet_name, worksheet_name)
def get_schema(self, sheet_name, worksheet_name, start_from_row):
self.get_data(sheet_name, worksheet_name, start_from_row)

# add object to schema builder so he can infer schema
builder = SchemaBuilder()
Expand Down

0 comments on commit 550ed94

Please sign in to comment.