Skip to content

Commit

Permalink
Merge pull request #38 from sebastianswms/recursive-subdirectory
Browse files Browse the repository at this point in the history
feat: Add recursive directory matching.
  • Loading branch information
visch authored Jul 20, 2023
2 parents ab84604 + 45f1294 commit 338e031
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 14 deletions.
5 changes: 3 additions & 2 deletions tap_universal_file/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,15 +97,16 @@ def get_files(

file_dict_list = []

for file in self.filesystem.ls(self.config["filepath"], detail=True):
for file_path in self.filesystem.find(self.config["filepath"]):
file = self.filesystem.info(file_path)
if (
file["type"] == "directory" # Ignore nested folders.
or file["size"] == 0 # Ignore empty files.
or ( # Ignore files not matching the configured file_regex
"file_regex" in self.config
and not re.match(
self.config["file_regex"],
Path(file["name"]).name,
file["name"],
)
)
):
Expand Down
24 changes: 12 additions & 12 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def execute_tap(config: dict = None):
# Run standard built-in tap tests from the SDK on a simple csv.

sample_config = base_file_config.copy()
sample_config.update({"file_regex": "fruit_records\\.csv"})
sample_config.update({"file_regex": "^.*fruit_records\\.csv"})

TestTapUniversalFile = get_tap_test_class(
tap_class=TapUniversalFile,
Expand All @@ -90,7 +90,7 @@ def execute_tap(config: dict = None):
def test_sdc_fields_present():
modified_config = base_file_config.copy()
modified_config.update(
{"file_regex": "^fruit_records\\.csv$", "additional_info": True}
{"file_regex": "^.*fruit_records\\.csv$", "additional_info": True}
)
messages = execute_tap(modified_config)
properties = messages["schema_messages"][0]["schema"]["properties"]
Expand All @@ -101,7 +101,7 @@ def test_sdc_fields_present():
def test_delimited_execution():
modified_config = base_file_config.copy()
modified_config.update(
{"file_type": "delimited", "file_regex": "^fruit_records\\.csv$"},
{"file_type": "delimited", "file_regex": "^.*fruit_records\\.csv$"},
)
execute_tap(modified_config)

Expand All @@ -111,7 +111,7 @@ def test_jsonl_execution():
modified_config.update(
{
"file_type": "jsonl",
"file_regex": "^employees\\.jsonl$",
"file_regex": "^.*\\/employees\\.jsonl$",
"jsonl_sampling_strategy": "first",
"jsonl_type_coercion_strategy": "string",
},
Expand All @@ -124,7 +124,7 @@ def test_avro_execution():
modified_config.update(
{
"file_type": "avro",
"file_regex": "^athletes\\.avro$",
"file_regex": "^.*athletes\\.avro$",
"avro_type_coercion_strategy": "convert",
},
)
Expand All @@ -135,7 +135,7 @@ def test_s3_execution():
s3_config = {
"protocol": "s3",
"filepath": "derek-tap-filetesting/2023",
"file_regex": "airtravel\\.csv$",
"file_regex": "^.*airtravel\\.csv$",
}
execute_tap(s3_config)

Expand All @@ -144,7 +144,7 @@ def test_compression_execution():
modified_config = base_file_config.copy()
modified_config.update(
{
"file_regex": "fruit_records",
"file_regex": "^.*fruit_records",
"compression": "detect",
"delimited_delimiter": ",",
},
Expand All @@ -156,7 +156,7 @@ def test_header_footer_execution():
modified_config = base_file_config.copy()
modified_config.update(
{
"file_regex": "^cats\\.csv$",
"file_regex": "^.*cats\\.csv$",
"delimited_header_skip": 3,
"delimited_footer_skip": 3,
},
Expand All @@ -167,7 +167,7 @@ def test_header_footer_execution():
def test_malformed_delimited_fail():
modified_config = base_file_config.copy()
modified_config.update(
{"file_regex": "^cats\\.csv$", "delimited_error_handling": "fail"}
{"file_regex": "^.*cats\\.csv$", "delimited_error_handling": "fail"}
)
with pytest.raises(RuntimeError, match="^Error processing.*"):
execute_tap(modified_config)
Expand All @@ -176,7 +176,7 @@ def test_malformed_delimited_fail():
def test_malformed_delimited_ignore():
modified_config = base_file_config.copy()
modified_config.update(
{"file_regex": "^cats\\.csv$", "delimited_error_handling": "ignore"}
{"file_regex": "^.*cats\\.csv$", "delimited_error_handling": "ignore"}
)
execute_tap(modified_config)

Expand All @@ -185,7 +185,7 @@ def test_malformed_jsonl_fail():
modified_config = base_file_config.copy()
modified_config.update(
{
"file_regex": "^malformed_employees\\.jsonl$",
"file_regex": "^.*malformed_employees\\.jsonl$",
"file_type": "jsonl",
"jsonl_error_handling": "fail",
"jsonl_type_coercion_strategy": "string",
Expand All @@ -199,7 +199,7 @@ def test_malformed_jsonl_ignore():
modified_config = base_file_config.copy()
modified_config.update(
{
"file_regex": "^malformed_employees\\.jsonl$",
"file_regex": "^.*malformed_employees\\.jsonl$",
"file_type": "jsonl",
"jsonl_error_handling": "ignore",
"jsonl_type_coercion_strategy": "string",
Expand Down

0 comments on commit 338e031

Please sign in to comment.