diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 71b1d90..2d0933f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: )$ - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.3 + rev: v0.6.4 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] diff --git a/README.md b/README.md index 39bbe35..67f1040 100644 --- a/README.md +++ b/README.md @@ -15,14 +15,22 @@ Note: This tap currently does not support incremental state. ## Settings -| Setting | Required | Default | Description | -|:--------------------|:--------:|:-------:|:------------| -| files | False | None | An array of csv file stream settings. | -| csv_files_definition| False | None | A path to the JSON file holding an array of file settings. | -| add_metadata_columns| False | False | When True, add the metadata columns (`_sdc_source_file`, `_sdc_source_file_mtime`, `_sdc_source_lineno`) to output. | +| Setting | Required | Default | Description | +| :------------------- | :------- | :------ | :----------------------------------------------------------------------------------------------------------------- | +| files | False | None | An array of csv file stream settings | +| filesystem | False | local | The filesystem to use for reading files | +| csv_files_definition | False | None | A path to the JSON file holding an array of file settings | +| add_metadata_columns | False | 0 | When True, add the metadata columns (`_sdc_source_file`, `_sdc_source_file_mtime`, `_sdc_source_lineno`) to output | A full list of supported settings and capabilities is available by running: `tap-csv --about` +The `filesystem` setting can be used to specify the filesystem to use for reading files. The following filesystems are supported: + +- `local`, the default, for reading files from the local filesystem. +- [`ftp`](#ftp), for reading files from an FTP server. +- [`github`](#github), for reading files from a GitHub repository. +- [`dropbox`](#dropbox), for reading files from a Dropbox account. + The `config.json` contains an array called `files` that consists of dictionary objects detailing each destination table to be passed to Singer. Each of those entries contains: * `entity`: The entity name to be passed to singer (i.e. the table) * `path`: Local path to the file to be ingested. Note that this may be a directory, in which case all files in that directory and any of its subdirectories will be recursively processed @@ -81,6 +89,106 @@ Optionally, the files definition can be provided by an external json file: ] ``` +### Filesystem settings + +#### FTP + +| Setting | Required | Default | Description | +| :----------- | :------- | :------ | :---------------------- | +| ftp | False | None | FTP connection settings | +| ftp.host | True | None | FTP server host | +| ftp.port | False | 21 | FTP server port | +| ftp.username | False | None | FTP username | +| ftp.password | False | None | FTP password | +| ftp.encoding | False | utf-8 | FTP server encoding | + +#### GitHub + +| Setting | Required | Default | Description | +| :-------------- | :------- | :------ | :---------------------------------------------------------- | +| github | False | None | GitHub connection settings | +| github.org | True | None | GitHub organization or user where the repository is located | +| github.repo | True | None | GitHub repository | +| github.username | False | None | GitHub username | +| github.token | False | None | GitHub token | + +
Example configuration +

+ +```json +{ + "add_metadata_columns": true, + "filesystem": "github", + "github": { + "org": "MeltanoLabs", + "repo": "tap-csv" + }, + "files": [ + { + "entity": "alphabet", + "path": "tap_csv/tests/data/alphabet.csv", + "keys": [ + "col1" + ] + } + ] +} +``` + +

+
+ + +#### Dropbox + +| Setting | Required | Default | Description | +| :------------ | :------- | :------ | :-------------------------- | +| dropbox | False | None | Dropbox connection settings | +| dropbox.token | True | None | Dropbox token | + +The token needs the `files.content.read` scope: + +[![Dropbox scopes](img/dropbox_scopes.png)](https://www.dropbox.com/developers/apps) + +
Example configuration +

+ +```json +{ + "add_metadata_columns": true, + "filesystem": "dropbox", + "dropbox": { + "token": "...." + }, + "files": [ + { + "entity": "alphabet", + "path": "/alphabet.csv", + "keys": [ + "col1" + ] + } + ] +} +``` + +

+
+ +### Built-in Singer SDK settings + +The following settings are supported by the Singer SDK and are automatically handled by the tap: + +| Setting | Required | Default | Description | +| :------------------- | :------- | :------ | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| stream_maps | False | None | Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html). | +| stream_map_config | False | None | User-defined config values to be used within map expressions. | +| faker_config | False | None | Config for the [`Faker`](https://faker.readthedocs.io/en/master/) instance variable `fake` used within map expressions. Only applicable if the plugin specifies `faker` as an addtional dependency (through the `singer-sdk` `faker` extra or directly). | +| faker_config.seed | False | None | Value to seed the Faker generator for deterministic output: https://faker.readthedocs.io/en/master/#seeding-the-generator | +| faker_config.locale | False | None | One or more LCID locale strings to produce localized output for: https://faker.readthedocs.io/en/master/#localization | +| flattening_enabled | False | None | 'True' to enable schema flattening and automatically expand nested properties. | +| flattening_max_depth | False | None | The max depth to flatten schemas. | + ## Installation ```bash diff --git a/img/dropbox_scopes.png b/img/dropbox_scopes.png new file mode 100644 index 0000000..e8f4d86 Binary files /dev/null and b/img/dropbox_scopes.png differ diff --git a/meltano.yml b/meltano.yml index 57e0343..c18b189 100644 --- a/meltano.yml +++ b/meltano.yml @@ -18,7 +18,69 @@ plugins: keys: - col1 add_metadata_columns: false + settings_group_validation: + - [ftp.host] + - [github.org, github.repo] settings: + - name: filesystem + kind: options + options: + - label: Local Filesystem + value: local + - label: FTP + value: ftp + - label: GitHub + value: github + + # FTP settings + - name: ftp.host + label: FTP Host + description: Hostname of the FTP server + kind: string + - name: ftp.port + label: FTP Port + description: Port of the FTP server + kind: integer + - name: ftp.username + label: FTP Username + description: Username for the FTP server + kind: string + - name: ftp.password + label: FTP Password + description: Password for the FTP server + kind: password + sensitive: true + - name: ftp.encoding + label: FTP Encoding + description: Encoding for the FTP server + kind: string + + # GitHub settings + - name: github.org + label: GitHub Organization + description: Organization name on GitHub + kind: string + - name: github.repo + label: GitHub Repository + description: Repository name on GitHub + kind: string + - name: github.username + label: GitHub Username + description: Username for GitHub + kind: string + - name: github.token + label: GitHub Token + description: Token for GitHub + kind: password + sensitive: true + + # Dropbox settings + - name: dropbox.token + label: Dropbox Token + description: Token for Dropbox + kind: password + sensitive: true + - name: files description: Array of objects containing keys - `entity`, `path`, `keys`, `encoding` (Optional), `delimiter` (Optional), `doublequote` (Optional), `escapechar` (Optional), `quotechar` (Optional), `skipinitialspace` (Optional), `strict` (Optional) kind: array @@ -30,6 +92,7 @@ plugins: - name: add_metadata_columns description: When True, add the metadata columns (`_sdc_source_file`, `_sdc_source_file_mtime`, `_sdc_source_lineno`) to output. kind: boolean + loaders: - name: target-jsonl variant: andyh1203 diff --git a/poetry.lock b/poetry.lock index 9e30457..f8c5dde 100644 --- a/poetry.lock +++ b/poetry.lock @@ -83,13 +83,13 @@ files = [ [[package]] name = "certifi" -version = "2024.7.4" +version = "2024.8.30" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, - {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, + {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"}, + {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"}, ] [[package]] @@ -300,6 +300,38 @@ files = [ [package.extras] toml = ["tomli"] +[[package]] +name = "dropbox" +version = "12.0.2" +description = "Official Dropbox API Client" +optional = true +python-versions = "*" +files = [ + {file = "dropbox-12.0.2-py2-none-any.whl", hash = "sha256:4b8207a9f4afd33726ec886c0d223f4bbc42fe649b87718690a24704f5e24c0c"}, + {file = "dropbox-12.0.2-py3-none-any.whl", hash = "sha256:c5b7e9c2668adb6b12dcecd84342565dc50f7d35ab6a748d155cb79040979d1c"}, + {file = "dropbox-12.0.2.tar.gz", hash = "sha256:50057fd5ad5fcf047f542dfc6747a896e7ef982f1b5f8500daf51f3abd609962"}, +] + +[package.dependencies] +requests = ">=2.16.2" +six = ">=1.12.0" +stone = ">=2,<3.3.3" + +[[package]] +name = "dropboxdrivefs" +version = "1.4.1" +description = "Dropbox implementation for fsspec module" +optional = true +python-versions = ">=3.5" +files = [ + {file = "dropboxdrivefs-1.4.1.tar.gz", hash = "sha256:6f3c6061d045813553ce91ed0e2b682f1d70bec74011943c92b3181faacefd34"}, +] + +[package.dependencies] +dropbox = "*" +fsspec = "*" +requests = "*" + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -333,6 +365,45 @@ six = ">=1.10,<2.0" [package.extras] scandir = ["scandir (>=1.5,<2.0)"] +[[package]] +name = "fsspec" +version = "2024.6.1" +description = "File-system specification" +optional = false +python-versions = ">=3.8" +files = [ + {file = "fsspec-2024.6.1-py3-none-any.whl", hash = "sha256:3cb443f8bcd2efb31295a5b9fdb02aee81d8452c80d28f97a6d0959e6cee101e"}, + {file = "fsspec-2024.6.1.tar.gz", hash = "sha256:fad7d7e209dd4c1208e3bbfda706620e0da5142bebbd9c384afb95b07e798e49"}, +] + +[package.extras] +abfs = ["adlfs"] +adl = ["adlfs"] +arrow = ["pyarrow (>=1)"] +dask = ["dask", "distributed"] +dev = ["pre-commit", "ruff"] +doc = ["numpydoc", "sphinx", "sphinx-design", "sphinx-rtd-theme", "yarl"] +dropbox = ["dropbox", "dropboxdrivefs", "requests"] +full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"] +fuse = ["fusepy"] +gcs = ["gcsfs"] +git = ["pygit2"] +github = ["requests"] +gs = ["gcsfs"] +gui = ["panel"] +hdfs = ["pyarrow (>=1)"] +http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)"] +libarchive = ["libarchive-c"] +oci = ["ocifs"] +s3 = ["s3fs"] +sftp = ["paramiko"] +smb = ["smbprotocol"] +ssh = ["paramiko"] +test = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "numpy", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "requests"] +test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe,test]", "moto[server] (>4,<5)", "pytest-timeout", "xarray"] +test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"] +tqdm = ["tqdm"] + [[package]] name = "greenlet" version = "3.0.3" @@ -922,19 +993,23 @@ files = [ [[package]] name = "setuptools" -version = "73.0.1" +version = "74.1.1" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-73.0.1-py3-none-any.whl", hash = "sha256:b208925fcb9f7af924ed2dc04708ea89791e24bde0d3020b27df0e116088b34e"}, - {file = "setuptools-73.0.1.tar.gz", hash = "sha256:d59a3e788ab7e012ab2c4baed1b376da6366883ee20d7a5fc426816e3d7b1193"}, + {file = "setuptools-74.1.1-py3-none-any.whl", hash = "sha256:fc91b5f89e392ef5b77fe143b17e32f65d3024744fba66dc3afe07201684d766"}, + {file = "setuptools-74.1.1.tar.gz", hash = "sha256:2353af060c06388be1cecbf5953dcdb1f38362f87a2356c480b6b4d5fcfc8847"}, ] [package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.5.2)"] core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.text (>=3.7)", "more-itertools (>=8.8)", "packaging (>=24)", "platformdirs (>=2.6.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] -test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.11.*)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (<0.4)", "pytest-ruff (>=0.2.1)", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.11.*)", "pytest-mypy"] [[package]] name = "simpleeval" @@ -1120,60 +1195,60 @@ files = [ [[package]] name = "sqlalchemy" -version = "2.0.32" +version = "2.0.34" description = "Database Abstraction Library" optional = false python-versions = ">=3.7" files = [ - {file = "SQLAlchemy-2.0.32-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0c9045ecc2e4db59bfc97b20516dfdf8e41d910ac6fb667ebd3a79ea54084619"}, - {file = "SQLAlchemy-2.0.32-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1467940318e4a860afd546ef61fefb98a14d935cd6817ed07a228c7f7c62f389"}, - {file = "SQLAlchemy-2.0.32-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5954463675cb15db8d4b521f3566a017c8789222b8316b1e6934c811018ee08b"}, - {file = "SQLAlchemy-2.0.32-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:167e7497035c303ae50651b351c28dc22a40bb98fbdb8468cdc971821b1ae533"}, - {file = "SQLAlchemy-2.0.32-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b27dfb676ac02529fb6e343b3a482303f16e6bc3a4d868b73935b8792edb52d0"}, - {file = "SQLAlchemy-2.0.32-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bf2360a5e0f7bd75fa80431bf8ebcfb920c9f885e7956c7efde89031695cafb8"}, - {file = "SQLAlchemy-2.0.32-cp310-cp310-win32.whl", hash = "sha256:306fe44e754a91cd9d600a6b070c1f2fadbb4a1a257b8781ccf33c7067fd3e4d"}, - {file = "SQLAlchemy-2.0.32-cp310-cp310-win_amd64.whl", hash = "sha256:99db65e6f3ab42e06c318f15c98f59a436f1c78179e6a6f40f529c8cc7100b22"}, - {file = "SQLAlchemy-2.0.32-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:21b053be28a8a414f2ddd401f1be8361e41032d2ef5884b2f31d31cb723e559f"}, - {file = "SQLAlchemy-2.0.32-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b178e875a7a25b5938b53b006598ee7645172fccafe1c291a706e93f48499ff5"}, - {file = "SQLAlchemy-2.0.32-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:723a40ee2cc7ea653645bd4cf024326dea2076673fc9d3d33f20f6c81db83e1d"}, - {file = "SQLAlchemy-2.0.32-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:295ff8689544f7ee7e819529633d058bd458c1fd7f7e3eebd0f9268ebc56c2a0"}, - {file = "SQLAlchemy-2.0.32-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:49496b68cd190a147118af585173ee624114dfb2e0297558c460ad7495f9dfe2"}, - {file = "SQLAlchemy-2.0.32-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:acd9b73c5c15f0ec5ce18128b1fe9157ddd0044abc373e6ecd5ba376a7e5d961"}, - {file = "SQLAlchemy-2.0.32-cp311-cp311-win32.whl", hash = "sha256:9365a3da32dabd3e69e06b972b1ffb0c89668994c7e8e75ce21d3e5e69ddef28"}, - {file = "SQLAlchemy-2.0.32-cp311-cp311-win_amd64.whl", hash = "sha256:8bd63d051f4f313b102a2af1cbc8b80f061bf78f3d5bd0843ff70b5859e27924"}, - {file = "SQLAlchemy-2.0.32-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6bab3db192a0c35e3c9d1560eb8332463e29e5507dbd822e29a0a3c48c0a8d92"}, - {file = "SQLAlchemy-2.0.32-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:19d98f4f58b13900d8dec4ed09dd09ef292208ee44cc9c2fe01c1f0a2fe440e9"}, - {file = "SQLAlchemy-2.0.32-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cd33c61513cb1b7371fd40cf221256456d26a56284e7d19d1f0b9f1eb7dd7e8"}, - {file = "SQLAlchemy-2.0.32-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d6ba0497c1d066dd004e0f02a92426ca2df20fac08728d03f67f6960271feec"}, - {file = "SQLAlchemy-2.0.32-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2b6be53e4fde0065524f1a0a7929b10e9280987b320716c1509478b712a7688c"}, - {file = "SQLAlchemy-2.0.32-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:916a798f62f410c0b80b63683c8061f5ebe237b0f4ad778739304253353bc1cb"}, - {file = "SQLAlchemy-2.0.32-cp312-cp312-win32.whl", hash = "sha256:31983018b74908ebc6c996a16ad3690301a23befb643093fcfe85efd292e384d"}, - {file = "SQLAlchemy-2.0.32-cp312-cp312-win_amd64.whl", hash = "sha256:4363ed245a6231f2e2957cccdda3c776265a75851f4753c60f3004b90e69bfeb"}, - {file = "SQLAlchemy-2.0.32-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b8afd5b26570bf41c35c0121801479958b4446751a3971fb9a480c1afd85558e"}, - {file = "SQLAlchemy-2.0.32-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c750987fc876813f27b60d619b987b057eb4896b81117f73bb8d9918c14f1cad"}, - {file = "SQLAlchemy-2.0.32-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ada0102afff4890f651ed91120c1120065663506b760da4e7823913ebd3258be"}, - {file = "SQLAlchemy-2.0.32-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:78c03d0f8a5ab4f3034c0e8482cfcc415a3ec6193491cfa1c643ed707d476f16"}, - {file = "SQLAlchemy-2.0.32-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:3bd1cae7519283ff525e64645ebd7a3e0283f3c038f461ecc1c7b040a0c932a1"}, - {file = "SQLAlchemy-2.0.32-cp37-cp37m-win32.whl", hash = "sha256:01438ebcdc566d58c93af0171c74ec28efe6a29184b773e378a385e6215389da"}, - {file = "SQLAlchemy-2.0.32-cp37-cp37m-win_amd64.whl", hash = "sha256:4979dc80fbbc9d2ef569e71e0896990bc94df2b9fdbd878290bd129b65ab579c"}, - {file = "SQLAlchemy-2.0.32-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c742be912f57586ac43af38b3848f7688863a403dfb220193a882ea60e1ec3a"}, - {file = "SQLAlchemy-2.0.32-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:62e23d0ac103bcf1c5555b6c88c114089587bc64d048fef5bbdb58dfd26f96da"}, - {file = "SQLAlchemy-2.0.32-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:251f0d1108aab8ea7b9aadbd07fb47fb8e3a5838dde34aa95a3349876b5a1f1d"}, - {file = "SQLAlchemy-2.0.32-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ef18a84e5116340e38eca3e7f9eeaaef62738891422e7c2a0b80feab165905f"}, - {file = "SQLAlchemy-2.0.32-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:3eb6a97a1d39976f360b10ff208c73afb6a4de86dd2a6212ddf65c4a6a2347d5"}, - {file = "SQLAlchemy-2.0.32-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0c1c9b673d21477cec17ab10bc4decb1322843ba35b481585facd88203754fc5"}, - {file = "SQLAlchemy-2.0.32-cp38-cp38-win32.whl", hash = "sha256:c41a2b9ca80ee555decc605bd3c4520cc6fef9abde8fd66b1cf65126a6922d65"}, - {file = "SQLAlchemy-2.0.32-cp38-cp38-win_amd64.whl", hash = "sha256:8a37e4d265033c897892279e8adf505c8b6b4075f2b40d77afb31f7185cd6ecd"}, - {file = "SQLAlchemy-2.0.32-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:52fec964fba2ef46476312a03ec8c425956b05c20220a1a03703537824b5e8e1"}, - {file = "SQLAlchemy-2.0.32-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:328429aecaba2aee3d71e11f2477c14eec5990fb6d0e884107935f7fb6001632"}, - {file = "SQLAlchemy-2.0.32-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85a01b5599e790e76ac3fe3aa2f26e1feba56270023d6afd5550ed63c68552b3"}, - {file = "SQLAlchemy-2.0.32-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aaf04784797dcdf4c0aa952c8d234fa01974c4729db55c45732520ce12dd95b4"}, - {file = "SQLAlchemy-2.0.32-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4488120becf9b71b3ac718f4138269a6be99a42fe023ec457896ba4f80749525"}, - {file = "SQLAlchemy-2.0.32-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:14e09e083a5796d513918a66f3d6aedbc131e39e80875afe81d98a03312889e6"}, - {file = "SQLAlchemy-2.0.32-cp39-cp39-win32.whl", hash = "sha256:0d322cc9c9b2154ba7e82f7bf25ecc7c36fbe2d82e2933b3642fc095a52cfc78"}, - {file = "SQLAlchemy-2.0.32-cp39-cp39-win_amd64.whl", hash = "sha256:7dd8583df2f98dea28b5cd53a1beac963f4f9d087888d75f22fcc93a07cf8d84"}, - {file = "SQLAlchemy-2.0.32-py3-none-any.whl", hash = "sha256:e567a8793a692451f706b363ccf3c45e056b67d90ead58c3bc9471af5d212202"}, - {file = "SQLAlchemy-2.0.32.tar.gz", hash = "sha256:c1b88cc8b02b6a5f0efb0345a03672d4c897dc7d92585176f88c67346f565ea8"}, + {file = "SQLAlchemy-2.0.34-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:95d0b2cf8791ab5fb9e3aa3d9a79a0d5d51f55b6357eecf532a120ba3b5524db"}, + {file = "SQLAlchemy-2.0.34-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:243f92596f4fd4c8bd30ab8e8dd5965afe226363d75cab2468f2c707f64cd83b"}, + {file = "SQLAlchemy-2.0.34-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9ea54f7300553af0a2a7235e9b85f4204e1fc21848f917a3213b0e0818de9a24"}, + {file = "SQLAlchemy-2.0.34-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:173f5f122d2e1bff8fbd9f7811b7942bead1f5e9f371cdf9e670b327e6703ebd"}, + {file = "SQLAlchemy-2.0.34-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:196958cde924a00488e3e83ff917be3b73cd4ed8352bbc0f2989333176d1c54d"}, + {file = "SQLAlchemy-2.0.34-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bd90c221ed4e60ac9d476db967f436cfcecbd4ef744537c0f2d5291439848768"}, + {file = "SQLAlchemy-2.0.34-cp310-cp310-win32.whl", hash = "sha256:3166dfff2d16fe9be3241ee60ece6fcb01cf8e74dd7c5e0b64f8e19fab44911b"}, + {file = "SQLAlchemy-2.0.34-cp310-cp310-win_amd64.whl", hash = "sha256:6831a78bbd3c40f909b3e5233f87341f12d0b34a58f14115c9e94b4cdaf726d3"}, + {file = "SQLAlchemy-2.0.34-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7db3db284a0edaebe87f8f6642c2b2c27ed85c3e70064b84d1c9e4ec06d5d84"}, + {file = "SQLAlchemy-2.0.34-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:430093fce0efc7941d911d34f75a70084f12f6ca5c15d19595c18753edb7c33b"}, + {file = "SQLAlchemy-2.0.34-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79cb400c360c7c210097b147c16a9e4c14688a6402445ac848f296ade6283bbc"}, + {file = "SQLAlchemy-2.0.34-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb1b30f31a36c7f3fee848391ff77eebdd3af5750bf95fbf9b8b5323edfdb4ec"}, + {file = "SQLAlchemy-2.0.34-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8fddde2368e777ea2a4891a3fb4341e910a056be0bb15303bf1b92f073b80c02"}, + {file = "SQLAlchemy-2.0.34-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:80bd73ea335203b125cf1d8e50fef06be709619eb6ab9e7b891ea34b5baa2287"}, + {file = "SQLAlchemy-2.0.34-cp311-cp311-win32.whl", hash = "sha256:6daeb8382d0df526372abd9cb795c992e18eed25ef2c43afe518c73f8cccb721"}, + {file = "SQLAlchemy-2.0.34-cp311-cp311-win_amd64.whl", hash = "sha256:5bc08e75ed11693ecb648b7a0a4ed80da6d10845e44be0c98c03f2f880b68ff4"}, + {file = "SQLAlchemy-2.0.34-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:53e68b091492c8ed2bd0141e00ad3089bcc6bf0e6ec4142ad6505b4afe64163e"}, + {file = "SQLAlchemy-2.0.34-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bcd18441a49499bf5528deaa9dee1f5c01ca491fc2791b13604e8f972877f812"}, + {file = "SQLAlchemy-2.0.34-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:165bbe0b376541092bf49542bd9827b048357f4623486096fc9aaa6d4e7c59a2"}, + {file = "SQLAlchemy-2.0.34-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3330415cd387d2b88600e8e26b510d0370db9b7eaf984354a43e19c40df2e2b"}, + {file = "SQLAlchemy-2.0.34-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:97b850f73f8abbffb66ccbab6e55a195a0eb655e5dc74624d15cff4bfb35bd74"}, + {file = "SQLAlchemy-2.0.34-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7cee4c6917857fd6121ed84f56d1dc78eb1d0e87f845ab5a568aba73e78adf83"}, + {file = "SQLAlchemy-2.0.34-cp312-cp312-win32.whl", hash = "sha256:fbb034f565ecbe6c530dff948239377ba859420d146d5f62f0271407ffb8c580"}, + {file = "SQLAlchemy-2.0.34-cp312-cp312-win_amd64.whl", hash = "sha256:707c8f44931a4facd4149b52b75b80544a8d824162602b8cd2fe788207307f9a"}, + {file = "SQLAlchemy-2.0.34-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:24af3dc43568f3780b7e1e57c49b41d98b2d940c1fd2e62d65d3928b6f95f021"}, + {file = "SQLAlchemy-2.0.34-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e60ed6ef0a35c6b76b7640fe452d0e47acc832ccbb8475de549a5cc5f90c2c06"}, + {file = "SQLAlchemy-2.0.34-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:413c85cd0177c23e32dee6898c67a5f49296640041d98fddb2c40888fe4daa2e"}, + {file = "SQLAlchemy-2.0.34-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:25691f4adfb9d5e796fd48bf1432272f95f4bbe5f89c475a788f31232ea6afba"}, + {file = "SQLAlchemy-2.0.34-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:526ce723265643dbc4c7efb54f56648cc30e7abe20f387d763364b3ce7506c82"}, + {file = "SQLAlchemy-2.0.34-cp37-cp37m-win32.whl", hash = "sha256:13be2cc683b76977a700948411a94c67ad8faf542fa7da2a4b167f2244781cf3"}, + {file = "SQLAlchemy-2.0.34-cp37-cp37m-win_amd64.whl", hash = "sha256:e54ef33ea80d464c3dcfe881eb00ad5921b60f8115ea1a30d781653edc2fd6a2"}, + {file = "SQLAlchemy-2.0.34-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:43f28005141165edd11fbbf1541c920bd29e167b8bbc1fb410d4fe2269c1667a"}, + {file = "SQLAlchemy-2.0.34-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b68094b165a9e930aedef90725a8fcfafe9ef95370cbb54abc0464062dbf808f"}, + {file = "SQLAlchemy-2.0.34-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a1e03db964e9d32f112bae36f0cc1dcd1988d096cfd75d6a588a3c3def9ab2b"}, + {file = "SQLAlchemy-2.0.34-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:203d46bddeaa7982f9c3cc693e5bc93db476ab5de9d4b4640d5c99ff219bee8c"}, + {file = "SQLAlchemy-2.0.34-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:ae92bebca3b1e6bd203494e5ef919a60fb6dfe4d9a47ed2453211d3bd451b9f5"}, + {file = "SQLAlchemy-2.0.34-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:9661268415f450c95f72f0ac1217cc6f10256f860eed85c2ae32e75b60278ad8"}, + {file = "SQLAlchemy-2.0.34-cp38-cp38-win32.whl", hash = "sha256:895184dfef8708e15f7516bd930bda7e50ead069280d2ce09ba11781b630a434"}, + {file = "SQLAlchemy-2.0.34-cp38-cp38-win_amd64.whl", hash = "sha256:6e7cde3a2221aa89247944cafb1b26616380e30c63e37ed19ff0bba5e968688d"}, + {file = "SQLAlchemy-2.0.34-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:dbcdf987f3aceef9763b6d7b1fd3e4ee210ddd26cac421d78b3c206d07b2700b"}, + {file = "SQLAlchemy-2.0.34-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ce119fc4ce0d64124d37f66a6f2a584fddc3c5001755f8a49f1ca0a177ef9796"}, + {file = "SQLAlchemy-2.0.34-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a17d8fac6df9835d8e2b4c5523666e7051d0897a93756518a1fe101c7f47f2f0"}, + {file = "SQLAlchemy-2.0.34-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ebc11c54c6ecdd07bb4efbfa1554538982f5432dfb8456958b6d46b9f834bb7"}, + {file = "SQLAlchemy-2.0.34-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2e6965346fc1491a566e019a4a1d3dfc081ce7ac1a736536367ca305da6472a8"}, + {file = "SQLAlchemy-2.0.34-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:220574e78ad986aea8e81ac68821e47ea9202b7e44f251b7ed8c66d9ae3f4278"}, + {file = "SQLAlchemy-2.0.34-cp39-cp39-win32.whl", hash = "sha256:b75b00083e7fe6621ce13cfce9d4469c4774e55e8e9d38c305b37f13cf1e874c"}, + {file = "SQLAlchemy-2.0.34-cp39-cp39-win_amd64.whl", hash = "sha256:c29d03e0adf3cc1a8c3ec62d176824972ae29b67a66cbb18daff3062acc6faa8"}, + {file = "SQLAlchemy-2.0.34-py3-none-any.whl", hash = "sha256:7286c353ee6475613d8beff83167374006c6b3e3f0e6491bfe8ca610eb1dec0f"}, + {file = "sqlalchemy-2.0.34.tar.gz", hash = "sha256:10d8f36990dd929690666679b0f42235c159a7051534adb135728ee52828dd22"}, ] [package.dependencies] @@ -1205,6 +1280,22 @@ postgresql-psycopgbinary = ["psycopg[binary] (>=3.0.7)"] pymysql = ["pymysql"] sqlcipher = ["sqlcipher3_binary"] +[[package]] +name = "stone" +version = "3.3.1" +description = "Stone is an interface description language (IDL) for APIs." +optional = true +python-versions = "*" +files = [ + {file = "stone-3.3.1-py2-none-any.whl", hash = "sha256:cd2f7f9056fc39b16c8fd46a26971dc5ccd30b5c2c246566cd2c0dd27ff96609"}, + {file = "stone-3.3.1-py3-none-any.whl", hash = "sha256:e15866fad249c11a963cce3bdbed37758f2e88c8ff4898616bc0caeb1e216047"}, + {file = "stone-3.3.1.tar.gz", hash = "sha256:4ef0397512f609757975f7ec09b35639d72ba7e3e17ce4ddf399578346b4cb50"}, +] + +[package.dependencies] +ply = ">=3.4" +six = ">=1.12.0" + [[package]] name = "tomli" version = "2.0.1" @@ -1252,15 +1343,33 @@ files = [ {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, ] +[[package]] +name = "universal-pathlib" +version = "0.2.3" +description = "pathlib api extended to use fsspec backends" +optional = false +python-versions = ">=3.8" +files = [ + {file = "universal_pathlib-0.2.3-py3-none-any.whl", hash = "sha256:4e0f6f85ca7ce66aec866a9c5f32d4a081858d2e58c70a6be84fb2df1f2806d3"}, + {file = "universal_pathlib-0.2.3.tar.gz", hash = "sha256:22f5f289fedec4b663956596742652e2177bca24661b6c8a173f5974cfee0052"}, +] + +[package.dependencies] +fsspec = ">=2022.1.0,<2024.3.1 || >2024.3.1" + +[package.extras] +dev = ["adlfs", "aiohttp", "cheroot", "gcsfs", "moto[s3,server] (<5)", "mypy (==1.10.0)", "packaging", "pydantic", "pydantic-settings", "pylint (==2.17.4)", "pytest (==8.0.0)", "pytest-cov (==4.1.0)", "pytest-mock (==3.12.0)", "pytest-mypy-plugins (==3.1.2)", "pytest-sugar (==0.9.7)", "requests", "s3fs", "smbprotocol", "webdav4[fsspec]", "wsgidav"] +tests = ["mypy (==1.10.0)", "packaging", "pylint (==2.17.4)", "pytest (==8.0.0)", "pytest-cov (==4.1.0)", "pytest-mock (==3.12.0)", "pytest-mypy-plugins (==3.1.2)", "pytest-sugar (==0.9.7)"] + [[package]] name = "urllib3" -version = "1.26.19" +version = "1.26.20" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ - {file = "urllib3-1.26.19-py2.py3-none-any.whl", hash = "sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3"}, - {file = "urllib3-1.26.19.tar.gz", hash = "sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429"}, + {file = "urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e"}, + {file = "urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32"}, ] [package.extras] @@ -1287,7 +1396,10 @@ enabler = ["pytest-enabler (>=2.2)"] test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] type = ["pytest-mypy"] +[extras] +dropbox = ["dropboxdrivefs"] + [metadata] lock-version = "2.0" python-versions = ">=3.8" -content-hash = "2dea44463346309438678e637d394d2ee1450ddcc4b53bea71e7aa39f32d984f" +content-hash = "dc24ddf88fb5ceb48c0cc09fc9f8ee7482a5e277bd23452d046a635fe8251792" diff --git a/pyproject.toml b/pyproject.toml index 9198d92..182797f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,16 @@ license = "Apache-2.0" [tool.poetry.dependencies] python = ">=3.8" +fsspec = "~=2024.6.1" singer-sdk = "~=0.40.0" +typing-extensions = { version = "~=4.12.2", python = "<3.12" } +universal-pathlib = "~=0.2.3" + +# Extras +dropboxdrivefs = {version = "~=1.4.1", optional = true} + +[tool.poetry.extras] +dropbox = ["dropboxdrivefs"] [tool.poetry.group.dev.dependencies] coverage = ">=7.2" @@ -40,17 +49,28 @@ ignore = [ "D213", ] select = [ - "A", - "B", - "W", - "D", - "COM", - "I", - # "PTH", - "PERF", - "RUF", - "TCH", - "UP", + "F", # Pyflakes + "E", # pycodestyle (errors) + "W", # pycodestyle (warnings) + "D", # pydocstyle + "UP", # pyupgrade + "I", # isort + "B", # flake8-bugbear + "A", # flake8-builtins + "COM", # flake8-commas + "EM", # flake8-errmsg + "LOG", # flake8-logging + "G", # flake8-logging-format + "PIE", # flake8-pie + "RSE", # flake8-raise + "RET", # flake8-return + "SLF", # flake8-self + "SIM", # flake8-simplify + "TCH", # flake8-type-checking + "ERA", # eradicate + "ARG", # flake8-unused-arguments + "PERF", # Perflint + "RUF", # Ruff-specific rules ] [tool.ruff.lint.isort] diff --git a/tap_csv/client.py b/tap_csv/client.py index cae3892..ece4fbc 100644 --- a/tap_csv/client.py +++ b/tap_csv/client.py @@ -3,12 +3,18 @@ from __future__ import annotations import csv -import os +import sys import typing as t -from datetime import datetime, timezone +from functools import cached_property from singer_sdk import typing as th -from singer_sdk.streams import Stream + +from .file_stream import FileStream + +if sys.version_info < (3, 12): + from typing_extensions import override +else: + from typing import override if t.TYPE_CHECKING: from singer_sdk.helpers.types import Context @@ -18,19 +24,19 @@ SDC_SOURCE_FILE_MTIME_COLUMN = "_sdc_source_file_mtime" -class CSVStream(Stream): +class CSVStream(FileStream): """Stream class for CSV streams.""" - file_paths: list[str] = [] # noqa: RUF012 - header: list[str] = [] # noqa: RUF012 - - def __init__(self, *args, **kwargs): - """Init CSVStram.""" - # cache file_config so we dont need to go iterating the config list again later - self.file_config = kwargs.pop("file_config") + def __init__(self, *args: t.Any, **kwargs: t.Any): + """Initialize CSVStream.""" super().__init__(*args, **kwargs) - def get_records(self, context: Context | None) -> t.Iterable[dict]: + @cached_property + def primary_keys(self) -> t.Sequence[str] | None: + """Return the primary keys for the stream.""" + return self.file_config.get("keys", []) + + def get_records(self, context: Context | None) -> t.Iterable[dict]: # noqa: ARG002 """Return a generator of row-type dictionary objects. The optional `context` argument is used to identify a specific slice of the @@ -38,9 +44,16 @@ def get_records(self, context: Context | None) -> t.Iterable[dict]: require partitioning and should ignore the `context` argument. """ for file_path in self.get_file_paths(): - file_last_modified = datetime.fromtimestamp( - os.path.getmtime(file_path), timezone.utc - ) + self.logger.info("Reading file at %s", file_path) + try: + file_last_modified = self.fs.modified(file_path) + except NotImplementedError: + self.logger.warning( + "Filesystem implementation for %s does not support modified time, " + "skipping", + self.fs.protocol, + ) + file_last_modified = None file_lineno = -1 @@ -50,62 +63,59 @@ def get_records(self, context: Context | None) -> t.Iterable[dict]: if not file_lineno: continue - if self.config.get("add_metadata_columns", False): + if self.include_metadata_columns: row = [file_path, file_last_modified, file_lineno, *row] - yield dict(zip(self.header, row)) + yield dict(zip(self.get_all_field_names(), row)) - def _get_recursive_file_paths(self, file_path: str) -> list: - file_paths = [] + @cached_property + def metadata_fields(self) -> tuple[str, ...]: + """Generate metadata columns names.""" + return ( + SDC_SOURCE_FILE_COLUMN, + SDC_SOURCE_FILE_MTIME_COLUMN, + SDC_SOURCE_LINENO_COLUMN, + ) - for dirpath, _, filenames in os.walk(file_path): - for filename in filenames: - file_path = os.path.join(dirpath, filename) - if self.is_valid_filename(file_path): - file_paths.append(file_path) + def get_metadata_columns_schemas(self) -> t.Generator[tuple[str, dict], None, None]: + """Get schema for metadata columns.""" + yield SDC_SOURCE_FILE_COLUMN, {"type": "string"} + yield ( + SDC_SOURCE_FILE_MTIME_COLUMN, + {"type": ["string", "null"], "format": "date-time"}, + ) + yield SDC_SOURCE_LINENO_COLUMN, {"type": "integer"} - return file_paths + @override + @cached_property + def field_names(self) -> list[str]: + """Return a sequence of field names for the stream.""" + for file_path in self.get_file_paths(): + for row in self.get_rows(file_path): + return row - def get_file_paths(self) -> list: - """Return a list of file paths to read. + return [] - This tap accepts file names and directories so it will detect - directories and iterate files inside. - """ - # Cache file paths so we dont have to iterate multiple times - if self.file_paths: - return self.file_paths - - file_path = self.file_config["path"] - if not os.path.exists(file_path): - raise Exception(f"File path does not exist {file_path}") - - file_paths = [] - if os.path.isdir(file_path): - clean_file_path = os.path.normpath(file_path) + os.sep - file_paths = self._get_recursive_file_paths(clean_file_path) - elif self.is_valid_filename(file_path): - file_paths.append(file_path) - - if not file_paths: - raise RuntimeError( - f"Stream '{self.name}' has no acceptable files. \ - See warning for more detail." - ) - self.file_paths = file_paths - return file_paths + @override + def get_schema(self) -> dict: + """Return the schema for a particular file stream.""" + props = [th.Property(col, th.StringType) for col in self.field_names] + return th.PropertiesList(*props).to_dict() + @override def is_valid_filename(self, file_path: str) -> bool: """Return a boolean of whether the file includes CSV extension.""" is_valid = True if file_path[-4:] != ".csv": is_valid = False - self.logger.warning(f"Skipping non-csv file '{file_path}'") self.logger.warning( - "Please provide a CSV file that ends with '.csv'; e.g. 'users.csv'" + "Skipping non-csv file '%s', please provide a CSV file that ends with " + "'.csv'; e.g. 'users.csv'", + file_path, ) return is_valid + @override def get_rows(self, file_path: str) -> t.Iterable[list]: """Return a generator of the rows in a particular CSV file.""" encoding = self.file_config.get("encoding", None) @@ -118,42 +128,5 @@ def get_rows(self, file_path: str) -> t.Iterable[list]: skipinitialspace=self.file_config.get("skipinitialspace", False), strict=self.file_config.get("strict", False), ) - with open(file_path, encoding=encoding) as f: + with self.fs.open(file_path, mode="r", encoding=encoding) as f: yield from csv.reader(f, dialect="tap_dialect") - - @property - def schema(self) -> dict: - """Return dictionary of record schema. - - Dynamically detect the json schema for the stream. - This is evaluated prior to any records being retrieved. - """ - properties: list[th.Property] = [] - self.primary_keys = self.file_config.get("keys", []) - - for file_path in self.get_file_paths(): - for header in self.get_rows(file_path): # noqa: B007 - break - break - - properties.extend(th.Property(column, th.StringType()) for column in header) - # If enabled, add file's metadata to output - if self.config.get("add_metadata_columns", False): - header = [ - SDC_SOURCE_FILE_COLUMN, - SDC_SOURCE_FILE_MTIME_COLUMN, - SDC_SOURCE_LINENO_COLUMN, - *header, - ] - - properties.extend( - ( - th.Property(SDC_SOURCE_FILE_COLUMN, th.StringType), - th.Property(SDC_SOURCE_FILE_MTIME_COLUMN, th.DateTimeType), - th.Property(SDC_SOURCE_LINENO_COLUMN, th.IntegerType), - ) - ) - # Cache header for future use - self.header = header - - return th.PropertiesList(*properties).to_dict() diff --git a/tap_csv/file_stream.py b/tap_csv/file_stream.py new file mode 100644 index 0000000..4e81510 --- /dev/null +++ b/tap_csv/file_stream.py @@ -0,0 +1,132 @@ +"""Stream class for file-based streams.""" + +from __future__ import annotations + +import abc +import os +import typing as t +from functools import cached_property + +import fsspec +from singer_sdk.streams import Stream + + +class FileStreamError(Exception): + """Exception for file stream errors.""" + + +class FileStream(Stream, metaclass=abc.ABCMeta): + """Abstract class for file streams.""" + + def __init__(self, filesystem: str, *args, options: dict[str, t.Any], **kwargs): + """Init CSVStram.""" + # cache file_config so we dont need to go iterating the config list again later + self.file_config = kwargs.pop("file_config") + self.fs: fsspec.AbstractFileSystem = fsspec.filesystem(filesystem, **options) + self._file_paths: list[str] = [] + + super().__init__(*args, **kwargs) + + def _get_recursive_file_paths(self, file_path: str) -> list: + file_paths = [] + + for dirpath, _, filenames in self.fs.walk(file_path): + for filename in filenames: + file_path = os.path.join(dirpath, filename) + if self.is_valid_filename(file_path): + file_paths.append(file_path) + + return file_paths + + def get_file_paths(self) -> list[str]: + """Return a list of file paths to read. + + This tap accepts file names and directories so it will detect + directories and iterate files inside. + """ + # Cache file paths so we dont have to iterate multiple times + if self._file_paths: + return self._file_paths + + file_path = self.file_config["path"] + if not self.fs.exists(file_path): + errmsg = f"File path does not exist {file_path}" + raise FileStreamError(errmsg) + + file_paths = [] + if self.fs.isdir(file_path): + clean_file_path = os.path.normpath(file_path) + os.sep + file_paths = self._get_recursive_file_paths(clean_file_path) + + elif self.is_valid_filename(file_path): + file_paths.append(file_path) + + if not file_paths: + msg = f"Stream '{self.name}' has no acceptable files" + raise FileStreamError(msg) + + self._file_paths = file_paths + + return self._file_paths + + def get_all_field_names(self) -> list[str]: + """Return a set of all field names, including metadata columns. + + If metadata columns are enabled, they will be **prepended** to the list. + """ + fields = list(self.field_names) + if self.include_metadata_columns: + fields = [*self.metadata_fields, *fields] + + return fields + + @cached_property + def include_metadata_columns(self) -> bool: + """Return a boolean of whether to include metadata columns.""" + return self.config.get("add_metadata_columns", False) + + @property + def metadata_fields(self) -> t.Iterable[str]: + """Get an iterable of metadata columns names.""" + return [] + + def get_metadata_columns_schemas(self) -> t.Iterable[tuple[str, dict]]: + """Get an iterable of metadata columns names and schemata.""" + return [] + + @property + @abc.abstractmethod + def field_names(self) -> t.Sequence[str]: + """A sequence of field names for the stream.""" + ... + + @abc.abstractmethod + def get_schema(self) -> dict: + """Return the schema for a particular file stream.""" + ... + + @abc.abstractmethod + def is_valid_filename(self, file_path: str) -> bool: + """Return a boolean of whether the file name is valid for the format.""" + ... + + @abc.abstractmethod + def get_rows(self, file_path: str) -> t.Iterable[list]: + """Return a generator of the rows in a particular file.""" + ... + + @cached_property + def schema(self) -> dict: + """Return dictionary of record schema. + + Dynamically detect the json schema for the stream. + This is evaluated prior to any records being retrieved. + """ + _schema = self.get_schema() + + # If enabled, add file's metadata to output + if self.include_metadata_columns: + metadata_schema = self.get_metadata_columns_schemas() + _schema["properties"].update(dict(metadata_schema)) + + return _schema diff --git a/tap_csv/filesystem_config.py b/tap_csv/filesystem_config.py new file mode 100644 index 0000000..c522b86 --- /dev/null +++ b/tap_csv/filesystem_config.py @@ -0,0 +1,87 @@ +"""JSON Schema for each filesystem configuration.""" + +from __future__ import annotations + +from singer_sdk import typing as th # JSON schema typing helpers + +FTP = th.Property( + "ftp", + th.ObjectType( + th.Property( + "host", + th.StringType, + required=True, + description="FTP server host", + ), + th.Property( + "port", + th.IntegerType, + default=21, + description="FTP server port", + ), + th.Property( + "username", + th.StringType, + description="FTP username", + ), + th.Property( + "password", + th.StringType, + secret=True, + description="FTP password", + ), + th.Property( + "encoding", + th.StringType, + default="utf-8", + description="FTP server encoding", + ), + ), + description="FTP connection settings", +) + +GITHUB = th.Property( + "github", + th.ObjectType( + th.Property( + "org", + th.StringType, + required=True, + description=("GitHub organization or user where the repository is located"), + ), + th.Property( + "repo", + th.StringType, + required=True, + description="GitHub repository", + ), + th.Property( + "username", + th.StringType, + required=False, + description="GitHub username", + ), + th.Property( + "token", + th.StringType, + required=False, + secret=True, + description="GitHub token", + ), + ), + description="GitHub connection settings", +) + +DROPBOX = th.Property( + "dropbox", + th.ObjectType( + th.Property( + "token", + th.StringType, + secret=True, + required=True, + description="Dropbox token", + ), + ), + description="Dropbox connection settings", +) diff --git a/tap_csv/tap.py b/tap_csv/tap.py index 5508a40..43abb44 100644 --- a/tap_csv/tap.py +++ b/tap_csv/tap.py @@ -4,14 +4,14 @@ import json import os -from typing import List from singer_sdk import Stream, Tap from singer_sdk import typing as th # JSON schema typing helpers +from singer_sdk.exceptions import ConfigValidationError from singer_sdk.helpers._classproperty import classproperty from singer_sdk.helpers.capabilities import TapCapabilities -from tap_csv.client import CSVStream +from . import client, filesystem_config class TapCSV(Tap): @@ -38,12 +38,28 @@ class TapCSV(Tap): th.Property("strict", th.BooleanType, required=False), ) ), - description="An array of csv file stream settings.", + description="An array of csv file stream settings", ), + th.Property( + "filesystem", + th.StringType, + required=False, + default="local", + description="The filesystem to use for reading files", + allowed_values=[ + "local", + "ftp", + "github", + "dropbox", + ], + ), + filesystem_config.FTP, + filesystem_config.GITHUB, + filesystem_config.DROPBOX, th.Property( "csv_files_definition", th.StringType, - description="A path to the JSON file holding an array of file settings.", + description="A path to the JSON file holding an array of file settings", ), th.Property( "add_metadata_columns", @@ -52,14 +68,14 @@ class TapCSV(Tap): default=False, description=( "When True, add the metadata columns (`_sdc_source_file`, " - "`_sdc_source_file_mtime`, `_sdc_source_lineno`) to output." + "`_sdc_source_file_mtime`, `_sdc_source_lineno`) to output" ), ), ).to_dict() @classproperty def capabilities(self) -> list[TapCapabilities]: - """Get tap capabilites.""" + """Get tap capabilities.""" return [ TapCapabilities.CATALOG, TapCapabilities.DISCOVER, @@ -78,7 +94,7 @@ def get_file_configs(self) -> list[dict]: with open(csv_files_definition) as f: csv_files = json.load(f) else: - self.logger.error(f"tap-csv: '{csv_files_definition}' file not found") + self.logger.error("tap-csv: '%s' file not found", csv_files_definition) exit(1) if not csv_files: self.logger.error("No CSV file definitions found.") @@ -87,11 +103,20 @@ def get_file_configs(self) -> list[dict]: def discover_streams(self) -> list[Stream]: """Return a list of discovered streams.""" + filesystem = self.config["filesystem"] + + if filesystem != "local" and filesystem not in self.config: + msg = "Misconfigured filesystem" + errors = [f"Missing filesystem options for {filesystem}"] + raise ConfigValidationError(msg, errors=errors) + return [ - CSVStream( + client.CSVStream( tap=self, name=file_config.get("entity"), file_config=file_config, + filesystem=filesystem, + options=self.config.get(filesystem, {}), ) for file_config in self.get_file_configs() ] diff --git a/tap_csv/tests/test_core.py b/tap_csv/tests/test_core.py deleted file mode 100644 index cdf5c02..0000000 --- a/tap_csv/tests/test_core.py +++ /dev/null @@ -1,92 +0,0 @@ -"""Tests standard tap features using the built-in SDK tests library.""" - -from __future__ import annotations - -import os - -from singer_sdk.testing import get_standard_tap_tests - -from tap_csv.tap import TapCSV - - -# Run standard built-in tap tests from the SDK: -def test_standard_tap_tests(): - """Run standard tap tests from the SDK.""" - test_data_dir = os.path.dirname(os.path.abspath(__file__)) - SAMPLE_CONFIG = { - "files": [ - { - "entity": "test", - "path": f"{test_data_dir}/data/alphabet.csv", - "keys": [], - } - ] - } - tests = get_standard_tap_tests(TapCSV, config=SAMPLE_CONFIG) - for test in tests: - test() - - -# Run standard built-in tap tests from the SDK, with different encoding: -def test_standard_tap_tests_encoding(): - """Run standard built-in tap tests from the SDK, with different encoding.""" - test_data_dir = os.path.dirname(os.path.abspath(__file__)) - SAMPLE_CONFIG = { - "files": [ - { - "entity": "test", - "path": f"{test_data_dir}/data/alphabet_encoding.csv", - "keys": [], - "encoding": "latin1", - } - ] - } - tests = get_standard_tap_tests(TapCSV, config=SAMPLE_CONFIG) - for test in tests: - test() - - -# Run standard built-in tap tests from the SDK, with different CSV dialect settings: -def test_standard_tap_tests_csv_dialect(): - """Run standard built-in tap tests from the SDK. - - With different CSV dialect settings. - """ - test_data_dir = os.path.dirname(os.path.abspath(__file__)) - SAMPLE_CONFIG = { - "files": [ - { - "entity": "test", - "path": f"{test_data_dir}/data/alphabet_encoding.csv", - "keys": [], - "delimiter": ",", - "doublequote": True, - "escapechar": "^", - "quotechar": '"', - "skipinitialspace": True, - "strict": True, - } - ] - } - tests = get_standard_tap_tests(TapCSV, config=SAMPLE_CONFIG) - for test in tests: - test() - - -# Run standard built-in tap tests from the SDK, with metadata columns included: -def test_standard_tap_tests_metadata_cols(): - """Run standard tap tests from the SDK, with metadata columns included.""" - test_data_dir = os.path.dirname(os.path.abspath(__file__)) - SAMPLE_CONFIG = { - "add_metadata_columns": True, - "files": [ - { - "entity": "test", - "path": f"{test_data_dir}/data/alphabet.csv", - "keys": [], - } - ], - } - tests = get_standard_tap_tests(TapCSV, config=SAMPLE_CONFIG) - for test in tests: - test() diff --git a/tap_csv/tests/__init__.py b/tests/__init__.py similarity index 100% rename from tap_csv/tests/__init__.py rename to tests/__init__.py diff --git a/tap_csv/tests/data/alphabet.csv b/tests/data/alphabet.csv similarity index 100% rename from tap_csv/tests/data/alphabet.csv rename to tests/data/alphabet.csv diff --git a/tap_csv/tests/data/alphabet_encoding.csv b/tests/data/alphabet_encoding.csv similarity index 100% rename from tap_csv/tests/data/alphabet_encoding.csv rename to tests/data/alphabet_encoding.csv diff --git a/tap_csv/tests/data/subfolder1/alphabet.csv b/tests/data/subfolder1/alphabet.csv similarity index 100% rename from tap_csv/tests/data/subfolder1/alphabet.csv rename to tests/data/subfolder1/alphabet.csv diff --git a/tap_csv/tests/data/subfolder1/subfolder2/alphabet.csv b/tests/data/subfolder1/subfolder2/alphabet.csv similarity index 100% rename from tap_csv/tests/data/subfolder1/subfolder2/alphabet.csv rename to tests/data/subfolder1/subfolder2/alphabet.csv diff --git a/tap_csv/tests/test_client.py b/tests/test_client.py similarity index 79% rename from tap_csv/tests/test_client.py rename to tests/test_client.py index cc597d9..11b80a7 100644 --- a/tap_csv/tests/test_client.py +++ b/tests/test_client.py @@ -4,7 +4,7 @@ import os -from tap_csv.tap import CSVStream, TapCSV +from tap_csv import client, tap def test_get_file_paths_recursively(): @@ -21,10 +21,12 @@ def test_get_file_paths_recursively(): ] } - stream = CSVStream( - tap=TapCSV(config=SAMPLE_CONFIG, catalog={}, state={}), + stream = client.CSVStream( + tap=tap.TapCSV(config=SAMPLE_CONFIG, catalog={}, state={}), name="test_recursive", file_config=SAMPLE_CONFIG.get("files")[0], + filesystem="local", + options={}, ) assert stream.get_file_paths() == [ f"{test_data_dir}/data/subfolder1/alphabet.csv", diff --git a/tests/test_core.py b/tests/test_core.py new file mode 100644 index 0000000..08f8d0b --- /dev/null +++ b/tests/test_core.py @@ -0,0 +1,74 @@ +"""Tests standard tap features using the built-in SDK tests library.""" + +from __future__ import annotations + +import os + +from singer_sdk.testing import get_tap_test_class + +from tap_csv.tap import TapCSV + +TEST_DATA_DIR = os.path.dirname(os.path.abspath(__file__)) + +TestTapCSV = get_tap_test_class( + TapCSV, + config={ + "files": [ + { + "entity": "test", + "path": f"{TEST_DATA_DIR}/data/alphabet.csv", + "keys": [], + }, + ], + }, +) + + +TestTapCSVEncoding = get_tap_test_class( + TapCSV, + config={ + "files": [ + { + "entity": "test", + "path": f"{TEST_DATA_DIR}/data/alphabet_encoding.csv", + "keys": [], + "encoding": "latin1", + }, + ], + }, +) + + +TestCSVDialect = get_tap_test_class( + TapCSV, + config={ + "files": [ + { + "entity": "test", + "path": f"{TEST_DATA_DIR}/data/alphabet_encoding.csv", + "keys": [], + "delimiter": ",", + "doublequote": True, + "escapechar": "^", + "quotechar": '"', + "skipinitialspace": True, + "strict": True, + }, + ], + }, +) + + +TestTapCSVMetadataCols = get_tap_test_class( + TapCSV, + config={ + "add_metadata_columns": True, + "files": [ + { + "entity": "test", + "path": f"{TEST_DATA_DIR}/data/alphabet.csv", + "keys": [], + }, + ], + }, +) diff --git a/tox.ini b/tox.ini index 4694ab8..20d7f77 100644 --- a/tox.ini +++ b/tox.ini @@ -32,4 +32,4 @@ commands = poetry install -v poetry run ruff check --diff tap_csv/ poetry run ruff format --check tap_csv - poetry run mypy tap_csv --exclude='tap_csv/tests' --ignore-missing-imports + poetry run mypy . --exclude='tests' --ignore-missing-imports