|
14 | 14 | # KIND, either express or implied. See the License for the
|
15 | 15 | # specific language governing permissions and limitations
|
16 | 16 | # under the License.
|
| 17 | +import gzip |
| 18 | +import os |
17 | 19 |
|
18 | 20 | import numpy as np
|
19 | 21 | import pyarrow as pa
|
20 | 22 | import pyarrow.dataset as ds
|
21 | 23 | import pytest
|
22 |
| -import gzip |
23 | 24 |
|
24 | 25 | from datafusion import udf
|
25 | 26 |
|
@@ -154,6 +155,56 @@ def test_register_dataset(ctx, tmp_path):
|
154 | 155 | assert result.to_pydict() == {"cnt": [100]}
|
155 | 156 |
|
156 | 157 |
|
| 158 | +def test_register_json(ctx, tmp_path): |
| 159 | + path = os.path.dirname(os.path.abspath(__file__)) |
| 160 | + test_data_path = os.path.join(path, "data_test_context", "data.json") |
| 161 | + gzip_path = tmp_path / "data.json.gz" |
| 162 | + |
| 163 | + with open(test_data_path, "rb") as json_file: |
| 164 | + with gzip.open(gzip_path, "wb") as gzipped_file: |
| 165 | + gzipped_file.writelines(json_file) |
| 166 | + |
| 167 | + ctx.register_json("json", test_data_path) |
| 168 | + ctx.register_json("json1", str(test_data_path)) |
| 169 | + ctx.register_json( |
| 170 | + "json2", |
| 171 | + test_data_path, |
| 172 | + schema_infer_max_records=10, |
| 173 | + ) |
| 174 | + ctx.register_json( |
| 175 | + "json_gzip", |
| 176 | + gzip_path, |
| 177 | + file_extension="gz", |
| 178 | + file_compression_type="gzip", |
| 179 | + ) |
| 180 | + |
| 181 | + alternative_schema = pa.schema( |
| 182 | + [ |
| 183 | + ("some_int", pa.int16()), |
| 184 | + ("some_bytes", pa.string()), |
| 185 | + ("some_floats", pa.float32()), |
| 186 | + ] |
| 187 | + ) |
| 188 | + ctx.register_json("json3", path, schema=alternative_schema) |
| 189 | + |
| 190 | + assert ctx.tables() == {"json", "json1", "json2", "json3", "json_gzip"} |
| 191 | + |
| 192 | + for table in ["json", "json1", "json2", "json_gzip"]: |
| 193 | + result = ctx.sql(f'SELECT COUNT("B") AS cnt FROM {table}').collect() |
| 194 | + result = pa.Table.from_batches(result) |
| 195 | + assert result.to_pydict() == {"cnt": [3]} |
| 196 | + |
| 197 | + result = ctx.sql("SELECT * FROM json3").collect() |
| 198 | + result = pa.Table.from_batches(result) |
| 199 | + assert result.schema == alternative_schema |
| 200 | + |
| 201 | + with pytest.raises( |
| 202 | + ValueError, |
| 203 | + match="file_compression_type must one of: gzip, bz2, xz, zstd", |
| 204 | + ): |
| 205 | + ctx.register_json("json4", gzip_path, file_compression_type="rar") |
| 206 | + |
| 207 | + |
157 | 208 | def test_execute(ctx, tmp_path):
|
158 | 209 | data = [1, 1, 2, 2, 3, 11, 12]
|
159 | 210 |
|
|
0 commit comments