Skip to content

Improved type annotation #7429

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
54f80c5
Add type hint for lock_file parameter in FileLock constructor
saiden89 Feb 24, 2025
ec2c319
Add type hints for SqlDatasetReader and SqlDatasetWriter methods
saiden89 Feb 24, 2025
7ede1f2
add type annotations
saiden89 Feb 24, 2025
95e9831
fix type hints for cache_dir parameter and read method return type in…
saiden89 Feb 24, 2025
f4e8893
Refactor type hints for cache_dir parameter and update read method re…
saiden89 Feb 24, 2025
c2ab178
Update type hints for JsonDatasetReader and JsonDatasetWriter methods
saiden89 Feb 24, 2025
950d2ae
improve type annotations
saiden89 Feb 24, 2025
b1b41ad
improve type annotations
saiden89 Feb 24, 2025
8414292
improve type annotations
saiden89 Feb 24, 2025
a895a82
improve type hints
saiden89 Feb 24, 2025
d975f59
impreve type annotations
saiden89 Feb 26, 2025
c9c01a6
improved type annotations
saiden89 Feb 26, 2025
8858207
improved type annotations
saiden89 Feb 26, 2025
af95bbf
improve type annotations
saiden89 Feb 26, 2025
a6ab6cc
improve type annotations
saiden89 Feb 26, 2025
382829a
improve type annotations
saiden89 Feb 27, 2025
a09ce1d
improve type annotations
saiden89 Feb 27, 2025
47e4c55
improve type annotations
saiden89 Feb 27, 2025
81cb84f
improve type annotations
saiden89 Feb 27, 2025
68ade8b
improve type annotations
saiden89 Feb 27, 2025
b8a7f07
improve type annotations
saiden89 Feb 27, 2025
efccfb2
format and lint
saiden89 Feb 27, 2025
2ec4303
annotate with list instead of deprecated typing.List
saiden89 Feb 27, 2025
992a88e
Merge branch 'main' into type-annotation
saiden89 Mar 6, 2025
47e77fe
Merge branch 'main' into type-annotation
saiden89 Mar 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
431 changes: 230 additions & 201 deletions src/datasets/arrow_dataset.py

Large diffs are not rendered by default.

26 changes: 13 additions & 13 deletions src/datasets/arrow_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def __init__(
type: Optional[FeatureType] = None,
try_type: Optional[FeatureType] = None,
optimized_int_type: Optional[FeatureType] = None,
):
) -> None:
# assert type is None or try_type is None,
if type is not None and try_type is not None:
raise ValueError("You cannot specify both type and try_type")
Expand Down Expand Up @@ -345,7 +345,7 @@ def __init__(
unit: str = "examples",
embed_local_files: bool = False,
storage_options: Optional[dict] = None,
):
) -> None:
if path is None and stream is None:
raise ValueError("At least one of path and stream must be provided.")
if features is not None:
Expand Down Expand Up @@ -396,17 +396,17 @@ def __init__(
self.pa_writer: Optional[pa.RecordBatchStreamWriter] = None
self.hkey_record = []

def __len__(self):
def __len__(self)-> int:
"""Return the number of writed and staged examples"""
return self._num_examples + len(self.current_examples) + len(self.current_rows)

def __enter__(self):
def __enter__(self) -> object:
return self

def __exit__(self, exc_type, exc_val, exc_tb):
self.close()

def close(self):
def close(self) -> None:
# Try closing if opened; if closed: pyarrow.lib.ArrowInvalid: Invalid operation on closed file
if self.pa_writer: # it might be None
try:
Expand All @@ -416,7 +416,7 @@ def close(self):
if self._closable_stream and not self.stream.closed:
self.stream.close() # This also closes self.pa_writer if it is opened

def _build_writer(self, inferred_schema: pa.Schema):
def _build_writer(self, inferred_schema: pa.Schema) -> None:
schema = self.schema
inferred_features = Features.from_arrow_schema(inferred_schema)
if self._features is not None:
Expand Down Expand Up @@ -462,7 +462,7 @@ def _build_metadata(info: DatasetInfo, fingerprint: Optional[str] = None) -> dic
metadata["fingerprint"] = fingerprint
return {"huggingface": json.dumps(metadata)}

def write_examples_on_file(self):
def write_examples_on_file(self) -> None:
"""Write stored examples from the write-pool of examples. It makes a table out of the examples and write it."""
if not self.current_examples:
return
Expand Down Expand Up @@ -496,7 +496,7 @@ def write_examples_on_file(self):
self.write_batch(batch_examples=batch_examples)
self.current_examples = []

def write_rows_on_file(self):
def write_rows_on_file(self) -> None:
"""Write stored rows from the write-pool of rows. It concatenates the single-row tables and it writes the resulting table."""
if not self.current_rows:
return
Expand All @@ -509,7 +509,7 @@ def write(
example: dict[str, Any],
key: Optional[Union[str, int, bytes]] = None,
writer_batch_size: Optional[int] = None,
):
) -> None:
"""Add a given (Example,Key) pair to the write-pool of examples which is written to file.

Args:
Expand Down Expand Up @@ -537,7 +537,7 @@ def write(

self.write_examples_on_file()

def check_duplicate_keys(self):
def check_duplicate_keys(self) -> None:
"""Raises error if duplicates found in a batch"""
tmp_record = set()
for hash, key in self.hkey_record:
Expand All @@ -552,7 +552,7 @@ def check_duplicate_keys(self):
else:
tmp_record.add(hash)

def write_row(self, row: pa.Table, writer_batch_size: Optional[int] = None):
def write_row(self, row: pa.Table, writer_batch_size: Optional[int] = None) -> None:
"""Add a given single-row Table to the write-pool of rows which is written to file.

Args:
Expand Down Expand Up @@ -609,7 +609,7 @@ def write_batch(
pa_table = pa.Table.from_arrays(arrays, schema=schema)
self.write_table(pa_table, writer_batch_size)

def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None):
def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None) -> None:
"""Write a Table to file.

Args:
Expand All @@ -627,7 +627,7 @@ def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = Non
self._num_examples += pa_table.num_rows
self.pa_writer.write_table(pa_table, writer_batch_size)

def finalize(self, close_stream=True):
def finalize(self, close_stream: bool = True) -> tuple[int, int]:
self.write_rows_on_file()
# In case current_examples < writer_batch_size, but user uses finalize()
if self._check_duplicates:
Expand Down
10 changes: 5 additions & 5 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1066,10 +1066,10 @@ def _make_split_generators_kwargs(self, prepare_split_kwargs):

def as_dataset(
self,
split: Optional[Split] = None,
run_post_process=True,
split: Optional[Union[str, Split]] = None,
run_post_process: bool = True,
verification_mode: Optional[Union[VerificationMode, str]] = None,
in_memory=False,
in_memory: bool = False,
) -> Union[Dataset, DatasetDict]:
"""Return a Dataset for the specified split.

Expand Down Expand Up @@ -1249,7 +1249,7 @@ def as_streaming_dataset(
self,
split: Optional[str] = None,
base_path: Optional[str] = None,
) -> Union[dict[str, IterableDataset], IterableDataset]:
) -> Union[IterableDatasetDict, IterableDataset]:
if is_remote_filesystem(self._fs):
raise NotImplementedError(
f"Loading a streaming dataset cached in a {type(self._fs).__name__} is not supported yet."
Expand Down Expand Up @@ -1437,7 +1437,7 @@ def _prepare_split(
self,
split_generator: SplitGenerator,
check_duplicate_keys: bool,
file_format="arrow",
file_format: str = "arrow",
num_proc: Optional[int] = None,
max_shard_size: Optional[Union[int, str]] = None,
):
Expand Down
Loading