huggingface · saiden89 · Feb 24, 2025 · Feb 24, 2025 · Feb 24, 2025 · Feb 24, 2025
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
@@ -139,7 +139,7 @@ def __init__(
         type: Optional[FeatureType] = None,
         try_type: Optional[FeatureType] = None,
         optimized_int_type: Optional[FeatureType] = None,
-    ):
+    ) -> None:
         # assert type is None or try_type is None,
         if type is not None and try_type is not None:
             raise ValueError("You cannot specify both type and try_type")
@@ -359,7 +359,7 @@ def __init__(
         unit: str = "examples",
         embed_local_files: bool = False,
         storage_options: Optional[dict] = None,
-    ):
+    ) -> None:
         if path is None and stream is None:
             raise ValueError("At least one of path and stream must be provided.")
         if features is not None:
@@ -410,17 +410,17 @@ def __init__(
         self.pa_writer: Optional[pa.RecordBatchStreamWriter] = None
         self.hkey_record = []
 
-    def __len__(self):
+    def __len__(self)-> int:
         """Return the number of writed and staged examples"""
         return self._num_examples + len(self.current_examples) + len(self.current_rows)
 
-    def __enter__(self):
+    def __enter__(self) -> object:
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.close()
 
-    def close(self):
+    def close(self) -> None:
         # Try closing if opened; if closed: pyarrow.lib.ArrowInvalid: Invalid operation on closed file
         if self.pa_writer:  # it might be None
             try:
@@ -430,7 +430,7 @@ def close(self):
         if self._closable_stream and not self.stream.closed:
             self.stream.close()  # This also closes self.pa_writer if it is opened
 
-    def _build_writer(self, inferred_schema: pa.Schema):
+    def _build_writer(self, inferred_schema: pa.Schema) -> None:
         schema = self.schema
         inferred_features = Features.from_arrow_schema(inferred_schema)
         if self._features is not None:
@@ -476,7 +476,7 @@ def _build_metadata(info: DatasetInfo, fingerprint: Optional[str] = None) -> dic
             metadata["fingerprint"] = fingerprint
         return {"huggingface": json.dumps(metadata)}
 
-    def write_examples_on_file(self):
+    def write_examples_on_file(self) -> None:
         """Write stored examples from the write-pool of examples. It makes a table out of the examples and write it."""
         if not self.current_examples:
             return
@@ -510,7 +510,7 @@ def write_examples_on_file(self):
         self.write_batch(batch_examples=batch_examples)
         self.current_examples = []
 
-    def write_rows_on_file(self):
+    def write_rows_on_file(self) -> None:
         """Write stored rows from the write-pool of rows. It concatenates the single-row tables and it writes the resulting table."""
         if not self.current_rows:
             return
@@ -523,7 +523,7 @@ def write(
         example: dict[str, Any],
         key: Optional[Union[str, int, bytes]] = None,
         writer_batch_size: Optional[int] = None,
-    ):
+    ) -> None:
         """Add a given (Example,Key) pair to the write-pool of examples which is written to file.
 
         Args:
@@ -551,7 +551,7 @@ def write(
 
             self.write_examples_on_file()
 
-    def check_duplicate_keys(self):
+    def check_duplicate_keys(self) -> None:
         """Raises error if duplicates found in a batch"""
         tmp_record = set()
         for hash, key in self.hkey_record:
@@ -566,7 +566,7 @@ def check_duplicate_keys(self):
             else:
                 tmp_record.add(hash)
 
-    def write_row(self, row: pa.Table, writer_batch_size: Optional[int] = None):
+    def write_row(self, row: pa.Table, writer_batch_size: Optional[int] = None) -> None:
         """Add a given single-row Table to the write-pool of rows which is written to file.
 
         Args:
@@ -629,7 +629,7 @@ def write_batch(
         pa_table = pa.Table.from_arrays(arrays, schema=schema)
         self.write_table(pa_table, writer_batch_size)
 
-    def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None):
+    def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None) -> None:
         """Write a Table to file.
 
         Args:
@@ -647,7 +647,7 @@ def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = Non
         self._num_examples += pa_table.num_rows
         self.pa_writer.write_table(pa_table, writer_batch_size)
 
-    def finalize(self, close_stream=True):
+    def finalize(self, close_stream: bool = True) -> tuple[int, int]:
         self.write_rows_on_file()
         # In case current_examples < writer_batch_size, but user uses finalize()
         if self._check_duplicates:

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -1066,10 +1066,10 @@ def _make_split_generators_kwargs(self, prepare_split_kwargs):
 
     def as_dataset(
         self,
-        split: Optional[Split] = None,
-        run_post_process=True,
+        split: Optional[Union[str, Split]] = None,
+        run_post_process: bool = True,
         verification_mode: Optional[Union[VerificationMode, str]] = None,
-        in_memory=False,
+        in_memory: bool = False,
     ) -> Union[Dataset, DatasetDict]:
         """Return a Dataset for the specified split.
 
@@ -1249,7 +1249,7 @@ def as_streaming_dataset(
         self,
         split: Optional[str] = None,
         base_path: Optional[str] = None,
-    ) -> Union[dict[str, IterableDataset], IterableDataset]:
+    ) -> Union[IterableDatasetDict, IterableDataset]:
         if is_remote_filesystem(self._fs):
             raise NotImplementedError(
                 f"Loading a streaming dataset cached in a {type(self._fs).__name__} is not supported yet."
@@ -1437,7 +1437,7 @@ def _prepare_split(
         self,
         split_generator: SplitGenerator,
         check_duplicate_keys: bool,
-        file_format="arrow",
+        file_format: str = "arrow",
         num_proc: Optional[int] = None,
         max_shard_size: Optional[Union[int, str]] = None,
     ):