docs: Fix various instances of repeated words in docs and comments (#…

…19516)
pola-rs · Oct 29, 2024 · abe5139 · abe5139
1 parent 7a23e07
commit abe5139
Show file tree

Hide file tree

Showing 30 changed files with 34 additions and 33 deletions.
diff --git a/crates/polars-arrow/src/array/mod.rs b/crates/polars-arrow/src/array/mod.rs
@@ -189,7 +189,7 @@ pub trait Array: Send + Sync + dyn_clone::DynClone + 'static {
         new
     }
 
-    /// Clones this [`Array`] with a new new assigned bitmap.
+    /// Clones this [`Array`] with a new assigned bitmap.
     /// # Panic
     /// This function panics iff `validity.len() != self.len()`.
     fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array>;

diff --git a/crates/polars-arrow/src/compute/cast/binary_to.rs b/crates/polars-arrow/src/compute/cast/binary_to.rs
@@ -199,7 +199,7 @@ pub fn fixed_size_binary_to_binview(from: &FixedSizeBinaryArray) -> BinaryViewAr
     // This is NOT equal to MAX_BYTES_PER_BUFFER because of integer division
     let split_point = num_elements_per_buffer * size;
 
-    // This is zero-copy for the buffer since split just increases the the data since
+    // This is zero-copy for the buffer since split just increases the data since
     let mut buffer = from.values().clone();
     let mut buffers = Vec::with_capacity(num_buffers);
     for _ in 0..num_buffers - 1 {

diff --git a/crates/polars-core/src/chunked_array/list/iterator.rs b/crates/polars-core/src/chunked_array/list/iterator.rs
@@ -152,7 +152,7 @@ impl ListChunked {
         let (s, ptr) =
             unsafe { unstable_series_container_and_ptr(name, inner_values.clone(), &iter_dtype) };
 
-        // SAFETY: ptr belongs the the Series..
+        // SAFETY: ptr belongs the Series..
         unsafe {
             AmortizedListIter::new(
                 self.len(),

diff --git a/crates/polars-core/src/frame/row/mod.rs b/crates/polars-core/src/frame/row/mod.rs
@@ -206,7 +206,7 @@ pub fn rows_to_schema_first_non_null(
             .iter_values()
             .enumerate()
             .filter_map(|(i, dtype)| {
-                // double check struct and list types types
+                // double check struct and list types
                 // nested null values can be wrongly inferred by front ends
                 match dtype {
                     DataType::Null | DataType::List(_) => Some(i),

diff --git a/crates/polars-expr/src/expressions/ternary.rs b/crates/polars-expr/src/expressions/ternary.rs
@@ -230,7 +230,7 @@ impl PhysicalExpr for TernaryExpr {
         //   * `zip_with` can be called directly with the series
         // * mix of unit literals and AggregatedList
         //   * `zip_with` can be called with the flat values after the offsets
-        //     have been been checked for alignment
+        //     have been checked for alignment
         let ac_target = non_literal_acs.first().unwrap();
 
         let agg_state_out = match ac_target.agg_state() {

diff --git a/crates/polars-expr/src/expressions/window.rs b/crates/polars-expr/src/expressions/window.rs
@@ -754,7 +754,7 @@ where
         unsafe { values.set_len(len) }
         ChunkedArray::new_vec(ca.name().clone(), values).into_series()
     } else {
-        // We don't use a mutable bitmap as bits will have have race conditions!
+        // We don't use a mutable bitmap as bits will have race conditions!
         // A single byte might alias if we write from single threads.
         let mut validity: Vec<bool> = vec![false; len];
         let validity_ptr = validity.as_mut_ptr();

diff --git a/crates/polars-ffi/src/version_0.rs b/crates/polars-ffi/src/version_0.rs
@@ -132,7 +132,7 @@ impl CallerContext {
         self.bitflags |= 1 << k
     }
 
-    /// Parallelism is done by polars' main engine, the plugin should not run run its own parallelism.
+    /// Parallelism is done by polars' main engine, the plugin should not run its own parallelism.
     /// If this is `false`, the plugin could use parallelism without (much) contention with polars
     /// parallelism strategies.
     pub fn parallel(&self) -> bool {

diff --git a/crates/polars-io/src/csv/write/write_impl/serializer.rs b/crates/polars-io/src/csv/write/write_impl/serializer.rs
@@ -689,7 +689,7 @@ pub(super) fn serializer_for<'a>(
             quote_if_always!(decimal_serializer, scale.unwrap_or(0))
         },
         _ => {
-            polars_bail!(ComputeError: "datatype {dtype} cannot be written to CSV\n\nConsider using JSON or or a binary format.")
+            polars_bail!(ComputeError: "datatype {dtype} cannot be written to CSV\n\nConsider using JSON or a binary format.")
         },
     };
     Ok(serializer)

diff --git a/crates/polars-io/src/ipc/ipc_file.rs b/crates/polars-io/src/ipc/ipc_file.rs
@@ -1,6 +1,6 @@
 //! # (De)serializing Arrows IPC format.
 //!
-//! Arrow IPC is a [binary format format](https://arrow.apache.org/docs/python/ipc.html).
+//! Arrow IPC is a [binary format](https://arrow.apache.org/docs/python/ipc.html).
 //! It is the recommended way to serialize and deserialize Polars DataFrames as this is most true
 //! to the data schema.
 //!

diff --git a/crates/polars-io/src/ipc/ipc_stream.rs b/crates/polars-io/src/ipc/ipc_stream.rs
@@ -1,6 +1,6 @@
 //! # (De)serializing Arrows Streaming IPC format.
 //!
-//! Arrow Streaming IPC is a [binary format format](https://arrow.apache.org/docs/python/ipc.html).
+//! Arrow Streaming IPC is a [binary format](https://arrow.apache.org/docs/python/ipc.html).
 //! It used for sending an arbitrary length sequence of record batches.
 //! The format must be processed from start to end, and does not support random access.
 //! It is different than IPC, if you can't deserialize a file with `IpcReader::new`, it's probably an IPC Stream File.

diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs
@@ -43,7 +43,7 @@ fn assert_dtypes(dtype: &ArrowDataType) {
         // These should all be casted to the BinaryView / Utf8View variants
         D::Utf8 | D::Binary | D::LargeUtf8 | D::LargeBinary => unreachable!(),
 
-        // These should be casted to to Float32
+        // These should be casted to Float32
         D::Float16 => unreachable!(),
 
         // This should have been converted to a LargeList

diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs
@@ -1349,7 +1349,7 @@ impl LazyFrame {
         right_on: E,
         args: JoinArgs,
     ) -> LazyFrame {
-        // if any of the nodes reads from files we must activate this this plan as well.
+        // if any of the nodes reads from files we must activate this plan as well.
         if other.opt_state.contains(OptFlags::FILE_CACHING) {
             self.opt_state |= OptFlags::FILE_CACHING;
         }

diff --git a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs
@@ -144,7 +144,7 @@ fn estimate_unique_count(keys: &[Column], mut sample_size: usize) -> PolarsResul
 
     if keys.len() == 1 {
         // we sample as that will work also with sorted data.
-        // not that sampling without replacement is very very expensive. don't do that.
+        // not that sampling without replacement is *very* expensive. don't do that.
         let s = keys[0].sample_n(sample_size, true, false, None).unwrap();
         // fast multi-threaded way to get unique.
         let groups = s.as_materialized_series().group_tuples(true, false)?;

diff --git a/crates/polars-ops/src/frame/join/general.rs b/crates/polars-ops/src/frame/join/general.rs
@@ -56,7 +56,7 @@ pub fn _coalesce_full_join(
     df_left: &DataFrame,
 ) -> DataFrame {
     // No need to allocate the schema because we already
-    // know for certain that the column name for left left is `name`
+    // know for certain that the column name for left is `name`
     // and for right is `name + suffix`
     let schema_left = if keys_left == keys_right {
         Schema::default()

diff --git a/crates/polars-ops/src/frame/pivot/positioning.rs b/crates/polars-ops/src/frame/pivot/positioning.rs
@@ -240,13 +240,13 @@ pub(super) fn compute_col_idx(
     let col_locations = match column_agg_physical.dtype() {
         T::Int32 | T::UInt32 => {
             let Some(BitRepr::Small(ca)) = column_agg_physical.bit_repr() else {
-                polars_bail!(ComputeError: "Expected 32-bit bit representation to be available. This should never happen");
+                polars_bail!(ComputeError: "Expected 32-bit representation to be available; this should never happen");
             };
             compute_col_idx_numeric(&ca)
         },
         T::Int64 | T::UInt64 => {
             let Some(BitRepr::Large(ca)) = column_agg_physical.bit_repr() else {
-                polars_bail!(ComputeError: "Expected 64-bit bit representation to be available. This should never happen");
+                polars_bail!(ComputeError: "Expected 64-bit representation to be available; this should never happen");
             };
             compute_col_idx_numeric(&ca)
         },
@@ -413,13 +413,13 @@ pub(super) fn compute_row_idx(
         match index_agg_physical.dtype() {
             T::Int32 | T::UInt32 => {
                 let Some(BitRepr::Small(ca)) = index_agg_physical.bit_repr() else {
-                    polars_bail!(ComputeError: "Expected 32-bit bit representation to be available. This should never happen");
+                    polars_bail!(ComputeError: "Expected 32-bit representation to be available; this should never happen");
                 };
                 compute_row_index(index, &ca, count, index_s.dtype())
             },
             T::Int64 | T::UInt64 => {
                 let Some(BitRepr::Large(ca)) = index_agg_physical.bit_repr() else {
-                    polars_bail!(ComputeError: "Expected 64-bit bit representation to be available. This should never happen");
+                    polars_bail!(ComputeError: "Expected 64-bit representation to be available; this should never happen");
                 };
                 compute_row_index(index, &ca, count, index_s.dtype())
             },

diff --git a/crates/polars-parquet/src/arrow/read/schema/mod.rs b/crates/polars-parquet/src/arrow/read/schema/mod.rs
@@ -40,7 +40,7 @@ impl Default for SchemaInferenceOptions {
 ///
 /// # Error
 /// This function errors iff the key `"ARROW:schema"` exists but is not correctly encoded,
-/// indicating that that the file's arrow metadata was incorrectly written.
+/// indicating that the file's arrow metadata was incorrectly written.
 pub fn infer_schema(file_metadata: &FileMetadata) -> PolarsResult<ArrowSchema> {
     infer_schema_with_options(file_metadata, &None)
 }

diff --git a/crates/polars-parquet/src/parquet/read/page/reader.rs b/crates/polars-parquet/src/parquet/read/page/reader.rs
@@ -97,7 +97,7 @@ impl PageReader {
         Self::new_with_page_meta(reader, column.into(), scratch, max_page_size)
     }
 
-    /// Create a a new [`PageReader`] with [`PageMetaData`].
+    /// Create a new [`PageReader`] with [`PageMetaData`].
     ///
     /// It assumes that the reader has been `sought` (`seek`) to the beginning of `column`.
     pub fn new_with_page_meta(

diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/ooc_state.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/ooc_state.rs
@@ -8,7 +8,7 @@ use crate::pipeline::{morsels_per_sink, FORCE_OOC};
 pub(super) struct OocState {
     // OOC
     // Stores available memory in the system at the start of this sink.
-    // and stores the memory used by this this sink.
+    // and stores the memory used by this sink.
     mem_track: MemTracker,
     // sort in-memory or out-of-core
     pub(super) ooc: bool,

diff --git a/crates/polars-pipe/src/executors/sinks/group_by/ooc_state.rs b/crates/polars-pipe/src/executors/sinks/group_by/ooc_state.rs
@@ -13,7 +13,7 @@ use crate::pipeline::morsels_per_sink;
 pub(super) struct OocState {
     // OOC
     // Stores available memory in the system at the start of this sink.
-    // and stores the memory used by this this sink.
+    // and stores the memory used by this sink.
     _mem_track: MemTracker,
     // sort in-memory or out-of-core
     pub(super) ooc: bool,

diff --git a/crates/polars-pipe/src/executors/sinks/joins/cross.rs b/crates/polars-pipe/src/executors/sinks/joins/cross.rs
@@ -111,7 +111,7 @@ impl Operator for CrossJoinProbe {
         _context: &PExecutionContext,
         chunk: &DataChunk,
     ) -> PolarsResult<OperatorResult> {
-        // Expected output is size**2, so this needs to be a a small number.
+        // Expected output is size**2, so this needs to be a small number.
         // However, if one of the DataFrames is much smaller than 250, we want
         // to take rather more from the other DataFrame so we don't end up with
         // overly small chunks.

diff --git a/crates/polars-pipe/src/executors/sinks/sort/sink.rs b/crates/polars-pipe/src/executors/sinks/sort/sink.rs
@@ -20,7 +20,7 @@ pub struct SortSink {
     schema: SchemaRef,
     chunks: Vec<DataFrame>,
     // Stores available memory in the system at the start of this sink.
-    // and stores the memory used by this this sink.
+    // and stores the memory used by this sink.
     mem_track: MemTracker,
     // sort in-memory or out-of-core
     ooc: bool,

diff --git a/crates/polars-pipe/src/operators/chunks.rs b/crates/polars-pipe/src/operators/chunks.rs
@@ -39,7 +39,7 @@ pub(crate) fn chunks_to_df_unchecked(chunks: Vec<DataChunk>) -> DataFrame {
 ///
 /// The benefit of having a series of `DataFrame` that are e.g. 4MB each that
 /// are then made contiguous is that you're not using a lot of memory (an extra
-/// 4MB), but you're still doing better than if you had a series of of 2KB
+/// 4MB), but you're still doing better than if you had a series of 2KB
 /// `DataFrame`s.
 ///
 /// Changing the `DataFrame` into contiguous chunks is the caller's

diff --git a/crates/polars-plan/src/dsl/functions/syntactic_sugar.rs b/crates/polars-plan/src/dsl/functions/syntactic_sugar.rs
@@ -55,7 +55,7 @@ pub fn is_not_null(expr: Expr) -> Expr {
 /// Casts the column given by `Expr` to a different type.
 ///
 /// Follows the rules of Rust casting, with the exception that integers and floats can be cast to `DataType::Date` and
-/// `DataType::DateTime(_, _)`. A column consisting entirely of of `Null` can be cast to any type, regardless of the
+/// `DataType::DateTime(_, _)`. A column consisting entirely of `Null` can be cast to any type, regardless of the
 /// nominal type of the column.
 pub fn cast(expr: Expr, dtype: DataType) -> Expr {
     Expr::Cast {

diff --git a/crates/polars-plan/src/plans/optimizer/cluster_with_columns.rs b/crates/polars-plan/src/plans/optimizer/cluster_with_columns.rs
@@ -141,7 +141,7 @@ pub fn optimize(root: Node, lp_arena: &mut Arena<IR>, expr_arena: &Arena<AExpr>)
                         // @NOTE: Pruning of re-assigned columns
                         //
                         // We checked if this expression output is also assigned by the input and
-                        // that that assignment is not used in the current WITH_COLUMNS.
+                        // that this assignment is not used in the current WITH_COLUMNS.
                         // Consequently, we are free to prune the input's assignment to the output.
                         //
                         // We immediately prune here to simplify the later code.

diff --git a/crates/polars-plan/src/plans/optimizer/simplify_expr/mod.rs b/crates/polars-plan/src/plans/optimizer/simplify_expr/mod.rs
@@ -152,7 +152,7 @@ impl OptimizationRule for SimplifyBooleanRule {
                 AExpr::Literal(LiteralValue::Boolean(true))
             ) && in_filter =>
             {
-                // Only in filter as we we might change the name from "literal"
+                // Only in filter as we might change the name from "literal"
                 // to whatever lhs columns is.
                 return Ok(Some(expr_arena.get(*right).clone()));
             },
@@ -210,7 +210,7 @@ impl OptimizationRule for SimplifyBooleanRule {
                 AExpr::Literal(LiteralValue::Boolean(false))
             ) && in_filter =>
             {
-                // Only in filter as we we might change the name from "literal"
+                // Only in filter as we might change the name from "literal"
                 // to whatever lhs columns is.
                 return Ok(Some(expr_arena.get(*right).clone()));
             },

diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs
@@ -25,7 +25,7 @@ pub struct PyFileLikeObject {
 /// Wraps a `PyObject`, and implements read, seek, and write for it.
 impl PyFileLikeObject {
     /// Creates an instance of a `PyFileLikeObject` from a `PyObject`.
-    /// To assert the object has the required methods methods,
+    /// To assert the object has the required methods,
     /// instantiate it with `PyFileLikeObject::require`
     pub fn new(object: PyObject) -> Self {
         PyFileLikeObject { inner: object }

diff --git a/crates/polars-utils/src/vec.rs b/crates/polars-utils/src/vec.rs
@@ -20,7 +20,7 @@ impl<T> IntoRawParts<T> for Vec<T> {
     }
 }
 
-/// Fill current allocation if if > 0
+/// Fill current allocation if > 0
 /// otherwise realloc
 pub trait ResizeFaster<T: Copy> {
     fn fill_or_alloc(&mut self, new_len: usize, value: T);

diff --git a/py-polars/polars/convert/general.py b/py-polars/polars/convert/general.py
@@ -738,6 +738,7 @@ def _from_dataframe_repr(m: re.Match[str]) -> DataFrame:
     if schema and data and (n_extend_cols := (len(schema) - len(data))) > 0:
         empty_data = [None] * len(data[0])
         data.extend((pl.Series(empty_data, dtype=String)) for _ in range(n_extend_cols))
+
     for dtype in set(schema.values()):
         if dtype in (List, Struct, Object):
             msg = (

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
@@ -1980,7 +1980,7 @@ def to_jax(
 
         Create the Array on a specific GPU device:
 
-        >>> gpu_device = jax.devices("gpu")[1])  # doctest: +SKIP
+        >>> gpu_device = jax.devices("gpu")[1]  # doctest: +SKIP
         >>> a = df.to_jax(device=gpu_device)  # doctest: +SKIP
         >>> a.device()  # doctest: +SKIP
         GpuDevice(id=1, process_index=0)

diff --git a/py-polars/tests/unit/series/test_scatter.py b/py-polars/tests/unit/series/test_scatter.py
@@ -43,7 +43,7 @@ def test_scatter() -> None:
     assert s.to_list() == ["a", "x", "x"]
     assert s.scatter([0, 2], 0.12345).to_list() == ["0.12345", "x", "0.12345"]
 
-    # set multiple values values
+    # set multiple values
     s = pl.Series(["z", "z", "z"])
     assert s.scatter([0, 1], ["a", "b"]).to_list() == ["a", "b", "z"]
     s = pl.Series([True, False, True])