From 762c961faa733e8aca41debf6485dccf80436fe1 Mon Sep 17 00:00:00 2001
From: Ghislain Picard <ghislain.picard@univ-grenoble-alpes.fr>
Date: Sun, 20 Sep 2020 22:40:19 +0200
Subject: [PATCH 1/8] Accept coordinates with MultiIndex (solve issue #3008)

---
 xarray/core/coordinates.py | 43 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py
index 846e4044a2c..7bc7fec4695 100644
--- a/xarray/core/coordinates.py
+++ b/xarray/core/coordinates.py
@@ -13,6 +13,7 @@
     cast,
 )
 
+import numpy as np
 import pandas as pd
 
 from . import formatting, indexing
@@ -106,9 +107,47 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index:
             (dim,) = ordered_dims
             return self._data.get_index(dim)  # type: ignore
         else:
+            from pandas.core.arrays.categorical import factorize_from_iterable
+
             indexes = [self._data.get_index(k) for k in ordered_dims]  # type: ignore
-            names = list(ordered_dims)
-            return pd.MultiIndex.from_product(indexes, names=names)
+
+            # compute the sizes of the repeat and tile for the cartesian product
+            # (taken from pandas.core.reshape.util)
+            lenX = np.fromiter((len(index) for index in indexes), dtype=np.intp)
+            cumprodX = np.cumproduct(lenX)
+
+            if cumprodX[-1] != 0:
+                # sizes of the repeats
+                b = cumprodX[-1] / cumprodX
+            else:
+                # if any factor is empty, the cartesian product is empty
+                b = np.zeros_like(cumprodX)
+
+            # sizes of the tiles
+            a = np.roll(cumprodX, 1)
+            a[0] = 1
+
+            # loop over the indexes
+            # for each MultiIndex or Index compute the cartesian product of the codes
+
+            code_list = []
+            level_list = []
+            names = []
+
+            for i, index in enumerate(indexes):
+                if isinstance(index, pd.MultiIndex):
+                    codes, levels = index.codes, index.levels
+                else:
+                    code, level = factorize_from_iterable(index)
+                    codes = [code]
+                    levels = [level]
+                    
+                # compute the cartesian product
+                code_list += [np.tile(np.repeat(code, b[i]), a[i]) for code in codes]
+                level_list += levels
+                names += index.names
+
+            return pd.MultiIndex(level_list, code_list, names=names)
 
     def update(self, other: Mapping[Hashable, Any]) -> None:
         other_vars = getattr(other, "variables", other)

From 1472c92ae7b238b34927b9b2d721a806aa5f2b15 Mon Sep 17 00:00:00 2001
From: Ghislain Picard <ghislain.picard@univ-grenoble-alpes.fr>
Date: Sun, 20 Sep 2020 22:43:08 +0200
Subject: [PATCH 2/8] formatting

---
 xarray/core/coordinates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py
index 7bc7fec4695..c5cf5558f25 100644
--- a/xarray/core/coordinates.py
+++ b/xarray/core/coordinates.py
@@ -141,7 +141,7 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index:
                     code, level = factorize_from_iterable(index)
                     codes = [code]
                     levels = [level]
-                    
+
                 # compute the cartesian product
                 code_list += [np.tile(np.repeat(code, b[i]), a[i]) for code in codes]
                 level_list += levels

From 31a698a2b7932c54872ba87fa28ef8d3b8990e2b Mon Sep 17 00:00:00 2001
From: Ghislain Picard <ghislain.picard@univ-grenoble-alpes.fr>
Date: Mon, 21 Sep 2020 21:02:23 +0200
Subject: [PATCH 3/8] change to pd.factorize and improve variable names

---
 xarray/core/coordinates.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py
index c5cf5558f25..5c27cdb6e7b 100644
--- a/xarray/core/coordinates.py
+++ b/xarray/core/coordinates.py
@@ -107,8 +107,6 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index:
             (dim,) = ordered_dims
             return self._data.get_index(dim)  # type: ignore
         else:
-            from pandas.core.arrays.categorical import factorize_from_iterable
-
             indexes = [self._data.get_index(k) for k in ordered_dims]  # type: ignore
 
             # compute the sizes of the repeat and tile for the cartesian product
@@ -118,14 +116,14 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index:
 
             if cumprodX[-1] != 0:
                 # sizes of the repeats
-                b = cumprodX[-1] / cumprodX
+                repeat_counts = cumprodX[-1] / cumprodX
             else:
                 # if any factor is empty, the cartesian product is empty
-                b = np.zeros_like(cumprodX)
+                repeat_counts = np.zeros_like(cumprodX)
 
             # sizes of the tiles
-            a = np.roll(cumprodX, 1)
-            a[0] = 1
+            tile_counts = np.roll(cumprodX, 1)
+            tile_counts[0] = 1
 
             # loop over the indexes
             # for each MultiIndex or Index compute the cartesian product of the codes
@@ -138,12 +136,12 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index:
                 if isinstance(index, pd.MultiIndex):
                     codes, levels = index.codes, index.levels
                 else:
-                    code, level = factorize_from_iterable(index)
+                    code, level = pd.factorize(index)
                     codes = [code]
                     levels = [level]
 
                 # compute the cartesian product
-                code_list += [np.tile(np.repeat(code, b[i]), a[i]) for code in codes]
+                code_list += [np.tile(np.repeat(code, repeat_counts[i]), tile_counts[i]) for code in codes]
                 level_list += levels
                 names += index.names
 

From 4ea0b5337af7bb720b442833896a3b342a38c3aa Mon Sep 17 00:00:00 2001
From: Ghislain Picard <ghislain.picard@univ-grenoble-alpes.fr>
Date: Mon, 21 Sep 2020 21:02:45 +0200
Subject: [PATCH 4/8] add a test for multiindex

---
 xarray/tests/test_dataarray.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
index 5e0fe13ea52..08d7e3ccdd4 100644
--- a/xarray/tests/test_dataarray.py
+++ b/xarray/tests/test_dataarray.py
@@ -3520,6 +3520,21 @@ def test_to_dataframe(self):
         with raises_regex(ValueError, "unnamed"):
             arr.to_dataframe()
 
+    def test_to_dataframe_multiindex(self):
+        # regression test for #3008
+        arr_np = np.random.randn(4, 3)
+     
+        mindex = pd.MultiIndex.from_product([[1, 2], list('ab')], names=['A', 'B'])
+
+        arr = DataArray(arr_np, [('MI', mindex), ('C', [5, 6, 7])], name="foo")
+
+        actual = arr.to_dataframe()
+        assert_array_equal(actual['foo'].values, arr_np.flatten())
+        assert_array_equal(actual.index.names, list('ABC'))
+        assert_array_equal(actual.index.levels[0], [1, 2])
+        assert_array_equal(actual.index.levels[1], ['a', 'b'])
+        assert_array_equal(actual.index.levels[2], [5, 6, 7])
+
     def test_to_pandas_name_matches_coordinate(self):
         # coordinate with same name as array
         arr = DataArray([1, 2, 3], dims="x", name="x")

From 1febc77f734b92ba17e6e30f4a0bab0c24222716 Mon Sep 17 00:00:00 2001
From: Ghislain Picard <ghislain.picard@univ-grenoble-alpes.fr>
Date: Mon, 21 Sep 2020 21:11:52 +0200
Subject: [PATCH 5/8] formatting

---
 xarray/tests/test_dataarray.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
index 08d7e3ccdd4..e2f8fc1c938 100644
--- a/xarray/tests/test_dataarray.py
+++ b/xarray/tests/test_dataarray.py
@@ -3523,16 +3523,16 @@ def test_to_dataframe(self):
     def test_to_dataframe_multiindex(self):
         # regression test for #3008
         arr_np = np.random.randn(4, 3)
-     
-        mindex = pd.MultiIndex.from_product([[1, 2], list('ab')], names=['A', 'B'])
 
-        arr = DataArray(arr_np, [('MI', mindex), ('C', [5, 6, 7])], name="foo")
+        mindex = pd.MultiIndex.from_product([[1, 2], list("ab")], names=["A", "B"])
+
+        arr = DataArray(arr_np, [("MI", mindex), ("C", [5, 6, 7])], name="foo")
 
         actual = arr.to_dataframe()
-        assert_array_equal(actual['foo'].values, arr_np.flatten())
-        assert_array_equal(actual.index.names, list('ABC'))
+        assert_array_equal(actual["foo"].values, arr_np.flatten())
+        assert_array_equal(actual.index.names, list("ABC"))
         assert_array_equal(actual.index.levels[0], [1, 2])
-        assert_array_equal(actual.index.levels[1], ['a', 'b'])
+        assert_array_equal(actual.index.levels[1], ["a", "b"])
         assert_array_equal(actual.index.levels[2], [5, 6, 7])
 
     def test_to_pandas_name_matches_coordinate(self):

From aa88f796ac51a2942aabeb36e9c12762f82bda37 Mon Sep 17 00:00:00 2001
From: Ghislain Picard <ghislain.picard@univ-grenoble-alpes.fr>
Date: Sun, 27 Sep 2020 09:57:59 +0200
Subject: [PATCH 6/8] change variable names and formatting

---
 xarray/core/coordinates.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py
index 5c27cdb6e7b..08b7629bed9 100644
--- a/xarray/core/coordinates.py
+++ b/xarray/core/coordinates.py
@@ -111,18 +111,20 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index:
 
             # compute the sizes of the repeat and tile for the cartesian product
             # (taken from pandas.core.reshape.util)
-            lenX = np.fromiter((len(index) for index in indexes), dtype=np.intp)
-            cumprodX = np.cumproduct(lenX)
+            index_lengths = np.fromiter(
+                (len(index) for index in indexes), dtype=np.intp
+            )
+            cumprod_lengths = np.cumproduct(index_lengths)
 
-            if cumprodX[-1] != 0:
+            if cumprod_lengths[-1] != 0:
                 # sizes of the repeats
-                repeat_counts = cumprodX[-1] / cumprodX
+                repeat_counts = cumprod_lengths[-1] / cumprod_lengths
             else:
                 # if any factor is empty, the cartesian product is empty
-                repeat_counts = np.zeros_like(cumprodX)
+                repeat_counts = np.zeros_like(cumprod_lengths)
 
             # sizes of the tiles
-            tile_counts = np.roll(cumprodX, 1)
+            tile_counts = np.roll(cumprod_lengths, 1)
             tile_counts[0] = 1
 
             # loop over the indexes
@@ -141,7 +143,10 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index:
                     levels = [level]
 
                 # compute the cartesian product
-                code_list += [np.tile(np.repeat(code, repeat_counts[i]), tile_counts[i]) for code in codes]
+                code_list += [
+                    np.tile(np.repeat(code, repeat_counts[i]), tile_counts[i])
+                    for code in codes
+                ]
                 level_list += levels
                 names += index.names
 

From 144c618d6f19d31ca0593e53c3886d312837f983 Mon Sep 17 00:00:00 2001
From: Ghislain Picard <ghislain.picard@univ-grenoble-alpes.fr>
Date: Sun, 27 Sep 2020 09:58:17 +0200
Subject: [PATCH 7/8] add test for to_dataframe with 0 length index

---
 xarray/tests/test_dataarray.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
index e2f8fc1c938..fcfef8ae217 100644
--- a/xarray/tests/test_dataarray.py
+++ b/xarray/tests/test_dataarray.py
@@ -3535,6 +3535,18 @@ def test_to_dataframe_multiindex(self):
         assert_array_equal(actual.index.levels[1], ["a", "b"])
         assert_array_equal(actual.index.levels[2], [5, 6, 7])
 
+    def test_to_dataframe_0length(self):
+        # regression test for #3008
+        arr_np = np.random.randn(4, 0)
+
+        mindex = pd.MultiIndex.from_product([[1, 2], list("ab")], names=["A", "B"])
+
+        arr = DataArray(arr_np, [("MI", mindex), ("C", [])], name="foo")
+
+        actual = arr.to_dataframe()
+        assert len(actual) == 0
+        assert_array_equal(actual.index.names, list("ABC"))
+
     def test_to_pandas_name_matches_coordinate(self):
         # coordinate with same name as array
         arr = DataArray([1, 2, 3], dims="x", name="x")

From 368c7d4e79dea8689bbd57cc7745ce1830e3e32e Mon Sep 17 00:00:00 2001
From: Ghislain Picard <ghislain.picard@univ-grenoble-alpes.fr>
Date: Mon, 5 Oct 2020 23:05:35 +0200
Subject: [PATCH 8/8] first attempt to align with tolerance

---
 xarray/core/alignment.py       | 59 +++++++++++++++++++++++++++-------
 xarray/core/indexes.py         | 52 ++++++++++++++++++++++++++++++
 xarray/tests/test_dataarray.py | 33 ++++++++++++++++++-
 3 files changed, 132 insertions(+), 12 deletions(-)

diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py
index 319c8a9a7a0..a1261c1e016 100644
--- a/xarray/core/alignment.py
+++ b/xarray/core/alignment.py
@@ -11,17 +11,32 @@
 from .indexing import get_indexer_nd
 from .utils import is_dict_like, is_full_slice
 from .variable import IndexVariable, Variable
+from .indexes import or_with_tolerance, and_with_tolerance, equal_indexes_with_tolerance
 
 if TYPE_CHECKING:
     from .dataarray import DataArray
     from .dataset import Dataset
 
 
-def _get_joiner(join):
+def _get_joiner(join, tolerance=0):
     if join == "outer":
-        return functools.partial(functools.reduce, operator.or_)
+        if tolerance == 0:
+            or_ = operator.or_
+        else:
+
+            def or_(x1, x2, tolerance=tolerance):
+                return or_with_tolerance(x1, x2, tolerance)
+
+        return functools.partial(functools.reduce, or_)
     elif join == "inner":
-        return functools.partial(functools.reduce, operator.and_)
+        if tolerance == 0:
+            and_ = operator.and_
+        else:
+
+            def and_(x1, x2, tolerance=tolerance):
+                return and_with_tolerance(x1, x2, tolerance)
+
+        return functools.partial(functools.reduce, and_)
     elif join == "left":
         return operator.itemgetter(0)
     elif join == "right":
@@ -65,6 +80,7 @@ def align(
     indexes=None,
     exclude=frozenset(),
     fill_value=dtypes.NA,
+    tolerance=0,
 ):
     """
     Given any number of Dataset and/or DataArray objects, returns new
@@ -107,6 +123,8 @@ def align(
         Value to use for newly missing values. If a dict-like, maps
         variable names to fill values. Use a data array's name to
         refer to its values.
+    tolerance: numerical
+        Value used to check equality between the coordinate values with numerical tolerance.
 
     Returns
     -------
@@ -284,30 +302,41 @@ def align(
     # - It ensures it's possible to do operations that don't require alignment
     #   on indexes with duplicate values (which cannot be reindexed with
     #   pandas). This is useful, e.g., for overwriting such duplicate indexes.
-    joiner = _get_joiner(join)
     joined_indexes = {}
     for dim, matching_indexes in all_indexes.items():
         if dim in indexes:
             index = utils.safe_cast_to_index(indexes[dim])
             if (
-                any(not index.equals(other) for other in matching_indexes)
-                or dim in unlabeled_dim_sizes
-            ):
+                not equal_indexes_with_tolerance(index, matching_indexes, tolerance)
+            ) or dim in unlabeled_dim_sizes:
                 joined_indexes[dim] = index
-        else:
+        elif (
+            any(not matching_indexes[0].equals(other) for other in matching_indexes[1:])
+            or dim in unlabeled_dim_sizes
+        ):
             if (
-                any(
-                    not matching_indexes[0].equals(other)
-                    for other in matching_indexes[1:]
+                not equal_indexes_with_tolerance(
+                    matching_indexes[0], matching_indexes[1:], tolerance
                 )
                 or dim in unlabeled_dim_sizes
             ):
                 if join == "exact":
                     raise ValueError(f"indexes along dimension {dim!r} are not equal")
+                # this logic could be moved out if _get_joiner is changed into _do_join(join, matching_index, tolerance)
+                if (tolerance > 0) and all(
+                    index.is_numeric() for index in matching_indexes
+                ):
+                    joiner = _get_joiner(join, tolerance)
+                else:
+                    joiner = _get_joiner(join)
                 index = joiner(matching_indexes)
                 joined_indexes[dim] = index
             else:
+                # they are identical within tolerance, reindexing is necessary
                 index = matching_indexes[0]
+                joined_indexes[dim] = index
+        else:
+            index = matching_indexes[0]  # I believe this line is not useful...
 
         if dim in unlabeled_dim_sizes:
             unlabeled_sizes = unlabeled_dim_sizes[dim]
@@ -336,6 +365,14 @@ def align(
         if not valid_indexers:
             # fast path for no reindexing necessary
             new_obj = obj.copy(deep=copy)
+        elif tolerance > 0:
+            new_obj = obj.reindex(
+                copy=copy,
+                fill_value=fill_value,
+                **valid_indexers,
+                tolerance=tolerance,
+                method="nearest",
+            )
         else:
             new_obj = obj.reindex(copy=copy, fill_value=fill_value, **valid_indexers)
         new_obj.encoding = obj.encoding
diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py
index 84cf35d3b4f..13908117877 100644
--- a/xarray/core/indexes.py
+++ b/xarray/core/indexes.py
@@ -143,3 +143,55 @@ def propagate_indexes(
         new_indexes = None  # type: ignore
 
     return new_indexes
+
+
+def equal_indexes_with_tolerance(
+    index: pd.Index, others: Iterable[pd.Index], tolerance: float
+) -> bool:
+
+    return all(other.equals(index) for other in others) or (
+        tolerance > 0
+        and index.is_numeric()
+        and all(other.is_numeric() for other in others)
+        and all(np.allclose(index, other, atol=tolerance, rtol=0) for other in others)
+    )
+
+
+def unique_with_tolerance(x, tolerance):
+    # check uniqueness based on the difference between successive values
+    # this eliminates all the numbers that are within tolerance of the previous one.
+    # a number can be removed even if the previous one was removed
+
+    x = np.sort(x)
+    # x is sorted
+    notclose = np.diff(x) > tolerance
+
+    unique = x[1:][notclose]
+    return np.insert(unique, 0, x[0])
+
+
+def or_with_tolerance(x1, x2, tolerance):
+    return unique_with_tolerance(np.concatenate((x1, x2)), tolerance=tolerance)
+
+
+def and_with_tolerance(x1, x2, tolerance):
+    # return values common in the two arrays (within some tolerance)
+
+    x = np.concatenate((x1, x2))
+
+    # sort x, remember the origin (x1 or x2)
+    ind = np.argsort(x)
+    x = x[ind]
+
+    # check for close values
+    close = np.diff(x) < tolerance
+
+    origin = ind < len(x1)
+    first = origin[:-1][close]  # where is the 'first' coming from
+    second = origin[1:][close]  # where is the 'second' coming from
+
+    intersect = (
+        first != second
+    )  # they must come from the two different arrays to be an interestion
+
+    return x[:-1][close][intersect]
diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
index fcfef8ae217..314c0eb391a 100644
--- a/xarray/tests/test_dataarray.py
+++ b/xarray/tests/test_dataarray.py
@@ -453,7 +453,7 @@ def test_equals_and_identical(self):
         assert not expected.identical(actual)
 
         actual = expected.copy()
-        actual["a"] = 100000
+        actual["a"] = 100_000
         assert not expected.equals(actual)
         assert not expected.identical(actual)
 
@@ -3372,6 +3372,37 @@ def test_align_without_indexes_errors(self):
                 DataArray([1, 2], coords=[("x", [0, 1])]),
             )
 
+    def test_align_with_tolerance_when_nearly_equal_index(self):
+        da1 = DataArray([1, 2, 3], coords=[("x", [1.0001, 2.0004, 2.9999])])
+
+        da2 = DataArray([4, 5, 6], coords=[("x", [1, 2, 3])])
+
+        aligned_da1, aligned_da2 = align(da1, da2, tolerance=4e-4)
+
+        assert_identical(aligned_da1.x, da1.x)
+        assert_identical(aligned_da2.x, da1.x)
+
+    def test_align_with_tolereance_and_intersect(self):
+        da1 = DataArray([1, 2, 3, 4], coords=[("x", [1.0001, 1.5, 2.0004, 2.9999])])
+
+        da2 = DataArray([5, 6, 7, 8], coords=[("x", [1, 2, 3, 5])])
+
+        aligned_da1, aligned_da2 = align(da1, da2, tolerance=4e-4, join="inner")
+
+        assert_identical(aligned_da1.x, aligned_da2.x)
+        print(aligned_da1.x)
+        assert_array_equal(aligned_da1.x, np.array([1.0, 2.0, 2.9999]))
+
+    def test_align_with_tolereance_and_union(self):
+        da1 = DataArray([1, 2, 3, 4], coords=[("x", [1.0001, 1.5, 2.0004, 2.9999])])
+
+        da2 = DataArray([5, 6, 7, 8], coords=[("x", [1, 2, 3, 5])])
+
+        aligned_da1, aligned_da2 = align(da1, da2, tolerance=4e-4, join="outer")
+
+        assert_identical(aligned_da1.x, aligned_da2.x)
+        assert_array_equal(aligned_da1.x, np.array([1.0, 1.5, 2.0, 2.9999, 5.0]))
+
     def test_broadcast_arrays(self):
         x = DataArray([1, 2], coords=[("a", [-1, -2])], name="x")
         y = DataArray([1, 2], coords=[("b", [3, 4])], name="y")