Merge pull request #1366 from jaladh-singhal/hdf-scalars-fix

Fix duplicate appending of scalars in HDF writer and add an overwrite option
tardis-sn · Dec 4, 2020 · 1249c4e · 1249c4e
2 parents 5ae8d7c + 077cfe7
commit 1249c4e
Show file tree

Hide file tree

Showing 9 changed files with 77 additions and 69 deletions.
diff --git a/tardis/io/tests/test_HDFWriter.py b/tardis/io/tests/test_HDFWriter.py
@@ -37,7 +37,7 @@ def __init__(self, property):
 def test_simple_write(tmpdir, attr):
     fname = str(tmpdir.mkdir("data").join("test.hdf"))
     actual = MockHDF(attr)
-    actual.to_hdf(fname, path="test")
+    actual.to_hdf(fname, path="test", overwrite=True)
     expected = pd.read_hdf(fname, key="/test/mock_hdf/scalars")["property"]
     assert actual.property == expected
 
@@ -59,7 +59,7 @@ def test_simple_write(tmpdir, attr):
 def test_complex_obj_write(tmpdir, attr):
     fname = str(tmpdir.mkdir("data").join("test.hdf"))
     actual = MockHDF(attr)
-    actual.to_hdf(fname, path="test")
+    actual.to_hdf(fname, path="test", overwrite=True)
     expected = pd.read_hdf(fname, key="/test/mock_hdf/property").values
     assert_array_almost_equal(actual.property, expected)
 
@@ -76,7 +76,7 @@ def test_complex_obj_write(tmpdir, attr):
 def test_MultiIndex_write(tmpdir):
     fname = str(tmpdir.mkdir("data").join("test.hdf"))
     actual = MockHDF(mock_multiIndex)
-    actual.to_hdf(fname, path="test")
+    actual.to_hdf(fname, path="test", overwrite=True)
     expected = pd.read_hdf(fname, key="/test/mock_hdf/property")
     expected = pd.MultiIndex.from_tuples(expected.unstack().values)
     pdt.assert_almost_equal(actual.property, expected)
@@ -92,7 +92,7 @@ def test_quantity_objects_write(tmpdir, attr):
     fname = str(tmpdir.mkdir("data").join("test.hdf"))
     attr_quantity = u.Quantity(attr, "g/cm**3")
     actual = MockHDF(attr_quantity)
-    actual.to_hdf(fname, path="test")
+    actual.to_hdf(fname, path="test", overwrite=True)
     expected = pd.read_hdf(fname, key="/test/mock_hdf/property")
     assert_array_almost_equal(actual.property.cgs.value, expected)
 
@@ -105,15 +105,15 @@ def test_scalar_quantity_objects_write(tmpdir, attr):
     fname = str(tmpdir.mkdir("data").join("test.hdf"))
     attr_quantity = u.Quantity(attr, "g/cm**3")
     actual = MockHDF(attr_quantity)
-    actual.to_hdf(fname, path="test")
+    actual.to_hdf(fname, path="test", overwrite=True)
     expected = pd.read_hdf(fname, key="/test/mock_hdf/scalars/")["property"]
     assert_array_almost_equal(actual.property.cgs.value, expected)
 
 
 def test_none_write(tmpdir):
     fname = str(tmpdir.mkdir("data").join("test.hdf"))
     actual = MockHDF(None)
-    actual.to_hdf(fname, path="test")
+    actual.to_hdf(fname, path="test", overwrite=True)
     expected = pd.read_hdf(fname, key="/test/mock_hdf/scalars/")["property"]
     if expected == "none":
         expected = None
@@ -138,7 +138,7 @@ def test_objects_write(tmpdir, attr):
     nested_object = MockHDF(np.array([4.0e14, 2, 2e14, 27.5]))
     attr_quantity = u.Quantity(attr, "g/cm**3")
     actual = MockClass(attr_quantity, nested_object)
-    actual.to_hdf(fname, path="test")
+    actual.to_hdf(fname, path="test", overwrite=True)
     expected_property = pd.read_hdf(fname, key="/test/mock_class/property")
     assert_array_almost_equal(actual.property.cgs.value, expected_property)
     nested_property = pd.read_hdf(

diff --git a/tardis/io/tests/test_config_reader.py b/tardis/io/tests/test_config_reader.py
@@ -64,7 +64,7 @@ def test_config_hdf(hdf_file_path, tardis_config_verysimple):
     expected = Configuration.from_config_dict(
         tardis_config_verysimple, validate=True, config_dirname="test"
     )
-    expected.to_hdf(hdf_file_path)
+    expected.to_hdf(hdf_file_path, overwrite=True)
     actual = pd.read_hdf(hdf_file_path, key="/simulation/config")
     expected = expected.get_properties()["config"]
     assert actual[0] == expected[0]
diff --git a/tardis/io/util.py b/tardis/io/util.py
@@ -36,12 +36,12 @@ def get_internal_data_path(fname):
 def quantity_from_str(text):
     """
     Convert a string to `astropy.units.Quantity`
-    
+
     Parameters
     ----------
     text :
         The string to convert to `astropy.units.Quantity`
-        
+
     Returns
     -------
     `astropy.units.Quantity`
@@ -76,7 +76,7 @@ def match(self, text):
         ----------
         text :
             A string to be passed to `target_type` for conversion.
-            
+
         Returns
         -------
         `True` if `text` can be converted to `target_type`.
@@ -196,10 +196,10 @@ def __new__(cls, *args, **kwargs):
         return instance
 
     @staticmethod
-    def to_hdf_util(path_or_buf, path, elements, complevel=9, complib="blosc"):
-        """
-        A function to uniformly store TARDIS data
-        to an HDF file.
+    def to_hdf_util(
+        path_or_buf, path, elements, overwrite, complevel=9, complib="blosc"
+    ):
+        """A function to uniformly store TARDIS data to an HDF file.
 
         Scalars will be stored in a Series under path/scalars
         1D arrays will be stored under path/property_name as distinct Series
@@ -209,32 +209,40 @@ def to_hdf_util(path_or_buf, path, elements, complevel=9, complib="blosc"):
 
         Parameters
         ----------
-        path_or_buf :
-            Path or buffer to the HDF store
+        path_or_buf : str or pandas.io.pytables.HDFStore
+            Path or buffer to the HDF file
         path : str
-            Path inside the HDF store to store the `elements`
+            Path inside the HDF file to store the `elements`
         elements : dict
             A dict of property names and their values to be
             stored.
-
-        Returns
-        -------
+        overwrite: bool
+            If the HDF file path already exists, whether to overwrite it or not
+
+        Notes
+        -----
+        `overwrite` option doesn't have any effect when `path_or_buf` is an
+        HDFStore because the user decides on the mode in which they have
+        opened the HDFStore ('r', 'w' or 'a').
         """
-        we_opened = False
-
-        try:
+        try:  # when path_or_buf is a str, the HDFStore should get created
             buf = pd.HDFStore(path_or_buf, complevel=complevel, complib=complib)
-        except TypeError as e:  # Already a HDFStore
+        except TypeError as e:
             if e.message == "Expected bytes, got HDFStore":
+                # when path_or_buf is an HDFStore buffer instead
                 buf = path_or_buf
             else:
                 raise e
-        else:  # path_or_buf was a string and we opened the HDFStore
-            we_opened = True
+        else:  # path_or_buf was a str
+            if os.path.exists(path_or_buf) and not overwrite:
+                buf.close()
+                raise FileExistsError(
+                    "The specified HDF file already exists. If you still want "
+                    "to overwrite it, set option overwrite=True"
+                )
 
         if not buf.is_open:
             buf.open()
-            we_opened = True
 
         scalars = {}
         for key, value in elements.items():
@@ -253,25 +261,17 @@ def to_hdf_util(path_or_buf, path, elements, complevel=9, complib="blosc"):
                         pd.DataFrame(value).to_hdf(buf, os.path.join(path, key))
                 else:
                     pd.DataFrame(value).to_hdf(buf, os.path.join(path, key))
-            else:
+            else:  # value is a TARDIS object like model, runner or plasma
                 try:
-                    value.to_hdf(buf, path, name=key)
+                    value.to_hdf(buf, path, name=key, overwrite=overwrite)
                 except AttributeError:
                     data = pd.DataFrame([value])
                     data.to_hdf(buf, os.path.join(path, key))
 
         if scalars:
-            scalars_series = pd.Series(scalars)
+            pd.Series(scalars).to_hdf(buf, os.path.join(path, "scalars"))
 
-            # Unfortunately, with to_hdf we cannot append, so merge beforehand
-            scalars_path = os.path.join(path, "scalars")
-            try:
-                scalars_series = buf[scalars_path].append(scalars_series)
-            except KeyError:  # no scalars in HDFStore
-                pass
-            scalars_series.to_hdf(buf, os.path.join(path, "scalars"))
-
-        if we_opened:
+        if buf.is_open:
             buf.close()
 
     def get_properties(self):
@@ -291,19 +291,18 @@ def convert_to_snake_case(s):
         s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", s)
         return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
 
-    def to_hdf(self, file_path, path="", name=None):
+    def to_hdf(self, file_path_or_buf, path="", name=None, overwrite=False):
         """
         Parameters
         ----------
-        file_path : str
-            Path or buffer to the HDF store
+        file_path_or_buf : str or pandas.io.pytables.HDFStore
+            Path or buffer to the HDF file
         path : str
-            Path inside the HDF store to store the `elements`
+            Path inside the HDF file to store the `elements`
         name : str
-            Group inside the HDF store to which the `elements` need to be saved
-
-        Returns
-        -------
+            Group inside the HDF file to which the `elements` need to be saved
+        overwrite: bool
+            If the HDF file path already exists, whether to overwrite it or not
         """
         if name is None:
             try:
@@ -313,7 +312,7 @@ def to_hdf(self, file_path, path="", name=None):
 
         data = self.get_properties()
         buff_path = os.path.join(path, name)
-        self.to_hdf_util(file_path, buff_path, data)
+        self.to_hdf_util(file_path_or_buf, buff_path, data, overwrite)
 
 
 class PlasmaWriterMixin(HDFWriterMixin):
@@ -338,30 +337,36 @@ def get_properties(self):
             data.pop("nlte_data")
         return data
 
-    def to_hdf(self, file_path, path="", name=None, collection=None):
+    def to_hdf(
+        self,
+        file_path_or_buf,
+        path="",
+        name=None,
+        collection=None,
+        overwrite=False,
+    ):
         """
         Parameters
         ----------
-        file_path : str
-            Path or buffer to the HDF store
+        file_path_or_buf : str or pandas.io.pytables.HDFStore
+            Path or buffer to the HDF file
         path : str
-            Path inside the HDF store to store the `elements`
+            Path inside the HDF file to store the `elements`
         name : str
-            Group inside the HDF store to which the `elements` need to be saved
+            Group inside the HDF file to which the `elements` need to be saved
         collection :
             `None` or a `PlasmaPropertyCollection` of which members are
             the property types which will be stored. If `None` then
-            all types of properties will be stored.
-
-            This acts like a filter, for example if a value of
-            `property_collections.basic_inputs` is given, only
-            those input parameters will be stored to the HDF store.
-
-        Returns
-        -------
+            all types of properties will be stored. This acts like a filter,
+            for example if a value of `property_collections.basic_inputs` is
+            given, only those input parameters will be stored to the HDF file.
+        overwrite: bool
+            If the HDF file path already exists, whether to overwrite it or not
         """
         self.collection = collection
-        super(PlasmaWriterMixin, self).to_hdf(file_path, path, name)
+        super(PlasmaWriterMixin, self).to_hdf(
+            file_path_or_buf, path, name, overwrite
+        )
 
 
 def download_from_url(url, dst):

diff --git a/tardis/model/tests/test_base.py b/tardis/model/tests/test_base.py
@@ -240,7 +240,7 @@ def test_model_decay(simple_isotope_abundance):
 
 @pytest.fixture(scope="module", autouse=True)
 def to_hdf_buffer(hdf_file_path, simulation_verysimple):
-    simulation_verysimple.model.to_hdf(hdf_file_path)
+    simulation_verysimple.model.to_hdf(hdf_file_path, overwrite=True)
 
 model_scalar_attrs = ['t_inner']
 

diff --git a/tardis/model/tests/test_density.py b/tardis/model/tests/test_density.py
@@ -9,7 +9,7 @@
 
 @pytest.fixture(scope="module", autouse=True)
 def to_hdf_buffer(hdf_file_path,simulation_verysimple):
-    simulation_verysimple.model.homologous_density.to_hdf(hdf_file_path)
+    simulation_verysimple.model.homologous_density.to_hdf(hdf_file_path, overwrite=True)
 
 def test_hdf_density_0(hdf_file_path, simulation_verysimple):
     actual = simulation_verysimple.model.homologous_density.density_0

diff --git a/tardis/montecarlo/tests/test_base.py b/tardis/montecarlo/tests/test_base.py
@@ -12,7 +12,9 @@
 
 @pytest.fixture(scope="module", autouse=True)
 def to_hdf_buffer(hdf_file_path, simulation_verysimple):
-    simulation_verysimple.runner.to_hdf(hdf_file_path, name="runner")
+    simulation_verysimple.runner.to_hdf(
+        hdf_file_path, name="runner", overwrite=True
+    )
 
 
 runner_properties = [

diff --git a/tardis/montecarlo/tests/test_spectrum.py b/tardis/montecarlo/tests/test_spectrum.py
@@ -153,7 +153,7 @@ def compare_spectra(actual, desired):
 
 @pytest.fixture(autouse=True)
 def to_hdf_buffer(hdf_file_path, spectrum):
-    spectrum.to_hdf(hdf_file_path, name="spectrum")
+    spectrum.to_hdf(hdf_file_path, name="spectrum", overwrite=True)
 
 
 @pytest.mark.parametrize("attr", TARDISSpectrum.hdf_properties)

diff --git a/tardis/plasma/tests/test_hdf_plasma.py b/tardis/plasma/tests/test_hdf_plasma.py
@@ -12,7 +12,7 @@
 
 @pytest.fixture(scope="module", autouse=True)
 def to_hdf_buffer(hdf_file_path, simulation_verysimple):
-    simulation_verysimple.plasma.to_hdf(hdf_file_path)
+    simulation_verysimple.plasma.to_hdf(hdf_file_path, overwrite=True)
 
 
 plasma_properties_list = [
@@ -104,6 +104,7 @@ def to_hdf_collection_buffer(hdf_file_path, simulation_verysimple):
         hdf_file_path,
         name="collection",
         collection=property_collections.basic_inputs,
+        overwrite=True,
     )
 
 

diff --git a/tardis/widgets/tests/test_shell_info.py b/tardis/widgets/tests/test_shell_info.py
@@ -29,7 +29,7 @@ def simulation_shell_info(simulation_verysimple):
 
 @pytest.fixture(scope="class")
 def hdf_shell_info(hdf_file_path, simulation_verysimple):
-    simulation_verysimple.to_hdf(hdf_file_path)  # save sim at hdf_file_path
+    simulation_verysimple.to_hdf(hdf_file_path, overwrite=True)  # save sim at hdf_file_path
     return HDFShellInfo(hdf_file_path)