Enhance LPdiag tool

* Add test coverage * Add some type hints * Clarify function names * Remove some outdated comments
iiasa · Oct 30, 2023 · e6950bf · e6950bf
1 parent 8133c18
commit e6950bf
Show file tree

Hide file tree

Showing 3 changed files with 119 additions and 61 deletions.
diff --git a/message_ix/tests/tools/test_lpdiag.py b/message_ix/tests/tools/test_lpdiag.py
@@ -1,5 +1,7 @@
 import os
 
+import pytest
+
 from message_ix.tools.lp_diag.lp_diag import LPdiag
 
 
@@ -15,7 +17,7 @@ def test_aez():
     lp = LPdiag()
 
     # Read MPS, store the matrix in dataFrame
-    lp.rd_mps(file)
+    lp.read_mps(file)
 
     # Check that the matrix has the correct shape
     assert lp.mat.shape == (8895, 5)
@@ -49,7 +51,7 @@ def test_diet():
     lp = LPdiag()
 
     # Read MPS, store the matrix in dataFrame
-    lp.rd_mps(file)
+    lp.read_mps(file)
 
     # Check that the matrix has the correct shape
     assert lp.mat.shape == (39, 5)
@@ -83,7 +85,7 @@ def test_jg_korh():
     lp = LPdiag()
 
     # Read MPS, store the matrix in dataFrame
-    lp.rd_mps(file)
+    lp.read_mps(file)
 
     # Check that the matrix has the correct shape
     assert lp.mat.shape == (10, 5)
@@ -117,7 +119,7 @@ def test_lotfi():
     lp = LPdiag()
 
     # Read MPS, store the matrix in dataFrame
-    lp.rd_mps(file)
+    lp.read_mps(file)
 
     # Check that the matrix has the correct shape
     assert lp.mat.shape == (1086, 5)
@@ -137,3 +139,64 @@ def test_lotfi():
 
     # Check that sequence number of the goal function is not -1
     assert lp.gf_seq != -1
+
+
+# TODO: continue expanding tests
+# Mostly, this means calling the last functions defined in lp_diag.py, but some
+# lines also require special edge cases (mps files defined with 6 and 7 sections)
+def test_error_cases():
+    """Test error cases"""
+
+    # Read in the err_tst.mps file
+    file = os.path.join(
+        os.getcwd(), "message_ix", "tools", "lp_diag", "test_mps", "err_tst"
+    )
+    lp = LPdiag()
+
+    # Read MPS, store the matrix in dataFrame
+    with pytest.raises(AssertionError):
+        lp.read_mps(file)
+
+
+def test_lpdiag_print_statistics():
+    """Test auxiliary stat function."""
+
+    # Read in the diet.mps file
+    file = os.path.join(
+        os.getcwd(), "message_ix", "tools", "lp_diag", "test_mps", "jg_korh"
+    )
+    lp = LPdiag()
+
+    # Read MPS, store the matrix in dataFrame
+    lp.read_mps(file)
+
+    # Stats of matrix coeffs, incl. distrib. tails
+    lp.print_statistics(lo_tail=-7, up_tail=5)
+    # To get numbers of coeffs for each magnitute specify equal/overlapping tails:
+    lp.print_statistics(lo_tail=1, up_tail=0)
+
+    # The function only prints, so we can only ...
+    # Check that the matrix has the correct shape
+    assert lp.mat.shape == (10, 5)
+
+
+def test_lpdiag_locate_outliers():
+    """Test locating outliers."""
+
+    # Read in the diet.mps file
+    file = os.path.join(
+        os.getcwd(), "message_ix", "tools", "lp_diag", "test_mps", "lotfi"
+    )
+    lp = LPdiag()
+
+    # Read MPS, store the matrix in dataFrame
+    lp.read_mps(file)
+
+    # Test (lotfi) small-value outliers:
+    lp.locate_outliers(small=True, thresh=-1, max_rec=100)
+    # Test (lotfi) large-value outliers
+    lp.locate_outliers(small=False, thresh=2, max_rec=500)
+
+    # The function doesn't return anything, so we can only ...
+    # Check that the matrix has the correct shape
+    assert lp.mat.shape == (1086, 5)
diff --git a/message_ix/tools/lp_diag/lp_diag.py b/message_ix/tools/lp_diag/lp_diag.py
@@ -36,7 +36,7 @@ class LPdiag:
     """
 
     def __init__(self):
-        self.fname = "undefined"  # MPS file name, to be defined by rd_mps() call
+        self.fname = "undefined"  # MPS file name, to be defined by read_mps() call
         self.pname = "undefined"  # problem name
         self.id_rhs = False  # True, if rhs_id defined
         self.id_range = False  # True, if range_id defined
@@ -75,7 +75,7 @@ def __init__(self):
         #   columns=['seq_id', 'name', 'type', 'lo_bnd', 'up_bnd']
         # )
 
-    def rd_mps(self, fname):  # process the MPS file
+    def read_mps(self, fname):  # process the MPS file
         print(f"\nReading MPS-format file {fname}.")
         self.fname = fname
         sections = [
@@ -122,12 +122,12 @@ def rd_mps(self, fname):  # process the MPS file
                     elif n_section == 6:  # SOS section
                         pass  # SOS section not processed
                     # elif n_section == 7:  # end data
-                    #     raise Exception(
+                    #     raise RunTimeError(
                     #         "Unexpected execution flow; needs to be explored."
                     #     )
                     else:
                         print(f"MPS record {n_line}, section id {n_section}.")
-                        raise Exception(
+                        raise RuntimeError(
                             f"MPS line '{line}' (line {n_line}) misplaced,"
                             f" processing section {sections[n_section]}."
                         )
@@ -140,7 +140,7 @@ def rd_mps(self, fname):  # process the MPS file
                     elif n_section == 6:  # SOS
                         print(f"WARNING: Section {sections[n_section]} not processed.")
                     else:
-                        raise Exception(
+                        raise RuntimeError(
                             f"Should not come here, n_section = {n_section}."
                         )
 
@@ -184,16 +184,16 @@ def next_sec(self, n_exp, words, sections):
                 return n_exp  # n_sections equals to the expected: n_exp
             else:
                 print(f"section {words} found.")
-                raise Exception(
-                    f"Required MPS section {sections[n_exp]} undefined" " or misplaced."
+                raise NameError(
+                    f"Required MPS section {sections[n_exp]} undefined or misplaced."
                 )
         else:  # the found section does not follow the last processed section
             try:
                 n_section = sections.index(words[0])
-            except ValueError:
-                raise Exception(f"Unknown section: {words} (line {n_line}).")
+            except ValueError as e:
+                raise ValueError(f"Unknown section: {words} (line {n_line}).") from e
             if n_section < n_exp:
-                raise Exception(
+                raise AttributeError(
                     f"Section {words[0]} (line {n_line}) is misplaced or duplicated."
                 )
             return n_section
@@ -570,11 +570,11 @@ def add_bnd(self, words, n_line):
         if typ in bnd_type1:  # bound-types that require a value
             try:
                 val = float(words[pos_name + 1])
-            except ValueError:
-                print(
+            except ValueError as e:
+                raise ValueError(
                     f"BOUND value {words[pos_name + 1]} (line {n_line}) is not a "
                     "number."
-                )
+                ) from e
             at_pos = bnd_type1.get(typ)
             if at_pos == 3:  # set both bounds
                 attr[1] = attr[2] = val
@@ -588,12 +588,12 @@ def add_bnd(self, words, n_line):
             else:
                 attr[at_pos] = self.infty
         elif typ in bnd_type3:
-            raise Exception(
+            raise TypeError(
                 f"Bound type {typ} of integer var. (line {n_line}) not"
                 " processed yet."
             )
         else:
-            raise Exception(f"Unknown bound type {typ} (line {n_line}).")
+            raise TypeError(f"Unknown bound type {typ} (line {n_line}).")
         self.seq_col.update({col_seq: attr})  # store the updated col-attributes
         self.n_bounds += 1
 
@@ -673,9 +673,9 @@ def row_att(self, row_seq, row_name, row_type, sec_name, val=0.0):
             #     f" {attr}."
             # )
         else:  # update row attributes (used in RHS and ranges sections)
-            raise Exception(f"row_att() should not be called for {sec_name=}.")
+            raise SyntaxError(f"row_att() should not be called for {sec_name=}.")
 
-    def stat(self, lo_tail=-7, up_tail=6):
+    def print_statistics(self, lo_tail: int = -7, up_tail: int = 6):
         """Basic statistics of the matrix coefficients.
 
         Focus on distributions of magnitudes of non-zero coeff. represented by values
@@ -754,7 +754,7 @@ def stat(self, lo_tail=-7, up_tail=6):
                     f" {self.mat.loc[self.mat['log'] == val]['log'].count()}"
                 )
 
-    def out_loc(self, small=True, thresh=-7, max_rec=500):
+    def locate_outliers(self, small: bool = True, thresh: int = -7, max_rec: int = 500):
         """Locations of outliers, i.e., elements having small/large coeff values.
 
         Locations of outliers (in the term of the matrix coefficient values).
@@ -789,14 +789,14 @@ def out_loc(self, small=True, thresh=-7, max_rec=500):
         )  # sort the df with outliers ascending seq_id of rows
         df1.reset_index()
         col_out = []  # col_seq of outliers' cols
-        for n_rows, (indx, row) in enumerate(df1.iterrows()):
+        for n_rows, (_, row) in enumerate(df1.iterrows()):
             assert (
                 n_rows < max_rec
             ), "To process all requested coeffs modify the safety limit assertion."
-            row_seq, row_name = self.ent_inf(
+            row_seq, row_name = self.get_entity_info(
                 row, True
             )  # row seq_id and name of the current coeff.
-            col_seq, col_name = self.ent_inf(
+            col_seq, col_name = self.get_entity_info(
                 row, False
             )  # col seq_id and name of the current coeff.
             if col_seq not in col_out:
@@ -813,12 +813,12 @@ def out_loc(self, small=True, thresh=-7, max_rec=500):
             ]  # df with all elements
             # print(f'matrix elements in the same row:\n{df_row}')
             print(
-                f"\tRow {row_name} {self.ent_range(row_seq, True)} has"
+                f"\tRow {row_name} {self.get_entity_range(row_seq, True)} has"
                 f" {df_row_out['log'].count()} outlier-coeff. of magnitudes in"
                 f" [{df_row_out['log'].min()}, {df_row_out['log'].max()}]"
             )
             print(
-                f"\tRow {row_name} {self.ent_range(row_seq, True)} has"
+                f"\tRow {row_name} {self.get_entity_range(row_seq, True)} has"
                 f" {df_row_all['log'].count()} (all)-coeff. of magnitudes in"
                 f" [{df_row_all['log'].min()}, {df_row_all['log'].max()}]"
             )
@@ -827,7 +827,7 @@ def out_loc(self, small=True, thresh=-7, max_rec=500):
             # df with outliers in the same col:
             # df_col = df1.loc[df1['col'] == col_seq]
             # print(
-            #     f"\tCol {col_name} {self.ent_range(col_seq, False)} has "
+            #     f"\tCol {col_name} {self.get_entity_range(col_seq, False)} has "
             #     f"{df_col["log"].count()} outlier coeff. of magnitudes in "
             #     f"[{df_col["log"].min()}, {df_col["log"].max()}]"
             # )
@@ -842,12 +842,14 @@ def out_loc(self, small=True, thresh=-7, max_rec=500):
                 self.mat["col"] == col_seq
             ]  # df with elements in the same col
             print(
-                f"\tCol {col_name} {self.ent_range(col_seq, False)} has"
+                f"\tCol {col_name} {self.get_entity_range(col_seq, False)} has"
                 f" {df_col['log'].count()} coeff. of magnitudes in"
                 f" [{df_col['log'].min()}, {df_col['log'].max()}]"
             )
 
-    def ent_inf(self, mat_row, by_row=True) -> typing.Tuple[int, str]:
+    def get_entity_info(
+        self, mat_row: pd.Series, by_row: bool = True
+    ) -> typing.Tuple[int, str]:
         """Return info on the entity (either row or col) defining the selected matrix
         coefficient.
 
@@ -874,7 +876,7 @@ def ent_inf(self, mat_row, by_row=True) -> typing.Tuple[int, str]:
             name = self.seq_col.get(ent_seq)[0]
         return ent_seq, name
 
-    def ent_range(self, seq_id, by_row=True) -> str:
+    def get_entity_range(self, seq_id: int, by_row: bool = True) -> str:
         """Return formatted string representing ranges of feasible values of either a
         row or a column.
 
@@ -915,3 +917,4 @@ def ent_range(self, seq_id, by_row=True) -> str:
     def plot_hist(self):
         """Plot histograms."""
         # todo: might not be needed; therefore the implementation postponed
+        pass
diff --git a/message_ix/tools/lp_diag/lpdiag.py b/message_ix/tools/lp_diag/lpdiag.py
@@ -66,12 +66,9 @@ def read_args():
     work_dir = os.getcwd()
     print(f"work_dir: '{work_dir}'.")
     tstart = dt.now()
-    # print('Started at:', str(tstart))
 
     # Retrieve and assign arguments
     args = read_args()
-    # dir2 = os.getcwd()
-    # print(f"{dir2 =}")
     w_dir = args.wdir or "."
     prob_id = args.mps or "test_mps/aez"  # default MPS for testing
     # alternative specs of test-MPS commented below
@@ -84,18 +81,16 @@ def read_args():
         print(f"Changing work-directory to: {w_dir}.")
         try:
             os.chdir(w_dir)
-        except OSError:
-            print(f"Cannot change work-directory to: {w_dir}.")
-    # dir3 = os.getcwd()
-    # print(f"{dir3 =}")
+        except OSError as e:
+            raise OSError(f"Cannot change work-directory to: {w_dir}.") from e
     assert isfile(prob_id), (
         f"MPS file {prob_id} not accessible from the work-directory:\n'{work_dir}'."
         "\nTry to use the --wdir command option to set the work-directory."
     )
     assert access(prob_id, R_OK), f"MPS file {prob_id} is not readable."
 
     # large (1+ GB) MPSs files, shall not be posted to gitHub.
-    # app was tested on two (1+ GB) MPSs posted by Oliver in /t/fricko on Feb 16, 2023:
+    # app was tested on two (1+ GB) MPSs posted by OFR in /t/fricko on Feb 16, 2023:
     # OFR_test_led_barrier.mps
     # baseline_barrier.mps
 
@@ -110,40 +105,37 @@ def read_args():
 
     default_stdout = sys.stdout
     if fn_outp:
-        # fn_out = "./" + repdir + prob_id + ".txt"  # file for redirected stdout
         print(f"Stdout redirected to: {fn_outp}")
         f_out = open(fn_outp, "w")
         sys.stdout = f_out
-    # else:  # defined to avoid warnings (only used when redir_stdo == True)
-    #     fn_out = "foo"
-    #     f_out = open(fn_out, "w")
 
     lp = LPdiag()  # LPdiag ctor
-    lp.rd_mps(prob_id)  # read MPS, store the matrix in dataFrame
-    lp.stat(lo_tail=-7, up_tail=5)  # stats of matrix coeffs, incl. distrib. tails
-    # to get numbers of coeffs for each magnitute specify equal/overlapping tails:
-    # lp.stat(lo_tail=0, up_tail=0)
-    lp.out_loc(small=True, thresh=-7, max_rec=100)  # locations of small-value outliers
-    lp.out_loc(small=False, thresh=6, max_rec=500)  # locations of large-value outliers
-    # lp.out_loc(small=True, thresh=-1, max_rec=100) # test (lotfi) small-value outliers
-    # lp.out_loc(small=False, thresh=2, max_rec=500) # test (lotfi) large-value outliers
-
-    tend = dt.now()
-    time_diff = tend - tstart
-    print("\nStarted at: ", str(tstart))
-    print("Finished at:", str(tend))
-    print(f"Wall-clock execution time: {time_diff.seconds} sec.")
+    lp.read_mps(prob_id)  # read MPS, store the matrix in dataFrame
+    lp.print_statistics(
+        lo_tail=-7, up_tail=5
+    )  # stats of matrix coeffs, incl. distrib. tails
+    # To get numbers of coeffs for each magnitute specify equal/overlapping tails:
+    # lp.print_statistics(lo_tail=0, up_tail=0)
+    lp.locate_outliers(
+        small=True, thresh=-7, max_rec=100
+    )  # locations of small-value outliers
+    lp.locate_outliers(
+        small=False, thresh=6, max_rec=500
+    )  # locations of large-value outliers
 
     if fn_outp:  # close the redirected output
         # noinspection PyUnboundLocalVariable
         f_out.close()
         sys.stdout = default_stdout
         print(f"\nRedirected stdout stored in {fn_outp}. Now writing to the console.")
-        print("\nStarted at: ", str(tstart))
-        print("Finished at:", str(tend))
-        print(f"Wall-clock execution time: {time_diff.seconds} sec.")
+
+    tend = dt.now()
+    time_diff = tend - tstart
+    print("\nStarted at: ", str(tstart))
+    print("Finished at:", str(tend))
+    print(f"Wall-clock execution time: {time_diff.seconds} sec.")
 
     # todo: TBD, if the MPS-standard should be observed; should it cause error or info
     #  in particular, range of values: 10^{-10} < abs(val) < 10^{10}
     # todo: naive scaling? might not be informative due to the later preprocessing
-    # todo: plots of distributions of coeffs, if indeed usefull
+    # todo: plots of distributions of coeffs, if indeed useful