GH-15572: Check if headers were set before setting them to null in si…

…ngle entry ARFF parsing [nocheck] (#15589) * Do not let xgboost loading exception to go to void (#15560) * Check if headers were set before setting them to null in single entry ARFF parsing --------- Co-authored-by: Adam Valenta <[email protected]>
h2oai · Jun 20, 2023 · e6ca341 · e6ca341
1 parent 7bef068
commit e6ca341
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 1 deletion.
diff --git a/h2o-core/src/main/java/water/parser/ARFFParser.java b/h2o-core/src/main/java/water/parser/ARFFParser.java
@@ -108,7 +108,7 @@ static ParseSetup guessSetup(ByteVec bv, byte[] bits, byte sep, boolean singleQu
         }
         data[0] = determineTokens(datalines[0], sep, singleQuotes, escapechar);
         ncols = (ncols > 0) ? ncols : data[0].length;
-        labels = null;
+        labels = labels[0] == null ? null : labels;
       } else {                    // 2 or more lines
         if (sep == GUESS_SEP) {   // first guess the separator
           //FIXME if last line is incomplete, this logic fails

diff --git a/h2o-extensions/xgboost/src/main/java/hex/tree/xgboost/XGBoostExtension.java b/h2o-extensions/xgboost/src/main/java/hex/tree/xgboost/XGBoostExtension.java
@@ -104,6 +104,7 @@ private boolean initXgboost() {
       return true;
     } catch (IOException e) {
       // Ups no lib loaded or load failed
+      LOG.debug("Cause of the xgboost unsuccessful load", e);
       LOG.warn("Cannot initialize XGBoost backend! " + XGBOOST_MIN_REQUIREMENTS);
       return false;
     }

diff --git a/h2o-py/tests/testdir_parser/pyunit_parse_single_entry_arff.py b/h2o-py/tests/testdir_parser/pyunit_parse_single_entry_arff.py
@@ -0,0 +1,33 @@
+import os
+import sys
+import tempfile
+
+sys.path.insert(1, "../../")
+import h2o
+from tests import pyunit_utils
+
+
+def test_single_entry_arff_file():
+    with open(pyunit_utils.locate("smalldata/junit/arff/iris.arff"), "r") as input_:
+        arff = input_.readlines()
+    data_start = arff.index("@DATA\n")
+    subsample = arff[:data_start + 2]
+    print(subsample[-3:])
+    _fd, tmp = tempfile.mkstemp(".arff")
+    try:
+        with open(tmp, "w") as output:
+            output.write(''.join(subsample))
+
+        train = h2o.import_file(pyunit_utils.locate("smalldata/junit/arff/iris.arff"))
+        test = h2o.import_file(tmp)
+
+        print(f"{train.columns} == {test.columns}: {train.columns == test.columns}")
+        assert train.columns == test.columns
+    finally:
+        os.unlink(tmp)
+
+
+if __name__ == "__main__":
+    pyunit_utils.standalone_test(test_single_entry_arff_file)
+else:
+    test_single_entry_arff_file()