From a568de8da124d2385b2f593d8b492d533664857d Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 20 May 2024 11:54:26 +0800
Subject: [PATCH 1/3] init

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 nlu/utils/environment/env_utils.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/nlu/utils/environment/env_utils.py b/nlu/utils/environment/env_utils.py
index 0540226e..06e6c49d 100644
--- a/nlu/utils/environment/env_utils.py
+++ b/nlu/utils/environment/env_utils.py
@@ -86,10 +86,14 @@ def try_import_streamlit():
 
 def is_running_in_databricks():
     """ Check if the currently running Python Process is running in Databricks or not
-     If any Environment Variable name contains 'DATABRICKS' this will return True, otherwise False"""
-    for k in os.environ.keys():
-        if 'DATABRICKS' in k:
-            return True
+    """
+    if "IS_IN_DATABRICKS_MODEL_SERVING_ENV" in os.environ:
+        # Serving container installs apache/spark, not databricks runtime.
+        return False
+
+    if "DATABRICKS_RUNTIME_VERSION" in os.environ:
+        return True
+
     return False
 
 

From 780ca5990731ad6734a6131c3dac3e14ec8eedb3 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 21 May 2024 17:24:52 +0800
Subject: [PATCH 2/3] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 nlu/__init__.py                               |  2 +-
 nlu/pipe/pipeline.py                          |  4 ++--
 nlu/pipe/utils/audio_data_conversion_utils.py | 12 ++++++------
 nlu/pipe/utils/ocr_data_conversion_utils.py   | 12 ++++++------
 nlu/pipe/utils/pipe_utils.py                  |  8 ++++----
 nlu/utils/environment/env_utils.py            | 13 +++----------
 6 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/nlu/__init__.py b/nlu/__init__.py
index 27876f47..10541a8f 100644
--- a/nlu/__init__.py
+++ b/nlu/__init__.py
@@ -325,7 +325,7 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
     If it is a component_list,  load the component_list and return it.
     If it is a singular model_anno_obj, load it to the correct AnnotatorClass and NLU component_to_resolve and then generate pipeline for it
     """
-    if is_running_in_databricks():
+    if is_running_in_databricks_runtime():
         return load_nlu_pipe_from_hdd_in_databricks(pipe_path, request)
     pipe = NLUPipeline()
     pipe.nlu_ref = request
diff --git a/nlu/pipe/pipeline.py b/nlu/pipe/pipeline.py
index b00f95a8..e6a1f3af 100644
--- a/nlu/pipe/pipeline.py
+++ b/nlu/pipe/pipeline.py
@@ -18,7 +18,7 @@
 from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils
 from nlu.universe.feature_node_ids import NLP_NODE_IDS
 from nlu.universe.universes import Licenses
-from nlu.utils.environment.env_utils import is_running_in_databricks, try_import_streamlit
+from nlu.utils.environment.env_utils import is_running_in_databricks_runtime, try_import_streamlit
 
 logger = logging.getLogger('nlu')
 
@@ -608,7 +608,7 @@ def viz(self, text_to_viz: str, viz_type='', labels_to_viz=None, viz_colors={},
         from nlu.utils.environment.env_utils import install_and_import_package
         install_and_import_package('spark-nlp-display', import_name='sparknlp_display')
         if self.vanilla_transformer_pipe is None: self.fit()
-        is_databricks_env = is_running_in_databricks()
+        is_databricks_env = is_running_in_databricks_runtime()
         if return_html: is_databricks_env = True
         # self.configure_light_pipe_usage(1, force=True)
         from nlu.pipe.viz.vis_utils import VizUtils
diff --git a/nlu/pipe/utils/audio_data_conversion_utils.py b/nlu/pipe/utils/audio_data_conversion_utils.py
index afdf0777..3f692f09 100644
--- a/nlu/pipe/utils/audio_data_conversion_utils.py
+++ b/nlu/pipe/utils/audio_data_conversion_utils.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pandas as pd
 import pyspark
-from johnsnowlabs.utils.env_utils import is_running_in_databricks
+from johnsnowlabs.utils.env_utils import is_running_in_databricks_runtime
 from pyspark.sql.types import *
 
 logger = logging.getLogger('nlu')
@@ -34,7 +34,7 @@ def validate_paths(data):
     @staticmethod
     def check_iterable_paths_are_valid(iterable_paths):
         """Validate for iterable data input if all elements point to file or jsl_folder"""
-        if is_running_in_databricks():
+        if is_running_in_databricks_runtime():
             iterable_paths = [f'/dbfs{p}' for p in iterable_paths]
         paths_validness = []
         for p in iterable_paths:
@@ -86,18 +86,18 @@ def glob_files_of_accepted_type(paths, file_types):
         1. paths point to a file which is suffixed with one of the accepted file_types, i.e. path/to/file.type
         2. path points to a jsl_folder, in this case jsl_folder is recursively searched for valid files and accepted paths will be in return result
         """
-        if is_running_in_databricks():
+        if is_running_in_databricks_runtime():
             paths = [f'/dbfs{p}' for p in paths]
         accepted_file_paths = []
         for p in paths:
             for t in file_types:
                 t = t.lower()
-                if os.path.isfile(p) or is_running_in_databricks() and os.path.isfile(f'/dbfs{p}'):
+                if os.path.isfile(p) or is_running_in_databricks_runtime() and os.path.isfile(f'/dbfs{p}'):
                     if p.lower().split('.')[-1] == t:
-                        if is_running_in_databricks():
+                        if is_running_in_databricks_runtime():
                             p = p.replace('/dbfs', '', 1)
                         accepted_file_paths.append(p)
-                elif os.path.isdir(p) or is_running_in_databricks() and os.path.isdir(f'/dbfs{p}'):
+                elif os.path.isdir(p) or is_running_in_databricks_runtime() and os.path.isdir(f'/dbfs{p}'):
                     accepted_file_paths += glob.glob(p + f'/**/*.{t}', recursive=True)
                 else:
                     print(f"Invalid path = {p} pointing neither to file or jsl_folder on this machine")
diff --git a/nlu/pipe/utils/ocr_data_conversion_utils.py b/nlu/pipe/utils/ocr_data_conversion_utils.py
index 44b4fd2a..b2aed5e5 100644
--- a/nlu/pipe/utils/ocr_data_conversion_utils.py
+++ b/nlu/pipe/utils/ocr_data_conversion_utils.py
@@ -7,7 +7,7 @@
 import numpy as np
 import pandas as pd
 import pyspark
-from johnsnowlabs.utils.env_utils import is_running_in_databricks
+from johnsnowlabs.utils.env_utils import is_running_in_databricks_runtime
 
 logger = logging.getLogger('nlu')
 
@@ -32,7 +32,7 @@ def validate_OCR_compatible_inputs(data):
     @staticmethod
     def check_iterable_paths_are_valid(iterable_paths):
         """Validate for iterable data input if all elements point to file or jsl_folder"""
-        if is_running_in_databricks():
+        if is_running_in_databricks_runtime():
             iterable_paths = [f'/dbfs{p}' for p in iterable_paths]
         paths_validness = []
         for p in iterable_paths:
@@ -58,18 +58,18 @@ def glob_files_of_accepted_type(paths, file_types):
         2. path points to a jsl_folder, in this case jsl_folder is recurisvely searched for valid files and accepted paths will be in return result
         """
 
-        if is_running_in_databricks():
+        if is_running_in_databricks_runtime():
             paths = [f'/dbfs{p}' for p in paths]
         accepted_file_paths = []
         for p in paths:
             for t in file_types:
                 t = t.lower()
-                if os.path.isfile(p) or is_running_in_databricks() and os.path.isfile(f'/dbfs{p}'):
+                if os.path.isfile(p) or is_running_in_databricks_runtime() and os.path.isfile(f'/dbfs{p}'):
                     if p.lower().split('.')[-1] == t:
-                        if is_running_in_databricks():
+                        if is_running_in_databricks_runtime():
                             p = p.replace('/dbfs', '', 1)
                         accepted_file_paths.append(p)
-                elif os.path.isdir(p) or is_running_in_databricks() and os.path.isdir(f'/dbfs{p}'):
+                elif os.path.isdir(p) or is_running_in_databricks_runtime() and os.path.isdir(f'/dbfs{p}'):
                     accepted_file_paths += glob.glob(p + f'/*.{t.upper()}', recursive=True) + glob.glob(p + f'/*.{t}',
                                                                                                         recursive=True)
                 else:
diff --git a/nlu/pipe/utils/pipe_utils.py b/nlu/pipe/utils/pipe_utils.py
index 9c80913a..43b7f161 100644
--- a/nlu/pipe/utils/pipe_utils.py
+++ b/nlu/pipe/utils/pipe_utils.py
@@ -18,7 +18,7 @@
 from nlu.pipe.utils.component_utils import ComponentUtils
 from typing import List, Union, Dict
 from nlu.universe.annotator_class_universe import AnnoClassRef
-from nlu.utils.environment.env_utils import is_running_in_databricks
+from nlu.utils.environment.env_utils import is_running_in_databricks_runtime
 import os
 import glob
 import json
@@ -140,12 +140,12 @@ def set_column_values_on_components_from_pretrained_pipe_from_disk_data(componen
             pipe_path = glob.glob(f'{pipe_path}*')
             if len(pipe_path) == 0:
                 # try databricks env path
-                if is_running_in_databricks():
+                if is_running_in_databricks_runtime():
                     pipe_path = [f'dbfs:/root/cache_pretrained/{nlp_ref}_{lang}']
                 else:
                     raise FileNotFoundError(f"Could not find downloaded Pipeline at path={pipe_path}")
             pipe_path = pipe_path[0]
-            if not os.path.exists(pipe_path) and not is_running_in_databricks():
+            if not os.path.exists(pipe_path) and not is_running_in_databricks_runtime():
                 raise FileNotFoundError(f"Could not find downloaded Pipeline at path={pipe_path}")
 
         # Find HDD location of component_list and read out input/output cols
@@ -155,7 +155,7 @@ def set_column_values_on_components_from_pretrained_pipe_from_disk_data(componen
 
         for c in component_list:
             model_name = c.model.uid.split('_')[0]
-            if is_running_in_databricks():
+            if is_running_in_databricks_runtime():
                 data = PipeUtils.get_json_data_for_pipe_model_at_stage_number_on_databricks(nlp_ref, lang, digit_str)
             else:
                 data = PipeUtils.get_json_data_for_pipe_model_at_stage_number(pipe_path, digit_str)
diff --git a/nlu/utils/environment/env_utils.py b/nlu/utils/environment/env_utils.py
index 06e6c49d..955fa217 100644
--- a/nlu/utils/environment/env_utils.py
+++ b/nlu/utils/environment/env_utils.py
@@ -84,17 +84,10 @@ def try_import_streamlit():
         print("You need to install Streamlit to run this functionality.")
 
 
-def is_running_in_databricks():
-    """ Check if the currently running Python Process is running in Databricks or not
+def is_running_in_databricks_runtime():
+    """ Check if the currently running Python Process is running in Databricks runtime or not
     """
-    if "IS_IN_DATABRICKS_MODEL_SERVING_ENV" in os.environ:
-        # Serving container installs apache/spark, not databricks runtime.
-        return False
-
-    if "DATABRICKS_RUNTIME_VERSION" in os.environ:
-        return True
-
-    return False
+    return "DATABRICKS_RUNTIME_VERSION" in os.environ
 
 
 def install_and_import_package(pkg_name, version='', import_name=''):

From 8564e7afec5857d9de7af4d1772aba1977eb6c2b Mon Sep 17 00:00:00 2001
From: C-K-Loan <christian.kasim.loan@gmail.com>
Date: Wed, 22 May 2024 00:39:03 +0200
Subject: [PATCH 3/3] version bump

---
 nlu/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nlu/__init__.py b/nlu/__init__.py
index 33d289d0..a905b751 100644
--- a/nlu/__init__.py
+++ b/nlu/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '5.3.1'
+__version__ = '5.3.2'
 
 
 import nlu.utils.environment.env_utils as env_utils