From a568de8da124d2385b2f593d8b492d533664857d Mon Sep 17 00:00:00 2001 From: Weichen Xu Date: Mon, 20 May 2024 11:54:26 +0800 Subject: [PATCH 1/3] init Signed-off-by: Weichen Xu --- nlu/utils/environment/env_utils.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/nlu/utils/environment/env_utils.py b/nlu/utils/environment/env_utils.py index 0540226e..06e6c49d 100644 --- a/nlu/utils/environment/env_utils.py +++ b/nlu/utils/environment/env_utils.py @@ -86,10 +86,14 @@ def try_import_streamlit(): def is_running_in_databricks(): """ Check if the currently running Python Process is running in Databricks or not - If any Environment Variable name contains 'DATABRICKS' this will return True, otherwise False""" - for k in os.environ.keys(): - if 'DATABRICKS' in k: - return True + """ + if "IS_IN_DATABRICKS_MODEL_SERVING_ENV" in os.environ: + # Serving container installs apache/spark, not databricks runtime. + return False + + if "DATABRICKS_RUNTIME_VERSION" in os.environ: + return True + return False From 780ca5990731ad6734a6131c3dac3e14ec8eedb3 Mon Sep 17 00:00:00 2001 From: Weichen Xu Date: Tue, 21 May 2024 17:24:52 +0800 Subject: [PATCH 2/3] update Signed-off-by: Weichen Xu --- nlu/__init__.py | 2 +- nlu/pipe/pipeline.py | 4 ++-- nlu/pipe/utils/audio_data_conversion_utils.py | 12 ++++++------ nlu/pipe/utils/ocr_data_conversion_utils.py | 12 ++++++------ nlu/pipe/utils/pipe_utils.py | 8 ++++---- nlu/utils/environment/env_utils.py | 13 +++---------- 6 files changed, 22 insertions(+), 29 deletions(-) diff --git a/nlu/__init__.py b/nlu/__init__.py index 27876f47..10541a8f 100644 --- a/nlu/__init__.py +++ b/nlu/__init__.py @@ -325,7 +325,7 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline: If it is a component_list, load the component_list and return it. If it is a singular model_anno_obj, load it to the correct AnnotatorClass and NLU component_to_resolve and then generate pipeline for it """ - if is_running_in_databricks(): + if is_running_in_databricks_runtime(): return load_nlu_pipe_from_hdd_in_databricks(pipe_path, request) pipe = NLUPipeline() pipe.nlu_ref = request diff --git a/nlu/pipe/pipeline.py b/nlu/pipe/pipeline.py index b00f95a8..e6a1f3af 100644 --- a/nlu/pipe/pipeline.py +++ b/nlu/pipe/pipeline.py @@ -18,7 +18,7 @@ from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils from nlu.universe.feature_node_ids import NLP_NODE_IDS from nlu.universe.universes import Licenses -from nlu.utils.environment.env_utils import is_running_in_databricks, try_import_streamlit +from nlu.utils.environment.env_utils import is_running_in_databricks_runtime, try_import_streamlit logger = logging.getLogger('nlu') @@ -608,7 +608,7 @@ def viz(self, text_to_viz: str, viz_type='', labels_to_viz=None, viz_colors={}, from nlu.utils.environment.env_utils import install_and_import_package install_and_import_package('spark-nlp-display', import_name='sparknlp_display') if self.vanilla_transformer_pipe is None: self.fit() - is_databricks_env = is_running_in_databricks() + is_databricks_env = is_running_in_databricks_runtime() if return_html: is_databricks_env = True # self.configure_light_pipe_usage(1, force=True) from nlu.pipe.viz.vis_utils import VizUtils diff --git a/nlu/pipe/utils/audio_data_conversion_utils.py b/nlu/pipe/utils/audio_data_conversion_utils.py index afdf0777..3f692f09 100644 --- a/nlu/pipe/utils/audio_data_conversion_utils.py +++ b/nlu/pipe/utils/audio_data_conversion_utils.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd import pyspark -from johnsnowlabs.utils.env_utils import is_running_in_databricks +from johnsnowlabs.utils.env_utils import is_running_in_databricks_runtime from pyspark.sql.types import * logger = logging.getLogger('nlu') @@ -34,7 +34,7 @@ def validate_paths(data): @staticmethod def check_iterable_paths_are_valid(iterable_paths): """Validate for iterable data input if all elements point to file or jsl_folder""" - if is_running_in_databricks(): + if is_running_in_databricks_runtime(): iterable_paths = [f'/dbfs{p}' for p in iterable_paths] paths_validness = [] for p in iterable_paths: @@ -86,18 +86,18 @@ def glob_files_of_accepted_type(paths, file_types): 1. paths point to a file which is suffixed with one of the accepted file_types, i.e. path/to/file.type 2. path points to a jsl_folder, in this case jsl_folder is recursively searched for valid files and accepted paths will be in return result """ - if is_running_in_databricks(): + if is_running_in_databricks_runtime(): paths = [f'/dbfs{p}' for p in paths] accepted_file_paths = [] for p in paths: for t in file_types: t = t.lower() - if os.path.isfile(p) or is_running_in_databricks() and os.path.isfile(f'/dbfs{p}'): + if os.path.isfile(p) or is_running_in_databricks_runtime() and os.path.isfile(f'/dbfs{p}'): if p.lower().split('.')[-1] == t: - if is_running_in_databricks(): + if is_running_in_databricks_runtime(): p = p.replace('/dbfs', '', 1) accepted_file_paths.append(p) - elif os.path.isdir(p) or is_running_in_databricks() and os.path.isdir(f'/dbfs{p}'): + elif os.path.isdir(p) or is_running_in_databricks_runtime() and os.path.isdir(f'/dbfs{p}'): accepted_file_paths += glob.glob(p + f'/**/*.{t}', recursive=True) else: print(f"Invalid path = {p} pointing neither to file or jsl_folder on this machine") diff --git a/nlu/pipe/utils/ocr_data_conversion_utils.py b/nlu/pipe/utils/ocr_data_conversion_utils.py index 44b4fd2a..b2aed5e5 100644 --- a/nlu/pipe/utils/ocr_data_conversion_utils.py +++ b/nlu/pipe/utils/ocr_data_conversion_utils.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd import pyspark -from johnsnowlabs.utils.env_utils import is_running_in_databricks +from johnsnowlabs.utils.env_utils import is_running_in_databricks_runtime logger = logging.getLogger('nlu') @@ -32,7 +32,7 @@ def validate_OCR_compatible_inputs(data): @staticmethod def check_iterable_paths_are_valid(iterable_paths): """Validate for iterable data input if all elements point to file or jsl_folder""" - if is_running_in_databricks(): + if is_running_in_databricks_runtime(): iterable_paths = [f'/dbfs{p}' for p in iterable_paths] paths_validness = [] for p in iterable_paths: @@ -58,18 +58,18 @@ def glob_files_of_accepted_type(paths, file_types): 2. path points to a jsl_folder, in this case jsl_folder is recurisvely searched for valid files and accepted paths will be in return result """ - if is_running_in_databricks(): + if is_running_in_databricks_runtime(): paths = [f'/dbfs{p}' for p in paths] accepted_file_paths = [] for p in paths: for t in file_types: t = t.lower() - if os.path.isfile(p) or is_running_in_databricks() and os.path.isfile(f'/dbfs{p}'): + if os.path.isfile(p) or is_running_in_databricks_runtime() and os.path.isfile(f'/dbfs{p}'): if p.lower().split('.')[-1] == t: - if is_running_in_databricks(): + if is_running_in_databricks_runtime(): p = p.replace('/dbfs', '', 1) accepted_file_paths.append(p) - elif os.path.isdir(p) or is_running_in_databricks() and os.path.isdir(f'/dbfs{p}'): + elif os.path.isdir(p) or is_running_in_databricks_runtime() and os.path.isdir(f'/dbfs{p}'): accepted_file_paths += glob.glob(p + f'/*.{t.upper()}', recursive=True) + glob.glob(p + f'/*.{t}', recursive=True) else: diff --git a/nlu/pipe/utils/pipe_utils.py b/nlu/pipe/utils/pipe_utils.py index 9c80913a..43b7f161 100644 --- a/nlu/pipe/utils/pipe_utils.py +++ b/nlu/pipe/utils/pipe_utils.py @@ -18,7 +18,7 @@ from nlu.pipe.utils.component_utils import ComponentUtils from typing import List, Union, Dict from nlu.universe.annotator_class_universe import AnnoClassRef -from nlu.utils.environment.env_utils import is_running_in_databricks +from nlu.utils.environment.env_utils import is_running_in_databricks_runtime import os import glob import json @@ -140,12 +140,12 @@ def set_column_values_on_components_from_pretrained_pipe_from_disk_data(componen pipe_path = glob.glob(f'{pipe_path}*') if len(pipe_path) == 0: # try databricks env path - if is_running_in_databricks(): + if is_running_in_databricks_runtime(): pipe_path = [f'dbfs:/root/cache_pretrained/{nlp_ref}_{lang}'] else: raise FileNotFoundError(f"Could not find downloaded Pipeline at path={pipe_path}") pipe_path = pipe_path[0] - if not os.path.exists(pipe_path) and not is_running_in_databricks(): + if not os.path.exists(pipe_path) and not is_running_in_databricks_runtime(): raise FileNotFoundError(f"Could not find downloaded Pipeline at path={pipe_path}") # Find HDD location of component_list and read out input/output cols @@ -155,7 +155,7 @@ def set_column_values_on_components_from_pretrained_pipe_from_disk_data(componen for c in component_list: model_name = c.model.uid.split('_')[0] - if is_running_in_databricks(): + if is_running_in_databricks_runtime(): data = PipeUtils.get_json_data_for_pipe_model_at_stage_number_on_databricks(nlp_ref, lang, digit_str) else: data = PipeUtils.get_json_data_for_pipe_model_at_stage_number(pipe_path, digit_str) diff --git a/nlu/utils/environment/env_utils.py b/nlu/utils/environment/env_utils.py index 06e6c49d..955fa217 100644 --- a/nlu/utils/environment/env_utils.py +++ b/nlu/utils/environment/env_utils.py @@ -84,17 +84,10 @@ def try_import_streamlit(): print("You need to install Streamlit to run this functionality.") -def is_running_in_databricks(): - """ Check if the currently running Python Process is running in Databricks or not +def is_running_in_databricks_runtime(): + """ Check if the currently running Python Process is running in Databricks runtime or not """ - if "IS_IN_DATABRICKS_MODEL_SERVING_ENV" in os.environ: - # Serving container installs apache/spark, not databricks runtime. - return False - - if "DATABRICKS_RUNTIME_VERSION" in os.environ: - return True - - return False + return "DATABRICKS_RUNTIME_VERSION" in os.environ def install_and_import_package(pkg_name, version='', import_name=''): From 8564e7afec5857d9de7af4d1772aba1977eb6c2b Mon Sep 17 00:00:00 2001 From: C-K-Loan Date: Wed, 22 May 2024 00:39:03 +0200 Subject: [PATCH 3/3] version bump --- nlu/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nlu/__init__.py b/nlu/__init__.py index 33d289d0..a905b751 100644 --- a/nlu/__init__.py +++ b/nlu/__init__.py @@ -1,4 +1,4 @@ -__version__ = '5.3.1' +__version__ = '5.3.2' import nlu.utils.environment.env_utils as env_utils