From 72ddc43f2a33e5de89cb9f5efe9a0eac9bf7fc9d Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Tue, 14 Jun 2022 13:53:26 +0200 Subject: [PATCH 01/18] introduce specific fast fetching behaviour for snowflake connector --- src/datajudge/db_access.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index a766f8f1..b7a2d14a 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -648,7 +648,15 @@ def get_column( if not aggregate_operator: selection = sa.select([column]) - result = engine.connect().execute(selection).scalars().all() + + if is_snowflake(engine): # check if we have a snowflake cursor + snowflake_cursor = engine.connect().connection.cursor() + + # note: this step requires pandas to be installed + result = snowflake_cursor.execute(str(selection)).fetch_pandas_all().values.ravel() + + else: + result = engine.connect().execute(selection).scalars().all() else: selection = sa.select([aggregate_operator(column)]) From 3d9194f448a5d0b37c3496bc2aea679d425e0157 Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Tue, 14 Jun 2022 14:07:29 +0200 Subject: [PATCH 02/18] formatting --- src/datajudge/db_access.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index b7a2d14a..862d8cde 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -653,7 +653,11 @@ def get_column( snowflake_cursor = engine.connect().connection.cursor() # note: this step requires pandas to be installed - result = snowflake_cursor.execute(str(selection)).fetch_pandas_all().values.ravel() + result = ( + snowflake_cursor.execute(str(selection)) + .fetch_pandas_all() + .values.ravel() + ) else: result = engine.connect().execute(selection).scalars().all() From 7788516ebfb031f1a3ebc14244a27060d290d341 Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Mon, 20 Jun 2022 21:07:00 +0200 Subject: [PATCH 03/18] convert to numpy array --- src/datajudge/db_access.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index 862d8cde..4c806bde 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -9,6 +9,7 @@ from typing import Callable, Sequence, final, overload import sqlalchemy as sa +from snowflake.connector.cursor import SnowflakeCursor from sqlalchemy.sql.expression import FromClause @@ -650,14 +651,12 @@ def get_column( selection = sa.select([column]) if is_snowflake(engine): # check if we have a snowflake cursor - snowflake_cursor = engine.connect().connection.cursor() + snowflake_cursor: SnowflakeCursor = engine.connect().connection.cursor() # note: this step requires pandas to be installed - result = ( - snowflake_cursor.execute(str(selection)) - .fetch_pandas_all() - .values.ravel() - ) + pa_table = snowflake_cursor.execute(str(selection)).fetch_arrow_all() + if pa_table: # snowflake connector returns NoneType when the table is empty + result = pa_table.column(0).to_numpy() else: result = engine.connect().execute(selection).scalars().all() From 10cadb54cb608abe8ce7f3122ced9aef05146a57 Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Mon, 20 Jun 2022 21:07:42 +0200 Subject: [PATCH 04/18] return empty list as result, otherwise --- src/datajudge/db_access.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index 4c806bde..b6c415ea 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -657,6 +657,8 @@ def get_column( pa_table = snowflake_cursor.execute(str(selection)).fetch_arrow_all() if pa_table: # snowflake connector returns NoneType when the table is empty result = pa_table.column(0).to_numpy() + else: + result = [] else: result = engine.connect().execute(selection).scalars().all() From e64c943ef87f49151f5757e98d365062462d50b2 Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Mon, 20 Jun 2022 21:48:04 +0200 Subject: [PATCH 05/18] update comments --- src/datajudge/db_access.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index b6c415ea..573c6a9a 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -650,7 +650,8 @@ def get_column( if not aggregate_operator: selection = sa.select([column]) - if is_snowflake(engine): # check if we have a snowflake cursor + # snowflake-specific optimization + if is_snowflake(engine): snowflake_cursor: SnowflakeCursor = engine.connect().connection.cursor() # note: this step requires pandas to be installed From 6884f1a4c0fb12ef470059e3c47ed9b7c314f2df Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Tue, 21 Jun 2022 21:04:20 +0200 Subject: [PATCH 06/18] update comment --- src/datajudge/db_access.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index 573c6a9a..425163cf 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -654,7 +654,7 @@ def get_column( if is_snowflake(engine): snowflake_cursor: SnowflakeCursor = engine.connect().connection.cursor() - # note: this step requires pandas to be installed + # note: in addition to pyarrow, this currently requires pandas as well pa_table = snowflake_cursor.execute(str(selection)).fetch_arrow_all() if pa_table: # snowflake connector returns NoneType when the table is empty result = pa_table.column(0).to_numpy() From 56b0b12d50d2302455f961e768a1f9d287f5877a Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Tue, 21 Jun 2022 21:05:53 +0200 Subject: [PATCH 07/18] remove snowflake cursor type hint and therefore (guarded) import --- src/datajudge/db_access.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index 425163cf..514f6455 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -9,7 +9,6 @@ from typing import Callable, Sequence, final, overload import sqlalchemy as sa -from snowflake.connector.cursor import SnowflakeCursor from sqlalchemy.sql.expression import FromClause @@ -652,7 +651,7 @@ def get_column( # snowflake-specific optimization if is_snowflake(engine): - snowflake_cursor: SnowflakeCursor = engine.connect().connection.cursor() + snowflake_cursor = engine.connect().connection.cursor() # note: in addition to pyarrow, this currently requires pandas as well pa_table = snowflake_cursor.execute(str(selection)).fetch_arrow_all() From 13c39c132a5b42c00da5f80bc0f3744246288e8b Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Tue, 21 Jun 2022 21:11:17 +0200 Subject: [PATCH 08/18] add pandas as (temporary) dependency --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 168d7c83..74816782 100644 --- a/environment.yml +++ b/environment.yml @@ -3,6 +3,7 @@ channels: - conda-forge - nodefaults dependencies: + - pandas - python>=3.8 - pytest - pytest-cov From 14b8023b2de1c075bc19ff8556f517c163f47b9d Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Thu, 23 Jun 2022 11:22:24 +0200 Subject: [PATCH 09/18] check pandas availability to prevent segmentation fault --- src/datajudge/db_access.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index 514f6455..f6d09eb8 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -1,6 +1,7 @@ from __future__ import annotations import functools +import importlib import json import operator from abc import ABC, abstractmethod @@ -11,6 +12,11 @@ import sqlalchemy as sa from sqlalchemy.sql.expression import FromClause +try: + pandas_available = importlib.import_module("pandas") is not None +except ModuleNotFoundError: + pandas_available = False + def is_mssql(engine: sa.engine.Engine) -> bool: return engine.name == "mssql" @@ -649,8 +655,8 @@ def get_column( if not aggregate_operator: selection = sa.select([column]) - # snowflake-specific optimization - if is_snowflake(engine): + # snowflake-specific optimization iff pandas is installed additionally + if is_snowflake(engine) and pandas_available: snowflake_cursor = engine.connect().connection.cursor() # note: in addition to pyarrow, this currently requires pandas as well From 443e8f37daff2fa07d8dbaaa2805cfd69fec4b1a Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Thu, 23 Jun 2022 11:29:29 +0200 Subject: [PATCH 10/18] single time warning message for snowflake users --- src/datajudge/db_access.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index f6d09eb8..a8ea82d3 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -16,6 +16,9 @@ pandas_available = importlib.import_module("pandas") is not None except ModuleNotFoundError: pandas_available = False + print( + "For snowflake users: `pandas` is not installed, that means optimized data loading is not available." + ) def is_mssql(engine: sa.engine.Engine) -> bool: @@ -655,7 +658,7 @@ def get_column( if not aggregate_operator: selection = sa.select([column]) - # snowflake-specific optimization iff pandas is installed additionally + # snowflake-specific optimization iff pandas is installed if is_snowflake(engine) and pandas_available: snowflake_cursor = engine.connect().connection.cursor() From fd30f695194132f54b3717b560faa255d2cef09b Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Thu, 23 Jun 2022 13:42:51 +0200 Subject: [PATCH 11/18] print message only for snowflake users --- src/datajudge/db_access.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index a8ea82d3..25d4772e 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -13,12 +13,15 @@ from sqlalchemy.sql.expression import FromClause try: + snowflake_available = importlib.import_module("snowflake") is not None pandas_available = importlib.import_module("pandas") is not None -except ModuleNotFoundError: - pandas_available = False - print( - "For snowflake users: `pandas` is not installed, that means optimized data loading is not available." - ) +except ModuleNotFoundError as err: # ex.: 'snowflake' not found + snowflake_available = "snowflake" not in str(err) + pandas_available = "pandas" not in str(err) + if snowflake_available and not pandas_available: + print( + "For snowflake users: `pandas` is not installed, that means optimized data loading is not available." + ) def is_mssql(engine: sa.engine.Engine) -> bool: From 985578d65363e9020384adff5f74c3f9230dcff8 Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Thu, 23 Jun 2022 17:36:58 +0200 Subject: [PATCH 12/18] simplify import statements (flake errors for unused import statements) --- src/datajudge/db_access.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index 25d4772e..03c57b77 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -1,7 +1,6 @@ from __future__ import annotations import functools -import importlib import json import operator from abc import ABC, abstractmethod @@ -13,8 +12,12 @@ from sqlalchemy.sql.expression import FromClause try: - snowflake_available = importlib.import_module("snowflake") is not None - pandas_available = importlib.import_module("pandas") is not None + import snowflake + + snowflake_available = True + import pandas + + pandas_available = True except ModuleNotFoundError as err: # ex.: 'snowflake' not found snowflake_available = "snowflake" not in str(err) pandas_available = "pandas" not in str(err) From c4e9be93ccb70eb19ecfd965563c83397cf64b6f Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Fri, 24 Jun 2022 09:23:04 +0200 Subject: [PATCH 13/18] simplify import statements --- src/datajudge/db_access.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index 03c57b77..04c81287 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -3,6 +3,7 @@ import functools import json import operator +import warnings from abc import ABC, abstractmethod from collections import Counter from dataclasses import dataclass @@ -12,19 +13,24 @@ from sqlalchemy.sql.expression import FromClause try: - import snowflake + import snowflake # noqa snowflake_available = True - import pandas +except ModuleNotFoundError: + snowflake_available = False + +try: + import pandas # noqa pandas_available = True -except ModuleNotFoundError as err: # ex.: 'snowflake' not found - snowflake_available = "snowflake" not in str(err) - pandas_available = "pandas" not in str(err) - if snowflake_available and not pandas_available: - print( - "For snowflake users: `pandas` is not installed, that means optimized data loading is not available." - ) +except ModuleNotFoundError: + pandas_available = False + + +if snowflake_available and not pandas_available: + warnings.warn( + "For snowflake users: `pandas` is not installed, that means optimized data loading is not available." + ) def is_mssql(engine: sa.engine.Engine) -> bool: From a7480774e93fe8b0a8e130591a41119a3831a9e2 Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Wed, 29 Jun 2022 17:08:37 +0200 Subject: [PATCH 14/18] clean up check for module import --- src/datajudge/db_access.py | 16 +++------------- src/datajudge/utils.py | 8 ++++++++ 2 files changed, 11 insertions(+), 13 deletions(-) create mode 100644 src/datajudge/utils.py diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index 04c81287..be4aeff8 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -11,20 +11,10 @@ import sqlalchemy as sa from sqlalchemy.sql.expression import FromClause +from utils import check_module_installed -try: - import snowflake # noqa - - snowflake_available = True -except ModuleNotFoundError: - snowflake_available = False - -try: - import pandas # noqa - - pandas_available = True -except ModuleNotFoundError: - pandas_available = False +snowflake_available = check_module_installed("snowflake") +pandas_available = check_module_installed("pandas") if snowflake_available and not pandas_available: diff --git a/src/datajudge/utils.py b/src/datajudge/utils.py new file mode 100644 index 00000000..3202a3f1 --- /dev/null +++ b/src/datajudge/utils.py @@ -0,0 +1,8 @@ +def check_module_installed(module_name: str) -> bool: + import importlib + + try: + mod = importlib.import_module(module_name) + return mod is not None + except ModuleNotFoundError: + return False From 764464cfcc297663469a938f5cd2675204105953 Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Wed, 29 Jun 2022 17:09:49 +0200 Subject: [PATCH 15/18] fix import --- src/datajudge/db_access.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index be4aeff8..9346c3f8 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -11,7 +11,8 @@ import sqlalchemy as sa from sqlalchemy.sql.expression import FromClause -from utils import check_module_installed + +from .utils import check_module_installed snowflake_available = check_module_installed("snowflake") pandas_available = check_module_installed("pandas") From fa1de3fe4a747ce36f0a0b932aa4d0b9f6f252ca Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Wed, 29 Jun 2022 20:45:35 +0200 Subject: [PATCH 16/18] add snowflake comment --- docs/source/installation.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 7311cf59..4a341318 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -12,3 +12,14 @@ or from a conda environment :: conda install datajudge -c conda-forge + + + +Snowflake +^^^^ + +If your backend is ``snowflake`` and you are querying large datasets, +you can additionally install ``pandas`` to make use of very fast query loading +(up to 50x speedup for large datasets). + Note: The ``pandas`` requirement is a bug in the snowflake-python-connector + and will not be needed in the future. From 08fb1141a7c6b1e41fde65e0143eb211630dec0e Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Thu, 30 Jun 2022 10:38:52 +0200 Subject: [PATCH 17/18] easier import check --- src/datajudge/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datajudge/utils.py b/src/datajudge/utils.py index 3202a3f1..b9da85c8 100644 --- a/src/datajudge/utils.py +++ b/src/datajudge/utils.py @@ -2,7 +2,7 @@ def check_module_installed(module_name: str) -> bool: import importlib try: - mod = importlib.import_module(module_name) - return mod is not None + importlib.import_module(module_name) + return True except ModuleNotFoundError: return False From 9dafd5f35c17b60b6860d69dff3a18ae8f255c03 Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Thu, 30 Jun 2022 10:39:13 +0200 Subject: [PATCH 18/18] Update docs/source/installation.rst Co-authored-by: Ignacio Vergara Kausel --- docs/source/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 4a341318..991a80c5 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -22,4 +22,4 @@ If your backend is ``snowflake`` and you are querying large datasets, you can additionally install ``pandas`` to make use of very fast query loading (up to 50x speedup for large datasets). Note: The ``pandas`` requirement is a bug in the snowflake-python-connector - and will not be needed in the future. + and, hopefully, it will not be needed in the future.