Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Sqlglot and fix regression #26

Merged
merged 4 commits into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"python.analysis.extraPaths": [
"./defog_utils"
],
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
}
10 changes: 6 additions & 4 deletions defog_utils/utils_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def mk_delete_ddl(md: Dict[str, Any]) -> str:
# check if the contents is a dictionary of tables or a list of tables
is_schema = isinstance(contents, Dict)
break

if is_schema:
md_delete = ""
for schema, tables in md.items():
Expand Down Expand Up @@ -288,7 +288,9 @@ def fix_md(md: Dict[str, List[Dict[str, str]]]) -> Dict[str, List[Dict[str, str]
return md_new


def test_valid_md_sql(sql: str, md: dict, creds: Dict = None, conn = None, verbose: bool = False):
def test_valid_md_sql(
sql: str, md: dict, creds: Dict = None, conn=None, verbose: bool = False
):
"""
Test custom metadata and a sql query
This will perform the following steps:
Expand All @@ -299,7 +301,7 @@ def test_valid_md_sql(sql: str, md: dict, creds: Dict = None, conn = None, verbo
If provided with the variable `conn`, this reuses the same database connection
to avoid creating a new connection for each query. Otherwise it will connect
via psycopg2 using the credentials provided (note that creds should set db_name)
This will not manage `conn` in any way (eg closing `conn`) - it is left to
This will not manage `conn` in any way (eg closing `conn`) - it is left to
the caller to manage the connection.
Returns tuple of (sql_valid, md_valid, err_message)
"""
Expand Down Expand Up @@ -546,7 +548,7 @@ def parse_md(md_str: str) -> Dict[str, List[Dict[str, str]]]:
def get_table_names(md: str) -> List[str]:
"""
Given a string of metadata formatted as a series of
CREATE TABLE statements, return a list of table names in the same order as
CREATE TABLE statements, return a list of table names in the same order as
they appear in the metadata.
"""
table_names = []
Expand Down
4 changes: 4 additions & 0 deletions defog_utils/utils_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,8 @@ def get_sql_features(
features.date_sub = True
elif isinstance(node, exp.DateTrunc) or isinstance(node, exp.TimestampTrunc):
features.date_trunc = True
elif isinstance(node, exp.StrToDate):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the key regression fix after updating sqlglot

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for documenting and for the fix!

features.date_time_type_conversion = True
elif isinstance(node, exp.StrToTime):
features.date_time_type_conversion = True
elif isinstance(node, exp.Extract):
Expand Down Expand Up @@ -520,9 +522,11 @@ def is_date_or_time_str(s: str) -> bool:
m = re.match(date_or_time_pattern, s)
return bool(m)


def has_month_name(s: str) -> bool:
return bool(re.search(month_name_pattern, s, re.IGNORECASE))


def has_date_in_name(s: str) -> bool:
return bool(re.search(r"(year|quarter|month|week|day)", s))

Expand Down
7 changes: 4 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
numpy
psycopg2-binary
sqlglot
sqlparse
psycopg2-binary==2.9.9
sqlglot==25.8.1
sqlglotrs==0.2.8
sqlparse==0.5.1
199 changes: 177 additions & 22 deletions tests/test_utils_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,76 @@ def test_mk_create_table_ddl(self):
)
self.assertEqual(mk_create_table_ddl(table_name, columns), expected_output)

def test_mk_create_table_ddl_spaces(self):
table_name = "table1"
columns = [
{
"data_type": "text",
"column_name": "Invoice Number",
"column_description": "Unique identifier for each invoice",
},
{
"data_type": "text",
"column_name": "Invoice Date",
"column_description": "Date when the invoice was issued",
},
{
"data_type": "text",
"column_name": "Sales Order#",
"column_description": "Sales order number associated with the invoice",
},
{
"data_type": "text",
"column_name": "Customer Name",
"column_description": "Name of the customer who made the purchase",
},
{
"data_type": "int",
"column_name": "Total with out GST",
"column_description": "Total amount of the invoice without including GST",
},
{
"data_type": "int",
"column_name": "Total",
"column_description": "Total amount of the invoice including GST",
},
{
"data_type": "text",
"column_name": "Status",
"column_description": "Current status of the invoice",
},
{
"data_type": "text",
"column_name": "Salesperson Name",
"column_description": "Name of the salesperson who handled the sale",
},
{
"data_type": "text",
"column_name": "Account Type",
"column_description": "Type of account associated with the invoice",
},
{
"data_type": "text",
"column_name": "Item Category",
"column_description": "Category of the item purchased",
},
]
expected_output = (
"CREATE TABLE table1 (\n"
' "Invoice Number" text, --Unique identifier for each invoice\n'
' "Invoice Date" text, --Date when the invoice was issued\n'
' "Sales Order#" text, --Sales order number associated with the invoice\n'
' "Customer Name" text, --Name of the customer who made the purchase\n'
' "Total with out GST" integer, --Total amount of the invoice without including GST\n'
" Total integer, --Total amount of the invoice including GST\n"
" Status text, --Current status of the invoice\n"
' "Salesperson Name" text, --Name of the salesperson who handled the sale\n'
' "Account Type" text, --Type of account associated with the invoice\n'
' "Item Category" text --Category of the item purchased\n'
");\n"
)
self.assertEqual(mk_create_table_ddl(table_name, columns), expected_output)


class TestMkCreateDDL(unittest.TestCase):
def test_mk_create_ddl(self):
Expand Down Expand Up @@ -398,24 +468,64 @@ def test_parse_md_2(self):
);"""
expected = {
"acct_trx": [
{"column_name": "trx_units", "data_type": "numeric(10,2)", "column_description": ""},
{"column_name": "asset_id", "data_type": "integer", "column_description": ""},
{"column_name": "trx_amount", "data_type": "numeric(10,2)", "column_description": ""},
{"column_name": "details", "data_type": "varchar(500)", "column_description": ""},
{"column_name": "id", "data_type": "integer", "column_description": "Primary key for acct_trx table, joinable with other tables"},
{"column_name": "settle_date", "data_type": "date", "column_description": "Date transaction settled"},
{"column_name": "symbol", "data_type": "varchar(10)", "column_description": ""},
{
"column_name": "trx_units",
"data_type": "numeric(10,2)",
"column_description": "",
},
{
"column_name": "asset_id",
"data_type": "integer",
"column_description": "",
},
{
"column_name": "trx_amount",
"data_type": "numeric(10,2)",
"column_description": "",
},
{
"column_name": "details",
"data_type": "varchar(500)",
"column_description": "",
},
{
"column_name": "id",
"data_type": "integer",
"column_description": "Primary key for acct_trx table, joinable with other tables",
},
{
"column_name": "settle_date",
"data_type": "date",
"column_description": "Date transaction settled",
},
{
"column_name": "symbol",
"data_type": "varchar(10)",
"column_description": "",
},
],
"acct_perf": [
{"column_name": "ytd_return", "data_type": "numeric(5,2)", "column_description": ""},
{"column_name": "acct_snapshot_date", "data_type": "text", "column_description": "format: yyyy-mm-dd"},
{"column_name": "account_id", "data_type": "integer", "column_description": "Primary key, foreign key to cust_acct table"},
{
"column_name": "ytd_return",
"data_type": "numeric(5,2)",
"column_description": "",
},
{
"column_name": "acct_snapshot_date",
"data_type": "text",
"column_description": "format: yyyy-mm-dd",
},
{
"column_name": "account_id",
"data_type": "integer",
"column_description": "Primary key, foreign key to cust_acct table",
},
],
}
md = parse_md(md_str)
print(md)
self.assertDictEqual(md, expected)

def test_parse_md_3(self):
md_str = """CREATE TABLE acct_trx (
trx_units numeric(10, 2),
Expand All @@ -433,18 +543,58 @@ def test_parse_md_3(self):
);"""
expected = {
"acct_trx": [
{"column_name": "trx_units", "data_type": "numeric(10, 2)", "column_description": ""},
{"column_name": "asset_id", "data_type": "integer", "column_description": ""},
{"column_name": "trx_amount", "data_type": "numeric(10, 2)", "column_description": ""},
{"column_name": "details", "data_type": "varchar(500)", "column_description": ""},
{"column_name": "id", "data_type": "integer", "column_description": "Primary key for acct_trx table, joinable with other tables"},
{"column_name": "settle_date", "data_type": "date", "column_description": "Date transaction settled"},
{"column_name": "symbol", "data_type": "varchar(10)", "column_description": ""},
{
"column_name": "trx_units",
"data_type": "numeric(10, 2)",
"column_description": "",
},
{
"column_name": "asset_id",
"data_type": "integer",
"column_description": "",
},
{
"column_name": "trx_amount",
"data_type": "numeric(10, 2)",
"column_description": "",
},
{
"column_name": "details",
"data_type": "varchar(500)",
"column_description": "",
},
{
"column_name": "id",
"data_type": "integer",
"column_description": "Primary key for acct_trx table, joinable with other tables",
},
{
"column_name": "settle_date",
"data_type": "date",
"column_description": "Date transaction settled",
},
{
"column_name": "symbol",
"data_type": "varchar(10)",
"column_description": "",
},
],
"acct_perf": [
{"column_name": "ytd_return", "data_type": "numeric(5, 2)", "column_description": ""},
{"column_name": "acct_snapshot_date", "data_type": "text", "column_description": "format: yyyy-mm-dd"},
{"column_name": "account_id", "data_type": "integer", "column_description": "Primary key, foreign key to cust_acct table"},
{
"column_name": "ytd_return",
"data_type": "numeric(5, 2)",
"column_description": "",
},
{
"column_name": "acct_snapshot_date",
"data_type": "text",
"column_description": "format: yyyy-mm-dd",
},
{
"column_name": "account_id",
"data_type": "integer",
"column_description": "Primary key, foreign key to cust_acct table",
},
],
}
md = parse_md(md_str)
Expand Down Expand Up @@ -624,7 +774,12 @@ def test_generate_aliases_with_reserved_keywords(self):
self.assertEqual(result, expected_result)

def test_generate_aliases_with_dots_and_underscores(self):
table_names = ["db.schema.table1", "db.schema.table2", "db.schema.table3", "_uncompressed___long_name_"]
table_names = [
"db.schema.table1",
"db.schema.table2",
"db.schema.table3",
"_uncompressed___long_name_",
]
result = generate_aliases(table_names)
print(result)
expected_result = "-- db.schema.table1 AS t1\n-- db.schema.table2 AS t2\n-- db.schema.table3 AS t3\n-- _uncompressed___long_name_ AS uln\n"
Expand Down
Loading
Loading