Skip to content

Commit

Permalink
Fix bench and update pandas and duckdb
Browse files Browse the repository at this point in the history
  • Loading branch information
auxten committed Jun 24, 2024
1 parent 3f39349 commit a663a35
Show file tree
Hide file tree
Showing 2 changed files with 282 additions and 200 deletions.
14 changes: 12 additions & 2 deletions tests/arrow_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import time
import chdb
import chdb.dataframe as cdf
import chdb.session as chs
import pandas as pd
import numpy as np
import pyarrow as pa
Expand Down Expand Up @@ -37,6 +38,8 @@

sql = "SELECT COUNT(DISTINCT UserID) FROM hits;"

# sql = "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;"

t = time.time()
# read parquet file into memory
with open(hits_0, "rb") as f:
Expand Down Expand Up @@ -65,6 +68,8 @@
# # hits[col] = hits[col].astype('string')
# hits[col] = hits[col].astype(str)

hits["Referer"] = hits["Referer"].astype(str)

# title = hits["Title"]
# title.values.data

Expand Down Expand Up @@ -216,17 +221,22 @@ def read(self, col_names, count):

reader = myReader(df_old)

sess = chs.Session()
# sess.query("set aggregation_memory_efficient_merge_threads=2;")

sql = sql.replace("STRLEN", "length")

def bench_chdb(i):
if i == 0:
format = "Debug"
else:
format = "DataFrame"
ret = chdb.query(
ret = sess.query(
# """ SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID)
# FROM Python(reader) GROUP BY RegionID ORDER BY c DESC LIMIT 10""",
# "SELECT COUNT(DISTINCT Title) FROM Python(reader);",
sql.replace("hits", "Python(hits)"),
"set aggregation_memory_efficient_merge_threads=3;"
+ sql.replace("hits", "Python(hits)"),
format,
)
return ret
Expand Down
Loading

0 comments on commit a663a35

Please sign in to comment.