-
I'm working on a data analysis project in Python that uses Alibaba Cloud OSS. I'm looking for a universal method to read/write parquet files and have been experimenting with OpenDAL and fsspec/ossfs. During my tests, I found that OpenDAL is about 4-5 times slower than ossfs. While I really appreciate OpenDAL's vision, this performance gap is a significant obstacle to adoption for my use case. Here's the code I used for testing: import opendal
import polars as pl
import ossfs
import time
from io import BytesIO
import pyarrow.dataset as ds
oss_config = {
"endpoint": "my-endpoint",
"key": "my-key",
"secret": "my-secret",
"bucket": "my-bucket",
}
class myOperator:
def __init__(self) -> None:
self.op = opendal.Operator(
"oss",
bucket=oss_config["bucket"],
endpoint=oss_config["endpoint"],
access_key_id=oss_config["key"],
access_key_secret=oss_config["secret"],
)
def write(
self,
df: pl.DataFrame,
file_name: str,
) -> None:
# Open and write the DataFrame to the file.
with self.op.open(file_name, mode="wb") as file:
df.write_parquet(file) # raise warning: Polars found a filename. Ensure you pass a path to the file instead of a python file object when possible for best performance.
# buffer = BytesIO()
# df.write_parquet(buffer)
# file.write(buffer.getvalue())
def read(
self,
file_name: str,
) -> None:
# Open and read the DataFrame from the file.
with self.op.open(file_name, mode="rb") as file:
read_df = pl.read_parquet(file) # raise error: TypeError: 'opendal.File' object is not iterable
print(f"read_df: {read_df.head(5)}")
class myOSSFileSystem:
def __init__(self) -> None:
self.fs = ossfs.OSSFileSystem(
bucket=oss_config["bucket"],
endpoint=oss_config["endpoint"],
key=oss_config["key"],
secret=oss_config["secret"],
)
def write(
self,
df: pl.DataFrame,
file_name: str,
) -> None:
# Open and write the DataFrame to the file.
with self.fs.open(f"{oss_config['bucket']}/{file_name}", mode="wb") as file:
buffer = BytesIO()
df.write_parquet(buffer)
file.write(buffer.getvalue())
def read(
self,
file_name: str,
) -> None:
# Open and read the DataFrame from the file.
with self.fs.open(f"{oss_config['bucket']}/{file_name}", mode="rb") as file:
read_df = pl.read_parquet(file)
print(f"read_df: {read_df.tail(5)}")
def read_ds(
self,
file_name: str,
) -> None:
dataset = ds.dataset(
source=f"{oss_config['bucket']}/{file_name}", format="parquet", filesystem=self.fs
)
pl.scan_pyarrow_dataset(dataset).collect()
df = pl.DataFrame(
{
"a": [1, 2, 3, 4, 5],
"b": [5, 4, 3, 2, 1],
}
)
# compare time cost of writing with opendal and ossfs
start = time.time()
op = myOperator()
op.write(df, "file_op.parquet")
print(f"opendal write cost: {time.time() - start}")
start2 = time.time()
# 🏆
fs = myOSSFileSystem()
fs.write(df, "file_fs.parquet")
print(f"ossfs write cost: {time.time() - start2}")
# # compare time cost of reading with opendal and ossfs
# start = time.time()
# op = myOperator()
# op.read("file_op.parquet")
# print(f"opendal read cost: {time.time() - start}")
# start2 = time.time()
# fs = myOSSFileSystem()
# fs.read("file_fs.parquet")
# print(f"ossfs read cost: {time.time() - start2}")
# start3 = time.time()
# # 🏆
# fs2 = myOSSFileSystem()
# fs2.read_ds("file_fs.parquet")
# print(f"ossfs-ds read cost: {time.time() - start3}") The output:
Has anyone else encountered similar performance issues? Are there any optimizations or best practices I might be missing? I'd love to use OpenDAL, but the current speed difference is too significant for my project's needs. |
Beta Was this translation helpful? Give feedback.
Replies: 2 comments 2 replies
-
Hi, thanks a lot for raising this question first. I'm surprised that opendal python is four times slower. Does ossfs have a cache feature where it writes to the local filesystem first and then uploads to OSS? I noticed that opendal uses I will attempt to reproduce this test and determine what went wrong. |
Beta Was this translation helpful? Give feedback.
-
Issue confirmed, I will convert this into an issue: #4901 |
Beta Was this translation helpful? Give feedback.
Issue confirmed, I will convert this into an issue: #4901