Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

serialize artifacts with pickle protocol 4 if possible #2243

Merged
merged 4 commits into from
Feb 7, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 11 additions & 32 deletions metaflow/datastore/task_datastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def init_task(self):

@only_if_not_done
@require_mode("w")
def save_artifacts(self, artifacts_iter, force_v4=False, len_hint=0):
def save_artifacts(self, artifacts_iter, len_hint=0):
"""
Saves Metaflow Artifacts (Python objects) to the datastore and stores
any relevant metadata needed to retrieve them.
Expand All @@ -269,52 +269,31 @@ def save_artifacts(self, artifacts_iter, force_v4=False, len_hint=0):
artifacts : Iterator[(string, object)]
Iterator over the human-readable name of the object to save
and the object itself
force_v4 : boolean or Dict[string -> boolean]
Indicates whether the artifact should be pickled using the v4
version of pickle. If a single boolean, applies to all artifacts.
If a dictionary, applies to the object named only. Defaults to False
if not present or not specified
len_hint: integer
Estimated number of items in artifacts_iter
"""
artifact_names = []

def pickle_iter():
for name, obj in artifacts_iter:
do_v4 = (
force_v4 and force_v4
if isinstance(force_v4, bool)
else force_v4.get(name, False)
)
if do_v4:
encode_type = "gzip+pickle-v4"
if encode_type not in self._encodings:
raise DataException(
"Artifact *%s* requires a serialization encoding that "
"requires Python 3.4 or newer." % name
)
encode_type = "gzip+pickle-v4"
if encode_type in self._encodings:
try:
blob = pickle.dumps(obj, protocol=4)
except TypeError as e:
raise UnpicklableArtifactException(name)
raise UnpicklableArtifactException(name) from e
else:
try:
blob = pickle.dumps(obj, protocol=2)
encode_type = "gzip+pickle-v2"
except (SystemError, OverflowError):
encode_type = "gzip+pickle-v4"
if encode_type not in self._encodings:
raise DataException(
"Artifact *%s* is very large (over 2GB). "
"You need to use Python 3.4 or newer if you want to "
"serialize large objects." % name
)
try:
blob = pickle.dumps(obj, protocol=4)
except TypeError as e:
raise UnpicklableArtifactException(name)
except (SystemError, OverflowError) as e:
raise DataException(
"Artifact *%s* is very large (over 2GB). "
"You need to use Python 3.4 or newer if you want to "
"serialize large objects." % name
) from e
except TypeError as e:
raise UnpicklableArtifactException(name)
raise UnpicklableArtifactException(name) from e

self._info[name] = {
"size": len(blob),
Expand Down
Loading