Skip to content

Commit

Permalink
change default shard size to 1GB (#357)
Browse files Browse the repository at this point in the history
* change default shard size to 1 GB

The values was based on the benchmarks in b2ddcc0.

The exact speed improvement depends on the size of the models being
serialized and of the individual files in the model. Speed improvements
ranged from a 5% improvement to a 87% improvement.

Manifest size is also influenced by shard size (and the model /
individual file size). Increasing shard size from ~1MB to 1GB decreases
manifest size by 3 orders of magnitude (99.9% reduction).

Signed-off-by: Spencer Schrock <[email protected]>

* update shard test goldens

Signed-off-by: Spencer Schrock <[email protected]>

* use gigabyte instead of gibibyte

They're roughly equal in performance, any differences either vary model
to model or run to run. However multiples of 1000 are slightly easier
for humans to visualize in things like shard names.

Signed-off-by: Spencer Schrock <[email protected]>

---------

Signed-off-by: Spencer Schrock <[email protected]>
  • Loading branch information
spencerschrock authored Feb 27, 2025
1 parent b2ddcc0 commit 4a610f7
Show file tree
Hide file tree
Showing 12 changed files with 45 additions and 45 deletions.
6 changes: 3 additions & 3 deletions benchmarks/exp_shard.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@
from model_signing.signing import in_toto


KB: Final[int] = 1024
MB: Final[int] = 1024 * KB
GB: Final[int] = 1024 * MB
KB: Final[int] = 1000
MB: Final[int] = 1000 * KB
GB: Final[int] = 1000 * MB


def build_parser() -> argparse.ArgumentParser:
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,9 +198,9 @@ def build_parser() -> argparse.ArgumentParser:
)
param_groups.add_argument(
"--shard",
help="shard size (default: 1000000)",
help="shard size (default: 1000000000)",
type=int,
default=1000000,
default=1_000_000_000,
)

shard_group = parser.add_argument_group("Serialization modes")
Expand Down
12 changes: 6 additions & 6 deletions src/model_signing/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def _build_sharded_file_hasher_factory(
self,
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
chunk_size: int = 1048576,
shard_size: int = 1000000,
shard_size: int = 1_000_000_000,
) -> Callable[[pathlib.Path, int, int], file.ShardedFileHasher]:
"""Builds the hasher factory for a serialization by file shards.
Expand All @@ -194,7 +194,7 @@ def _build_sharded_file_hasher_factory(
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
shard_size: The size of a file shard. Default is 1,000,000 bytes.
shard_size: The size of a file shard. Default is 1 GB.
Returns:
The hasher factory that should be used by the active serialization
Expand Down Expand Up @@ -299,7 +299,7 @@ def set_serialize_by_file_shard_to_manifest(
*,
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
chunk_size: int = 1048576,
shard_size: int = 1000000,
shard_size: int = 1_000_000_000,
max_workers: Optional[int] = None,
allow_symlinks: bool = False,
) -> Self:
Expand All @@ -315,7 +315,7 @@ def set_serialize_by_file_shard_to_manifest(
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
shard_size: The size of a file shard. Default is 1,000,000 bytes.
shard_size: The size of a file shard. Default is 1 GB.
max_workers: Maximum number of workers to use in parallel. Default
is to defer to the `concurrent.futures` library.
allow_symlinks: Controls whether symbolic links are included. If a
Expand All @@ -340,7 +340,7 @@ def set_serialize_by_file_shard_to_digest(
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
merge_algorithm: Literal["sha256", "blake2"] = "sha256",
chunk_size: int = 1048576,
shard_size: int = 1000000,
shard_size: int = 1_000_000_000,
max_workers: Optional[int] = None,
allow_symlinks: bool = False,
) -> Self:
Expand All @@ -357,7 +357,7 @@ def set_serialize_by_file_shard_to_digest(
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
shard_size: The size of a file shard. Default is 1,000,000 bytes.
shard_size: The size of a file shard. Default is 1 GB.
max_workers: Maximum number of workers to use in parallel. Default
is to defer to the `concurrent.futures` library.
allow_symlinks: Controls whether symbolic links are included. If a
Expand Down
4 changes: 2 additions & 2 deletions src/model_signing/hashing/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def __init__(
start: int,
end: int,
chunk_size: int = 1048576,
shard_size: int = 1000000,
shard_size: int = 1_000_000_000,
digest_name_override: Optional[str] = None,
):
"""Initializes an instance to hash a file with a specific `HashEngine`.
Expand All @@ -253,7 +253,7 @@ def __init__(
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
shard_size: The size of a file shard. Default is 1,000,000 bytes.
shard_size: The size of a file shard. Default is 1 GB.
digest_name_override: Optional string to allow overriding the
`digest_name` property to support shorter, standardized names.
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,22 @@
"shards": [
{
"digest": "6efa14bb03544fcb76045c55f25b9315b6eb5be2d8a85f703193a76b7874c6ff",
"algorithm": "file-sha256-1000000",
"algorithm": "file-sha256-1000000000",
"name": "d0/d1/d2/d3/d4/f0:0:16"
},
{
"digest": "a9bc149b70b9d325cd68d275d582cfdb98c0347d3ce54590aa6533368daed3d2",
"algorithm": "file-sha256-1000000",
"algorithm": "file-sha256-1000000000",
"name": "d0/d1/d2/d3/d4/f1:0:16"
},
{
"digest": "5f597e6a92d1324d9adbed43d527926d11d0131487baf315e65ae1ef3b1ca3c0",
"algorithm": "file-sha256-1000000",
"algorithm": "file-sha256-1000000000",
"name": "d0/d1/d2/d3/d4/f2:0:16"
},
{
"digest": "eaf677c35fec6b87889d9e4563d8bb65dcb9869ca0225697c9cc44cf49dca008",
"algorithm": "file-sha256-1000000",
"algorithm": "file-sha256-1000000000",
"name": "d0/d1/d2/d3/d4/f3:0:16"
}
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"shards": [
{
"digest": "3aab065c7181a173b5dd9e9d32a9f79923440b413be1e1ffcdba26a7365f719b",
"algorithm": "file-sha256-1000000",
"algorithm": "file-sha256-1000000000",
"name": ".:0:22"
}
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,52 +13,52 @@
"shards": [
{
"digest": "fdd8925354242a7fd1515e79534317b800015607a609cd306e0b4dcfe6c92249",
"algorithm": "file-sha256-1000000",
"algorithm": "file-sha256-1000000000",
"name": "d0/f00:0:23"
},
{
"digest": "e16940b5e44ce981150bda37c4ba95881a749a521b4a297c5cdf97bdcfe965e6",
"algorithm": "file-sha256-1000000",
"algorithm": "file-sha256-1000000000",
"name": "d0/f01:0:23"
},
{
"digest": "407822246ea8f9e26380842c3f4cd10d7b23e78f1fe7c74c293608682886a426",
"algorithm": "file-sha256-1000000",
"algorithm": "file-sha256-1000000000",
"name": "d0/f02:0:23"
},
{
"digest": "6a3b08b5df77c4d418ceee1ac136a9ad49fc7c41358b5e82c1176daccb21ff3f",
"algorithm": "file-sha256-1000000",
"algorithm": "file-sha256-1000000000",
"name": "d1/f10:0:23"
},
{
"digest": "a484b3d8ea5e99b75f9f123f9a42c882388693edc7d85d82ccba54834712cadf",
"algorithm": "file-sha256-1000000",
"algorithm": "file-sha256-1000000000",
"name": "d1/f11:0:23"
},
{
"digest": "8f577930f5f40c2c2133cb299d36f9527fde98c1608569017cae6b5bcd01abb3",
"algorithm": "file-sha256-1000000",
"algorithm": "file-sha256-1000000000",
"name": "d1/f12:0:23"
},
{
"digest": "997b37cc51f1ca1c7a270466607e26847429cd7264c30148c1b9352e224083fc",
"algorithm": "file-sha256-1000000",
"algorithm": "file-sha256-1000000000",
"name": "f0:0:24"
},
{
"digest": "c88a04d48353133fb065ba2c8ab369abab21395b9526aa20373ad828915fa7ae",
"algorithm": "file-sha256-1000000",
"algorithm": "file-sha256-1000000000",
"name": "f1:0:24"
},
{
"digest": "700e3ba5065d8dd47e41fd928ea086670d628f891ba363be0ca3c31d20d7d719",
"algorithm": "file-sha256-1000000",
"algorithm": "file-sha256-1000000000",
"name": "f2:0:24"
},
{
"digest": "912bcf5ebdf44dc7b4085b07940e0a81d157fba24b276e73fd911121d4544c4a",
"algorithm": "file-sha256-1000000",
"algorithm": "file-sha256-1000000000",
"name": "f3:0:24"
}
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"shards": [
{
"digest": "3aab065c7181a173b5dd9e9d32a9f79923440b413be1e1ffcdba26a7365f719b",
"algorithm": "file-sha256-1000000",
"algorithm": "file-sha256-1000000000",
"name": "symlink_file:0:22"
}
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"sha256": "6efa14bb03544fcb76045c55f25b9315b6eb5be2d8a85f703193a76b7874c6ff"
},
"annotations": {
"actual_hash_algorithm": "file-sha256-1000000"
"actual_hash_algorithm": "file-sha256-1000000000"
}
},
{
Expand All @@ -16,7 +16,7 @@
"sha256": "a9bc149b70b9d325cd68d275d582cfdb98c0347d3ce54590aa6533368daed3d2"
},
"annotations": {
"actual_hash_algorithm": "file-sha256-1000000"
"actual_hash_algorithm": "file-sha256-1000000000"
}
},
{
Expand All @@ -25,7 +25,7 @@
"sha256": "5f597e6a92d1324d9adbed43d527926d11d0131487baf315e65ae1ef3b1ca3c0"
},
"annotations": {
"actual_hash_algorithm": "file-sha256-1000000"
"actual_hash_algorithm": "file-sha256-1000000000"
}
},
{
Expand All @@ -34,7 +34,7 @@
"sha256": "eaf677c35fec6b87889d9e4563d8bb65dcb9869ca0225697c9cc44cf49dca008"
},
"annotations": {
"actual_hash_algorithm": "file-sha256-1000000"
"actual_hash_algorithm": "file-sha256-1000000000"
}
}
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"sha256": "3aab065c7181a173b5dd9e9d32a9f79923440b413be1e1ffcdba26a7365f719b"
},
"annotations": {
"actual_hash_algorithm": "file-sha256-1000000"
"actual_hash_algorithm": "file-sha256-1000000000"
}
}
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"sha256": "fdd8925354242a7fd1515e79534317b800015607a609cd306e0b4dcfe6c92249"
},
"annotations": {
"actual_hash_algorithm": "file-sha256-1000000"
"actual_hash_algorithm": "file-sha256-1000000000"
}
},
{
Expand All @@ -16,7 +16,7 @@
"sha256": "e16940b5e44ce981150bda37c4ba95881a749a521b4a297c5cdf97bdcfe965e6"
},
"annotations": {
"actual_hash_algorithm": "file-sha256-1000000"
"actual_hash_algorithm": "file-sha256-1000000000"
}
},
{
Expand All @@ -25,7 +25,7 @@
"sha256": "407822246ea8f9e26380842c3f4cd10d7b23e78f1fe7c74c293608682886a426"
},
"annotations": {
"actual_hash_algorithm": "file-sha256-1000000"
"actual_hash_algorithm": "file-sha256-1000000000"
}
},
{
Expand All @@ -34,7 +34,7 @@
"sha256": "6a3b08b5df77c4d418ceee1ac136a9ad49fc7c41358b5e82c1176daccb21ff3f"
},
"annotations": {
"actual_hash_algorithm": "file-sha256-1000000"
"actual_hash_algorithm": "file-sha256-1000000000"
}
},
{
Expand All @@ -43,7 +43,7 @@
"sha256": "a484b3d8ea5e99b75f9f123f9a42c882388693edc7d85d82ccba54834712cadf"
},
"annotations": {
"actual_hash_algorithm": "file-sha256-1000000"
"actual_hash_algorithm": "file-sha256-1000000000"
}
},
{
Expand All @@ -52,7 +52,7 @@
"sha256": "8f577930f5f40c2c2133cb299d36f9527fde98c1608569017cae6b5bcd01abb3"
},
"annotations": {
"actual_hash_algorithm": "file-sha256-1000000"
"actual_hash_algorithm": "file-sha256-1000000000"
}
},
{
Expand All @@ -61,7 +61,7 @@
"sha256": "997b37cc51f1ca1c7a270466607e26847429cd7264c30148c1b9352e224083fc"
},
"annotations": {
"actual_hash_algorithm": "file-sha256-1000000"
"actual_hash_algorithm": "file-sha256-1000000000"
}
},
{
Expand All @@ -70,7 +70,7 @@
"sha256": "c88a04d48353133fb065ba2c8ab369abab21395b9526aa20373ad828915fa7ae"
},
"annotations": {
"actual_hash_algorithm": "file-sha256-1000000"
"actual_hash_algorithm": "file-sha256-1000000000"
}
},
{
Expand All @@ -79,7 +79,7 @@
"sha256": "700e3ba5065d8dd47e41fd928ea086670d628f891ba363be0ca3c31d20d7d719"
},
"annotations": {
"actual_hash_algorithm": "file-sha256-1000000"
"actual_hash_algorithm": "file-sha256-1000000000"
}
},
{
Expand All @@ -88,7 +88,7 @@
"sha256": "912bcf5ebdf44dc7b4085b07940e0a81d157fba24b276e73fd911121d4544c4a"
},
"annotations": {
"actual_hash_algorithm": "file-sha256-1000000"
"actual_hash_algorithm": "file-sha256-1000000000"
}
}
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"sha256": "3aab065c7181a173b5dd9e9d32a9f79923440b413be1e1ffcdba26a7365f719b"
},
"annotations": {
"actual_hash_algorithm": "file-sha256-1000000"
"actual_hash_algorithm": "file-sha256-1000000000"
}
}
],
Expand Down

0 comments on commit 4a610f7

Please sign in to comment.