Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

apps: Parse the docker store to obtain layer size and disk usage #285

Merged
merged 2 commits into from
Jul 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions apps/apps_publisher.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2020 Foundries.io
# SPDX-License-Identifier: Apache-2.0

import json
import logging
from tempfile import NamedTemporaryFile

Expand All @@ -15,11 +16,13 @@


class AppsPublisher:
def __init__(self, factory, publish_tool: str, archs: str, registry_host=DockerRegistryClient.DefaultRegistryHost):
def __init__(self, factory, publish_tool: str, archs: str,
registry_host=DockerRegistryClient.DefaultRegistryHost, layers_meta: dict = None):
self._factory = factory
self._publish_tool = publish_tool
self._archs = archs
self._registry_host = registry_host
self._layers_meta = layers_meta

self._image_base_url = '{}/{}'.format(registry_host, self._factory)
self._allowed_tags = ['${TAG}', 'latest']
Expand Down Expand Up @@ -83,6 +86,10 @@ def __publish(self, app: ComposeApps.App, tag: str):
app_base_url = self._image_base_url + '/' + app.name
self._app_tagged_url = app_base_url + ':app-' + tag
# TODO: Consider implementation of the "publish tool" in DockerRegistryClient
with NamedTemporaryFile(mode="w+") as f:
cmd_exe(self._publish_tool, '-d', f.name, self._app_tagged_url, self._archs, cwd=app.dir)
return app_base_url + '@' + f.read().strip()
with NamedTemporaryFile(mode="w+") as layers_meta_file:
json.dump(self._layers_meta, layers_meta_file)
layers_meta_file.flush()
with NamedTemporaryFile(mode="w+") as f:
cmd_exe(self._publish_tool, '-d', f.name, '-l', layers_meta_file.name,
self._app_tagged_url, self._archs, cwd=app.dir)
return app_base_url + '@' + f.read().strip()
4 changes: 4 additions & 0 deletions apps/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -217,3 +217,7 @@ done
[ -d $HOME/.docker/manifests ] && mv $HOME/.docker/manifests /archive/manifests || echo 'no manifests to archive'

PYTHONPATH=${HERE}/.. python3 ${HERE}/generate_non_factory_sboms.py --arch=$ARCH
# 1. Parse the local docker store (the one where the built images are stored).
# 2. Extract layers metadata (size, usage) of all Apps' images
# 3. Store the gathered layers metadata as a CI artifact
PYTHONPATH="${HERE}"/.. python3 "${HERE}"/get_layers_meta.py --apps-root "${REPO_ROOT}" --tag "${TAG}-${ARCH}" --out-file "/archive/layers_meta.json"
142 changes: 142 additions & 0 deletions apps/docker_store.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import hashlib
import json
import os.path
import subprocess


class DockerStore:
_REPO_PATH = "image/overlay2/repositories.json"

class Layer:
_DISTRIBUTION_DIGEST_PATH = "image/overlay2/distribution/v2metadata-by-diffid/sha256"
_LAYER_DB_PATH = "image/overlay2/layerdb/sha256"
_LAYER_DATA_BASE_PATH = "overlay2"
_SUPPORTED_HASH_TYPE = "sha256:"

def __init__(self, data_root, layer_diff_id, parent_chain_id):
self._data_root = data_root
self.diff_id = layer_diff_id
self.digest = self._get_layer_archive_digest()
self.chain_id = self._get_chain_id(parent_chain_id)
# Wire/transfer size (in bytes) of unarchived layer
self.size = self._get_layer_size_from_meta()
self.cache_id = self._get_cache_id()
self.data_path = os.path.join(self._data_root, self._LAYER_DATA_BASE_PATH,
self.cache_id)

# Disk usage (in bytes) of unarchived layer taking into account the volume block size
self.usage = self._get_disk_usage()
# Disk usage (in bytes) of unarchived layer along with metadata
# taking into account the volume block size
self.usage_with_meta = self._get_disk_usage_with_metadata()
# Size of the file containing metadata about the layer's TAR stream.
# It's stored in
# `<docker-data-root>/image/overlay2/layerdb/sha256/<chainID>/tar-split.json.gz
# It can be used to partially "fsck" layer data/files on disk based on file/dir names
# and their sizes stored in the "tar-split" file.
# E.g. `tar-split a --input <path to tar-split.json.gz> \
# --path <docker-data-root>/overlay2/<cacheID>/diff/>
# --output /dev/null`
self.tar_split_size = self._get_tar_split_size()
self.overall_usage = self.usage_with_meta + self.tar_split_size

def _get_layer_archive_digest(self):
digest_file_path = os.path.join(self._data_root, self._DISTRIBUTION_DIGEST_PATH,
self.diff_id[len(self._SUPPORTED_HASH_TYPE):])
if not os.path.exists(digest_file_path):
raise Exception(
f"Image layer diff ID to digest mapping is not found in: {digest_file_path}")

with open(digest_file_path) as f:
digests = json.load(f)
return digests[0]["Digest"]

def _get_chain_id(self, parent_chain_id):
if not parent_chain_id:
return self.diff_id
bytes = parent_chain_id + " " + self.diff_id
return self._SUPPORTED_HASH_TYPE + hashlib.sha256(bytes.encode('utf-8')).hexdigest()

def _get_layer_size_from_meta(self):
size_file = os.path.join(self._data_root, self._LAYER_DB_PATH,
self.chain_id[len(self._SUPPORTED_HASH_TYPE):], "size")
if not os.path.exists(size_file):
raise Exception(f"Layer size file is missing: {size_file}")
with open(size_file) as f:
size_str = f.readline()
return int(size_str)

def _get_cache_id(self):
cache_id_file = os.path.join(self._data_root, self._LAYER_DB_PATH,
self.chain_id[len(self._SUPPORTED_HASH_TYPE):], "cache-id")
if not os.path.exists(cache_id_file):
raise Exception(f"Layer cache-id file is missing: {cache_id_file}")
with open(cache_id_file) as f:
cache_id = f.readline()
return cache_id

def _get_disk_usage_with_metadata(self):
# get disk usage of the layer diff/rootfs along with its metadata
# taking into account a block size
du_output = subprocess.check_output("du -sk " + self.data_path, shell=True)
return int(du_output.decode().split()[0])*1024

def _get_disk_usage(self):
# get disk usage of the layer diff/rootfs taking into account a block size
du_output = subprocess.check_output("du -sk " + self.data_path + "/diff", shell=True)
return int(du_output.decode().split()[0])*1024

def _get_tar_split_size(self):
tar_split_file = os.path.join(self._data_root, self._LAYER_DB_PATH,
self.chain_id[len(self._SUPPORTED_HASH_TYPE):],
"tar-split.json.gz")
if not os.path.exists(tar_split_file):
raise Exception(f"Layer tar_split file is missing: {tar_split_file}")

return os.path.getsize(tar_split_file)

class Image:
_IMAGE_DB_ROOT_PATH = "image/overlay2/imagedb"
_IMAGE_DB_CONTENT_PATH = "content/sha256"
_SUPPORTED_HASH_TYPE = "sha256:"

def __init__(self, data_root, image_conf_hash):
if not image_conf_hash.startswith(self._SUPPORTED_HASH_TYPE):
raise Exception(f"Unsupported image config hash type: {image_conf_hash}")

image_conf_path = os.path.join(data_root,
self._IMAGE_DB_ROOT_PATH, self._IMAGE_DB_CONTENT_PATH,
image_conf_hash[len(self._SUPPORTED_HASH_TYPE):])
if not os.path.exists(image_conf_path):
raise Exception(f"Image config has not been found in: {image_conf_path}")

self.layers = []
self.conf_hash = image_conf_hash
with open(image_conf_path) as f:
image_conf = json.load(f)
cur_chain_id = None
for layer_diff_id in image_conf["rootfs"]["diff_ids"]:
layer = DockerStore.Layer(data_root, layer_diff_id, cur_chain_id)
self.layers.append(layer)
cur_chain_id = layer.chain_id

def __init__(self, data_root="/var/lib/docker"):
self.data_root = data_root
self._cfg_to_image = {}
self.images_by_ref = {}
self._parse_repositories()

def _parse_repositories(self):
repos_file = os.path.join(self.data_root, self._REPO_PATH)
if not os.path.exists(repos_file):
raise Exception(f"No `repositories.json` is found in the docker store: {repos_file}")
with open(repos_file) as f:
repos = json.load(f)
fs_stats = os.fstatvfs(f.fileno())
self.fs_block_size = fs_stats.f_bsize

for image_base_ref, image_refs in repos["Repositories"].items():
for ref, image_conf_hash in image_refs.items():
if image_conf_hash not in self._cfg_to_image:
self._cfg_to_image[image_conf_hash] = self.Image(self.data_root, image_conf_hash)
self.images_by_ref[ref] = self._cfg_to_image[image_conf_hash]
82 changes: 82 additions & 0 deletions apps/get_layers_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import argparse
import json
import os.path

from helpers import status
from apps.docker_store import DockerStore
from apps.compose_apps import ComposeApps


def get_args():
parser = argparse.ArgumentParser("Parse docker store and obtain layers metadata of all Apps' images")
parser.add_argument("-d", "--docker-data-root",
help="Path to the docker data root", default="/var/lib/docker")
parser.add_argument("-a", "--apps-root", help="Path to the compose apps root dir", default="./")
parser.add_argument("-t", "--tag", help="Expected tag of images")
parser.add_argument("-o", "--out-file", help="Json file to ouput the gathered layers metadata to")
a = parser.parse_args()
return a


def print_layer_details(layer: DockerStore.Layer):
title2value = {
"DiffID": layer.diff_id,
"ChainID": layer.chain_id,
"CacheID": layer.cache_id,
"DataPath": layer.data_path,
"Size": layer.size,
"Usage": layer.usage,
"UsageWithMeta": layer.usage_with_meta,
"TarMetaSize": layer.tar_split_size,
"Overall Usage": layer.overall_usage
}
for t, v in title2value.items():
status(f"{' ':<16}{t:<16}{v}", prefix="")


if __name__ == '__main__':
args = get_args()
status(f"Parsing the docker store {args.docker_data_root}...")
docker_store = DockerStore(args.docker_data_root)
status(f"The docker store has been parsed; fs block size is {docker_store.fs_block_size}")

status("Processing metadata about each App layer...")
apps = ComposeApps(args.apps_root, quiet=True)
apps_layers_meta = {
"fs_block_size": docker_store.fs_block_size,
"layers": {}
}
for app in apps:
status(f"Processing App metadata: {app.name}", prefix="=== ")
for img in app.images(expand_env=True):
img_uri = img
if img_uri.startswith("hub.foundries.io"):
img_uri += ":" + args.tag

image = docker_store.images_by_ref.get(img_uri)
if not image:
status("Image metadata are not found in local store; "
f"`SKIP_ARCHS` must be set for the image: {img_uri}", prefix="==== ")
continue

status(f"Image: {img_uri}", prefix="==== ")
for layer in image.layers:
if layer.digest in apps_layers_meta["layers"]:
status(f"Layer has been already processed: {layer.digest}", prefix="\t=====")
continue
status(f"Layer: {layer.digest}", prefix="\t=====")
apps_layers_meta["layers"][layer.digest] = {
"size": layer.size,
"usage": layer.overall_usage
}
print_layer_details(layer)
status("\n", prefix="")
status("Image processing done", prefix="==== ")

status(f"App metadata has been successfully processed: {app.name}\n", prefix="=== ")

status(f"Storing App layers metadata; file: {args.out_file}", prefix="=== ")
with open(args.out_file, "+w") as f:
json.dump(apps_layers_meta, f)

status("Processing metadata about each App layer has been successfully completed")
19 changes: 17 additions & 2 deletions apps/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
# Copyright (c) 2020 Foundries.io
# SPDX-License-Identifier: Apache-2.0

import json
import logging
import argparse


from helpers import fio_dnsbase, status
from helpers import fio_dnsbase, status, jobserv_get
from apps.target_manager import create_target
from apps.compose_apps import ComposeApps
from apps.apps_publisher import AppsPublisher
Expand All @@ -18,6 +19,16 @@
logger = logging.getLogger(__name__)


def get_layers_metadata(factory: str, ver: str, archs: []) -> dict:
layers_meta = {}
project = f"{factory}/lmp" if factory != "lmp" else "lmp"
for a in archs:
run_name = {"amd64": "build-amd64", "arm64": "build-aarch64", "arm": "build-armhf"}[a]
status(f"Downloading layers metadata built by `{run_name}` run", prefix="=== ")
layers_meta[a] = jobserv_get(f"/projects/{project}/builds/{ver}/runs/{run_name}/layers_meta.json")
return layers_meta


def main(factory: str, sha: str, targets_json: str, machines: [], platforms: [], app_root_dir: str,
publish_tool: str, apps_version: str, target_tag: str, target_version: str, new_targets_file: str):
publish_manifest_lists()
Expand All @@ -28,9 +39,13 @@ def main(factory: str, sha: str, targets_json: str, machines: [], platforms: [],

status('Compose Apps has been validated: {}'.format(apps.str))

status('Downloading Apps\' layers metadata...')
layers_meta = get_layers_metadata(factory, target_version, platforms)
status('Apps\' layers metadata have been downloaded')

reg_host = "hub." + fio_dnsbase()
archs = ','.join(platforms) if platforms else ''
apps_to_add_to_target = AppsPublisher(factory, publish_tool, archs, reg_host).publish(apps, apps_version)
apps_to_add_to_target = AppsPublisher(factory, publish_tool, archs, reg_host, layers_meta).publish(apps, apps_version)

status('Creating Targets that refer to the published Apps; tag: {}, version: {}, machines: {}, platforms: {} '
.format(target_tag, target_version, ','.join(machines) if machines else '[]',
Expand Down
Loading