Skip to content

Commit fcc1941

Browse files
committed
survey: add report of "largest" paths
Since we are already walking our reachable objects using the path-walk API, let's now collect lists of the paths that contribute most to different metrics. Specifically, we care about * Number of versions. * Total size on disk. * Total inflated size (no delta or zlib compression). This information can be critical to discovering which parts of the repository are causing the most growth, especially on-disk size. Different packing strategies might help compress data more efficiently, but the toal inflated size is a representation of the raw size of all snapshots of those paths. Even when stored efficiently on disk, that size represents how much information must be processed to complete a command such as 'git blame'. Since the on-disk size is likely to be fragile, stop testing the exact output of 'git survey' and check that the correct set of headers is output. Signed-off-by: Derrick Stolee <[email protected]>
1 parent e2ed70f commit fcc1941

File tree

2 files changed

+82
-9
lines changed

2 files changed

+82
-9
lines changed

builtin/survey.c

+71-8
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@ struct survey_report_object_size_summary {
7979

8080
typedef int (*survey_top_cmp)(void *v1, void *v2);
8181

82-
MAYBE_UNUSED
8382
static int cmp_by_nr(void *v1, void *v2)
8483
{
8584
struct survey_report_object_size_summary *s1 = v1;
@@ -92,7 +91,6 @@ static int cmp_by_nr(void *v1, void *v2)
9291
return 0;
9392
}
9493

95-
MAYBE_UNUSED
9694
static int cmp_by_disk_size(void *v1, void *v2)
9795
{
9896
struct survey_report_object_size_summary *s1 = v1;
@@ -105,7 +103,6 @@ static int cmp_by_disk_size(void *v1, void *v2)
105103
return 0;
106104
}
107105

108-
MAYBE_UNUSED
109106
static int cmp_by_inflated_size(void *v1, void *v2)
110107
{
111108
struct survey_report_object_size_summary *s1 = v1;
@@ -136,7 +133,6 @@ struct survey_report_top_table {
136133
void *data;
137134
};
138135

139-
MAYBE_UNUSED
140136
static void init_top_sizes(struct survey_report_top_table *top,
141137
size_t limit, const char *name,
142138
survey_top_cmp cmp)
@@ -162,7 +158,6 @@ static void clear_top_sizes(struct survey_report_top_table *top)
162158
free(sz_array);
163159
}
164160

165-
MAYBE_UNUSED
166161
static void maybe_insert_into_top_size(struct survey_report_top_table *top,
167162
struct survey_report_object_size_summary *summary)
168163
{
@@ -199,6 +194,10 @@ struct survey_report {
199194
struct survey_report_object_summary reachable_objects;
200195

201196
struct survey_report_object_size_summary *by_type;
197+
198+
struct survey_report_top_table *top_paths_by_count;
199+
struct survey_report_top_table *top_paths_by_disk;
200+
struct survey_report_top_table *top_paths_by_inflate;
202201
};
203202

204203
#define REPORT_TYPE_COMMIT 0
@@ -450,6 +449,13 @@ static void survey_report_object_sizes(const char *title,
450449
clear_table(&table);
451450
}
452451

452+
static void survey_report_plaintext_sorted_size(
453+
struct survey_report_top_table *top)
454+
{
455+
survey_report_object_sizes(top->name, _("Path"),
456+
top->data, top->nr);
457+
}
458+
453459
static void survey_report_plaintext(struct survey_context *ctx)
454460
{
455461
printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree);
@@ -460,6 +466,21 @@ static void survey_report_plaintext(struct survey_context *ctx)
460466
_("Object Type"),
461467
ctx->report.by_type,
462468
REPORT_TYPE_COUNT);
469+
470+
survey_report_plaintext_sorted_size(
471+
&ctx->report.top_paths_by_count[REPORT_TYPE_TREE]);
472+
survey_report_plaintext_sorted_size(
473+
&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB]);
474+
475+
survey_report_plaintext_sorted_size(
476+
&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE]);
477+
survey_report_plaintext_sorted_size(
478+
&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB]);
479+
480+
survey_report_plaintext_sorted_size(
481+
&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]);
482+
survey_report_plaintext_sorted_size(
483+
&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]);
463484
}
464485

465486
/*
@@ -700,7 +721,8 @@ static void increment_totals(struct survey_context *ctx,
700721

701722
static void increment_object_totals(struct survey_context *ctx,
702723
struct oid_array *oids,
703-
enum object_type type)
724+
enum object_type type,
725+
const char *path)
704726
{
705727
struct survey_report_object_size_summary *total;
706728
struct survey_report_object_size_summary summary = { 0 };
@@ -732,9 +754,30 @@ static void increment_object_totals(struct survey_context *ctx,
732754
total->disk_size += summary.disk_size;
733755
total->inflated_size += summary.inflated_size;
734756
total->num_missing += summary.num_missing;
757+
758+
if (type == OBJ_TREE || type == OBJ_BLOB) {
759+
int index = type == OBJ_TREE ?
760+
REPORT_TYPE_TREE : REPORT_TYPE_BLOB;
761+
struct survey_report_top_table *top;
762+
763+
/*
764+
* Temporarily store (const char *) here, but it will
765+
* be duped if inserted and will not be freed.
766+
*/
767+
summary.label = (char *)path;
768+
769+
top = ctx->report.top_paths_by_count;
770+
maybe_insert_into_top_size(&top[index], &summary);
771+
772+
top = ctx->report.top_paths_by_disk;
773+
maybe_insert_into_top_size(&top[index], &summary);
774+
775+
top = ctx->report.top_paths_by_inflate;
776+
maybe_insert_into_top_size(&top[index], &summary);
777+
}
735778
}
736779

737-
static int survey_objects_path_walk_fn(const char *path UNUSED,
780+
static int survey_objects_path_walk_fn(const char *path,
738781
struct oid_array *oids,
739782
enum object_type type,
740783
void *data)
@@ -743,7 +786,7 @@ static int survey_objects_path_walk_fn(const char *path UNUSED,
743786

744787
increment_object_counts(&ctx->report.reachable_objects,
745788
type, oids->nr);
746-
increment_object_totals(ctx, oids, type);
789+
increment_object_totals(ctx, oids, type, path);
747790

748791
ctx->progress_nr += oids->nr;
749792
display_progress(ctx->progress, ctx->progress_nr);
@@ -753,11 +796,31 @@ static int survey_objects_path_walk_fn(const char *path UNUSED,
753796

754797
static void initialize_report(struct survey_context *ctx)
755798
{
799+
const int top_limit = 100;
800+
756801
CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
757802
ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
758803
ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
759804
ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
760805
ctx->report.by_type[REPORT_TYPE_TAG].label = xstrdup(_("Tags"));
806+
807+
CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT);
808+
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE],
809+
top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr);
810+
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB],
811+
top_limit, _("TOP FILES BY COUNT"), cmp_by_nr);
812+
813+
CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT);
814+
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE],
815+
top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size);
816+
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB],
817+
top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size);
818+
819+
CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT);
820+
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE],
821+
top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size);
822+
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB],
823+
top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size);
761824
}
762825

763826
static void survey_phase_objects(struct survey_context *ctx)

t/t8100-git-survey.sh

+11-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,17 @@ test_expect_success 'git survey (default)' '
8686
Tags | 4 | $(test_oid tags_size_on_disk) | $(test_oid tags_size)
8787
EOF
8888
89-
test_cmp expect out
89+
lines=$(wc -l <expect) &&
90+
head -n $lines out >out-trimmed &&
91+
test_cmp expect out-trimmed &&
92+
93+
for type in "DIRECTORIES" "FILES"
94+
do
95+
for metric in "COUNT" "DISK SIZE" "INFLATED SIZE"
96+
do
97+
grep "TOP $type BY $metric" out || return 1
98+
done || return 1
99+
done
90100
'
91101

92102
test_done

0 commit comments

Comments
 (0)