-
Notifications
You must be signed in to change notification settings - Fork 0
/
plot.py
208 lines (183 loc) · 11.9 KB
/
plot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import matplotlib.pyplot as plt
import numpy as np
import logging
import seaborn as sns
import colorcet as cc
import pandas as pd
import helper as hlp
def plot_stack_area_chart(values, execution_id, path, ylabel, legend_labels, tick_labels, legend, title=''):
"""
:param values: the data to be plotted. numpy array of shape (number of total entities, number of time steps)
:param path: the path to save the figure to
"""
fig = plt.figure(figsize=(6, 4))
num_entities = values.shape[0]
num_time_steps = values.shape[1]
col = sns.color_palette(cc.glasbey, n_colors=num_entities)
plt.stackplot(range(num_time_steps), values, colors=col, edgecolor='face', linewidth=0.0001, labels=legend_labels)
plt.title(title)
plt.margins(0)
plt.xlabel("Date")
plt.ylabel(ylabel)
plt.xticks(ticks=range(num_time_steps), labels=tick_labels, rotation=45)
locs, x_labels = plt.xticks()
for i, label in enumerate(x_labels):
if i % 10 == 0: # only keep every 10th xtick label
continue
label.set_visible(False)
if legend:
execution_id += '_with_legend'
visible_legend_labels = [label for label in legend_labels if not label.startswith('_')]
if len(visible_legend_labels) > 0:
max_labels_per_column = 25
ncols = len(visible_legend_labels) // (max_labels_per_column + 1) + 1
fig.legend(loc='upper right', bbox_to_anchor=(0.9, -0.1), ncol=ncols, fancybox=True, borderpad=0.2,
labelspacing=0.3, handlelength=1)
filename = execution_id + ".png"
plt.savefig(path / filename, bbox_inches='tight')
plt.close("all")
def plot_contribution_distribution(ledger_repos, data_dir, figures_dir, contribution_type, top_k=-1, unit='relative',
legend=False):
"""
Plots the dynamics for each repository in terms of commit contribution
:param ledger_repos: dictionary that contains the repositories for each ledger
:param top_k: if > 0, then only the evolution of the top k contributors will be shown in the graph. Otherwise,
all contributors will be plotted.
:param unit: string that specifies whether the plots to be generated will be in absolute or relative values (i.e.
number of contributions or share of contributions). It can be one of: absolute, relative
"""
for ledger, repos in ledger_repos.items():
for repo in repos:
filename = f"{repo}_contributions_per_entity.csv"
sample_windows, contributions_per_entity = hlp.get_contributions_per_entity_from_file(
filepath=data_dir / filename)
total_contributions_per_sample_window = [0] * len(sample_windows)
for entity, contribution_values in contributions_per_entity.items():
for sample_window_idx, ncontributions in contribution_values.items():
total_contributions_per_sample_window[sample_window_idx] += ncontributions
total_contributions_per_sample_window = np.array(total_contributions_per_sample_window)
nonzero_idx = total_contributions_per_sample_window.nonzero()[
0] # only keep time chunks with at least one contribution
total_contributions_per_sample_window = total_contributions_per_sample_window[nonzero_idx]
sample_windows = [sample_windows[i] for i in nonzero_idx]
contributions_array = []
for entity, contribution_values in contributions_per_entity.items():
entity_array = []
for sample_window_idx in nonzero_idx:
try:
entity_array.append(contribution_values[sample_window_idx])
except KeyError:
entity_array.append(0)
contributions_array.append(entity_array)
contributions_array = np.array(contributions_array)
if unit == 'relative':
contribution_shares_array = contributions_array / total_contributions_per_sample_window * 100
values = contribution_shares_array
ylabel = f'Share of {contribution_type} (%)'
legend_threshold = 0 * total_contributions_per_sample_window + 5 # only show in the legend contributors that have a contribution of at least 5% in some sample window
else:
values = contributions_array
ylabel = f'Number of {contribution_type}'
legend_threshold = 0.05 * total_contributions_per_sample_window
max_values_per_contributor = values.max(axis=1)
# labels = [f"{entity_name if len(entity_name) <= 15 else entity_name[:15] + '..'}"
# f"({round(max_values_per_contributor[i], 1)}{'%' if unit == 'relative' else ''})" if any(
# values[i] > legend_threshold) else f'_{entity_name}' for i, entity_name in
# enumerate(contributions_per_entity.keys())]
labels = contributions_per_entity.keys()
if top_k > 0: # only keep the top k contributors (i.e. the contributors that contributed the most commits in total)
total_value_per_contributor = values.sum(axis=1)
top_k_idx = total_value_per_contributor.argpartition(-top_k)[-top_k:]
values = values[top_k_idx]
labels = [labels[i] for i in top_k_idx]
if values.shape[1] > 1: # only plot stack area chart if there is more than one time step
plot_stack_area_chart(values=values,
execution_id=f'{repo}_{unit}_values_top_{top_k}' if top_k > 0 else f'{repo}_{unit}_values_all',
path=figures_dir, ylabel=ylabel, legend_labels=labels, tick_labels=sample_windows,
legend=legend, title=f'{repo.title()} - {contribution_type} distribution over time')
else:
# if there is only one time step, plot a doughnut chart
data_dict = {label: value[0] for label, value in zip(labels, values)}
plot_doughnut_chart(data_dict, filepath=figures_dir / f'{repo}_doughnut_chart.png',
title=f'{repo.title()} - All time {contribution_type} distribution')
def plot_comparative_metrics(ledger_repos, metrics, file, figures_dir):
repos = [repo for repos in ledger_repos.values() for repo in repos]
metrics_df = pd.read_csv(file, index_col='date')
metrics_df.index = pd.to_datetime(metrics_df.index)
colors = sns.color_palette(cc.glasbey, n_colors=len(repos))
for metric in metrics:
plt.figure(figsize=(10, 6))
for i, repo in enumerate(repos):
repo_data = metrics_df[metrics_df['ledger'] == repo][[metric]]
plt.plot(repo_data, label=repo, marker='o', markersize=3, color=colors[i])
plt.xlabel('Date')
plt.ylabel(metric.replace('_', ' ').title())
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=3, fancybox=True, shadow=True)
plt.savefig(figures_dir / f"{metric}.png", bbox_inches='tight')
plt.close("all")
def plot_doughnut_chart(data_dict, title='', filepath='figures/doughnut_chart.png'):
"""
Plots a doughnut chart with the data provided in the data_dict and saves it in a png file.
:param data_dict: dictionary with the data to be plotted. The keys are the labels and the values are the values to be
plotted.
:param title: optional title for the plot
:param filepath: the path where the plot will be saved
"""
fig, ax = plt.subplots()
plt.title(title)
# sort the data_dict by values in descending order
data_dict = dict(sorted(data_dict.items(), key=lambda x: x[1], reverse=True))
labels = [(f'{label[:15]}...' if len(label) > 15 else label) for label in data_dict.keys()]
labels = [label for label in data_dict.keys()]
wedges, texts = ax.pie(data_dict.values(), wedgeprops=dict(width=0.5), startangle=0)
bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center")
# for i, wedge in enumerate(wedges):
# fraction = (wedge.theta2 - wedge.theta1) / 360
# label_threshold = 0.02 # only show labels for the values that exceed the threshold
# if fraction > label_threshold:
# ang = (wedge.theta2 - wedge.theta1) / 2. + wedge.theta1
# y = np.sin(np.deg2rad(ang))
# x = np.cos(np.deg2rad(ang))
# horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
# connectionstyle = f"angle,angleA=0,angleB={ang}"
# kw["arrowprops"].update({"connectionstyle": connectionstyle})
# ax.annotate(labels[i], xy=(x, y), xytext=(1.35 * np.sign(x), 1.4 * y),
# horizontalalignment=horizontalalignment, **kw)
plt.savefig(filepath, bbox_inches='tight')
def plot(ledger_repos, metrics, commits_per_sample_window, contributor_type, contribution_type):
contributions_per_entity_data_dir = hlp.get_output_dir(output_type='data', contribution_type=contribution_type,
contributor_type=contributor_type,
commits_per_sample_window=commits_per_sample_window,
data_type='contributions_per_entity')
metrics_file = hlp.get_output_dir(output_type='data', contribution_type=contribution_type,
contributor_type=contributor_type,
commits_per_sample_window=commits_per_sample_window,
data_type='metrics') / 'all_metrics.csv'
dynamics_figures_dir = hlp.get_output_dir(output_type='figures', contribution_type=contribution_type,
contributor_type=contributor_type,
commits_per_sample_window=commits_per_sample_window, data_type='dynamics',
mkdir=True)
metrics_figures_dir = hlp.get_output_dir(output_type='figures', contribution_type=contribution_type,
contributor_type=contributor_type,
commits_per_sample_window=commits_per_sample_window, data_type='metrics',
mkdir=True)
logging.info("Plotting commit distributions for each repo..")
plot_contribution_distribution(ledger_repos=ledger_repos, data_dir=contributions_per_entity_data_dir,
figures_dir=dynamics_figures_dir, legend=False, contribution_type=contribution_type)
logging.info("Plotting metrics..")
if metrics_file.exists():
plot_comparative_metrics(ledger_repos=ledger_repos, metrics=metrics, file=metrics_file,
figures_dir=metrics_figures_dir)
if __name__ == '__main__':
logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%Y/%m/%d %I:%M:%S %p', level=logging.INFO)
ledger_repos = hlp.get_ledger_repos()
metrics = hlp.get_metrics()
commits_per_sample_window = hlp.get_commits_per_sample_window_list()
contributor_types = hlp.get_contributor_types()
contribution_types = hlp.get_contribution_types()
for contribution_type in contribution_types:
logging.info(f'Plotting by contribution type: {contribution_type}')
for contributor_type in contributor_types:
logging.info(f'Plotting per entity type: {contributor_type}')
plot(ledger_repos, metrics, commits_per_sample_window, contributor_type, contribution_type)