-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsetup.Rmd
220 lines (184 loc) · 7.5 KB
/
setup.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
---
title: Setup
---
```{r, include=FALSE}
options(knitr.duplicate.label="allow")
```
This page outlines the steps required to prepare the R environment.
## Required packages
```{r setup-load-packages, message=FALSE}
# Bayesian data analysis tools
library(brms)
# Miscellaneous plotting
library(ggplot2)
library(ggpattern)
library(ggdag)
# Causal analysis
library(dagitty)
```
## Constants
Here we define some useful constants.
```{r setup-define-constants}
major_categories <- c(1, 2, 3, 5, 7)
ordered_category_names <- c("N", # category 1
"A", # category 2
"AE", # category 3
"AP", # category 4
"AEP", # category 5
"AE+", # category 6
"AEP+", # category 7
"AP+", # category 8
"E+", # category 9
"A+", # category 10
"E" # category 11
)
```
## Functions
Here we define some common convenience functions.
```{r setup-load-utility-functions}
# Returns the number of days between a date and the analysis date.
# We use February 28 as the reference date, otherwise it changes from day-to-day.
get_age <- function (yymmdd) {
as.integer(difftime("2023-02-28", as.Date(yymmdd, "%Y-%m-%d"), units="days"))
}
# Adds columns for each metric that are relative to 1,000 lines-of-code.
add_scaled_metrics <- function(data) {
data$bugs_kloc <- (data$bugs / data$loc) * 1000
data$code_smells_kloc <- (data$code_smells / data$loc) * 1000
data$critical_violations_kloc <- (data$critical_violations / data$loc) * 1000
data$major_violations_kloc <- (data$major_violations / data$loc) * 1000
data$minor_violations_kloc <- (data$minor_violations / data$loc) * 1000
data$security_hotspots_kloc <- (data$security_hotspots / data$loc) * 1000
data$vulnerabilities_kloc <- (data$vulnerabilities / data$loc) * 1000
data$duplicated_lines_kloc <- (data$duplicated_lines / data$loc) * 1000
data$cyclomatic_complexity_kloc <- (data$cyclomatic_complexity / data$loc) * 1000
data$cognitive_complexity_kloc <- (data$cognitive_complexity / data$loc) * 1000
data
}
# Returns 2 for projects that use warnings, 1 otherwise.
uses_warnings <- function(categories) {
ifelse(categories %in% 2:11, 2, 1)
}
# Utility function used to fit brms models
fit_model <- function(name, formula, priors, data, seed) {
brm(
formula,
family=gaussian,
prior=priors,
data=data,
iter=10000,
warmup=5000,
chains=4,
cores=4,
file=paste("fits", name, sep="/"),
file_refit="on_change",
save_pars=save_pars(all=TRUE),
seed=seed
)
}
# Converts a standardized value to the natural scale, given the source data.
restore_from_std <- function(value, data) {
value * sd(data) + mean(data)
}
# Returns a data frame with columns for standardized and natural scale means
determine_natural_scale_means <- function(std_means, data, precision=2) {
means <- data.frame(row.names=ordered_category_names, "std"=std_means)
means$natural <- round(restore_from_std(means$std, data), digits=precision)
means
}
# This function creates a data frame with summary data in the natural scale,
# similar to what you get from the brms "summary" function.
create_natural_scale_summary_df <- function(fit, raw_data, precision=2) {
fit_df <- as.data.frame(fit)
intervals <- posterior_interval(fit, prob=0.90) # 0.05-0.95
std_means <- c(round(mean(fit_df$b_category1), digits=precision),
round(mean(fit_df$b_category2), digits=precision),
round(mean(fit_df$b_category3), digits=precision),
round(mean(fit_df$b_category5), digits=precision),
round(mean(fit_df$b_category7), digits=precision))
std_sd <- c(round(sd(fit_df$b_category1), digits=precision),
round(sd(fit_df$b_category2), digits=precision),
round(sd(fit_df$b_category3), digits=precision),
round(sd(fit_df$b_category5), digits=precision),
round(sd(fit_df$b_category7), digits=precision))
std_lower <- c(round(intervals[1, 1], digits=precision),
round(intervals[2, 1], digits=precision),
round(intervals[3, 1], digits=precision),
round(intervals[5, 1], digits=precision),
round(intervals[7, 1], digits=precision))
std_upper <- c(round(intervals[1, 2], digits=precision),
round(intervals[2, 2], digits=precision),
round(intervals[3, 2], digits=precision),
round(intervals[5, 2], digits=precision),
round(intervals[7, 2], digits=precision))
natural_means <- round(restore_from_std(std_means, raw_data), digits=precision)
natural_sd <- round(restore_from_std(std_sd, raw_data), digits=precision)
natural_lower <- round(restore_from_std(std_lower, raw_data), digits=precision)
natural_upper <- round(restore_from_std(std_upper, raw_data), digits=precision)
data.frame(row.names=c("N", "A", "AE", "AEP", "AEP+"),
check.names=FALSE,
`Mean`=natural_means,
`SD`=natural_sd,
`0.05 CI`=natural_lower,
`0.95 CI`=natural_upper)
}
# Plots credible intervals for all categories based on a fit model
plot_intervals_of_categories <- function(model) {
mcmc_plot(model,
type="intervals",
variable="b_category",
regex=TRUE,
point_size=2,
prob=0.5, # Inner region density mass percentage
prob_outer=0.90 # Outer region density mass percentage
) +
theme_default(base_family="sans", base_size=14)
}
# Plots credible intervals for the major categories based on a fit model
plot_intervals_of_major_categories <- function(fit) {
mcmc_plot(fit,
type="intervals",
variable=c(
"b_category1", # N
"b_category2", # A
"b_category3", # AE
"b_category5", # AEP
"b_category7" # AEP+
),
regex=FALSE,
point_size=2,
prob=0.5, # Inner region density mass percentage
prob_outer=0.90 # Outer region density mass percentage
) +
theme_default(base_family="sans", base_size=14) +
scale_y_discrete(labels=c(
"b_category1"="N",
"b_category2"="A",
"b_category3"="AE",
"b_category5"="AEP",
"b_category7"="AEP+"
))
}
# Plots a prior predictive check plot
plot_priors <- function(fit, xlab) {
pp_check(fit, ndraws=25) +
theme_default(base_family="sans", base_size=14) +
labs(title="Prior predictive check", x=xlab, y="Probability density")
}
plot_caterpillar <- function(fit, var, ylabel) {
mcmc_plot(fit, type="trace", variable=var) +
ylab(ylabel)
}
```
## Data
The data we need has been prepared in a separate CSV file, all we need to do is load it. We also add some additional columns with metric values that are scaled according to the lines of code in each sample. This is so that we can compare metrics from projects of different sizes.
```{r setup-load-data-frame}
df <- read.csv("data/data_frame.csv")
# Include columns with metrics scaled per KLOC
df <- add_scaled_metrics(df)
```
## Environment
The following provides information about the R environment used to generate the results.
```{r setup-print-env-info}
sessionInfo()
```