Skip to content

Commit 7555634

Browse files
authored
Clean up and refactor input generation to avoid several issues with artifact (#564)
* Refactor input downloading to clean steps Signed-off-by: Konstantinos Kallas <[email protected]> * Some notes from the generation Signed-off-by: Konstantinos Kallas <[email protected]> * Comments on lib functions Signed-off-by: Konstantinos Kallas <[email protected]> * Change wget to curl Signed-off-by: Konstantinos Kallas <[email protected]> * checkpoint Signed-off-by: Konstantinos Kallas <[email protected]> * JPG downloading Signed-off-by: Konstantinos Kallas <[email protected]> * nginx logs Signed-off-by: Konstantinos Kallas <[email protected]> * Add a build library Signed-off-by: Konstantinos Kallas <[email protected]>
1 parent f3a47e8 commit 7555634

File tree

2 files changed

+228
-55
lines changed

2 files changed

+228
-55
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
##
2+
## A library of shell functions that can be used to
3+
## easily create a building/dependency installing/input
4+
## downloading scripts.
5+
##
6+
7+
8+
9+
##
10+
## This function checks if all the files in the arguments exist
11+
## It returns 0 if all files exist, or 1 otherwise
12+
##
13+
files_exist_done_check()
14+
{
15+
for file in "$@"; do
16+
if [ ! -f "$file" ]; then
17+
return 1
18+
fi
19+
done
20+
return 0
21+
}
22+
23+
##
24+
## This function checks if number of files in a sequence of directories
25+
## is correct.
26+
## Returns 0 if number is correct, or 1 otherwise
27+
##
28+
number_of_files_in_dir()
29+
{
30+
local expected_number=$1
31+
local actual_number=$(ls "${@:2}" | wc -l)
32+
if [ $expected_number -eq $actual_number ]; then
33+
return 0
34+
else
35+
return 1
36+
fi
37+
}
38+
39+
##
40+
## This function executes a single idempotent step only if its check fails
41+
##
42+
## Requirements:
43+
## - The step needs to be idempotent
44+
## - The check needs to also check file sizes if there is concern of non-idempotence or failed download
45+
##
46+
execute_step()
47+
{
48+
local step_fun=$1
49+
local step_done_check_fun=$2
50+
local step_desc=${3:-"Execution step"}
51+
52+
# shellcheck disable=SC2086
53+
if ! eval $step_done_check_fun; then
54+
echo "$step_desc is not done, executing..."
55+
# shellcheck disable=SC2086
56+
eval $step_fun
57+
# shellcheck disable=SC2086
58+
eval $step_done_check_fun || { echo "ERROR: $step_desc failed!"; exit 1; }
59+
fi
60+
echo "$step_desc completed."
61+
}
62+
63+
## Issues:
64+
##
65+
## - An overarching problem is that these take time in general,
66+
## and therefore testing them out is not really feasible.
67+
## - Another problem is that by doing that manually,
68+
## we cannot get completely fine-grained. For example, we could
69+
## only copy the missing file _a la_ Rattle, instead of running
70+
## the whole step.
71+
## - Another problem is that idempotence checking is hard to do manually.
72+
## - Another issue is that generating the checks is cumbersome and error-prone.
73+
## Users need to think whether they need file_exists/number_of_files/size checks,
74+
## and if they are downloading, they need to first download and then determine the check.
75+
##

evaluation/benchmarks/dependency_untangling/input/setup.sh

+153-55
Original file line numberDiff line numberDiff line change
@@ -7,77 +7,175 @@ IN=$PASH_TOP/evaluation/benchmarks/dependency_untangling/input/
77
OUT=$PASH_TOP/evaluation/benchmarks/dependency_untangling/output/
88
IN_NAME=$PASH_TOP/evaluation/benchmarks/dependency_untangling/input/100G.txt
99

10+
## Import the build library
11+
. "$IN/build_lib.sh"
12+
1013
if [ "$1" == "-c" ]; then
11-
rm -rf ${IN}/jpg
12-
rm -rf ${IN}/log_data
13-
rm -rf ${IN}/wav
14-
rm -rf ${IN}/nginx-logs
15-
rm -rf ${IN}/node_modules
16-
rm -rf ${IN}/pcap_data
17-
rm -rf ${IN}/pcaps
18-
rm -rf ${IN}/packages
19-
rm -rf ${IN}/mir-sa
20-
rm -rf ${IN}/deps
21-
rm -rf ${IN}/bio
22-
rm -rf ${IN}/output
23-
rm -rf ${OUT}
14+
rm -rf "${IN}/jpg"
15+
rm -rf "${IN}/log_data"
16+
rm -rf "${IN}/wav"
17+
rm -rf "${IN}/nginx-logs"
18+
rm -rf "${IN}/node_modules"
19+
rm -rf "${IN}/pcap_data"
20+
rm -rf "${IN}/pcaps"
21+
rm -rf "${IN}/packages"
22+
rm -rf "${IN}/mir-sa"
23+
rm -rf "${IN}/deps"
24+
rm -rf "${IN}/bio"
25+
rm -rf "${IN}/output"
26+
rm -rf "${OUT}"
2427
exit
2528
fi
2629

30+
31+
32+
## Q: Can these checks be generated automatically? This would be great if
33+
## the user just ran the command, and then if it succeeded, the test is generated.
34+
wav_step_1_done_check()
35+
{
36+
local prefix="wav/file_example_WAV"
37+
files_exist_done_check "${prefix}_1MG.wav.kernel" "${prefix}_2MG.wav.kernel" "${prefix}_5MG.wav.kernel" "${prefix}_10MG.wav.kernel"
38+
return $?
39+
}
40+
41+
## Q: Can we automatically check that this is idempotent?
42+
## For example, in the step below there were 2 non-idempotence issues:
43+
## - wget downloads wav.zip.2 if wav.zip already exists, so we need to use -O flag
44+
## - wav files need to be saved with .kernel suffix to make step 2 idempotent
45+
wav_step_1()
46+
{
47+
curl -C - -o wav.zip http://pac-n4.csail.mit.edu:81/pash_data/wav.zip
48+
unzip wav.zip
49+
local prefix="wav/file_example_WAV"
50+
## Necessary so that the iteration in step 2 is idempotent
51+
for f in "${prefix}_1MG.wav" "${prefix}_2MG.wav" "${prefix}_5MG.wav" "${prefix}_10MG.wav"; do
52+
mv $f $f.kernel
53+
done
54+
}
55+
export -f wav_step_1_done_check
56+
export -f wav_step_1
57+
58+
wav_step_2_done_check()
59+
{
60+
local prefix="wav/file_example_WAV"
61+
for i in $(seq 0 "$WAV_DATA_FILES"); do
62+
if ! files_exist_done_check "${prefix}_1MG.wav$i.wav" "${prefix}_2MG.wav$i.wav" "${prefix}_5MG.wav$i.wav" "${prefix}_10MG.wav$i.wav"; then
63+
return 1
64+
fi
65+
done
66+
echo "Done"
67+
return 0
68+
}
69+
70+
wav_step_2()
71+
{
72+
for f in wav/*.kernel; do
73+
for (( i = 0; i <= $WAV_DATA_FILES; i++)) do
74+
echo copying to "$base_f$i.wav"
75+
base_f=wav/$(basename "$f" .kernel)
76+
cp "$f" "$base_f$i.wav"
77+
done
78+
done
79+
}
80+
export -f wav_step_2_done_check
81+
export -f wav_step_2
82+
83+
84+
jpg_step()
85+
{
86+
curl -C - -o jpg.zip $JPG_DATA_LINK
87+
unzip jpg.zip
88+
rm -rf ${IN}/jpg.zip
89+
}
90+
91+
jpg_step_done_check()
92+
{
93+
number_of_files_in_dir $JPG_NUMBER jpg
94+
}
95+
export -f jpg_step
96+
export -f jpg_step_done_check
97+
98+
nginx_logs_step_1()
99+
{
100+
curl -C - -o nginx.zip http://pac-n4.csail.mit.edu:81/pash_data/nginx.zip
101+
unzip nginx.zip
102+
rm nginx.zip
103+
}
104+
105+
nginx_logs_step_1_done_check()
106+
{
107+
local prefix="nginx-logs/log"
108+
for i in $(seq 0 7); do
109+
if ! files_exist_done_check "${prefix}$i"; then
110+
return 1
111+
fi
112+
done
113+
return $?
114+
}
115+
116+
export -f nginx_logs_step_1
117+
export -f nginx_logs_step_1_done_check
118+
119+
nginx_logs_step_2()
120+
{
121+
# generating analysis logs
122+
mkdir -p ${IN}/log_data
123+
for (( i = 1; i <=$LOG_DATA_FILES; i++)) do
124+
for j in nginx-logs/*;do
125+
n=$(basename $j)
126+
cp $j log_data/log${i}_${n}.log;
127+
done
128+
done
129+
}
130+
131+
132+
nginx_logs_step_2_done_check()
133+
{
134+
local prefix="log_data/log"
135+
for j in $(seq 0 7); do
136+
for i in $(seq 1 "$LOG_DATA_FILES"); do
137+
if ! files_exist_done_check "${prefix}${i}_log${j}.log"; then
138+
return 1
139+
fi
140+
done
141+
done
142+
echo "Done"
143+
return 0
144+
}
145+
146+
export -f nginx_logs_step_2_done_check
147+
export -f nginx_logs_step_2
148+
149+
27150
setup_dataset() {
28151
if [ "$1" == "--small" ]; then
29-
LOG_DATA_FILES=6
30-
WAV_DATA_FILES=20
152+
export LOG_DATA_FILES=6
153+
export WAV_DATA_FILES=20
31154
NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/node_modules.zip
32155
BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/bio.zip
33-
JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/jpg.zip
156+
export JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/jpg.zip
157+
export JPG_NUMBER=508
34158
PCAP_DATA_FILES=1
35159
else
36-
LOG_DATA_FILES=84
37-
WAV_DATA_FILES=120
160+
export LOG_DATA_FILES=84
161+
export WAV_DATA_FILES=120
38162
NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/node_modules.zip
39163
BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/bio.zip
40-
JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/jpg.zip
164+
export JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/jpg.zip
165+
export JPG_NUMBER=1624
41166
PCAP_DATA_FILES=15
42167
fi
43168

44-
if [ ! -d ${IN}/wav ]; then
45-
wget http://pac-n4.csail.mit.edu:81/pash_data/wav.zip
46-
unzip wav.zip && cd wav/
47-
for f in *.wav; do
48-
FILE=$(basename "$f")
49-
for (( i = 0; i <= $WAV_DATA_FILES; i++)) do
50-
echo copying to $f$i.wav
51-
cp $f $f$i.wav
52-
done
53-
done
54-
echo "WAV Generated"
55-
fi
56-
57-
if [ ! -d ${IN}/jpg ]; then
58-
cd ${IN}
59-
wget $JPG_DATA_LINK
60-
unzip jpg.zip
61-
echo "JPG Generated"
62-
rm -rf ${IN}/jpg.zip
63-
fi
169+
## WAV
170+
execute_step wav_step_1 wav_step_1_done_check "WAV zip download"
171+
execute_step wav_step_2 wav_step_2_done_check "WAV file generation"
172+
173+
## JPG
174+
execute_step jpg_step jpg_step_done_check "JPG Downloading"
64175

65-
# download the input for the nginx logs and populate the dataset
66-
if [ ! -d ${IN}/log_data ]; then
67-
cd $IN
68-
wget http://pac-n4.csail.mit.edu:81/pash_data/nginx.zip
69-
unzip nginx.zip
70-
rm nginx.zip
71-
# generating analysis logs
72-
mkdir -p ${IN}/log_data
73-
for (( i = 1; i <=$LOG_DATA_FILES; i++)) do
74-
for j in nginx-logs/*;do
75-
n=$(basename $j)
76-
cat $j > log_data/log${i}_${n}.log;
77-
done
78-
done
79-
echo "Logs Generated"
80-
fi
176+
## nginx logs
177+
execute_step nginx_logs_step_1 nginx_logs_step_1_done_check "NGINX logs Downloading"
178+
execute_step nginx_logs_step_2 nginx_logs_step_2_done_check "NGINX logs generated"
81179

82180
if [ ! -d ${IN}/bio ]; then
83181
if [ "$1" = "--small" ]; then

0 commit comments

Comments
 (0)