Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

System transforms #52

Open
wants to merge 47 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
8de402e
Complete vps-audit and makeself benchmarks for CI integration + minor…
Geoka1 Dec 23, 2024
4f64a85
set-up-to-date
Geoka1 Dec 27, 2024
78a0ede
deps fix
Geoka1 Dec 27, 2024
1e864e6
fix deps
Geoka1 Dec 27, 2024
ef73a25
Merge branch 'complete-vps-makeself'
Geoka1 Dec 27, 2024
e5d652c
fix deps v2
Geoka1 Dec 27, 2024
d7be5ce
Merge branch 'binpash:main' into main
Geoka1 Dec 27, 2024
952a8e2
fixes
Geoka1 Dec 27, 2024
0fa7a0f
fixes
Geoka1 Dec 27, 2024
1ae7d6d
remove extra files
Geoka1 Dec 27, 2024
d685eec
vps-audit verification changes
Geoka1 Dec 30, 2024
e20dcc1
fetched changes
Geoka1 Dec 30, 2024
235baf4
Added makeself, vps-audit and vps-audit with negation
Geoka1 Jan 2, 2025
7308b9d
change verification for vps, added vps-negate to ci
Geoka1 Jan 2, 2025
51765ff
Make vps-audit work with existing Docker image
Geoka1 Jan 3, 2025
f9461b0
Changed README.md for iptables to work
Geoka1 Jan 3, 2025
70a04e2
vps verification changes
Geoka1 Jan 3, 2025
8926a2a
fix infotest because of varying bin sizes
Geoka1 Jan 4, 2025
9243c16
Make verify.sh fit common format
vagos Jan 4, 2025
3d60532
Bring up-to-date with upstream
Geoka1 Jan 4, 2025
45bc648
vps-audit-negate verification fixes
Geoka1 Jan 5, 2025
0ccc54d
Minor fixes on makeself
Geoka1 Jan 5, 2025
cd30d0e
Added new benchmarks on tests.yml
Geoka1 Jan 5, 2025
855f633
Merge branch 'binpash:main' into main
Geoka1 Jan 6, 2025
1860099
Merge remote-tracking branch 'upstream/main'
Geoka1 Jan 6, 2025
4d737da
Parallelizing benchmarks using GNU-Parallel
Geoka1 Jan 6, 2025
fa2f37d
Introduced GNU Parallel to more benchmarks
Geoka1 Jan 8, 2025
70778dc
Added parallelized VPS audit
Geoka1 Jan 9, 2025
5eb8c40
Changes in GNU parallel benchmarks and added Shark
Geoka1 Jan 10, 2025
b5dacee
Merge branch 'binpash:main' into adding-systems
Geoka1 Jan 11, 2025
e7dfaee
multiple changes in systems
Geoka1 Jan 11, 2025
edd30b2
Changed transformations
Geoka1 Jan 11, 2025
6b9f8e2
Removed time command
Geoka1 Jan 11, 2025
d734976
fixes
Geoka1 Jan 11, 2025
10dbda5
prepare systems for dynamic analysis
Geoka1 Jan 12, 2025
ffd7aa6
changing run scripts
Geoka1 Jan 12, 2025
d9be448
add scripts to sync
Geoka1 Jan 12, 2025
0e07851
added time
Geoka1 Jan 12, 2025
1f6e76a
changed back riker/sql in shark
Geoka1 Jan 12, 2025
faf1f30
fixed web index
Geoka1 Jan 13, 2025
9332e69
changed shark web index
Geoka1 Jan 13, 2025
e1db127
changed nlp shark
Geoka1 Jan 13, 2025
5c10873
changed aurpkg
Geoka1 Jan 13, 2025
23c0488
gnu parallel changes
Geoka1 Jan 13, 2025
22ac5e3
fixed sklearn
Geoka1 Jan 13, 2025
05e7088
fixed bigrams
Geoka1 Jan 13, 2025
e7aadac
fixed 4 last scripts
Geoka1 Jan 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
Binary file added .DS_Store
Binary file not shown.
7 changes: 6 additions & 1 deletion aurpkg/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,13 @@ mkdir -p ${OUT}

script="./scripts/pacaur.sh"

BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
export BENCHMARK_CATEGORY="aurpkg"
export BENCHMARK_SCRIPT="$(realpath "$script")"
export BENCHMARK_INPUT_FILE="$(realpath "$IN")"

# Switch to user "user" to avoid permission issues

echo "$script"
$BENCHMARK_SHELL "$script" "$IN" "$OUT"
time $BENCHMARK_SHELL "$script" "$IN" "$OUT"
echo "$?"
1 change: 1 addition & 0 deletions bio/deps.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash
# install dependencies
required_version="1.7"

Expand Down
2 changes: 2 additions & 0 deletions bio/input.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

IN=inputs
IN_NAME=input.txt

Expand Down
9 changes: 8 additions & 1 deletion bio/run.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

# create bam files with regions
################### 1KG SAMPLES
IN=inputs
Expand All @@ -8,6 +10,11 @@ if [[ "$@" == *"--small"* ]]; then
IN_NAME=input_small.txt
fi

export BENCHMARK_CATEGORY="bio"
BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}

"$BENCHMARK_SHELL" ./scripts/bio.sh "$IN" "$IN_NAME" "$OUT"
script_file=./scripts/bio.sh
export BENCHMARK_SCRIPT="$(realpath "$script_file")"
export BENCHMARK_INPUT_FILE="$(realpath "$IN_NAME")"

time $BENCHMARK_SHELL "$script_file" "$IN" "$IN_NAME" "$OUT"
14 changes: 10 additions & 4 deletions covid-mts/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,15 @@ output_scoped="$outputs_dir/outputs$suffix"
mkdir -p "$output_scoped"

BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
export BENCHMARK_CATEGORY="covid-mts"
export BENCHMARK_INPUT_FILE="$(realpath "$input_file")"

$BENCHMARK_SHELL "$scripts_dir/1.sh" "$input_file" > "$output_scoped/1.out"
$BENCHMARK_SHELL "$scripts_dir/2.sh" "$input_file" > "$output_scoped/2.out"
$BENCHMARK_SHELL "$scripts_dir/3.sh" "$input_file" > "$output_scoped/3.out"
$BENCHMARK_SHELL "$scripts_dir/4.sh" "$input_file" > "$output_scoped/4.out"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/1.sh")"
time $BENCHMARK_SHELL "$scripts_dir/1.sh" "$input_file" > "$output_scoped/1.out"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/2.sh")"
time $BENCHMARK_SHELL "$scripts_dir/2.sh" "$input_file" > "$output_scoped/2.out"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/3.sh")"
time $BENCHMARK_SHELL "$scripts_dir/3.sh" "$input_file" > "$output_scoped/3.out"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/4.sh")"
time $BENCHMARK_SHELL "$scripts_dir/4.sh" "$input_file" > "$output_scoped/4.out"

2 changes: 2 additions & 0 deletions file-enc/deps.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

sudo apt-get update

pkgs='ffmpeg unrtf imagemagick libarchive-tools libncurses5-dev libncursesw5-dev zstd liblzma-dev libbz2-dev zip unzip nodejs tcpdump'
Expand Down
9 changes: 7 additions & 2 deletions file-enc/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ if [[ "$1" == "--small" ]]; then
suffix=".small"
fi

export BENCHMARK_CATEGORY="file-enc"
export BENCHMARK_INPUT_FILE="$(realpath "$input_pcaps")"
BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
$BENCHMARK_SHELL $scripts_dir/compress_files.sh $input_pcaps $results_dir/compress_files$suffix
$BENCHMARK_SHELL $scripts_dir/encrypt_files.sh $input_pcaps $results_dir/encrypt_files$suffix

export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/compress_files.sh")"
time $BENCHMARK_SHELL "$scripts_dir/compress_files.sh" "$input_pcaps" "$results_dir/compress_files$suffix"
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/encrypt_files.sh")"
time $BENCHMARK_SHELL "$scripts_dir/encrypt_files.sh" "$input_pcaps" "$results_dir/encrypt_files$suffix"
Binary file added infrastructure/.DS_Store
Binary file not shown.
10 changes: 4 additions & 6 deletions infrastructure/Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/scripts_to_benchmark.csv target/cyclomatic.csv target/shellmetrics.sh
STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/cyclomatic.csv target/shellmetrics.sh target/dynamic_analysis.jsonl

static: $(STATIC_OUTPUTS)

target/scripts_to_benchmark.csv: scripts_to_benchmark.py
target/dynamic_analysis.jsonl: dynamic_analysis.py
python3 $< | sort > $@

target/lines_of_code.csv: count_lines_of_code.py
Expand All @@ -22,8 +22,6 @@ target/shellmetrics.sh:
chmod +x $@

target/cyclomatic.csv: get_cyclomatic.py target/shellmetrics.sh
python3 get_cyclomatic.py > $@
python3 get_cyclomatic.py | sort > $@

dynamic:

.PHONY: static dynamic clean-static static-test
.PHONY: static clean-static static-test
4 changes: 4 additions & 0 deletions infrastructure/all_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,7 @@ def get_all_scripts(
]
for benchmark_name, benchmark_data in benchmark_data.items()
}

if __name__ == "__main__":
for bench in get_all_scripts().keys():
print(bench)
199 changes: 199 additions & 0 deletions infrastructure/colossal_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
#!/usr/bin/env python3

import pandas as pd
import fnmatch
import viz_syntax as stx
import viz_dynamic as dyn

from all_scripts import get_all_scripts
from project_root import get_project_root

root = get_project_root()
data_path = root / 'infrastructure/target/dynamic_analysis.jsonl'
input_size_path = root / 'infrastructure/data/size_inputs.jsonl'
loc_data_path = root / 'infrastructure/target/lines_of_code.csv'

benchmark_category_style = {
'bio': ('XXX', 'XXX', 'XXX'),
'vps-audit': ('XXX', 'XXX', 'XXX'),
'vps-audit-negate': ('XXX', 'XXX', 'XXX'),
'aurpkg': ('XXX', 'XXX', 'XXX'),
'makeself': ('XXX', 'XXX', 'XXX'),
'infrastructure/standards/100-files': ('XXX', 'XXX', 'XXX'),
'infrastructure/standards/read-write': ('XXX', 'XXX', 'XXX'),
'infrastructure/standards/shell-memory': ('XXX', 'XXX', 'XXX'),
'infrastructure/standards/sleep': ('XXX', 'XXX', 'XXX'),
'infrastructure/standards/time-in-shell-subprocess': ('XXX', 'XXX', 'XXX'),
'infrastructure/standards/user-time': ('XXX', 'XXX', 'XXX'),
'infrastructure/standards/user-time-in-shell': ('XXX', 'XXX', 'XXX'),
'infrastructure/standards/write-only': ('XXX', 'XXX', 'XXX'),
'covid-mts': ('Data analysis', 'Data extraction', '\\cite{covid-mts-source}'),
'file-enc': ('Cryptography', 'Automation', '\\cite{cito2020empirical}'),
'log-analysis': ('System admin.', 'Data extraction', '\\cite{spinellis2017extending, raghavan2020posh}'),
'max-temp': ('Data analysis', 'Data extraction', '\\cite{hadoop-guide-2009}'),
'media-conv': ('Misc.', 'Automation', '\\cite{spinellis2017extending, raghavan2020posh}'),
'nlp': ('Machine learning', 'Text processing', '\\cite{unix-for-poets-church}'),
'oneliners': ('Misc.', 'Text processing', '\\cite{bentley-pearl-cacm-1985, bentley-pearl-cacm-1986, unix-cacm-1974, wicked-cool-shell-scripts}'),
'riker': ('Development', 'Build scripts', ''),
'sklearn': ('Machine learning', 'Automation', ''),
'uniq-ips': ('System admin.', 'Automation', ''),
'unix50': ('Misc.', 'Text processing', '\\cite{bhandari2020solutions}'),
'web-index': ('Development', 'Text processing', '\\cite{pash2021}')
}

def short_category(benchmark):
dom, style, _ = benchmark_category_style[benchmark]
def shorten(str):
return ''.join([x[0].upper() for x in str.split(' ')])
return shorten(dom) + '/' + shorten(style)

benchmark_input_description = {
'aurpkg': 'package files',
'bio': 'biological data files',
'covid-mts': 'transit data',
'file-enc': 'pcap files',
'log-analysis': 'log files',
'max-temp': 'temperature data',
'media-conv': 'media files',
'nlp': 'text files',
'oneliners': 'text files',
'riker': 'source code files',
'sklearn': 'CSV files',
'uniq-ips': 'text files',
'unix50': 'text files',
'web-index': 'HTML files',
'bio': 'XXX',
'vps-audit': 'system status',
'vps-audit-negate': 'system status',
'makeself': 'XXX',
'aurpkg': 'XXX',
'infrastructure/standards/100-files': 'XXX',
'infrastructure/standards/read-write': 'XXX',
'infrastructure/standards/shell-memory': 'XXX',
'infrastructure/standards/sleep': 'XXX',
'infrastructure/standards/time-in-shell-subprocess': 'XXX',
'infrastructure/standards/user-time': 'XXX',
'infrastructure/standards/user-time-in-shell': 'XXX',
'infrastructure/standards/write-only': 'XXX',
}

scripts_to_include = [
'covid-mts/scripts/1.sh',
'file-enc/scripts/encrypt_files.sh',
'log-analysis/scripts/nginx.sh',
'media-conv/scripts/img_convert.sh',
'nlp/scripts/bigrams.sh',
'oneliners/*',
'unix50/scripts/1.sh',
'riker/scripts/redis/build.sh',
# max-temp is just 1
# sklearn is just 1
# aurpkg is just 1
# bio is just 1
# makeself??
'web-index/scripts/ngrams.sh',
# vps-audit is just 1
]


def count_unique_cmds(series):
return len({node for node in series if 'command(' in node})

def count_constructs(series):
return len(set(series))

def read_loc_data():
loc_data = pd.read_csv(loc_data_path, header=None)
loc_data.columns = ['script', 'loc']
loc_data['benchmark'] = loc_data['script'].apply(lambda x: x.split('/')[0])
loc_data_bench = loc_data.groupby('benchmark').agg({
'loc': 'sum',
'script': 'count'
}).reset_index()
loc_data_bench.rename(columns={'script': 'number_of_scripts'}, inplace=True)
return loc_data, loc_data_bench

def prettify_bytes_number(n):
if n < 1024:
value, unit = n, "B"
elif n < 1024 * 1024:
value, unit = n / 1024, "KB"
elif n < 1024 * 1024 * 1024:
value, unit = n / (1024 * 1024), "MB"
else:
value, unit = n / (1024 * 1024 * 1024), "GB"

if value < 10:
decimals = 2
elif value < 100:
decimals = 1
else:
decimals = 0

color = 'black' if unit == 'GB' else 'gray'
return f"{value:.{decimals}f} \\textcolor{{{color}}}{{{unit}}}"

def main():
syntax_script, syntax_bench = stx.read_data(True)

syntax_script_all_cmds, syntax_bench_all_cmds = stx.read_data(False)
dyn_script, dyn_bench = dyn.read_data()
loc_data_script, loc_data_bench = read_loc_data()

syntax_script_all_cmds['unique_cmds'] = syntax_script_all_cmds['nodes'].apply(count_unique_cmds)
syntax_bench_all_cmds['unique_cmds'] = syntax_bench_all_cmds['nodes'].apply(count_unique_cmds)
syntax_script['constructs'] = syntax_script['nodes'].apply(count_constructs)
syntax_bench['constructs'] = syntax_bench['nodes'].apply(count_constructs)

# all_scripts = set(syntax_script['script'].unique())

# missing_in_dyn = all_scripts - set(dyn_script['script'].unique())
# missing_in_loc_data = all_scripts - set(loc_data_script['script'].unique())
# missing_in_cmds = all_scripts - set(syntax_script_all_cmds['script'].unique())

# print("Missing in dyn_script:", missing_in_dyn)
# print("Missing in loc_data_script:", missing_in_loc_data)
# print("Missing in syntax_script_all_cmds:", missing_in_cmds)

dyn_bench['input_description'] = dyn_bench['benchmark'].apply(lambda x: benchmark_input_description[x])

big_bench = syntax_bench.merge(dyn_bench, on='benchmark')\
.merge(loc_data_bench, on='benchmark')\
.merge(syntax_bench_all_cmds[['benchmark', 'unique_cmds']], on='benchmark')

big_script = syntax_script.merge(dyn_script, on='script')\
.merge(loc_data_script, on='script')\
.merge(syntax_script_all_cmds[['script', 'unique_cmds']], on='script')


print("""
\\def\\idw{5em}
\\begin{tabular}{l|lrr|rr|l|rrrr|lr}
\\toprule
\\multirow{2}{*}{Benchmark/Script} & \\multicolumn{3}{c|}{Surface} & \\multicolumn{2}{c|}{Syntax} & \\multicolumn{1}{c|}{Inputs} & \\multicolumn{4}{c|}{Dynamic} & \\multicolumn{2}{c}{System} \\\\
& Dom & \\#.sh & LOC & \\# Cons & \\# Cmd & & T.sh & T.cmd & Mem & I/O & \\# s/c & \\# fd \\\\
\\midrule
""")
# generate a big latex table with the following columns:
# benchmark, short_category, number of scripts, LOC, number of constructs, number of unique commands, input description, time in shell, time in commands, max memory, IO
for _, row in big_bench.iterrows():
numscripts_shown = 0
numscripts = row['number_of_scripts']
print("\\rule{0pt}{5ex}")
print(f"\\textbf{{\\tt {row['benchmark']}}} & {short_category(row['benchmark'])} & {row['number_of_scripts']} & {row['loc']} & {row['constructs']} & {row['unique_cmds']} & \\multirow{{2}}{{*}}{{\\parbox{{\\idw}}{{{prettify_bytes_number(row['input_size']) + ' of ' + row['input_description']}}}}} & {row['time_in_shell']:.2f} & {row['time_in_commands']:.2f} & {prettify_bytes_number(row['max_unique_set_size'])} & {prettify_bytes_number(row['io_chars'])} \\\\")
# now print the details of all scripts in the benchmark
for _, row_script in big_script.iterrows():
if row_script['benchmark'] == row['benchmark'] and any([fnmatch.fnmatch(row_script['script'], pattern) for pattern in scripts_to_include]):
# all columns except leave blank benchmark, category, number of scripts, input description
print(f"\\hspace{{0.5em}} {row_script['script'].split('/')[-1]} & & & {row_script['loc']} & {row_script['constructs']} & {row_script['unique_cmds']} & & {row_script['time_in_shell']:.2f} & {row_script['time_in_commands']:.2f} & {prettify_bytes_number(row_script['max_unique_set_size'])} & {prettify_bytes_number(row_script['io_chars'])} \\\\")
numscripts_shown += 1
if numscripts_shown < numscripts and numscripts > 1:
print(f"\\hspace{{0.5em}} \\ldots & & & & & & & & & & \\\\")
print("""
\\bottomrule
\\end{tabular}
""")


if __name__ == '__main__':
main()
37 changes: 32 additions & 5 deletions infrastructure/data/script-globs.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
{
"aurpkg": {
"scripts": ["aurpkg/scripts/*.sh"]
},
"bio": {
"scripts": ["bio/scripts/*.sh"]
},
"covid-mts": {
"scripts": ["covid-mts/scripts/*.sh"]
},
Expand All @@ -24,10 +30,7 @@
"scripts": ["sklearn/scripts/run.sh"]
},
"riker": {
"scripts": ["riker/scripts/*/build.sh"]
},
"uniq-ips": {
"scripts": ["uniq-ips/scripts/run.sh"]
"scripts": ["riker/scripts/*/run.sh"]
},
"unix50": {
"scripts": ["unix50/scripts/*.sh"]
Expand All @@ -36,12 +39,36 @@
"scripts": ["web-index/scripts/*.sh"]
},
"makeself": {
"scripts": ["makeself/makeself/*.sh"]
"scripts": ["makeself/makeself/test/*/*.sh"]
},
"vps-audit": {
"scripts": ["vps-audit/scripts/*.sh"]
},
"vps-audit-negate": {
"scripts": ["vps-audit-negate/scripts/*.sh"]
},
"infrastructure/standards/100-files": {
"scripts": ["infrastructure/standards/100-files/scripts/*.sh"]
},
"infrastructure/standards/read-write": {
"scripts": ["infrastructure/standards/read-write/scripts/*.sh"]
},
"infrastructure/standards/shell-memory": {
"scripts": ["infrastructure/standards/shell-memory/scripts/*.sh"]
},
"infrastructure/standards/sleep": {
"scripts": ["infrastructure/standards/sleep/scripts/*.sh"]
},
"infrastructure/standards/time-in-shell-subprocess": {
"scripts": ["infrastructure/standards/time-in-shell-subprocess/scripts/*.sh"]
},
"infrastructure/standards/user-time": {
"scripts": ["infrastructure/standards/user-time/scripts/*.sh"]
},
"infrastructure/standards/user-time-in-shell": {
"scripts": ["infrastructure/standards/user-time-in-shell/scripts/*.sh"]
},
"infrastructure/standards/write-only": {
"scripts": ["infrastructure/standards/write-only/scripts/*.sh"]
}
}
Loading