binpash · RobertoGonzalesMatos · Sep 11, 2024 · Oct 22, 2024 · Oct 25, 2024 · Oct 27, 2024
diff --git a/py-2/benchmarks/nginx/.gitignore b/py-2/benchmarks/nginx/.gitignore
@@ -0,0 +1,2 @@
+inputs/
+outputs/
diff --git a/py-2/benchmarks/nginx/checksum.md5 b/py-2/benchmarks/nginx/checksum.md5
@@ -0,0 +1 @@
+3fe6814c6d6f2edd73a83c35f45aa024  results/nginx.sh.out
diff --git a/py-2/benchmarks/nginx/cleanup.sh b/py-2/benchmarks/nginx/cleanup.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+REPO_TOP=$(git rev-parse --show-toplevel)
+results_dir="${REPO_TOP}/covid-mts/results"
+
+echo "Cleaning up outputs..."
+rm -rf $results_dir
diff --git a/py-2/benchmarks/nginx/execution.sh b/py-2/benchmarks/nginx/execution.sh
@@ -0,0 +1 @@
+## Run this file to execute entire script considering aggregators!
diff --git a/py-2/benchmarks/nginx/input.sh b/py-2/benchmarks/nginx/input.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+REPO_TOP=$(git rev-parse --show-toplevel)
+DIR=$REPO_TOP/analysis-logs/input
+mkdir -p $DIR
+cd $DIR
+
+if [[ $1 == "--kaggle" ]]; then
+    # Set up Kaggle API
+    if [[ ! -d ~/.kaggle ]]; then
+        mkdir ~/.kaggle
+        echo "Place your kaggle.json in the ~/.kaggle directory."
+    fi
+    chmod 600 ~/.kaggle/kaggle.json
+
+    if [[ ! -f nginx.zip ]]; then
+        kaggle datasets download -d eliasdabbas/web-server-access-logs
+        unzip web-server-access-logs
+        rm -f web-server-access-logs.zip client_hostname.csv
+    fi
+else
+    if [[ ! -f nginx.zip ]]; then
+        # TODO: replace with omega URL
+        # wget -O nginx.zip "https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/3QBYB5/NXKB6J"
+        # unzip web-server-access-logs
+        # rm -f web-server-access-logs.zip
+        echo "Not implemented yet."
+        exit 1
+    fi
+fi
diff --git a/py-2/benchmarks/nginx/log.txt b/py-2/benchmarks/nginx/log.txt
@@ -0,0 +1,2 @@
+Running aggregators for script: ./scripts/1.sh and input file: 1
+Running aggregators for script: ./scripts/1.sh and input file: 1
diff --git a/py-2/benchmarks/nginx/run.sh b/py-2/benchmarks/nginx/run.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+export SUITE_DIR=$(realpath $(dirname "$0"))
+export TIMEFORMAT=%R
+cd $SUITE_DIR
+
+if [[ "$1" == "--small" ]]; then
+    echo "Using small input"
+    export INPUT="$SUITE_DIR/inputs/access.log"
+else
+    # not using this
+    echo "Using default input"
+    export IN="$SUITE_DIR/inputs/pg"
+fi
+
+# ////////
+# original script
+# REPO_TOP=$(git rev-parse --show-toplevel)
+
+# eval_dir="${REPO_TOP}/analysis-logs"
+# results_dir="${eval_dir}/results"
+# inputs_dir="${eval_dir}/input"
+
+# shell="/bin/bash"
+
+# mkdir -p $results_dir
+
+# export INPUT=${inputs_dir}/access.log
+# script="${eval_dir}/nginx.sh"
+
+# echo "Executing $(basename "$script")"
+# $shell "$script" > "$results_dir/$(basename "$script").out"
+
+# ////////
+
+covid-mts_bash() {
+for number in $(
+        seq 7
+    ); do
+        script="${number}"
+        script_file="./scripts/$script.sh"
+        output_dir="./outputs/bash/$script/"
+        output_file="./outputs/bash/$script.out"
+        time_file="./outputs/bash/$script.time"
+        log_file="./outputs/bash/$script.log"
+
+        time bash $script_file $input_file >$output_file 2>$time_file
+
+        cat "${time_file}" >>$all_res_file
+        echo "$script_file $(cat "$time_file")" | tee -a $mode_res_file
+    done
+}
+
+ID=1 # track agg run
+
+covid-mts_agg() {
+    AGG_FILE="../agg_run.sh"
+    chmod +x $AGG_FILE
+    mkdir -p "outputs/agg"
+    echo executing oneliners agg $(date) | tee -a $mode_res_file $all_res_file
+
+    for number in $(seq 7); do
+        script="${number}"
+        script_file="./scripts/$script.sh"
+        output_dir="./outputs/agg/$script/"
+        output_file="./outputs/agg/$script.out"
+        time_file="./outputs/agg/$script.time"
+        log_file="./outputs/agg/$script.log"
+        { time ../agg_run.sh $script_file $input_file $ID covid-mts >$output_file; } 2>$time_file #run file with input and direct to output
+
+        cat "${time_file}" >>$all_res_file
+        echo "$script_file $(cat "$time_file")" | tee -a $mode_res_file
+        ((ID++))
+    done
+}
+
+covid-mts_bash
+covid-mts_agg
diff --git a/py-2/benchmarks/nginx/scripts/1.sh b/py-2/benchmarks/nginx/scripts/1.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Vehicles on the road per day
+
+# <in.csv sed 's/T..:..:..//' |
+# awk -F, '!seen[$1 $3] {onroad[$1]++; seen[$1 $3] = 1}
+#    END { OFS = "\t"; for (d in onroad) print d, onroad[d]}' |
+# sort > out1
+
+# curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 |
+#   bzip2 -d |              # decompress
+# Replace the line below with the two lines above to stream the latest file
+# cat "$1" |                    # assumes saved input
+#   sed 's/T..:..:..//' |     # hide times
+#   cut -d ',' -f 1,3 |       # keep only day and bus no
+#   sort -u |                 # remove duplicate records due to time
+#   cut -d ',' -f 1 |         # keep all dates
+#   sort |                    # preparing for uniq
+#   uniq -c |                 # count unique dates
+#   awk "{print \$2,\$1}"     # print first date, then count
+
+# # diff out{1,}
+
+cat $1 | cat ${INPUT} | cut -d "\"" -f3 | cut -d ' ' -f2 | sort | uniq -c | sort -rn
diff --git a/py-2/benchmarks/nginx/scripts/2.sh b/py-2/benchmarks/nginx/scripts/2.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Days a vehicle is on the road
+
+# <in.csv sed 's/T..:..:..//' |
+# awk -F, '!seen[$1 $3] {onroad[$3]++; seen[$1 $3] = 1}
+#    END { OFS = "\t"; for (d in onroad) print d, onroad[d]}' |
+# sort -k2n >out1
+
+# curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 |
+#   bzip2 -d |                  # decompress
+# Replace the line below with the two lines above to stream the latest file
+# cat "$1" |                        # assumes saved input
+#   sed 's/T..:..:..//' |         # hide times
+#   cut -d ',' -f 3,1 |           # keep only day and bus ID
+#   sort -u |                     # removing duplicate day-buses
+#   cut -d ',' -f 2 |             # keep only bus ID
+#   sort |                        # preparing for uniq
+#   uniq -c |                     # count unique dates
+#   sort -k 1 -n |                   # sort in reverse numerical order
+#   awk "{print \$2,\$1}"     # print first date, then count
+
+# diff out{1,}
+
+cat $1 | awk '{print $9}' ${INPUT} | sort | uniq -c | sort -rn
diff --git a/py-2/benchmarks/nginx/scripts/3.sh b/py-2/benchmarks/nginx/scripts/3.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+cat $1 | awk '($9 ~ /404/)' ${INPUT} | awk '{print $7}' | sort | uniq -c | sort -rn
diff --git a/py-2/benchmarks/nginx/scripts/4.sh b/py-2/benchmarks/nginx/scripts/4.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Hours monitored each day
+
+# <in.csv sed 's/T\(..\):..:../,\1/' |
+# awk -F, '!seen[$1 $2] {hours[$1]++; seen[$1 $2] = 1}
+#    END { OFS = "\t"; for (d in hours) print d, hours[d]}' |
+#   sort
+
+# curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 |
+#   bzip2 -d |                  # decompress
+# Replace the line below with the two lines above to stream the latest file
+# cat "$1" |                        # assumes saved input
+#   sed 's/T\(..\):..:../,\1/' |  # keep times only
+#   cut -d ',' -f 1,2 |           # keep only time and date
+#   sort -u |                     # removing duplicate entries
+#   cut -d ',' -f 1 |             # keep only date
+#   sort |                        # preparing for uniq
+#   uniq -c |                     # count unique dates
+#   awk "{print \$2,\$1}"         # print first date, then count
+
+# # diff out{1,}
+
+cat $1 | awk '($9 ~ /502/)' ${INPUT} | awk '{print $7}' | sort | uniq -c | sort -r
diff --git a/py-2/benchmarks/nginx/scripts/5.sh b/py-2/benchmarks/nginx/scripts/5.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+# Hours each bus is active each day
+
+# Records are day, hour, line, bus
+cat $1 | awk -F\" '($2 ~ "/wp-admin/install.php"){print $1}' ${INPUT} | awk '{print $1}' | sort | uniq -c | sort -r
diff --git a/py-2/benchmarks/nginx/scripts/6.sh b/py-2/benchmarks/nginx/scripts/6.sh
@@ -0,0 +1 @@
+cat $1 | awk '($9 ~ /404/)' ${INPUT} | awk -F\" '($2 ~ "^GET .*\.php")' | awk '{print $7}' | sort | uniq -c | sort -r | head -n 20
diff --git a/py-2/benchmarks/nginx/scripts/7.sh b/py-2/benchmarks/nginx/scripts/7.sh
@@ -0,0 +1 @@
+cat $1 | awk -F\" '{print $2}' ${INPUT}  | awk '{print $2}' | sort | uniq -c | sort -r
diff --git a/py-2/benchmarks/nginx/scripts/8.sh b/py-2/benchmarks/nginx/scripts/8.sh
@@ -0,0 +1 @@
+cat $1 | awk -F\" '($2 ~ "ref"){print $2}' ${INPUT} | awk '{print $2}' | sort | uniq -c | sort -r
diff --git a/py-2/benchmarks/nginx/verify.sh b/py-2/benchmarks/nginx/verify.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+REPO_TOP=$(git rev-parse --show-toplevel)
+
+eval_dir="${REPO_TOP}/analysis-logs/"
+results_dir="${eval_dir}/results"
+input_dir="${eval_dir}/input"
+
+if [ "$(md5sum $results_dir/* | awk '{print $1}')" == "$(cat $input_dir/checksum.md5 | awk '{print $1}')" ];
+then
+    echo "Valid"
+else
+    echo "Invalid"
+    exit 1
+fi
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		3fe6814c6d6f2edd73a83c35f45aa024 results/nginx.sh.out
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		## Run this file to execute entire script considering aggregators!
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Running aggregators for script: ./scripts/1.sh and input file: 1
		Running aggregators for script: ./scripts/1.sh and input file: 1
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/bash

		cat $1 \| awk '($9 ~ /404/)' ${INPUT} \| awk '{print $7}' \| sort \| uniq -c \| sort -rn
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		cat $1 \| awk -F\" '{print $2}' ${INPUT} \| awk '{print $2}' \| sort \| uniq -c \| sort -r
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		cat $1 \| awk -F\" '($2 ~ "ref"){print $2}' ${INPUT} \| awk '{print $2}' \| sort \| uniq -c \| sort -r