Skip to content

Commit

Permalink
chore(brainreports): formatting tweaks (#1118)
Browse files Browse the repository at this point in the history
* chore(brainreports): formatting tweaks

* fix(brain-reports): cache, collect from 2020-01-13

* chore(gen3 api): indexd-download-all helper
  • Loading branch information
frickjack authored Mar 10, 2020
1 parent b37e12b commit 1a3e21d
Show file tree
Hide file tree
Showing 7 changed files with 227 additions and 30 deletions.
8 changes: 8 additions & 0 deletions doc/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@ Helpers for interacting with the gen3 api

## Use

### indexd-download-all

Helper downloads all the records from indexd to a folder

```
ex:$ gen3 api indexd-download-all domain.commons.io ./destFolder
```

### indexd-post-folder

Helper uploads a folder of json indexd records.
Expand Down
13 changes: 13 additions & 0 deletions files/scripts/braincommons/beatpd-files.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
s3://bhcprodv2-data-bucket/dg.7519/1090d55e-a04c-49f1-82af-98b1080031eb/real-pd.testing_data_updated.tar.bz2
s3://bhcprodv2-data-bucket/dg.7519/11283b92-583d-4bd8-b25e-de43c897941f/cis-pd.ancillary_data.tar.bz2
s3://bhcprodv2-data-bucket/dg.7519/2798041a-5165-472e-994b-c0ed419f9c49/cis-pd.testing_data.tar.bz2
s3://bhcprodv2-data-bucket/dg.7519/4ebe9c9e-20e0-4374-9837-e43f39f1858a/cis-pd.training_data.tar.bz2
s3://bhcprodv2-data-bucket/dg.7519/54aee383-5bd5-48b1-a4e7-00c404684f3f/BEAT-PD_Challenge_Data_Dictionary.csv
s3://bhcprodv2-data-bucket/dg.7519/69df65dd-6550-4fe0-b037-333280c0c2db/cis-pd.data_labels.tar.bz2
s3://bhcprodv2-data-bucket/dg.7519/73ce9ceb-a774-4dd2-b011-fcbd5943f6dd/cis-pd.clinical_data.tar.bz2
s3://bhcprodv2-data-bucket/dg.7519/88ab7900-b0c1-4ee1-a710-fe909cf9b0fd/real-pd.data_labels.tar.bz2
s3://bhcprodv2-data-bucket/dg.7519/8c1a8185-7098-4f21-be9e-7b53c1d69737/real-pd.REAL-PD_Test_Data_IDs.csv
s3://bhcprodv2-data-bucket/dg.7519/8f67f4b9-21ab-40e0-af4c-914ebdc2df17/real-pd.clinical_data.tar.bz2
s3://bhcprodv2-data-bucket/dg.7519/ce1b8a45-2504-4433-99be-2f373894d71b/real-pd.ancillary_data_updated.tar.bz2
s3://bhcprodv2-data-bucket/dg.7519/d244cea7-5e45-4185-ae2b-a7f6440b1d59/real-pd.training_data_updated.tar.bz2
s3://bhcprodv2-data-bucket/dg.7519/e5ef9c59-b44b-456d-b82d-e4e049c3eb27/cis-pd.CIS-PD_Test_Data_IDs.csv
101 changes: 101 additions & 0 deletions files/scripts/braincommons/brain-custom-reports.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#
# Generate S3 access and Dream-challenger user login reports for
# the brain commons, and publish to dashboard service
#
# Run as cron:
# GEN3_HOME=/home/bhcprodv2/cloud-automation
# PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
# 2 2 * * 1 (if [ -f $GEN3_HOME/files/scripts/braincommons/brain-custom-reports.sh ]; then bash $GEN3_HOME/files/scripts/braincommons/brain-custom-reports.sh; else echo "no brain-custom-reports.sh"; fi) > $HOME/brain-custom-reports.log 2>&1


source "${GEN3_HOME}/gen3/gen3setup.sh"


# lib -------------------------

BEATPD="${GEN3_HOME}/files/scripts/braincommons/beatpd-files.txt"
beatpdFilter() {
while read -r LINE; do
local path
if path="$(awk '{ print $2 }' <<<"$LINE")" && grep "$path" "$BEATPD" > /dev/null 2>&1; then
echo -e "$LINE"
else
gen3_log_info "SKIPPING $LINE - not in beatpd"
fi
done
}

# main ------------------------

# pin start date to January 13
startDate="2020-01-13"
numDays=0

if [[ $# -lt 1 || "$1" != "go" ]]; then
gen3_log_err "Use: brain-custom-reports.sh go"
exit 1
fi
shift

#startDate="$1"
startSecs="$(date -u -d"$startDate" +%s)"
endSecs="$(date -u -d"00:00" +%s)"
numDays="$(( (endSecs - startSecs)/(24*60*60) ))"
gen3_log_info "$numDays days since $startDate"

dropDeadSecs="$(date -u -d2020-05-01 +%s)"
if [[ "$endSecs" -gt "$dropDeadSecs" ]]; then
gen3_log_err "This script will not process logs after 2020-05-01"
exit 1
fi

# to simplify testing - optionally take an already existing workfolder
if [[ $# -gt 0 && -f "$1/raw.txt" ]]; then
workFolder="$1"
shift
folderName="$(basename "$workFolder")"
else
folderName="$(date -d"$numDays days ago" -u +%Y%m%d)-$(date -u +%Y%m%d_%H%M%S)"
workFolder="$(mktemp -d -p "$XDG_RUNTIME_DIR" brainCustomReport_XXXXXX)/$folderName"
fi
mkdir -p "$workFolder"
cd "$workFolder"
gen3_log_info "working in $workFolder"

# cache raw data from last run, and add to it incrementally
cacheDate="2020-03-05"
cacheFile="${XDG_DATA_HOME}/gen3/cache/brain-custom-report_2020-01-13_to_2020-03-05_raw.txt"
if [[ ! -f "$cacheFile" ]]; then
gen3_log_err "Please generate cache $cacheFile : gen3 logs s3 start=2020-01-13 end=2020-03-05 filter=raw prefix=s3://bhcprodv2-data-bucket-logs/log/bhcprodv2-data-bucket/ > brain-custom-report_2020-01-13_to_2020-03-05_raw.txt"
exit 1
fi

if [[ -f raw.txt ]]; then
gen3_log_info "using existing raw.txt - probably testing something"
else
gen3 logs s3 start="$cacheDate 00:00" end="00:00" filter=raw prefix=s3://bhcprodv2-data-bucket-logs/log/bhcprodv2-data-bucket/ > "raw-${cacheDate}.txt"
cat "$cacheFile" "raw-${cacheDate}.txt" > "raw.txt"
fi
gen3 logs s3filter filter=accessCount < raw.txt > accessCountRaw.tsv
gen3 logs s3filter filter=whoWhatWhen < raw.txt > whoWhatWhenRaw.tsv

if dreamReport="$(bash "${GEN3_HOME}/files/scripts/braincommons/dream-access-report-cronjob.sh" "$numDays" | tail -1)" && [[ -f "$dreamReport" ]]; then
gen3_log_info "cp $dreamReport to $workFolder/dream_access_report.tsv"
cp "$dreamReport" dream_access_report.tsv
else
gen3_log_err "Failed to generate Dream access report"
fi

# Some customization for the brain-commons beat-pd dream challenge case
echo -e "Access_count\tdid\tfilename" > accessCountBrain.tsv
grep dg.7519/ accessCountRaw.tsv | beatpdFilter | sed -E 's@(dg.7519/.+)/(.+)@\1\t\2@' | tee -a accessCountBrain.tsv

echo -e "Date_time\tdid\tfilename\tUser_id" > whoWhatWhenBrain.tsv
grep dg.7519/ whoWhatWhenRaw.tsv | beatpdFilter | sed -E 's@(dg.7519/.+)/(.+)@\1\t\2@' | sed 's/__Synapse_ID_/ (Synapse ID)/g' >> whoWhatWhenBrain.tsv

if [[ -d "$workFolder" ]]; then
gen3 dashboard publish secure "$workFolder" "dreamAccess/$(date -u +%Y)/$folderName"
cd "$XDG_RUNTIME_DIR"
gen3_log_info "cleaning up $workFolder"
/bin/rm -rf "$workFolder"
fi
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
# KUBECONFIG=path/to/kubeconfig
# 6 6 * * 1 (if [ -f $HOME/cloud-automation/files/scripts/dream-access-report-cronjob.sh ]; then bash $HOME/cloud-automation/files/scripts/dream-access-report-cronjob.sh; else echo "no dream-access-report-cronjob.sh"; fi) > $HOME/dream-access-report-cronjob.log 2>&1


export GEN3_HOME="${GEN3_HOME:-"$HOME/cloud-automation"}"

if ! [[ -d "$GEN3_HOME" ]]; then
Expand All @@ -18,39 +17,43 @@ PATH="${PATH}:/usr/local/bin"

source "${GEN3_HOME}/gen3/gen3setup.sh"

echo "Setting up..."
gen3_log_info "Setting up..."
dataFolder="$(mktemp -d -p "$XDG_RUNTIME_DIR" 'tempDreamReportDataFolder_XXXXXX')"
dateTime="$(date '+%Y-%m-%d_%H:%M')"
dateTime="$(date -u '+%Y%m%d_%H%M')"
destFolder="$HOME/Dream_access_reports"
if [[ ! -e $destFolder ]]; then
mkdir $destFolder
fi
fileName="Dream_access_report_$dateTime.tsv"
dreamTeamID=$(g3kubectl get secrets/fence-config -o json | jq -r '.data["fence-config.yaml"]' | base64 --decode | yq .DREAM_CHALLENGE_TEAM | tr -d '\\"')

logInterval=7
regexNum='^[0-9]+$'
if [ "$1" != "" ]; then
if ! [[ $1 =~ $regexNum ]] ; then
echo "Input argument is not a number, using default value '$logInterval' days"
gen3_log_err "Input argument is not a number, using default value '$logInterval' days"
else
logInterval=$1
echo "Changing logInterval value to '$logInterval' days"
gen3_log_info "Changing logInterval value to '$logInterval' days"
fi
else
echo "logInterval value is '$logInterval' days"
gen3_log_info "logInterval value is '$logInterval' days"
fi
echo "Done!"

echo "Generating user audit log..."
gen3 psql fence -A -t -o "$dataFolder/user.json" -c "SELECT json_agg(t) FROM (SELECT * FROM user_audit_logs WHERE timestamp > CURRENT_DATE - INTERVAL '$logInterval' DAY ORDER BY id ASC) t;"
echo "Done!"
echo "Generating cert audit log..."
gen3 psql fence -A -t -o "$dataFolder/cert.json" -c "SELECT json_agg(t) FROM (SELECT * FROM cert_audit_logs WHERE timestamp > CURRENT_DATE - INTERVAL '$logInterval' DAY ORDER BY id ASC) t;"
echo "Done!"
echo "Generating report TSV..."
python3 $HOME/cloud-automation/files/scripts/dream-access-report.py -t "$dreamTeamID" -u "$dataFolder/user.json" -c "$dataFolder/cert.json" -o "$destFolder/$fileName"
echo "All done!"
gen3_log_info "Done!"
startTime="$(date -u -d"$logInterval days ago" +%Y%m%d)"
fileName="Dream_access_report_${startTime}_to_$dateTime.tsv"

gen3_log_info "Generating user audit log..."
gen3 psql fence -A -t -o "$dataFolder/user.json" -c "SELECT json_agg(t) FROM (SELECT * FROM user_audit_logs WHERE timestamp > CURRENT_DATE - INTERVAL '$logInterval' DAY ORDER BY id ASC) t;" 1>&2
gen3_log_info "Done!"
gen3_log_info "Generating cert audit log..."
gen3 psql fence -A -t -o "$dataFolder/cert.json" -c "SELECT json_agg(t) FROM (SELECT * FROM cert_audit_logs WHERE timestamp > CURRENT_DATE - INTERVAL '$logInterval' DAY ORDER BY id ASC) t;" 1>&2
gen3_log_info "Done!"
gen3_log_info "Generating report TSV..."
python3 "$GEN3_HOME/files/scripts/braincommons/dream-access-report.py" -t "$dreamTeamID" -u "$dataFolder/user.json" -c "$dataFolder/cert.json" -o "$destFolder/$fileName" 1>&2
gen3_log_info "All done!"

cd /tmp
/bin/rm -rf "${dataFolder}"
/bin/rm -rf "${dataFolder}"

# brain_custom_reports expects this to be the last line of output
echo "$destFolder/$fileName"
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
row["Synapse_email"] = ""
row["Synapse_sub_id"] = ""
row["Authorized_BEAT-PD"] = False
row["ToU/PP"] = False
row["ToU/PP"] = "FALSE"
if new_values["additional_info"]:
if "userid" in new_values["additional_info"]:
row["Synapse_id"] = new_values["additional_info"]["userid"]
Expand All @@ -72,7 +72,7 @@
if cert_data["user_id"]:
for output_content_i, output_content_row in enumerate(output_content):
if output_content_row["User_id"] == cert_data["user_id"]:
output_content[output_content_i]["ToU/PP"] = True
output_content[output_content_i]["ToU/PP"] = "TRUE"
except Exception as e:
print(e)

Expand Down
71 changes: 64 additions & 7 deletions gen3/bin/api.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ gen3_curl_json() {
method="POST"
jsonFile=""
if [[ $# -lt 2 || -z "$1" ]]; then
echo -e "$(red_color "ERROR: USE: gen3_curl_json path username jsonFile")" 2>1
gen3_log_err "USE: gen3_curl_json path username jsonFile"
return 1
fi
path="$1"
Expand All @@ -65,20 +65,20 @@ gen3_curl_json() {
jsonFile="$1"
shift
if [[ ! -f "$jsonFile" ]]; then
echo -e "$(red_color "ERROR: unable to read json file $jsonFile")"
gen3_log_err "unable to read json file $jsonFile"
return 1
fi
else
method="GET"
fi
accessToken="$(gen3_access_token "$userName")"
if [[ -z "$accessToken" ]]; then
echo -e "$(red_color "ERROR: unable to acquire token for $userName")"
gen3_log_err "unable to acquire token for $userName"
return 1
fi
hostname="$(g3kubectl get configmap manifest-global -o json | jq -r '.data["hostname"]')"
if [[ -z "$hostname" ]]; then
echo -e "$(red_color "ERROR: unable to determine hostname for commons API")"
gen3_log_err "unable to determine hostname for commons API"
return 1
fi

Expand All @@ -105,7 +105,7 @@ gen3_new_project() {
local result

if [[ $# -lt 3 || -z "$1" || -z "$2" || -z "$3" ]]; then
echo -e "$(red_color "ERROR: USE: gen3 api new-project prog-name proj-name username")" 2>1
gen3_log_err "USE: gen3 api new-project prog-name proj-name username"
return 1
fi
progName="$1"
Expand Down Expand Up @@ -145,7 +145,7 @@ gen3_new_program() {
local result

if [[ $# -lt 2 || -z "$1" || -z "$2" ]]; then
echo -e "$(red_color "ERROR: USE: gen3 api new-program prog-name username")" 2>1
gen3_log_err "USE: gen3 api new-program prog-name username"
return 1
fi
progName="$1"
Expand Down Expand Up @@ -192,7 +192,7 @@ gen3_indexd_post_folder() {
fi

if [[ ! -d "${DATA_DIR}" ]]; then
echo -e "$(red_color "ERROR: ") DATA_DIR, ${DATA_DIR}, does not exist"
gen3_log_err "DATA_DIR, ${DATA_DIR}, does not exist"
gen3_indexd_post_folder_help
return 1
fi
Expand All @@ -212,6 +212,60 @@ gen3_indexd_post_folder() {
done
}

#
# Download all the indexd records from the given domain -
# manage the paging.
# Ex:
# gen3 api indexd-download domain.commons.io data/
#
gen3_indexd_download_all() {
local DOMAIN
local DEST_DIR
local INDEXD_USER
local INDEXD_SECRET

if [[ $# -lt 2 ]]; then
gen3_log_err "gen3_indexd_download_all takes 2 arguments: domain and destintation folder"
return 1
fi
DOMAIN="$1"
shift
DATA_DIR="${1%%/}"
shift

if [[ ! -d "${DATA_DIR}" ]]; then
gen3_log_err "destination folder, ${DATA_DIR}, does not exist"
return 1
fi

local stats
local totalFiles=0
local fetchUrl="https://${DOMAIN}/index/_stats"
if ! stats="$(curl -s "$fetchUrl")" || ! totalFiles="$(jq -e -r .fileCount <<<"$stats")"; then
gen3_log_err "Failed to retrieve https://${DOMAIN}/index/_stats"
return 1
fi
gen3_log_info "Preparing to fetch $totalFiles from $DOMAIN to $DATA_DIR/ in batches of 1000"
local count=0
local start=""
local dataFile
while true; do
fetchUrl="https://${DOMAIN}/index/index?limit=1000"
dataFile="${DATA_DIR}/indexd_${DOMAIN//./_}_${count}.json"
if [[ -n "$start" ]]; then
fetchUrl="${fetchUrl}&start=$start"
fi
gen3_log_info "Fetching $fetchUrl into $dataFile"
curl -s "$fetchUrl" > "$dataFile"
start="$(jq -r '.records[-1].did' < "$dataFile")"
count=$((count + 1))
if [[ "$start" == null || $((count * 1000)) -gt "$totalFiles" ]]; then
break
fi
sleep 1
done
}


#---------- main

Expand All @@ -220,6 +274,9 @@ if [[ -z "$GEN3_SOURCE_ONLY" ]]; then
command="$1"
shift
case "$command" in
"indexd-download-all")
gen3_indexd_download_all "$@"
;;
"indexd-post-folder")
gen3_indexd_post_folder "$@"
;;
Expand Down
19 changes: 17 additions & 2 deletions gen3/lib/logs/s3.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@ source "${GEN3_HOME}/gen3/lib/utils.sh"
gen3_load "gen3/gen3setup"


#
# urldecode the given argument
#
gen3_logs_urldecode() {
echo -e ${1//\%/\\x}
}


#
# Run the stdin logstream through a filter to generate a report
#
Expand All @@ -27,10 +35,17 @@ gen3_logs_s3filter() {
fi
case "$filterName" in
"whoWhatWhen")
grep 'username' | grep GET | awk -v bucket="$logFolder" '{ gsub(/\[/, "", $3); gsub(/.+username=/, "", $11); gsub(/&.*/, "", $11); gsub(/%40/, "@", $11); print $3 "\t" $9 "\t" $11 "\t" bucket }' | sort
if [[ "$logFolder" != "unknown" ]]; then
echo -e "Date_time\tPath\tUser_id\tBucket_name"
grep 'username' | grep GET | awk -v bucket="$logFolder" '{ gsub(/\[/, "", $3); gsub(/.+username=/, "", $11); gsub(/&.*/, "", $11); gsub(/%40/, "@", $11); print $3 "\t" $9 "\t" $11 "\t" bucket }' | sort | sed -E 's/\%[0-9]+/_/g'
else
echo -e "Date_time\tPath\tUser_id"
grep 'username' | grep GET | awk -v bucket="$logFolder" '{ gsub(/\[/, "", $3); gsub(/.+username=/, "", $11); gsub(/&.*/, "", $11); gsub(/%40/, "@", $11); print $3 "\t" $9 "\t" $11 }' | sort | sed -E 's/\%[0-9]+/_/g'
fi
;;
"accessCount")
grep 'username' | grep GET | awk '{ print $9 }' | sort | uniq -c
echo -e "Access_count\tPath"
grep 'username' | grep GET | awk '{ print $9 }' | sort | uniq -c | awk '{ print $1 "\t" $2 }'
;;
*)
cat -
Expand Down

0 comments on commit 1a3e21d

Please sign in to comment.