diff --git a/doc/api.md b/doc/api.md index 953651827..b85a492b1 100644 --- a/doc/api.md +++ b/doc/api.md @@ -4,6 +4,14 @@ Helpers for interacting with the gen3 api ## Use +### indexd-download-all + +Helper downloads all the records from indexd to a folder + +``` +ex:$ gen3 api indexd-download-all domain.commons.io ./destFolder +``` + ### indexd-post-folder Helper uploads a folder of json indexd records. diff --git a/files/scripts/braincommons/beatpd-files.txt b/files/scripts/braincommons/beatpd-files.txt new file mode 100644 index 000000000..02ac6b5b8 --- /dev/null +++ b/files/scripts/braincommons/beatpd-files.txt @@ -0,0 +1,13 @@ +s3://bhcprodv2-data-bucket/dg.7519/1090d55e-a04c-49f1-82af-98b1080031eb/real-pd.testing_data_updated.tar.bz2 +s3://bhcprodv2-data-bucket/dg.7519/11283b92-583d-4bd8-b25e-de43c897941f/cis-pd.ancillary_data.tar.bz2 +s3://bhcprodv2-data-bucket/dg.7519/2798041a-5165-472e-994b-c0ed419f9c49/cis-pd.testing_data.tar.bz2 +s3://bhcprodv2-data-bucket/dg.7519/4ebe9c9e-20e0-4374-9837-e43f39f1858a/cis-pd.training_data.tar.bz2 +s3://bhcprodv2-data-bucket/dg.7519/54aee383-5bd5-48b1-a4e7-00c404684f3f/BEAT-PD_Challenge_Data_Dictionary.csv +s3://bhcprodv2-data-bucket/dg.7519/69df65dd-6550-4fe0-b037-333280c0c2db/cis-pd.data_labels.tar.bz2 +s3://bhcprodv2-data-bucket/dg.7519/73ce9ceb-a774-4dd2-b011-fcbd5943f6dd/cis-pd.clinical_data.tar.bz2 +s3://bhcprodv2-data-bucket/dg.7519/88ab7900-b0c1-4ee1-a710-fe909cf9b0fd/real-pd.data_labels.tar.bz2 +s3://bhcprodv2-data-bucket/dg.7519/8c1a8185-7098-4f21-be9e-7b53c1d69737/real-pd.REAL-PD_Test_Data_IDs.csv +s3://bhcprodv2-data-bucket/dg.7519/8f67f4b9-21ab-40e0-af4c-914ebdc2df17/real-pd.clinical_data.tar.bz2 +s3://bhcprodv2-data-bucket/dg.7519/ce1b8a45-2504-4433-99be-2f373894d71b/real-pd.ancillary_data_updated.tar.bz2 +s3://bhcprodv2-data-bucket/dg.7519/d244cea7-5e45-4185-ae2b-a7f6440b1d59/real-pd.training_data_updated.tar.bz2 +s3://bhcprodv2-data-bucket/dg.7519/e5ef9c59-b44b-456d-b82d-e4e049c3eb27/cis-pd.CIS-PD_Test_Data_IDs.csv diff --git a/files/scripts/braincommons/brain-custom-reports.sh b/files/scripts/braincommons/brain-custom-reports.sh new file mode 100644 index 000000000..81a0c60be --- /dev/null +++ b/files/scripts/braincommons/brain-custom-reports.sh @@ -0,0 +1,101 @@ +# +# Generate S3 access and Dream-challenger user login reports for +# the brain commons, and publish to dashboard service +# +# Run as cron: +# GEN3_HOME=/home/bhcprodv2/cloud-automation +# PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +# 2 2 * * 1 (if [ -f $GEN3_HOME/files/scripts/braincommons/brain-custom-reports.sh ]; then bash $GEN3_HOME/files/scripts/braincommons/brain-custom-reports.sh; else echo "no brain-custom-reports.sh"; fi) > $HOME/brain-custom-reports.log 2>&1 + + +source "${GEN3_HOME}/gen3/gen3setup.sh" + + +# lib ------------------------- + +BEATPD="${GEN3_HOME}/files/scripts/braincommons/beatpd-files.txt" +beatpdFilter() { + while read -r LINE; do + local path + if path="$(awk '{ print $2 }' <<<"$LINE")" && grep "$path" "$BEATPD" > /dev/null 2>&1; then + echo -e "$LINE" + else + gen3_log_info "SKIPPING $LINE - not in beatpd" + fi + done +} + +# main ------------------------ + +# pin start date to January 13 +startDate="2020-01-13" +numDays=0 + +if [[ $# -lt 1 || "$1" != "go" ]]; then + gen3_log_err "Use: brain-custom-reports.sh go" + exit 1 +fi +shift + +#startDate="$1" +startSecs="$(date -u -d"$startDate" +%s)" +endSecs="$(date -u -d"00:00" +%s)" +numDays="$(( (endSecs - startSecs)/(24*60*60) ))" +gen3_log_info "$numDays days since $startDate" + +dropDeadSecs="$(date -u -d2020-05-01 +%s)" +if [[ "$endSecs" -gt "$dropDeadSecs" ]]; then + gen3_log_err "This script will not process logs after 2020-05-01" + exit 1 +fi + +# to simplify testing - optionally take an already existing workfolder +if [[ $# -gt 0 && -f "$1/raw.txt" ]]; then + workFolder="$1" + shift + folderName="$(basename "$workFolder")" +else + folderName="$(date -d"$numDays days ago" -u +%Y%m%d)-$(date -u +%Y%m%d_%H%M%S)" + workFolder="$(mktemp -d -p "$XDG_RUNTIME_DIR" brainCustomReport_XXXXXX)/$folderName" +fi +mkdir -p "$workFolder" +cd "$workFolder" +gen3_log_info "working in $workFolder" + +# cache raw data from last run, and add to it incrementally +cacheDate="2020-03-05" +cacheFile="${XDG_DATA_HOME}/gen3/cache/brain-custom-report_2020-01-13_to_2020-03-05_raw.txt" +if [[ ! -f "$cacheFile" ]]; then + gen3_log_err "Please generate cache $cacheFile : gen3 logs s3 start=2020-01-13 end=2020-03-05 filter=raw prefix=s3://bhcprodv2-data-bucket-logs/log/bhcprodv2-data-bucket/ > brain-custom-report_2020-01-13_to_2020-03-05_raw.txt" + exit 1 +fi + +if [[ -f raw.txt ]]; then + gen3_log_info "using existing raw.txt - probably testing something" +else + gen3 logs s3 start="$cacheDate 00:00" end="00:00" filter=raw prefix=s3://bhcprodv2-data-bucket-logs/log/bhcprodv2-data-bucket/ > "raw-${cacheDate}.txt" + cat "$cacheFile" "raw-${cacheDate}.txt" > "raw.txt" +fi +gen3 logs s3filter filter=accessCount < raw.txt > accessCountRaw.tsv +gen3 logs s3filter filter=whoWhatWhen < raw.txt > whoWhatWhenRaw.tsv + +if dreamReport="$(bash "${GEN3_HOME}/files/scripts/braincommons/dream-access-report-cronjob.sh" "$numDays" | tail -1)" && [[ -f "$dreamReport" ]]; then + gen3_log_info "cp $dreamReport to $workFolder/dream_access_report.tsv" + cp "$dreamReport" dream_access_report.tsv +else + gen3_log_err "Failed to generate Dream access report" +fi + +# Some customization for the brain-commons beat-pd dream challenge case +echo -e "Access_count\tdid\tfilename" > accessCountBrain.tsv +grep dg.7519/ accessCountRaw.tsv | beatpdFilter | sed -E 's@(dg.7519/.+)/(.+)@\1\t\2@' | tee -a accessCountBrain.tsv + +echo -e "Date_time\tdid\tfilename\tUser_id" > whoWhatWhenBrain.tsv +grep dg.7519/ whoWhatWhenRaw.tsv | beatpdFilter | sed -E 's@(dg.7519/.+)/(.+)@\1\t\2@' | sed 's/__Synapse_ID_/ (Synapse ID)/g' >> whoWhatWhenBrain.tsv + +if [[ -d "$workFolder" ]]; then + gen3 dashboard publish secure "$workFolder" "dreamAccess/$(date -u +%Y)/$folderName" + cd "$XDG_RUNTIME_DIR" + gen3_log_info "cleaning up $workFolder" + /bin/rm -rf "$workFolder" +fi diff --git a/files/scripts/dream-access-report-cronjob.sh b/files/scripts/braincommons/dream-access-report-cronjob.sh similarity index 57% rename from files/scripts/dream-access-report-cronjob.sh rename to files/scripts/braincommons/dream-access-report-cronjob.sh index cb99b8926..982d9249a 100755 --- a/files/scripts/dream-access-report-cronjob.sh +++ b/files/scripts/braincommons/dream-access-report-cronjob.sh @@ -6,7 +6,6 @@ # KUBECONFIG=path/to/kubeconfig # 6 6 * * 1 (if [ -f $HOME/cloud-automation/files/scripts/dream-access-report-cronjob.sh ]; then bash $HOME/cloud-automation/files/scripts/dream-access-report-cronjob.sh; else echo "no dream-access-report-cronjob.sh"; fi) > $HOME/dream-access-report-cronjob.log 2>&1 - export GEN3_HOME="${GEN3_HOME:-"$HOME/cloud-automation"}" if ! [[ -d "$GEN3_HOME" ]]; then @@ -18,39 +17,43 @@ PATH="${PATH}:/usr/local/bin" source "${GEN3_HOME}/gen3/gen3setup.sh" -echo "Setting up..." +gen3_log_info "Setting up..." dataFolder="$(mktemp -d -p "$XDG_RUNTIME_DIR" 'tempDreamReportDataFolder_XXXXXX')" -dateTime="$(date '+%Y-%m-%d_%H:%M')" +dateTime="$(date -u '+%Y%m%d_%H%M')" destFolder="$HOME/Dream_access_reports" if [[ ! -e $destFolder ]]; then mkdir $destFolder fi -fileName="Dream_access_report_$dateTime.tsv" dreamTeamID=$(g3kubectl get secrets/fence-config -o json | jq -r '.data["fence-config.yaml"]' | base64 --decode | yq .DREAM_CHALLENGE_TEAM | tr -d '\\"') logInterval=7 regexNum='^[0-9]+$' if [ "$1" != "" ]; then if ! [[ $1 =~ $regexNum ]] ; then - echo "Input argument is not a number, using default value '$logInterval' days" + gen3_log_err "Input argument is not a number, using default value '$logInterval' days" else logInterval=$1 - echo "Changing logInterval value to '$logInterval' days" + gen3_log_info "Changing logInterval value to '$logInterval' days" fi else - echo "logInterval value is '$logInterval' days" + gen3_log_info "logInterval value is '$logInterval' days" fi -echo "Done!" - -echo "Generating user audit log..." -gen3 psql fence -A -t -o "$dataFolder/user.json" -c "SELECT json_agg(t) FROM (SELECT * FROM user_audit_logs WHERE timestamp > CURRENT_DATE - INTERVAL '$logInterval' DAY ORDER BY id ASC) t;" -echo "Done!" -echo "Generating cert audit log..." -gen3 psql fence -A -t -o "$dataFolder/cert.json" -c "SELECT json_agg(t) FROM (SELECT * FROM cert_audit_logs WHERE timestamp > CURRENT_DATE - INTERVAL '$logInterval' DAY ORDER BY id ASC) t;" -echo "Done!" -echo "Generating report TSV..." -python3 $HOME/cloud-automation/files/scripts/dream-access-report.py -t "$dreamTeamID" -u "$dataFolder/user.json" -c "$dataFolder/cert.json" -o "$destFolder/$fileName" -echo "All done!" +gen3_log_info "Done!" +startTime="$(date -u -d"$logInterval days ago" +%Y%m%d)" +fileName="Dream_access_report_${startTime}_to_$dateTime.tsv" + +gen3_log_info "Generating user audit log..." +gen3 psql fence -A -t -o "$dataFolder/user.json" -c "SELECT json_agg(t) FROM (SELECT * FROM user_audit_logs WHERE timestamp > CURRENT_DATE - INTERVAL '$logInterval' DAY ORDER BY id ASC) t;" 1>&2 +gen3_log_info "Done!" +gen3_log_info "Generating cert audit log..." +gen3 psql fence -A -t -o "$dataFolder/cert.json" -c "SELECT json_agg(t) FROM (SELECT * FROM cert_audit_logs WHERE timestamp > CURRENT_DATE - INTERVAL '$logInterval' DAY ORDER BY id ASC) t;" 1>&2 +gen3_log_info "Done!" +gen3_log_info "Generating report TSV..." +python3 "$GEN3_HOME/files/scripts/braincommons/dream-access-report.py" -t "$dreamTeamID" -u "$dataFolder/user.json" -c "$dataFolder/cert.json" -o "$destFolder/$fileName" 1>&2 +gen3_log_info "All done!" cd /tmp -/bin/rm -rf "${dataFolder}" \ No newline at end of file +/bin/rm -rf "${dataFolder}" + +# brain_custom_reports expects this to be the last line of output +echo "$destFolder/$fileName" diff --git a/files/scripts/dream-access-report.py b/files/scripts/braincommons/dream-access-report.py similarity index 98% rename from files/scripts/dream-access-report.py rename to files/scripts/braincommons/dream-access-report.py index b7549b13f..7b77cb29f 100644 --- a/files/scripts/dream-access-report.py +++ b/files/scripts/braincommons/dream-access-report.py @@ -45,7 +45,7 @@ row["Synapse_email"] = "" row["Synapse_sub_id"] = "" row["Authorized_BEAT-PD"] = False - row["ToU/PP"] = False + row["ToU/PP"] = "FALSE" if new_values["additional_info"]: if "userid" in new_values["additional_info"]: row["Synapse_id"] = new_values["additional_info"]["userid"] @@ -72,7 +72,7 @@ if cert_data["user_id"]: for output_content_i, output_content_row in enumerate(output_content): if output_content_row["User_id"] == cert_data["user_id"]: - output_content[output_content_i]["ToU/PP"] = True + output_content[output_content_i]["ToU/PP"] = "TRUE" except Exception as e: print(e) diff --git a/gen3/bin/api.sh b/gen3/bin/api.sh index d9bdcda1f..2977174d8 100644 --- a/gen3/bin/api.sh +++ b/gen3/bin/api.sh @@ -54,7 +54,7 @@ gen3_curl_json() { method="POST" jsonFile="" if [[ $# -lt 2 || -z "$1" ]]; then - echo -e "$(red_color "ERROR: USE: gen3_curl_json path username jsonFile")" 2>1 + gen3_log_err "USE: gen3_curl_json path username jsonFile" return 1 fi path="$1" @@ -65,7 +65,7 @@ gen3_curl_json() { jsonFile="$1" shift if [[ ! -f "$jsonFile" ]]; then - echo -e "$(red_color "ERROR: unable to read json file $jsonFile")" + gen3_log_err "unable to read json file $jsonFile" return 1 fi else @@ -73,12 +73,12 @@ gen3_curl_json() { fi accessToken="$(gen3_access_token "$userName")" if [[ -z "$accessToken" ]]; then - echo -e "$(red_color "ERROR: unable to acquire token for $userName")" + gen3_log_err "unable to acquire token for $userName" return 1 fi hostname="$(g3kubectl get configmap manifest-global -o json | jq -r '.data["hostname"]')" if [[ -z "$hostname" ]]; then - echo -e "$(red_color "ERROR: unable to determine hostname for commons API")" + gen3_log_err "unable to determine hostname for commons API" return 1 fi @@ -105,7 +105,7 @@ gen3_new_project() { local result if [[ $# -lt 3 || -z "$1" || -z "$2" || -z "$3" ]]; then - echo -e "$(red_color "ERROR: USE: gen3 api new-project prog-name proj-name username")" 2>1 + gen3_log_err "USE: gen3 api new-project prog-name proj-name username" return 1 fi progName="$1" @@ -145,7 +145,7 @@ gen3_new_program() { local result if [[ $# -lt 2 || -z "$1" || -z "$2" ]]; then - echo -e "$(red_color "ERROR: USE: gen3 api new-program prog-name username")" 2>1 + gen3_log_err "USE: gen3 api new-program prog-name username" return 1 fi progName="$1" @@ -192,7 +192,7 @@ gen3_indexd_post_folder() { fi if [[ ! -d "${DATA_DIR}" ]]; then - echo -e "$(red_color "ERROR: ") DATA_DIR, ${DATA_DIR}, does not exist" + gen3_log_err "DATA_DIR, ${DATA_DIR}, does not exist" gen3_indexd_post_folder_help return 1 fi @@ -212,6 +212,60 @@ gen3_indexd_post_folder() { done } +# +# Download all the indexd records from the given domain - +# manage the paging. +# Ex: +# gen3 api indexd-download domain.commons.io data/ +# +gen3_indexd_download_all() { + local DOMAIN + local DEST_DIR + local INDEXD_USER + local INDEXD_SECRET + + if [[ $# -lt 2 ]]; then + gen3_log_err "gen3_indexd_download_all takes 2 arguments: domain and destintation folder" + return 1 + fi + DOMAIN="$1" + shift + DATA_DIR="${1%%/}" + shift + + if [[ ! -d "${DATA_DIR}" ]]; then + gen3_log_err "destination folder, ${DATA_DIR}, does not exist" + return 1 + fi + + local stats + local totalFiles=0 + local fetchUrl="https://${DOMAIN}/index/_stats" + if ! stats="$(curl -s "$fetchUrl")" || ! totalFiles="$(jq -e -r .fileCount <<<"$stats")"; then + gen3_log_err "Failed to retrieve https://${DOMAIN}/index/_stats" + return 1 + fi + gen3_log_info "Preparing to fetch $totalFiles from $DOMAIN to $DATA_DIR/ in batches of 1000" + local count=0 + local start="" + local dataFile + while true; do + fetchUrl="https://${DOMAIN}/index/index?limit=1000" + dataFile="${DATA_DIR}/indexd_${DOMAIN//./_}_${count}.json" + if [[ -n "$start" ]]; then + fetchUrl="${fetchUrl}&start=$start" + fi + gen3_log_info "Fetching $fetchUrl into $dataFile" + curl -s "$fetchUrl" > "$dataFile" + start="$(jq -r '.records[-1].did' < "$dataFile")" + count=$((count + 1)) + if [[ "$start" == null || $((count * 1000)) -gt "$totalFiles" ]]; then + break + fi + sleep 1 + done +} + #---------- main @@ -220,6 +274,9 @@ if [[ -z "$GEN3_SOURCE_ONLY" ]]; then command="$1" shift case "$command" in + "indexd-download-all") + gen3_indexd_download_all "$@" + ;; "indexd-post-folder") gen3_indexd_post_folder "$@" ;; diff --git a/gen3/lib/logs/s3.sh b/gen3/lib/logs/s3.sh index 708cf7261..d1b9c2e36 100644 --- a/gen3/lib/logs/s3.sh +++ b/gen3/lib/logs/s3.sh @@ -8,6 +8,14 @@ source "${GEN3_HOME}/gen3/lib/utils.sh" gen3_load "gen3/gen3setup" +# +# urldecode the given argument +# +gen3_logs_urldecode() { + echo -e ${1//\%/\\x} +} + + # # Run the stdin logstream through a filter to generate a report # @@ -27,10 +35,17 @@ gen3_logs_s3filter() { fi case "$filterName" in "whoWhatWhen") - grep 'username' | grep GET | awk -v bucket="$logFolder" '{ gsub(/\[/, "", $3); gsub(/.+username=/, "", $11); gsub(/&.*/, "", $11); gsub(/%40/, "@", $11); print $3 "\t" $9 "\t" $11 "\t" bucket }' | sort + if [[ "$logFolder" != "unknown" ]]; then + echo -e "Date_time\tPath\tUser_id\tBucket_name" + grep 'username' | grep GET | awk -v bucket="$logFolder" '{ gsub(/\[/, "", $3); gsub(/.+username=/, "", $11); gsub(/&.*/, "", $11); gsub(/%40/, "@", $11); print $3 "\t" $9 "\t" $11 "\t" bucket }' | sort | sed -E 's/\%[0-9]+/_/g' + else + echo -e "Date_time\tPath\tUser_id" + grep 'username' | grep GET | awk -v bucket="$logFolder" '{ gsub(/\[/, "", $3); gsub(/.+username=/, "", $11); gsub(/&.*/, "", $11); gsub(/%40/, "@", $11); print $3 "\t" $9 "\t" $11 }' | sort | sed -E 's/\%[0-9]+/_/g' + fi ;; "accessCount") - grep 'username' | grep GET | awk '{ print $9 }' | sort | uniq -c + echo -e "Access_count\tPath" + grep 'username' | grep GET | awk '{ print $9 }' | sort | uniq -c | awk '{ print $1 "\t" $2 }' ;; *) cat -