Skip to content

Commit

Permalink
Add files via upload (#312)
Browse files Browse the repository at this point in the history
* Add files via upload

* Update requirements.txt

---------

Co-authored-by: dsp-fieldeng-bot <[email protected]>
  • Loading branch information
ncalvanese1 and dsp-fieldeng-bot authored Sep 20, 2023
1 parent da2d053 commit 86c5823
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 10 deletions.
102 changes: 102 additions & 0 deletions orchestration/hca_manage/deduplicate_staging_areas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Outputs contents of up to two GCS paths, comparing if multiple paths are specified.
Usage:
> python3 deduplicate_staging_areas.py -s STAGING_AREA_PATH [--print_files] [--skip_deletion]"""

# Imports
import os
import re
import argparse
from google.cloud import storage
import pandas as pd

# Function to return the objects in a staging area bucket
def get_staging_area_objects(bucket_name, prefix, delimiter=None):
record_list = []
try:
# List blobs in specified bucket/prefix
storage_client = storage.Client()
blobs = storage_client.list_blobs(bucket_name, prefix=prefix, delimiter=delimiter)

# Parse blobs and return a list of records
for blob in blobs:
if prefix + "data/" not in blob.name:
obj = blob.name
path = os.path.split(blob.name)[0]
entity = os.path.split(blob.name)[1].split("_")[0]
version = os.path.split(blob.name)[1].split("_")[1]
record = [obj, path, entity, version]
record_list.append(record)
return record_list
except Exception as e:
print(f"Error retrieving objects from staging area: {str(e)}")
return record_list

# Function to identify outdated entity files
def identify_outdated_files(record_list):
delete_list = []
if record_list:
# Load records into dataframe, group by path and entity, and order by version descending
df = pd.DataFrame(record_list, columns = ["blob", "path", "entity", "version"])
df["rn"] = df.groupby(["path", "entity"])["version"].rank(method="first", ascending=False)

# Identify outdated records and return as a list
df_outdated = df[df["rn"] != 1]
for index, row in df_outdated.iterrows():
delete_list.append(row["blob"])
return delete_list

# Function to batch delete files
def batch_delete_files(delete_list, bucket_name, prefix, delimiter=None):
if delete_list:
try:
# Loop through and submit batch deletion requests (max 1000)
deleted_list = []
while True:

# List blobs in specified bucket/prefix
storage_client = storage.Client()
blobs = storage_client.list_blobs(bucket_name, prefix=prefix, delimiter=delimiter)

# Loop through blobs and delete those found on the delete list
iterator = 0
with storage_client.batch():
for blob in blobs:
if blob.name in delete_list and blob.name not in deleted_list and iterator < 1000:
iterator += 1
deleted_list.append(blob.name)
blob.delete()

# If all objects deleted, exit loop
if len(deleted_list) == len(delete_list):
break
print("Objects deleted successfully.")
except Exception as e:
print(f"Error deleting objects: {str(e)}")

# Main function
if __name__ == "__main__":

# Set up argument parser
parser = argparse.ArgumentParser(description="Remove outdated entity files from HCA staging area.")
parser.add_argument("-s", "--staging_area", required=True, type=str, help="Full GCS path to the staging area of interest.")
parser.add_argument("-p", "--print_files", required=False, action="store_true", help="Add argument to print files to be removed.", default=False)
parser.add_argument("-n", "--skip_deletion", required=False, action="store_true", help="Add argument to skip file deltion.", default=False)
args = parser.parse_args()

# Initialize variables
bucket_name = re.match("gs:\/\/([a-z0-9\-_]+)\/", args.staging_area).group(1)
prefix = re.match("gs:\/\/[a-z0-9\-_]+\/([A-Za-z0-9\-_\/\.]+)", args.staging_area).group(1)
if prefix[-1] != "/":
prefix += "/"

# Call functions to identify and remove outdated entity files
print(f"Evaluating outdated files in staging area: {args.staging_area}")
objects_list = get_staging_area_objects(bucket_name, prefix)
print(f"\t- Total objects found: {len(objects_list)}")
delete_list = identify_outdated_files(objects_list)
print(f"\t- Outdated objects found: {len(delete_list)}")
if args.print_files:
print("\t- Outdated object list: \n\t\t- " + "\n\t\t- ".join(delete_list))
if not args.skip_deletion:
batch_delete_files(delete_list, bucket_name, prefix)
62 changes: 62 additions & 0 deletions orchestration/hca_manage/pull_dcp_snapshots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Outputs the snapshots for a particular DCP release
Usage:
> python3 pull_dcp_snapshots.py -r dcp_release"""

# Imports
import argparse
import data_repo_client
import pandas as pd
import google.auth
import google.auth.transport.requests
import requests

# Function to return the objects in a staging area bucket
def get_snapshots(release):
try:
# Establish TDR API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False

# Enumerate snapshots
snapshot_filter = "_" + release
snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
snapshots_list = snapshots_api.enumerate_snapshots(filter=snapshot_filter, limit=1000)
records_list = []
for snapshot_entry in snapshots_list.items:
public_flag = "N"
public_response = requests.get(
url=f"https://sam.dsde-prod.broadinstitute.org/api/resources/v2/datasnapshot/{snapshot_entry.id}/policies/reader/public",
headers={"Authorization": f"Bearer {creds.token}"},
)
if public_response.text == "true":
public_flag = "Y"
record = [snapshot_entry.id, snapshot_entry.name, snapshot_entry.data_project, snapshot_entry.created_date[0:10], snapshot_entry.created_date, public_flag]
records_list.append(record)
df = pd.DataFrame(records_list, columns =["TDR Snapshot ID", "TDR Snapshot Name", "TDR Snapshot Google Project", "Created Date", "Created Datetime", "Public Flag"])
df_sorted = df.sort_values(by=["TDR Snapshot Name"], ignore_index=True)
except Exception as e:
print(f"Error retrieving snapshots: {str(e)}")
df_sorted = pd.DataFrame()
return df_sorted

# Main function
if __name__ == "__main__":

# Set up argument parser
parser = argparse.ArgumentParser(description="Pull snapshots for a particular DCP release.")
parser.add_argument("-r", "--release", required=True, type=str, help="DCP release code (e.g., dcp25).")
args = parser.parse_args()

# Call functions to identify and remove outdated entity files
print(f"Pulling snapshots for release: {args.release}")
df = get_snapshots(args.release)
file_name = f"dcp_snapshot_list_{args.release}.tsv"
df.to_csv(file_name, sep="\t")
print(f"Results outputed to {file_name}")
20 changes: 10 additions & 10 deletions orchestration/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ dagster-pandas==0.12.14
dagster-postgres==0.12.14
dagster-slack==0.12.14
dagster==0.12.14
data-repo-client==1.521.0
data-repo-client==1.527.0
docstring-parser==0.15; python_version >= "3.9" and python_version < "3.10"
frozenlist==1.4.0; python_version >= "3.9" and python_version < "3.10" and python_full_version >= "3.6.0"
google-api-core==2.11.1; python_version >= "3.9" and python_version < "3.10" and (python_version >= "3.9" and python_full_version < "3.0.0" and python_version < "3.10" or python_full_version >= "3.6.0" and python_version >= "3.9" and python_version < "3.10") and (python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.7")
google-api-python-client==1.12.11; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
google-auth-httplib2==0.1.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
google-auth==2.17.3; python_version >= "3.9" and python_full_version < "3.0.0" and python_version < "3.10" or python_full_version >= "3.6.0" and python_version >= "3.9" and python_version < "3.10"
google-auth-httplib2==0.1.1; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
google-auth==2.23.0; python_version >= "3.9" and python_full_version < "3.0.0" and python_version < "3.10" or python_full_version >= "3.6.0" and python_version >= "3.9" and python_version < "3.10"
google-cloud-bigquery==2.34.3; python_version >= "3.6" and python_version < "3.11"
google-cloud-core==2.3.3; python_version >= "3.9" and python_version < "3.10" and (python_version >= "3.9" and python_full_version < "3.0.0" and python_version < "3.10" or python_full_version >= "3.6.0" and python_version >= "3.9" and python_version < "3.10")
google-cloud-storage==1.44.0; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.6.0")
Expand All @@ -39,20 +39,20 @@ graphql-ws==0.3.1
greenlet==2.0.2; python_version >= "3" and python_full_version < "3.0.0" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and (python_version >= "3.9" and python_full_version < "3.0.0" and python_version < "3.10" or python_version >= "3.9" and python_version < "3.10" and python_full_version >= "3.6.0") or python_version >= "3" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and (python_version >= "3.9" and python_full_version < "3.0.0" and python_version < "3.10" or python_version >= "3.9" and python_version < "3.10" and python_full_version >= "3.6.0") and python_full_version >= "3.5.0"
grpcio-health-checking==1.48.2; python_version >= "3.9" and python_version < "3.10"
grpcio-status==1.48.2; python_version >= "3.9" and python_version < "3.10"
grpcio==1.57.0; python_version >= "3.9" and python_version < "3.10"
grpcio==1.58.0; python_version >= "3.9" and python_version < "3.10"
hca-import-validation==0.0.17; python_version >= "3.6"
httplib2==0.22.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
humanfriendly==10.0; python_version >= "3.9" and python_full_version < "3.0.0" and python_version < "3.10" or python_version >= "3.9" and python_version < "3.10" and python_full_version >= "3.5.0"
idna==3.4; python_version >= "3.9" and python_version < "3.10" and python_full_version >= "3.6.0"
jinja2==2.11.3; python_version >= "3.9" and python_full_version < "3.0.0" and python_version < "3.10" or python_version >= "3.9" and python_version < "3.10" and python_full_version >= "3.5.0"
jsonschema-specifications==2023.7.1; python_version >= "3.8"
jsonschema==4.19.0; python_version >= "3.8"
kubernetes==27.2.0; python_version >= "3.6"
kubernetes==28.1.0; python_version >= "3.6"
mako==1.2.2; python_version >= "3.7"
markupsafe==2.0.1; python_version >= "3.6"
more-itertools==10.1.0; python_version >= "3.8"
multidict==6.0.4; python_version >= "3.9" and python_version < "3.10" and python_full_version >= "3.6.0"
numpy==1.25.2; python_version < "3.11" and python_version >= "3.9"
numpy==1.26.0; python_version >= "3.9" and python_version < "3.11"
oauth2client==4.1.3
oauthlib==3.2.2; python_version >= "3.6"
packaging==23.1; python_version >= "3.9" and python_version < "3.10"
Expand All @@ -77,11 +77,11 @@ referencing==0.30.2; python_version >= "3.8"
requests-oauthlib==1.3.1; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6"
requests==2.31.0; python_version >= "3.9" and python_version < "3.10" and (python_version >= "3.9" and python_full_version < "3.0.0" and python_version < "3.10" or python_version >= "3.9" and python_version < "3.10" and python_full_version >= "3.6.0") and (python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.7")
rfc3339-validator==0.1.4; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
rpds-py==0.10.2; python_version >= "3.8"
rpds-py==0.10.3; python_version >= "3.8"
rsa==4.9; python_version >= "3.6" and python_version < "4" and (python_version >= "3.9" and python_full_version < "3.0.0" and python_version < "3.10" or python_full_version >= "3.6.0" and python_version >= "3.9" and python_version < "3.10")
rx==1.6.3; python_version >= "3.9" and python_version < "3.10"
six==1.16.0; python_version >= "3.9" and python_full_version < "3.0.0" and python_version < "3.10" or python_version >= "3.9" and python_version < "3.10" and python_full_version >= "3.6.0"
slack-sdk==3.21.3; python_full_version >= "3.6.0"
slack-sdk==3.22.0; python_full_version >= "3.6.0"
slackclient==2.9.4; python_version >= "3.9" and python_version < "3.10" and python_full_version >= "3.6.0"
soupsieve==2.5; python_full_version >= "3.6.0" and python_version >= "3.8"
sqlalchemy==1.4.49; python_version >= "3.9" and python_full_version < "3.0.0" and python_version < "3.10" or python_version >= "3.9" and python_version < "3.10" and python_full_version >= "3.6.0"
Expand All @@ -93,7 +93,7 @@ typing-compat==0.1.0; python_version >= "3.9" and python_full_version < "3.0.0"
typing-extensions==3.10.0.2
tzdata==2023.3; python_version >= "3.9"
uritemplate==3.0.1; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
urllib3==2.0.4; python_version >= "3.9" and python_version < "3.10"
urllib3==1.26.16; python_version >= "3.9" and python_full_version < "3.0.0" and python_version < "3.10" or python_version >= "3.9" and python_version < "3.10" and python_full_version >= "3.6.0"
watchdog==3.0.0; python_version >= "3.9" and python_version < "3.10"
websocket-client==1.6.2; python_version >= "3.8"
websocket-client==1.6.3; python_version >= "3.8"
yarl==1.9.2; python_version >= "3.9" and python_version < "3.10" and python_full_version >= "3.6.0"

0 comments on commit 86c5823

Please sign in to comment.