diff --git a/.gitignore b/.gitignore index 83b1350..1ead1d3 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,5 @@ __pycache__ paper/paper.pdf paper/jats/ venv + +.qodo diff --git a/README.md b/README.md index a8362de..af1a3fb 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,8 @@ First, clone the repository on your local machine and then install the package u ```bash $ git clone https://github.com/PRIDE-Archive/pridepy $ cd pridepy -$ pip install . +$ poetry build +$ pip install dist/*.whl ``` Install with setup.py: @@ -34,73 +35,126 @@ $ cd pridepy $ poetry build $ pip install dist/pridepy-{version}.tar.gz ``` +# Usage and Documentation -# Examples +This Python CLI tool, built using the Click module, +already provides detailed usage instructions for each command. To avoid redundancy and potential clutter in this README, you can access the usage instructions directly from the CLI +Use the below command to view a list of commands available: -Download all the raw files from a dataset(eg: PXD012353). -Warning: Raw files are generally large in size, so it may take some time to download depending on the number of files and file sizes. +```bash +$ pridepy --help +Usage: pridepy [OPTIONS] COMMAND [ARGS]... -`-p`: in download specifies protocol (ftp default): - - **ftp**: FTP protocol - - **aspera**: using the aspera protocol - - **globus**: PRIDE globus endpoint (_the data is downloaded through https_) +Options: + --help Show this message and exit. + +Commands: + download-all-public-raw-files Download all public raw files... + download-file-by-name Download a single file from a... + get-files-by-filter get paged files :return: + get-files-by-project-accession get files by project accession... + get-private-files Get private files by project... + get-projects get paged projects :return: + get-projects-by-accession get projects by accession... + stream-files-metadata Stream all files metadata in... + stream-projects-metadata Stream all projects metadata... + +``` +> [!NOTE] +> Please make sure you are using Python3, not Python 2.7 version. + +## Downloading a project from PRIDE Archive + +The main purpose of this tool is to download data from the PRIDE Archive. Here, how to download all the raw files from a dataset(eg: PXD012353). ```bash $ pridepy download-all-public-raw-files -a PXD012353 -o /Users/yourname/Downloads/foldername/ -p aspera ``` +- `-a` flag is used to specify the project accession number. +- `-o` flag is used to specify the output directory. +- `-p` flag is used to specify the protocol (**aspera, ftp, globus**) + +> [!IMPORTANT] +> Currently, pridepy supports multiple protocols for downloading including ftp, aspera, globus, s3. ftp, aspera uses those protocols to download the files; the pridepy includes the aspera client. For globus and s3, the tool uses https of both services endpoints. Read the whitepaper to know more about the performance of each protocol. + +Additional options: + +- `-skip` flag is used to skip the download of files that already exist in the output directory. +- `--aspera_maximum_bandwidth` flag is used to specify the maximum bandwidth for the Aspera download. The default value is 100M. +- `--checksum_check` flag is used to check the checksum of the downloaded files. The default value is False. + +## Download single file by name + +Users instead of downloading an entire project files may be interested in downloading a single file if they know it by name. Here is how to download a single file by name. -Download single file by name: ```bash $ pridepy download-file-by-name -a PXD022105 -o /Users/yourname/Downloads/foldername/ -f checksum.txt -p globus ``` ->**NOTE**: Currently we use Globus URLs (when `-p globus` is used) via HTTPS, not the Globus protocol. For more information about Globus, see [Globus documentation](https://www.globus.org/data-transfer). +Please be aware that the additional parameters are the same as the previous command [Downloading a project from PRIDE Archive](#downloading-a-project-from-pride-archive). -Search projects with keywords and filters -```bash -$ pridepy search-projects-by-keywords-and-filters --filter accession==PXD012353 +## Download project files by category + +Users may be interested in downloading files by category. Here is how to download files by category. The different categories are available in the PRIDE Archive: -$ pridepy search-projects-by-keywords-and-filters --keyword PXD012353 +- RAW: Raw data files +- PEAK: Peak list files +- SEARCH: Search engine output files +- OTHER: Other files +- RESULT: Result files +- SPECTRUM LIBRARIES: Spectrum libraries +- FASTA: FASTA files + +```bash +$ pridepy download-files-by-category -a PXD022105 -o /Users/yourname/Downloads/foldername/ -c RAW -p ftp ``` -Stream metadata of all projects as json and write it to a file +Please be aware that the additional parameters are the same as the previous command [Downloading a project from PRIDE Archive](#downloading-a-project-from-pride-archive). + +>[!IMPORTANT] +> We also implemented a direct command to download RAW files from a project which is the most common use case. + +## Download private files + +Users and especially reviewers may be interested in downloading private files. Here is how to download private files. + +First, the user can list the private files of a project: + ```bash -$ pridepy stream-projects-metadata -o all_pride_projects.json +$ pridepy list-private-files -a PXD022105 -u yourusername -p yourpassword ``` -Stream metadata of all files as json and write it to a file. Project accession can be specified as an optional parameter +This command will list the private files of the project PXD022105. Including the file name, file size, and download link. + +Then the user can download the private files: + ```bash -$ pridepy stream-files-metadata -o all_pride_files.json -OR -$ pridepy stream-files-metadata -o PXD005011_files.json -a PXD005011 +$ pridepy download-file-by-name -a PXD022105 -o /Users/yourname/Downloads/foldername/ --username yourusername --password yourpassword -f checksum.txt ``` -This Python CLI tool, built using the Click module, -already provides detailed usage instructions for each command. To avoid redundancy and potential clutter in this README, you can access the usage instructions directly from the CLI -Use the below command to view a list of commands available: +>[!WARNING] +> To download preivate files, the user should use the same command as downloading a single file by name. The only difference is that the user should provide the username and password. However, protocol in this case is unnecessary as the tool will use the https protocol to download the files. At the moment we only allow this protocol because of the infrastructure of PRIDE private files (read the whitepaper for more information). + +## Streamming metadata + +One of the great features of PRIDE and pridepy is the ability to stream metadata of all projects and files. This is useful for users who want to analyze the metadata of all projects and files locally. + +Stream metadata of all projects as JSON and write it to a file: ```bash -$ pridepy --help -Usage: pridepy [OPTIONS] COMMAND [ARGS]... +$ pridepy stream-projects-metadata -o all_pride_projects.json +``` -Options: - --help Show this message and exit. +Stream all files metadata in a specific project as JSON and write it to a file: -Commands: - download-all-public-raw-files Download all public raw files... - download-file-by-name Download a single file from a... - get-files-by-filter get paged files :return: - get-files-by-project-accession get files by project accession... - get-private-files Get private files by project... - get-projects get paged projects :return: - get-projects-by-accession get projects by accession... - stream-files-metadata Stream all files metadata in... - stream-projects-metadata Stream all projects metadata... - +```bash +$ pridepy stream-files-metadata -o all_pride_files_metadata.json ``` -# NOTE +Stream the files metadata of a specific project as JSON and write it to a file: -Please make sure you are using Python3, not Python 2.7 version. +```bash +$ pridepy stream-files-metadata -o PXD005011_files.json -a PXD005011 +``` # White paper diff --git a/pridepy/authentication/authentication.py b/pridepy/authentication/authentication.py index 8dc6d7a..f042fef 100644 --- a/pridepy/authentication/authentication.py +++ b/pridepy/authentication/authentication.py @@ -27,11 +27,7 @@ def get_token(self, username, password): url = self.base_url + "/login" headers = {"Content-type": "application/json", "Accept": "text/plain"} credentials = ( - '{"Credentials":{"username":"' - + username - + '", "password":"' - + password - + '"}}' + '{"Credentials":{"username":"' + username + '", "password":"' + password + '"}}' ) response = requests.post(url, data=credentials, headers=headers) @@ -55,8 +51,4 @@ def validate_token(self, token): response = requests.post(url, headers=headers) - return ( - response.ok - and response.status_code == 200 - and response.text == "Token Valid" - ) + return response.ok and response.status_code == 200 and response.text == "Token Valid" diff --git a/pridepy/files/files.py b/pridepy/files/files.py index e4f17e5..a5425e8 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -56,9 +56,7 @@ class Files: PRIDE_ARCHIVE_FTP = "ftp.pride.ebi.ac.uk" S3_URL = "https://hh.fire.sdo.ebi.ac.uk" S3_BUCKET = "pride-public" - logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" - ) + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") def __init__(self): pass @@ -72,15 +70,15 @@ async def stream_all_files_metadata(self, output_file, accession=None): count_request_url = f"{self.V3_API_BASE_URL}/files/count" else: request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/all" - count_request_url = ( - f"{self.V3_API_BASE_URL}/projects/{accession}/files/count" - ) + count_request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/count" headers = {"Accept": "application/JSON"} response = Util.get_api_call(count_request_url, headers) total_records = response.json() regex_search_pattern = '"fileName"' - await Util.stream_response_to_file(output_file, total_records, regex_search_pattern, request_url, headers) + await Util.stream_response_to_file( + output_file, total_records, regex_search_pattern, request_url, headers + ) def stream_all_files_by_project(self, accession) -> List[Dict]: """ @@ -88,7 +86,7 @@ def stream_all_files_by_project(self, accession) -> List[Dict]: """ request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/all" headers = {"Accept": "application/JSON"} - record_files = Util.read_json_stream(api_url=request_url, headers = headers) + record_files = Util.read_json_stream(api_url=request_url, headers=headers) return record_files def get_all_raw_file_list(self, project_accession): @@ -219,9 +217,7 @@ def callback(data): pbar.update(len(data)) # Retrieve the file with progress callback - ftp.retrbinary( - f"RETR {ftp_file_path}", callback - ) + ftp.retrbinary(f"RETR {ftp_file_path}", callback) logging.info(f"Successfully downloaded {new_file_path}") break # Exit download retry loop if successful @@ -240,13 +236,9 @@ def callback(data): ) break # Give up on this file after max retries except (KeyError, IndexError) as e: - logging.error( - f"Failed to process file due to missing data: {str(e)}" - ) + logging.error(f"Failed to process file due to missing data: {str(e)}") except Exception as e: - logging.error( - f"Unexpected error while processing file: {str(e)}" - ) + logging.error(f"Unexpected error while processing file: {str(e)}") ftp.quit() # Close FTP connection after all files are downloaded logging.info(f"Disconnected from FTP host: {Files.PRIDE_ARCHIVE_FTP}") break # Exit connection retry loop if everything was successful @@ -257,9 +249,7 @@ def callback(data): socket.error, ) as e: connection_attempt += 1 - logging.error( - f"FTP connection failed (attempt {connection_attempt}): {str(e)}" - ) + logging.error(f"FTP connection failed (attempt {connection_attempt}): {str(e)}") if connection_attempt < max_connection_retries: logging.info("Retrying connection...") time.sleep(5) # Optional delay before retrying @@ -303,9 +293,7 @@ def download_files_from_aspera( # Create a clean filename to save the downloaded file logging.debug(f"Downloading via Aspera: {download_url}") - new_file_path = Files.get_output_file_name( - download_url, file, output_folder - ) + new_file_path = Files.get_output_file_name(download_url, file, output_folder) if skip_if_downloaded_already == True and os.path.exists(new_file_path): logging.info("Skipping download as file already exists") @@ -359,9 +347,7 @@ def download_files_from_globus( download_url = download_url.replace(ftp_base_url, globus_base_url) # Create a clean filename to save the downloaded file - new_file_path = Files.get_output_file_name( - download_url, file, output_folder - ) + new_file_path = Files.get_output_file_name(download_url, file, output_folder) if skip_if_downloaded_already == True and os.path.exists(new_file_path): logging.info("Skipping download as file already exists") @@ -378,18 +364,14 @@ def download_files_from_globus( urllib.request.urlretrieve( download_url, new_file_path, - reporthook=lambda blocks, block_size, total_size: progress( - block_size - ), + reporthook=lambda blocks, block_size, total_size: progress(block_size), ) progress.close() logging.info(f"Successfully downloaded {new_file_path}") except Exception as e: - logging.error( - f"Download from Globus failed for {new_file_path}: {str(e)}" - ) + logging.error(f"Download from Globus failed for {new_file_path}: {str(e)}") @staticmethod def download_files_from_s3( @@ -431,9 +413,7 @@ def download_files_from_s3( ftp_base_url = "ftp://ftp.pride.ebi.ac.uk/pride/data/archive/" s3_path = download_url.replace(ftp_base_url, "") - new_file_path = Files.get_output_file_name( - download_url, file, output_folder - ) + new_file_path = Files.get_output_file_name(download_url, file, output_folder) if skip_if_downloaded_already == True and os.path.exists(new_file_path): logging.info("Skipping download as file already exists") @@ -514,9 +494,7 @@ def download_file_by_name( ## Check type of project public_project = False - project_status = Util.get_api_call( - self.API_BASE_URL + "/status/{}".format(accession) - ) + project_status = Util.get_api_call(self.API_BASE_URL + "/status/{}".format(accession)) if project_status.status_code == 200: if project_status.text == "PRIVATE": @@ -524,9 +502,7 @@ def download_file_by_name( elif project_status.text == "PUBLIC": public_project = True else: - raise Exception( - "Dataset {} is not present in PRIDE Archive".format(accession) - ) + raise Exception("Dataset {} is not present in PRIDE Archive".format(accession)) if public_project: logging.info("Downloading file from public dataset {}".format(accession)) @@ -576,9 +552,7 @@ def get_file_from_api(self, accession, file_name) -> List[Dict]: except Exception as e: raise Exception("File not found " + str(e)) - def download_private_file_name( - self, accession, file_name, output_folder, username, password - ): + def download_private_file_name(self, accession, file_name, output_folder, username, password): """ Get the information for a given private file to be downloaded from the api. :param accession: Project accession @@ -592,12 +566,8 @@ def download_private_file_name( validate_token = auth.validate_token(auth_token) logging.info("Valid token after login: {}".format(validate_token)) - url = self.API_PRIVATE_URL + "/projects/{}/files?search={}".format( - accession, file_name - ) - content = requests.get( - url, headers={"Authorization": "Bearer {}".format(auth_token)} - ) + url = self.API_PRIVATE_URL + "/projects/{}/files?search={}".format(accession, file_name) + content = requests.get(url, headers={"Authorization": "Bearer {}".format(auth_token)}) if content.ok and content.status_code == 200: json_file = content.json() if ( @@ -605,22 +575,16 @@ def download_private_file_name( and "files" in json_file["_embedded"] and len(json_file["_embedded"]["files"]) == 1 ): - download_url = json_file["_embedded"]["files"][0]["_links"]["download"][ - "href" - ] + download_url = json_file["_embedded"]["files"][0]["_links"]["download"]["href"] logging.info(download_url) # Create a clean filename to save the downloaded file new_file_path = os.path.join(output_folder, f"{file_name}") - session = ( - Util.create_session_with_retries() - ) # Create session with retries + session = Util.create_session_with_retries() # Create session with retries # Check if the file already exists if os.path.exists(new_file_path): - resume_header = { - "Range": f"bytes={os.path.getsize(new_file_path)}-" - } + resume_header = {"Range": f"bytes={os.path.getsize(new_file_path)}-"} mode = "ab" # Append to file resume_size = os.path.getsize(new_file_path) else: @@ -738,17 +702,61 @@ def download_files( elif protocol == "aspera": Files.download_files_from_aspera( - file_list_json, - output_folder, - skip_if_downloaded_already, - maximum_bandwidth=aspera_maximum_bandwidth, - ) + file_list_json, + output_folder, + skip_if_downloaded_already, + maximum_bandwidth=aspera_maximum_bandwidth, + ) elif protocol == "globus": Files.download_files_from_globus( - file_list_json, output_folder, skip_if_downloaded_already - ) - elif protocol == "s3": - Files.download_files_from_s3( file_list_json, output_folder, skip_if_downloaded_already - ) + ) + elif protocol == "s3": + Files.download_files_from_s3(file_list_json, output_folder, skip_if_downloaded_already) + + def download_all_category_files( + self, + accession: str, + output_folder: str, + skip_if_downloaded_already: bool, + protocol: str, + aspera_maximum_bandwidth: str, + checksum_check: bool, + category: str, + ): + """ + Download all files of a specified category from a PRIDE project. + + :param accession: The PRIDE project accession identifier. + :param output_folder: The directory where the files will be downloaded. + :param skip_if_downloaded_already: If True, skips downloading files that already exist. + :param protocol: The transfer protocol to use (e.g., ftp, aspera, globus, s3). + :param aspera_maximum_bandwidth: Maximum bandwidth for Aspera transfers. + :param checksum_check: If True, downloads the checksum file for the project. + :param category: The category of files to download. + """ + raw_files = self.get_all_category_file_list(accession, category) + self.download_files( + raw_files, + accession, + output_folder, + skip_if_downloaded_already, + protocol, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, + checksum_check=checksum_check, + ) + + def get_all_category_file_list(self, accession: str, category: str): + """ + Retrieve a list of files from a specific project that belong to a given category. + + :param accession: The PRIDE project accession identifier. + :param category: The category of files to filter by. + :return: A list of files in the specified category. + """ + record_files = self.stream_all_files_by_project(accession) + category_files = [ + file for file in record_files if file["fileCategory"]["value"] == category + ] + return category_files diff --git a/pridepy/pridepy.py b/pridepy/pridepy.py index a0454ea..6f2996a 100644 --- a/pridepy/pridepy.py +++ b/pridepy/pridepy.py @@ -10,7 +10,6 @@ def main(): pass - @main.command( "download-all-public-raw-files", help="Download all public raw files from a given PRIDE public project", @@ -57,7 +56,15 @@ def download_all_public_raw_files( checksum_check: bool = False, ): """ - This script download raw files from FTP or copy from the file system + Command to download all public raw files from a specified PRIDE project. + + Parameters: + accession (str): PRIDE project accession. + protocol (str): Protocol for downloading files (ftp, aspera, globus). Default is ftp. + output_folder (str): Directory to save downloaded raw files. + skip_if_downloaded_already (bool): Skip download if files already exist. Default is True. + aspera_maximum_bandwidth (str): Maximum bandwidth for Aspera protocol. Default is 100M. + checksum_check (bool): Flag to download checksum file for the project. Default is False. """ raw_files = Files() @@ -78,8 +85,8 @@ def download_all_public_raw_files( @main.command( - "download-file-by-name", - help="Download a single file from a given PRIDE project (public or private)", + "download-all-public-category-files", + help="Download all public files of specific category from a given PRIDE public project", ) @click.option("-a", "--accession", required=True, help="PRIDE project accession") @click.option( @@ -88,12 +95,11 @@ def download_all_public_raw_files( default="ftp", help="Protocol to be used to download files either by ftp or aspera or from globus. Default is ftp", ) -@click.option("-f", "--file_name", required=True, help="fileName to be downloaded") @click.option( "-o", "--output_folder", required=True, - help="output folder to download or copy files", + help="output folder to download or copy raw files", ) @click.option( "-skip", @@ -103,11 +109,92 @@ def download_all_public_raw_files( help="Boolean value to skip the download if the file has already been downloaded.", ) @click.option( - "--username", required=False, help="PRIDE login username for private files" + "--aspera_maximum_bandwidth", + required=False, + help="Aspera maximum bandwidth (e.g 50M, 100M, 200M), depending on the user's network bandwidth, default is 100M", + default="100M", ) @click.option( - "--password", required=False, help="PRIDE login password for private files" + "--checksum_check", + required=False, + help="Download checksum file for project", + is_flag=True, + default=False, ) +@click.option( + "-c", + "--category", + required=True, + help="Category of the files to be downloaded", + type=click.Choice("RAW,PEAK,SEARCH,RESULT,SPECTRUM_LIBRARY,OTHER, FASTA".split(",")), +) +def download_all_public_category_files( + accession: str, + protocol: str, + output_folder: str, + skip_if_downloaded_already: bool, + aspera_maximum_bandwidth: str = "50M", + checksum_check: bool = False, + category: str = "RAW", +): + """ + Command to download all public files of a specified category from a given PRIDE public project. + + Parameters: + accession (str): The PRIDE project accession identifier. + protocol (str): The protocol to use for downloading files (ftp, aspera, globus). + output_folder (str): The directory where the files will be downloaded. + skip_if_downloaded_already (bool): If True, skips downloading files that already exist. + aspera_maximum_bandwidth (str): Maximum bandwidth for Aspera transfers. + checksum_check (bool): If True, downloads the checksum file for the project. + category (str): The category of files to download. + """ + + raw_files = Files() + logging.info("accession: " + accession) + logging.info(f"Data will be downloaded from {protocol}") + + if protocol == "aspera": + logging.info(f"Aspera maximum bandwidth: {aspera_maximum_bandwidth}") + + raw_files.download_all_category_files( + accession, + output_folder, + skip_if_downloaded_already, + protocol, + aspera_maximum_bandwidth=aspera_maximum_bandwidth, + checksum_check=checksum_check, + category=category, + ) + + +@main.command( + "download-file-by-name", + help="Download a single file from a given PRIDE project (public or private)", +) +@click.option("-a", "--accession", required=True, help="PRIDE project accession") +@click.option( + "-p", + "--protocol", + default="ftp", + help="Protocol to be used to download files either by ftp or aspera or from globus. Default is ftp", +) +@click.option("-f", "--file_name", required=True, help="fileName to be downloaded") +@click.option( + "-o", + "--output_folder", + required=True, + help="output folder to download or copy files", +) +@click.option( + "-skip", + "--skip_if_downloaded_already", + required=False, + default=True, + help="Boolean value to skip the download if the file has already been downloaded.", +) +@click.option("--username", required=False, help="PRIDE login username for private files") +@click.option("--password", required=False, help="PRIDE login password for private files") @click.option( "--aspera_maximum_bandwidth", required=False, @@ -165,11 +252,11 @@ def download_file_by_name( ) -@main.command("get-private-files", help="Get private files by project accession") +@main.command("list-private-files", help="List private files by project accession") @click.option("-a", "--accession", required=True, help="accession of the project") @click.option("-u", "--user", required=True, help="PRIDE login username") -@click.option("-p", "--password", required=True, help="PRiDE login password") -def get_private_files(accession, user, password): +@click.option("-p", "--password", required=True, help="PRIDE login password") +def list_private_files(accession, user, password): """ get files by project accession :return: @@ -182,72 +269,7 @@ def get_private_files(accession, user, password): # Get file size in MB from bytes file_size = f["fileSizeBytes"] / (1024 * 1024) file_category = f["fileCategory"]["value"] - logging.info( - f["fileName"] + "\t" + str(file_size) + " MB\t" + file_category - ) - - -@main.command() -@click.option( - "-k", - "--keyword", - required=False, - default="", - help="The entered word will be searched among the fields to fetch " - "matching pride. The structure of the keyword is : *:*", -) -@click.option( - "-f", - "--filter", - required=False, - help="Parameters to filter the search results. The structure of the " - "filter is: field1==value1, field2==value2. Example " - "accession==PRD000001", -) -@click.option( - "-ps", - "--page_size", - required=False, - default=100, - help="Number of results to fetch in a page", -) -@click.option( - "-p", - "--page", - required=False, - default=0, - help="Identifies which page of results to fetch", -) -@click.option( - "-sd", - "--sort_direction", - required=False, - default="DESC", - help="Sorting direction: ASC or DESC", -) -@click.option( - "-sf", - "--sort_fields", - required=False, - default="submission_date", - help="Field(s) for sorting the results on. Default for this " - "request is submission_date. More fields can be separated by " - "comma and passed. Example: submission_date,project_title", -) -def search_projects_by_keywords_and_filters( - keyword, filter, page_size, page, date_gap, sort_direction, sort_fields -): - """ - search public pride with keywords and filters - :return: - """ - project = Project() - logging.info( - project.search_by_keywords_and_filters( - keyword, filter, page_size, page, sort_direction, sort_fields - ) - ) - + logging.info(f["fileName"] + "\t" + str(file_size) + " MB\t" + file_category) @main.command() @click.option( @@ -286,115 +308,31 @@ def stream_files_metadata(accession, output_file): files = Files() asyncio.run(files.stream_all_files_metadata(output_file, accession)) - @main.command() @click.option( - "-ps", - "--page_size", - required=False, - default=100, - help="Number of results to fetch in a page", -) -@click.option( - "-p", - "--page", - required=False, - default=0, - help="Identifies which page of results to fetch", -) -@click.option( - "-sd", - "--sort_direction", - required=False, - default="DESC", - help="Sorting direction: ASC or DESC", -) -@click.option( - "-sc", - "--sort_conditions", - required=False, - default="projectAccession", - help="Field(s) for sorting the results on. Default for this " - "request is project_accession. More fields can be separated by " - "comma and passed. Example: submission_date,project_title", -) -def get_projects(page_size, page, sort_direction, sort_conditions): - """ - get paged projects - :return: - """ - project = Project() - logging.info(project.get_projects(page_size, page, sort_direction, sort_conditions)) - - -@main.command() -@click.option("-a", "--accession", required=False, help="accession of the project") -def get_projects_by_accession(accession): - """ - get projects by accession - :return: - """ - project = Project() - logging.info(project.get_by_accession(accession)) - - -@main.command() -@click.option("-a", "--accession", required=False, help="accession of the project") -def get_similar_projects_by_accession(accession): - """ - get similar projects by accession - :return: - """ - project = Project() - logging.info(project.get_similar_projects_by_accession(accession)) - - -@main.command() -@click.option("-a", "--accession", required=True, help="accession of the project") -@click.option( - "-ps", - "--page_size", - required=False, - default=100, - help="Number of results to fetch in a page", -) -@click.option( - "-p", - "--page", - required=False, - default=0, - help="Identifies which page of results to fetch", -) -@click.option( - "-sd", - "--sort_direction", - required=False, - default="DESC", - help="Sorting direction: ASC or DESC", -) -@click.option( - "-sc", - "--sort_conditions", + "-k", + "--keyword", required=False, - default="projectAccession", - help="Field(s) for sorting the results on. Default for this " - "request is project_accession. More fields can be separated by " - "comma and passed. Example: submission_date,project_title", + default="", + help="The entered word will be searched among the fields to fetch " + "matching pride. The structure of the keyword is : *:*", ) -def get_files_by_project_accession( - accession, filter, page_size, page, sort_direction, sort_conditions +def search_projects_by_keywords_and_filters( + keyword, filter, page_size, page, date_gap, sort_direction, sort_fields ): """ - get files by project accession - :return: + TODO: @selva this function and command line should be reimplemented. + TODO: The idea is that the user can type a keyword or keywords and filters and get all the files projects in + TODO: JSON. Please remember to update the README. """ project = Project() logging.info( - project.get_files_by_accession( - accession, filter, page_size, page, sort_direction, sort_conditions + project.search_by_keywords_and_filters( + keyword, filter, page_size, page, sort_direction, sort_fields ) ) + if __name__ == "__main__": main() diff --git a/pridepy/project/project.py b/pridepy/project/project.py index 81f1bc8..8481c86 100644 --- a/pridepy/project/project.py +++ b/pridepy/project/project.py @@ -66,9 +66,7 @@ def get_by_accession(self, accession): response = Util.get_api_call(request_url, headers) return response.json() - def get_files_by_accession( - self, accession - ): + def get_files_by_accession(self, accession): """ search PRIDE project's files by accession :param accession: PRIDE project accession @@ -141,16 +139,10 @@ def search_by_keywords_and_filters( if query_filter: request_url = request_url + "filter=" + query_filter + "&" - request_url = ( - request_url + "pageSize=" + str(page_size) + "&page=" + str(page) + "&" - ) + request_url = request_url + "pageSize=" + str(page_size) + "&page=" + str(page) + "&" request_url = ( - request_url - + "sortDirection=" - + sort_direction - + "&sortFields=" - + sort_fields + request_url + "sortDirection=" + sort_direction + "&sortFields=" + sort_fields ) headers = {"Accept": "application/JSON"} @@ -164,8 +156,6 @@ def get_project_file_names( if user and password: files = self.get_private_files_by_accession(accession, user, password) else: - files = self.get_files_by_accession( - accession, "", 100, 0, "ASC", "fileName" - )["list"] + files = self.get_files_by_accession(accession, "", 100, 0, "ASC", "fileName")["list"] return [file["fileName"] for file in files] diff --git a/pridepy/tests/test_raw_files.py b/pridepy/tests/test_raw_files.py index 3fdab73..631b8cf 100644 --- a/pridepy/tests/test_raw_files.py +++ b/pridepy/tests/test_raw_files.py @@ -28,3 +28,12 @@ def test_get_raw_file_path_prefix(self): """ raw = Files() assert raw.get_submitted_file_path_prefix("PXD008644") == "2018/10/PXD008644" + + def test_get_all_category_file_list(self): + + raw = Files() + result = raw.get_all_category_file_list("PXD008644", "RAW") + assert len(result) == 2 + + result = raw.get_all_category_file_list("PXD008644", "SEARCH") + assert len(result) == 2 diff --git a/pridepy/tests/test_search.py b/pridepy/tests/test_search.py index 2cd310a..a61e83c 100644 --- a/pridepy/tests/test_search.py +++ b/pridepy/tests/test_search.py @@ -17,11 +17,18 @@ def test_search_projects(self): """ project = Project() - result = project.search_by_keywords_and_filters(keyword="PXD009476",query_filter="", - page_size=100,page=0 , sort_direction="DESC", - sort_fields="accession") + result = project.search_by_keywords_and_filters( + keyword="PXD009476", + query_filter="", + page_size=100, + page=0, + sort_direction="DESC", + sort_fields="accession", + ) assert len(result) > 0 # Search should return at least one result - assert any(r["accession"] == "PXD009476" for r in result) # Search should return the queried project + assert any( + r["accession"] == "PXD009476" for r in result + ) # Search should return the queried project result = project.get_projects(77, 0, "ASC", "submission_date") assert len(result) == 77 @@ -29,18 +36,21 @@ def test_search_projects(self): result = project.get_by_accession("PXD009476") assert result["accession"] == "PXD009476" - assert len(project.get_files_by_accession( - "PXD009476", - )) == 100 + assert ( + len( + project.get_files_by_accession( + "PXD009476", + ) + ) + == 100 + ) def test_status_dataset(self): files = Files() accession = "PXD044389" - project_status = Util.get_api_call( - files.API_BASE_URL + "/status/{}".format(accession) - ) + project_status = Util.get_api_call(files.API_BASE_URL + "/status/{}".format(accession)) public_project = False if project_status.status_code == 200: if project_status.text == "PRIVATE": @@ -48,7 +58,5 @@ def test_status_dataset(self): elif project_status.text == "PUBLIC": public_project = False else: - raise Exception( - "Dataset {} is not present in PRIDE Archive".format(accession) - ) + raise Exception("Dataset {} is not present in PRIDE Archive".format(accession)) logging.debug(f"Public project: {public_project}") diff --git a/pridepy/util/api_handling.py b/pridepy/util/api_handling.py index f05a91d..cdf7ffc 100644 --- a/pridepy/util/api_handling.py +++ b/pridepy/util/api_handling.py @@ -33,16 +33,14 @@ def get_api_call(url, headers=None): response = requests.get(url, headers=headers) if (not response.ok) or response.status_code != 200: - raise Exception( - "PRIDE API call {} response: {}".format(url, response.status_code) - ) + raise Exception("PRIDE API call {} response: {}".format(url, response.status_code)) return response @staticmethod @sleep_and_retry @limits(calls=1000, period=50) async def stream_response_to_file( - out_file, total_records, regex_search_pattern, url, headers=None + out_file, total_records, regex_search_pattern, url, headers=None ): # Initialize the progress bar with tqdm(total=total_records, unit_scale=True) as pbar: @@ -56,7 +54,7 @@ async def stream_response_to_file( # Iterate over the streaming content line by line async for line in response.aiter_lines(): if ( - line + line ): # Avoid printing empty lines (common with text/event-stream) cfile.write(line + "\n") # Check if the pattern exists in the string @@ -72,9 +70,9 @@ async def stream_response_to_file( @sleep_and_retry @limits(calls=1000, period=50) def read_json_stream( - api_url: str, - headers: Optional[Dict[str, str]] = None, - params: Optional[Dict[str, str]] = None, + api_url: str, + headers: Optional[Dict[str, str]] = None, + params: Optional[Dict[str, str]] = None, ) -> Optional[List[Dict[str, Any]]]: """ Read a JSON stream from the given API URL. @@ -85,9 +83,7 @@ def read_json_stream( """ try: lines = [] # List to store lines for decoding - with get( - api_url, headers=headers, params=params, stream=True, timeout=30 - ) as response: + with get(api_url, headers=headers, params=params, stream=True, timeout=30) as response: response.raise_for_status() # Raise an HTTPError for bad responses print("Connected to the streaming API. Fetching data...")