diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..8dbb36353 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,6 @@ +repos: +- repo: git@github.com:Yelp/detect-secrets + rev: v0.13.1 + hooks: + - id: detect-secrets + args: ['--baseline', '.secrets.baseline'] \ No newline at end of file diff --git a/.secrets.baseline b/.secrets.baseline new file mode 100644 index 000000000..9b680293c --- /dev/null +++ b/.secrets.baseline @@ -0,0 +1,248 @@ +{ + "exclude": { + "files": null, + "lines": null + }, + "generated_at": "2021-04-27T18:14:46Z", + "plugins_used": [ + { + "name": "AWSKeyDetector" + }, + { + "name": "ArtifactoryDetector" + }, + { + "base64_limit": 4.5, + "name": "Base64HighEntropyString" + }, + { + "name": "BasicAuthDetector" + }, + { + "name": "CloudantDetector" + }, + { + "hex_limit": 3, + "name": "HexHighEntropyString" + }, + { + "name": "IbmCloudIamDetector" + }, + { + "name": "IbmCosHmacDetector" + }, + { + "name": "JwtTokenDetector" + }, + { + "keyword_exclude": null, + "name": "KeywordDetector" + }, + { + "name": "MailchimpDetector" + }, + { + "name": "PrivateKeyDetector" + }, + { + "name": "SlackDetector" + }, + { + "name": "SoftlayerDetector" + }, + { + "name": "StripeDetector" + }, + { + "name": "TwilioKeyDetector" + } + ], + "results": { + "docs/API/Users_Guide/Additional_Examples.md": [ + { + "hashed_secret": "d9bbc424159a2c5ef89902e02caa2be6cff1817c", + "is_secret": false, + "is_verified": false, + "line_number": 89, + "type": "Hex High Entropy String" + }, + { + "hashed_secret": "34d2afab6dc6f76855d5f83cc2dbed2efd99ddfb", + "is_secret": false, + "is_verified": false, + "line_number": 110, + "type": "Hex High Entropy String" + } + ], + "docs/API/Users_Guide/Getting_Started.md": [ + { + "hashed_secret": "05b339a29ce9a548e1efbe032131cfcdde6727d8", + "is_secret": false, + "is_verified": false, + "line_number": 91, + "type": "Hex High Entropy String" + } + ], + "docs/API/Users_Guide/Search_and_Retrieval.md": [ + { + "hashed_secret": "e9813ab1054cec827e517760617ac33bd97af1f3", + "is_secret": false, + "is_verified": false, + "line_number": 307, + "type": "Hex High Entropy String" + }, + { + "hashed_secret": "22ecc0fefa20bdbbdff52cdd35fad2974db02e56", + "is_secret": false, + "is_verified": false, + "line_number": 353, + "type": "Hex High Entropy String" + }, + { + "hashed_secret": "8b0471397a6dec83405ee2ae28edde87d02271a0", + "is_secret": false, + "is_verified": false, + "line_number": 389, + "type": "Hex High Entropy String" + }, + { + "hashed_secret": "14f469780554b9dd75a3d03a267b3be725582499", + "is_secret": false, + "is_verified": false, + "line_number": 422, + "type": "Hex High Entropy String" + }, + { + "hashed_secret": "6b505580b5bc4fab5bd4b2e0e10d621fea0614ee", + "is_secret": false, + "is_verified": false, + "line_number": 465, + "type": "Hex High Entropy String" + }, + { + "hashed_secret": "98895ba87fa2e568b0f48b1afbddb9b45d8c8ec3", + "is_secret": false, + "is_verified": false, + "line_number": 482, + "type": "Hex High Entropy String" + }, + { + "hashed_secret": "20b3183bf913ad4ef0d5ecd59c1de5437a7e8a04", + "is_secret": false, + "is_verified": false, + "line_number": 499, + "type": "Hex High Entropy String" + }, + { + "hashed_secret": "26b3f1e8d57ea8c36c9b83a355fd90b4c5befcd0", + "is_secret": false, + "is_verified": false, + "line_number": 516, + "type": "Hex High Entropy String" + }, + { + "hashed_secret": "d425d30f9f7abd3b38f825f91f83473e00275899", + "is_secret": false, + "is_verified": false, + "line_number": 2354, + "type": "Hex High Entropy String" + } + ], + "docs/API/Users_Guide/Submission.md": [ + { + "hashed_secret": "93f5b94e262e685fee4a419438d60e82fafaf491", + "is_secret": false, + "is_verified": false, + "line_number": 2293, + "type": "Hex High Entropy String" + }, + { + "hashed_secret": "313355a8530a54c23567f7bbedd9f804bb269820", + "is_secret": false, + "is_verified": false, + "line_number": 2386, + "type": "Hex High Entropy String" + }, + { + "hashed_secret": "b47ceb76f45ab4e8b52da270875d85fdd9b7fc33", + "is_secret": false, + "is_verified": false, + "line_number": 2448, + "type": "Hex High Entropy String" + } + ], + "docs/API/Users_Guide/System_Information.md": [ + { + "hashed_secret": "ecb0642a6305ce066c2675dac1562535b530e5b0", + "is_secret": false, + "is_verified": false, + "line_number": 70, + "type": "Hex High Entropy String" + } + ], + "docs/Data_Submission_Portal/Users_Guide/Best_Practices.md": [ + { + "hashed_secret": "87d7b59b5af7c86ea71b60ed042d1e62175136fb", + "is_secret": false, + "is_verified": false, + "line_number": 85, + "type": "Hex High Entropy String" + } + ], + "docs/Data_Submission_Portal/Users_Guide/Data_Submission_Walkthrough.md": [ + { + "hashed_secret": "87d7b59b5af7c86ea71b60ed042d1e62175136fb", + "is_secret": false, + "is_verified": false, + "line_number": 651, + "type": "Hex High Entropy String" + } + ], + "docs/Data_Submission_Portal/Users_Guide/Data_Upload_UG.md": [ + { + "hashed_secret": "dbaf99f4789432509c1313aba5256a6ea4ddb986", + "is_secret": false, + "is_verified": false, + "line_number": 482, + "type": "Hex High Entropy String" + }, + { + "hashed_secret": "471568dffba5b4873ca000b88049046d5aa687d4", + "is_secret": false, + "is_verified": false, + "line_number": 553, + "type": "Hex High Entropy String" + }, + { + "hashed_secret": "87d7b59b5af7c86ea71b60ed042d1e62175136fb", + "is_secret": false, + "is_verified": false, + "line_number": 765, + "type": "Hex High Entropy String" + } + ], + "theme/css/font-awesome-4.5.css": [ + { + "hashed_secret": "51de2b835bd35a67eb32dbcd3d77d4b96e5aa39d", + "is_secret": false, + "is_verified": false, + "line_number": 1734, + "type": "Secret Keyword" + } + ], + "theme/css/font-awesome.min-4.5.css": [ + { + "hashed_secret": "3e4128ccd5d7d597667230af6326b3387fd18545", + "is_secret": false, + "is_verified": false, + "line_number": 4, + "type": "Secret Keyword" + } + ] + }, + "version": "0.13.1", + "word_list": { + "file": null, + "hash": null + } +} diff --git a/API_UG.yml b/API_UG.yml index 83d08700a..91b8c60d0 100644 --- a/API_UG.yml +++ b/API_UG.yml @@ -16,6 +16,7 @@ pages: - BAM Slicing: 'API/Users_Guide/BAM_Slicing.md' - Submission: 'API/Users_Guide/Submission.md' - Python Examples: 'API/Users_Guide/Python_Examples.md' + - GraphQL Examples: 'API/Users_Guide/GraphQL_Examples.md' - System Information: 'API/Users_Guide/System_Information.md' - Additional Examples: 'API/Users_Guide/Additional_Examples.md' - "Appendix A: Available Fields": 'API/Users_Guide/Appendix_A_Available_Fields.md' diff --git a/Adding_write_to_file.md b/Adding_write_to_file.md new file mode 100644 index 000000000..39a5cb741 --- /dev/null +++ b/Adding_write_to_file.md @@ -0,0 +1,14 @@ +# Changes made 2-5-2021 + +An extra script added to the following scripts, to redirect a copy of the output to a file: + +``` +- A Basic Query + +- A Filtered Query + +- Complex Filters + +- Basic Troubleshooting +``` +Also the ```.PY``` files in ```scripts``` folder were updated accordingly. diff --git a/Data_Submission_Portal_UG.yml b/Data_Submission_Portal_UG.yml index b00ffc3d2..0f9f5f4c5 100644 --- a/Data_Submission_Portal_UG.yml +++ b/Data_Submission_Portal_UG.yml @@ -9,19 +9,13 @@ copyright: "© 2015-2016" theme_dir: theme pages: - Data Submission Portal: - - Getting Started: 'Data_Submission_Portal/Users_Guide/Getting_Started.md' - - Submission Workflow: 'Data_Submission_Portal/Users_Guide/Submission_Workflow.md' - - Authentication: 'Data_Submission_Portal/Users_Guide/Authentication.md' - - Homepage: 'Data_Submission_Portal/Users_Guide/Homepage.md' - - Dashboard: 'Data_Submission_Portal/Users_Guide/Dashboard.md' - - Upload Data: 'Data_Submission_Portal/Users_Guide/Data_Upload_UG.md' - - Submit Data: 'Data_Submission_Portal/Users_Guide/Submit_Data.md' - - Release Data: 'Data_Submission_Portal/Users_Guide/Release_Data.md' - - Transactions: 'Data_Submission_Portal/Users_Guide/Transactions.md' - - Browse Data: 'Data_Submission_Portal/Users_Guide/Browse_Data.md' - - Pre-Release Data Review: 'Data_Submission_Portal/Users_Guide/Pre_Release_QC.md' - - Best Practices: 'Data_Submission_Portal/Users_Guide/Best_Practices.md' + - Before Submitting Data to the GDC Portal: 'Data_Submission_Portal/Users_Guide/Checklist.md' + - Data Submission Overview: 'Data_Submission_Portal/Users_Guide/Data_Submission_Overview.md' + - Data Submission Portal: 'Data_Submission_Portal/Users_Guide/Data_Submission_Process.md' + - Data Upload Walkthrough: 'Data_Submission_Portal/Users_Guide/Data_Submission_Walkthrough.md' + - Pre-Release Data Portal: 'Data_Submission_Portal/Users_Guide/Pre_Release_QC.md' - Release Notes: 'Data_Submission_Portal/Release_Notes/Data_Submission_Portal_Release_Notes.md' + extra: project_root_dir: '/' project_org: 'GDC' diff --git a/Data_Transfer_Tool_UG.yml b/Data_Transfer_Tool_UG.yml index d8e7a47f4..a474a63af 100644 --- a/Data_Transfer_Tool_UG.yml +++ b/Data_Transfer_Tool_UG.yml @@ -10,12 +10,10 @@ theme_dir: theme pages: - Data Transfer Tool: - Getting Started: 'Data_Transfer_Tool/Users_Guide/Getting_Started.md' - - Accessing Built-in Help: 'Data_Transfer_Tool/Users_Guide/Accessing_Built-in_Help.md' - Preparing for Data Download and Upload: 'Data_Transfer_Tool/Users_Guide/Preparing_for_Data_Download_and_Upload.md' - - Data Download and Upload - Command Line: 'Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload.md' - - Data Download - UI: 'Data_Transfer_Tool/Users_Guide/Data_Download_Upload_UI.md' - - Key Terms: 'Data_Transfer_Tool/Users_Guide/Appendix_A_-_Key_Terms.md' + - Data Transfer Tool Command Line Documentation: 'Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload.md' - Release Notes - Command Line: 'Data_Transfer_Tool/Release_Notes/DTT_Release_Notes.md' + - Data Transfer Tool UI Documentation: 'Data_Transfer_Tool/Users_Guide/Data_Download_DTT_UI.md' - Release Notes - UI: 'Data_Transfer_Tool/Release_Notes/DTT_UI_Release_Notes.md' extra: diff --git a/Data_UG.yml b/Data_UG.yml index 449054797..30c08d295 100644 --- a/Data_UG.yml +++ b/Data_UG.yml @@ -19,6 +19,7 @@ pages: - "Bioinformatics Pipeline: miRNA Analysis": 'Data/Bioinformatics_Pipelines/miRNA_Pipeline.md' - "Bioinformatics Pipeline: Copy Number Variation": 'Data/Bioinformatics_Pipelines/CNV_Pipeline.md' - "Bioinformatics Pipeline: Methylation Liftover": 'Data/Bioinformatics_Pipelines/Methylation_LO_Pipeline.md' + - Aligned Reads Summary Metrics: 'Data/Bioinformatics_Pipelines/Aligned_reads_summary_metrics.md' - Release Notes: 'Data/Release_Notes/Data_Release_Notes.md' extra: project_root_dir: '/' diff --git a/README.md b/README.md index 132ed9c3f..e82cab829 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,97 @@ ![](https://gdc.cancer.gov/sites/all/themes/gdc_bootstrap/logo.png) -# THIS PROJECT HAS MOVED +# GDC Open Source code -Please join us at https://github.com/NCI-GDC/gdc-docs +======= +GDC is Open Source, Github Repositories containing source code of GDC Applications can be found on [GDC GitHub Organization page](https://github.com/NCI-GDC/). +- GDC Data Portal: https://github.com/NCI-GDC/portal-ui +- GDC Legacy Archive: https://github.com/NCI-GDC/portal-ui-legacy +- GDC Data Transfer Tool: https://github.com/NCI-GDC/gdc-client +- GDC Data Dictionary: https://github.com/NCI-GDC/gdcdictionary +- GDC Data Model: https://github.com/NCI-GDC/gdcdatamodel +- GDC Psqlgraph: https://github.com/NCI-GDC/psqlgraph + +# Support + +Please direct technical questions to [GDC Support](https://gdc.cancer.gov/support). + +# GDC Documentation Site + +### Technology + + - Python 2.6, 2.7, 3.3, 3.4 and 3.5. + - [mkdocs](http://www.mkdocs.org/) + - [BSCodeTabs for mkdocs](https://github.com/mikecules/MarkdownBSCodeTabs#for-use-in-mkdocs) + +### Install & Run + +(Optional) Set up virtualenv: + +- [Install virtualenv](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) +- `python -m virtualenv venv` +- `source venv/bin/activate` +- Run the installation commands below +- To leave the virtual environment: `deactivate` + +Install GDC-docs: + + - `pip install -r requirements.txt` + - `mkdocs serve` (optionally set port `--dev-addr=0.0.0.0:`) + +### Build + + - `mkdocs build --clean` + +### Repository Conventions + +- All Shared content in the "Commons" directory +- One Directory per GDC product (API, Data_Portal, Data_Submission_Portal, Data_Transfer_Tool) +- Each GDC product have a Users_Guide and Release_Notes directory + +### Linking + +To another documentation page +``` +[Authentication and Authorization](../../Commons/Authentication.md) +``` + +Inside another documentation page + +``` +[Authentication and Authorization](../../Commons/Authentication.md#internal-section) +``` + +### Adding icons and PDFs +The convention for this, when updating mkdocs.yml is the following: +- : 'index.md' +example: +- fa-file-pdf-o Download PDF /API/PDF/API_UG.pdf: 'index.md' + +### Documentation Conventions + +A detailed list of all conventions is available on [GDC Website](https://gdc.cancer.gov/conventions-page) + + +### Build PDF + +Install mkdocs2pandoc, following instructions available here: +``` +https://github.com/jgrassler/mkdocs-pandoc +``` + +Prepare a yml file dedicated to your Userguide, using Data_Portal_UG.yml as an example. + +Run the following commands to: +* Convert the User Guide to Pandoc: +* Tweak the pandoc file +* Build a PDF + +``` +mkdocs2pandoc -f Data_Portal_UG.yml -o docs/Data_Portal/PDF/Data_portal_UG.pd +sed -i -e 's/# / /g' docs/Data_Portal/PDF/Data_portal_UG.pd +sed -i -e 's/### /## /g' docs/Data_Portal/PDF/Data_portal_UG.pd +sed -i -e 's/\/site\//\/docs\//g' docs/Data_Portal/PDF/Data_portal_UG.pd +pandoc --toc -V documentclass=report -V geometry:"top=2cm, bottom=1.5cm, left=1cm, right=1cm" -f markdown+grid_tables+table_captions -o docs/Data_Portal/PDF/Data_portal_UG.pdf docs/Data_Portal/PDF/Data_portal_UG.pd +``` diff --git a/docs/API/Release_Notes/API_Release_Notes.md b/docs/API/Release_Notes/API_Release_Notes.md index 232ccf13c..3813d9158 100644 --- a/docs/API/Release_Notes/API_Release_Notes.md +++ b/docs/API/Release_Notes/API_Release_Notes.md @@ -3,6 +3,18 @@ | Version | Date | |---|---| +| [v3.3.0](API_Release_Notes.md#v330) | May 17, 2021 | +| [v3.0.0](API_Release_Notes.md#v300) | August 14, 2020 | +| [v2.1.2](API_Release_Notes.md#v212) | May 7, 2020 | +| [v2.1.0](API_Release_Notes.md#v210) | March 10, 2020 | +| [v2.0.0](API_Release_Notes.md#v200) | January 30, 2020 | +| [v1.23.0](API_Release_Notes.md#v1230) | November 6, 2019 | +| [v1.22.0](API_Release_Notes.md#v1220) | July 31, 2019 | +| [v1.21.0](API_Release_Notes.md#v1210) | June 5, 2019 | +| [v1.20.0](API_Release_Notes.md#v1200) | April 17, 2019 | +| [v1.19.0](API_Release_Notes.md#v1190) | February 20, 2019 | +| [v1.18.0](API_Release_Notes.md#v1180) | December 18, 2018 | +| [v1.17.0](API_Release_Notes.md#v1170) | November 7, 2018 | | [v1.16.0](API_Release_Notes.md#v1160) | September 27, 2018 | | [v1.15.0](API_Release_Notes.md#v1150) | August 23, 2018 | | [v1.14.1](API_Release_Notes.md#v1141) | May 21, 2018 | @@ -21,6 +33,293 @@ | [v1.1.0](API_Release_Notes.md#v110) | May 25, 2016 | | [v1.0.1](API_Release_Notes.md#v101) | May 16, 2016 | +## v3.3.0 + +* __GDC Product__: Application Programming Interface (API) +* __Release Date__: May 17, 2021 + +### New Features and Changes + +* Features to better support batch tracking for submitted data were added. + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Fields are not counted as missing if parent field is also missing. This may occur with queries of nested fields in the Data Portal Advanced Search or an API query using a filter. This behavior could impact results reported using search parameters of "IS MISSING" or "NOT MISSING". +* Certain very large API requests will time out. It is recommended to break up very large requests into a series of smaller requests. + + +## v3.0.0 + +* __GDC Product__: Application Programming Interface (API) +* __Release Date__: August 14, 2020 + +### New Features and Changes + +* Enhancements were made to increase performance. + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Fields are not counted as missing if parent field is also missing. This may occur with queries of nested fields in the Data Portal Advanced Search or an API query using a filter. This behavior could impact results reported using search parameters of "IS MISSING" or "NOT MISSING". +* Certain very large API requests will time out. It is recommended to break up very large requests into a series of smaller requests. + +## v2.1.2 + +* __GDC Product__: Application Programming Interface (API) +* __Release Date__: May 7, 2020 + +### New Features and Changes + +* An update to improve usability in the homepage quicksearch + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Fields are not counted as missing if parent field is also missing. This may occur with queries of nested fields in the Data Portal Advanced Search or an API query using a filter. This behavior could impact results reported using search parameters of "IS MISSING" or "NOT MISSING". +* Certain very large API requests will time out. It is recommended to break up very large requests into a series of smaller requests. + +## v2.1.0 + +* __GDC Product__: Application Programming Interface (API) +* __Release Date__: March 10, 2020 + +### New Features and Changes + +* New data dictionary changes. + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Fields are not counted as missing if parent field is also missing. This may occur with queries of nested fields in the Data Portal Advanced Search or an API query using a filter. This behavior could impact results reported using search parameters of "IS MISSING" or "NOT MISSING". +* Certain very large API requests will time out. It is recommended to break up very large requests into a series of smaller requests. + +## v2.0.0 + +* __GDC Product__: Application Programming Interface (API) +* __Release Date__: January 30, 2020 + +### New Features and Changes + +* API code now uses Python 3. + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Fields are not counted as missing if parent field is also missing. This may occur with queries of nested fields in the Data Portal Advanced Search or an API query using a filter. This behavior could impact results reported using search parameters of "IS MISSING" or "NOT MISSING". +* Certain very large API requests will time out. It is recommended to break up very large requests into a series of smaller requests. + +## v1.23.0 + +* __GDC Product__: Application Programming Interface (API) +* __Release Date__: November 6, 2019 + +### New Features and Changes + +* QC Tests added for Submission +* BAM slicing is now supported for unmapped reads +* API now includes data from molecular_test and follow_up nodes. This impacts what is displayed on the GDC Data Portal +* Better handling of concurrent transactions +* CIViC annotations now included on the ssms endpoint. You can read more about CIViC annotations [here](https://civicdb.org/home) + +### Bugs Fixed Since Last Release + +* Fixed API memory leak +* Fixed data offset issue returned by clinical.tar endpoint + +### Known Issues and Workarounds + +* Fields are not counted as missing if parent field is also missing. This may occur with queries of nested fields in the Data Portal Advanced Search or an API query using a filter. This behavior could impact results reported using search parameters of "IS MISSING" or "NOT MISSING". +* Certain very large API requests will time out. It is recommended to break up very large requests into a series of smaller requests. + + + +## v1.22.0 + +* __GDC Product__: Application Programming Interface (API) +* __Release Date__: July 31, 2019 + +### New Features and Changes + +* Updated BCR Clinical XML parser code to support future indexing of additional clinical data. Parser code can be found [here](https://github.com/NCI-GDC/gdcdatamodel/tree/develop/gdcdatamodel/xml_mappings) + + - Added a mapping for: + - `ann_arbor_b_symptoms` + - `ann_arbor_extranodal_involvement` + - `ajcc_pathologic_t` + - `ajcc_pathologic_n` + - `ajcc_pathologic_m` + - `ajcc_clinical_t` + - `ajcc_clinical_n` + - `ajcc_clinical_m` + - `ajcc_staging_system_edition` + - `figo_stage` + - `ajcc_clinical_stage` + - `primary_gleason_grade` + - `secondary_gleason_graade` + - `igcccg_stage` + - `masaoka_stage` + + - Updated the mapping for: + - `primary_diagnosis`, `morphology`, `tissue_or_organ_of_origin`, and `site_of_resection_or_biopsy` + +### Bugs Fixed Since Last Release + +* Fixed a bug preventing multipart uploads + +### Known Issues and Workarounds + +* Fields are not counted as missing if parent field is also missing. This may occur with queries of nested fields in the Data Portal Advanced Search or an API query using a filter. This behavior could impact results reported using search parameters of "IS MISSING" or "NOT MISSING". +* Certain very large API requests will time out. It is recommended to break up very large requests into a series of smaller requests. + + +## v1.21.0 + +* __GDC Product__: Application Programming Interface (API) +* __Release Date__: June 5, 2019 + +### New Features and Changes + +* Mutation indexer update to accommodate data model changes +* Updates to when in the release cycle downloaders and submitters have access to files + + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Fields are not counted as missing if parent field is also missing. This may occur with queries of nested fields in the Data Portal Advanced Search or an API query using a filter. This behavior could impact results reported using search parameters of "IS MISSING" or "NOT MISSING". +* Certain very large API requests will time out. It is recommended to break up very large requests into a series of smaller requests. + + +## v1.20.0 + +* __GDC Product__: Application Programming Interface (API) +* __Release Date__: April 17, 2019 + +### New Features and Changes + +* Disallowed creation of multiple file versions in the same data release +* Improved API concurrency performance and addressed random failures/4XX/5XX responses +* Improved API/Portal performances for querying large sets of data from the Elasticsearch indices +* Updated BCR Clinical XML parser code to support future indexing of additional clinical data. Parser code can be found [here](https://github.com/NCI-GDC/gdcdatamodel/tree/develop/gdcdatamodel/xml_mappings) + - Updated the mapping for: + - `days_to_death` + - `days_to_birth` + - `vital_status` + - `prior_malignancy` and `synchronous_malignancy` + - Added a mapping for: + - `pack_years_smoked` + - `prior_treatment` + - `age_at_index` + - `days_to_diagnosis` + - `icd_10_code` + - `year_of_diagnosis` + - Remove calculation for: + - `cigarettes_per_day` + - `year_of_birth` + - `year_of_death` + - `bmi` + - Updated XML Parser to pull the most Up-to-Date Survival Information + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Fields are not counted as missing if parent field is also missing. This may occur with queries of nested fields in the Data Portal Advanced Search or an API query using a filter. This behavior could impact results reported using search parameters of "IS MISSING" or "NOT MISSING". +* Certain very large API requests will time out. It is recommended to break up very large requests into a series of smaller requests. + + + +## v1.19.0 + +* __GDC Product__: Application Programming Interface (API) +* __Release Date__: February 20, 2019 + +### New Features and Changes + +* Added API features to support controlled access DAVE +* Updated API query endpoints to handle filtering of queries based on tokens +* Created login notification Endpoint +* Added hashing and logging for similar ES queries + +### Bugs Fixed Since Last Release + +* Fixed bug where quick search ES query grows with each request +* Fixed bug where new file versions could be created when exactly the same existing metadata is uploaded +* Fixed bug where submitting to specific projects produced error that data already existed + +### Known Issues and Workarounds + +* Fields are not counted as missing if parent field is also missing. This may occur with queries of nested fields in the Data Portal Advanced Search or an API query using a filter. This behavior could impact results reported using search parameters of "IS MISSING" or "NOT MISSING". +* Certain very large API requests will time out. It is recommended to break up very large requests into a series of smaller requests. + + +## v1.18.0 + +* __GDC Product__: Application Programming Interface (API) +* __Release Date__: December 18, 2018 + +### New Features and Changes + +* Update to auth for GDC Pre-Release Data Portal + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Fields are not counted as missing if parent field is also missing. This may occur with queries of nested fields in the Data Portal Advanced Search or an API query using a filter. This behavior could impact results reported using search parameters of "IS MISSING" or "NOT MISSING". +* Certain very large API requests will time out. It is recommended to break up very large requests into a series of smaller requests. + + + +## v1.17.0 + +* __GDC Product__: Application Programming Interface (API) +* __Release Date__: November 7, 2018 + +### New Features and Changes + +* Created new index cnv_centric +* Created new index cnv_occurrence_centric +* Created new REST API endpoints for CNV +* Created mapping from aliquot to case for occurrence on cnv_centric +* Created new graphql endpoints for CNV +* Updated index case_centric to add cnv +* Updated index gene_centric to add cnv + +### Bugs Fixed Since Last Release + +* Fixed bug to prevent users from deleting files in state submitted or released + +### Known Issues and Workarounds + +* Fields are not counted as missing if parent field is also missing. This may occur with queries of nested fields in the Data Portal Advanced Search or an API query using a filter. This behavior could impact results reported using search parameters of "IS MISSING" or "NOT MISSING". +* Certain very large API requests will time out. It is recommended to break up very large requests into a series of smaller requests. + + + ## v1.16.0 * __GDC Product__: Application Programming Interface (API) diff --git a/docs/API/Users_Guide/Additional_Examples.md b/docs/API/Users_Guide/Additional_Examples.md index 86057cbf0..8b852f61b 100644 --- a/docs/API/Users_Guide/Additional_Examples.md +++ b/docs/API/Users_Guide/Additional_Examples.md @@ -668,7 +668,7 @@ curl 'https://api.gdc.cancer.gov/cases?filters=%7b%0d%0a+++%22op%22+%3a+%22%3d%2 #### Example: Filter using a range -This is an example of filtering for age at diagnosis. The request is for cases where the age at diagnosis is between 40 and 70 years. *Note:* `age_at_diagnosis` is expressed in days. +This is an example of filtering for age at diagnosis. The request is for cases where the age at diagnosis is between 40 and 70 years. >**Note:** `age_at_diagnosis` is expressed in days. ```Filter { diff --git a/docs/API/Users_Guide/Appendix_C_Format_of_Submission_Requests_and_Responses.md b/docs/API/Users_Guide/Appendix_C_Format_of_Submission_Requests_and_Responses.md index e831cc09b..9de5b8060 100644 --- a/docs/API/Users_Guide/Appendix_C_Format_of_Submission_Requests_and_Responses.md +++ b/docs/API/Users_Guide/Appendix_C_Format_of_Submission_Requests_and_Responses.md @@ -123,4 +123,4 @@ API responses will contain a status for each entity specified in the request: **`error`**: The desired transaction was not sucessful, and the transaction was aborted because of this entity. This entity did not pass validation or an internal error occured when attempting to complete the transaction. The error state will be accompanied by a list of errors recorded about the entity (see label-error-messages). -**Note:** GDC API requests are transactional. An error with processing a node specified in the transaction will abort the transaction and will result in no changes being applied for any node involved in the transaction. +>**Note:** GDC API requests are transactional. An error with processing a node specified in the transaction will abort the transaction and will result in no changes being applied for any node involved in the transaction. diff --git a/docs/API/Users_Guide/BAM_Slicing.md b/docs/API/Users_Guide/BAM_Slicing.md index 4462549c3..f072d4c28 100644 --- a/docs/API/Users_Guide/BAM_Slicing.md +++ b/docs/API/Users_Guide/BAM_Slicing.md @@ -21,11 +21,11 @@ Please note the following: The following query parameters and JSON fields are supported: | Description | Query Parameter | JSON Field | Query format | -|---|---|---| +|---|---|---|---| | entire chromosome, or a position or region on the chromosome, specified using chromosomal coordinates | region | regions | region=(:(-)?)? | | region specified using a [HGNC](http://www.genenames.org/) / [GENCODE v22](http://www.gencodegenes.org/) gene name | gencode | gencode | gencode= | -**NOTE:** The successfully sliced BAM will contain all reads that overlap (entirely or partially) with the specified region or gene. It is possible to specify an open-ended region, e.g. `chr2:10000`, which would return all reads that (completely or partially) overlap with the region of chromosome 2 from position 10,000 to the end of the chromosome. +>**NOTE:** The successfully sliced BAM will contain all reads that overlap (entirely or partially) with the specified region or gene. It is possible to specify an open-ended region, e.g. `chr2:10000`, which would return all reads that (completely or partially) overlap with the region of chromosome 2 from position 10,000 to the end of the chromosome. ### JSON Schema @@ -110,6 +110,32 @@ HTTP/1.1 206 ``` +## Examples: Specifying unmapped reads + +Unmapped reads are found in GDC BAM files. You may request these reads by using the following commands. + +```GET +token=$( +``` + After downloading, the sliced BAM file can be converted to SAM using the following command if `samtools` is installed on the user's system: diff --git a/docs/API/Users_Guide/Data_Analysis.md b/docs/API/Users_Guide/Data_Analysis.md index d987f538c..69618e7e2 100644 --- a/docs/API/Users_Guide/Data_Analysis.md +++ b/docs/API/Users_Guide/Data_Analysis.md @@ -6,22 +6,31 @@ The GDC DAVE tools use the same API as the rest of the Data Portal and takes adv The following data analysis endpoints are available from the GDC API: -| __Endpoint__ | __Description__ | -|---|---| -| __/genes__ | Allows users to access summary information about each gene using its Ensembl ID. | -| __/ssms__ | Allows users to access information about each somatic mutation. For example, a `ssm` would represent the transition of C to T at position 52000 of chromosome 1. | -| __/ssm_occurrences__ | A `ssm` entity as applied to a single instance (case). An example of a `ssm occurrence` would be that the transition of C to T at position 52000 of chromosome 1 occurred in patient TCGA-XX-XXXX. | -|__/analysis/top_cases_counts_by_genes__| Returns the number of cases with a mutation in each gene listed in the gene_ids parameter for each project. Note that this endpoint cannot be used with the `format` or `fields` parameters.| -|__/analysis/top_mutated_genes_by_project__| Returns a list of genes that have the most mutations within a given project. | -|__/analysis/top_mutated_cases_by_gene__| Generates information about the cases that are most affected by mutations in a given number of genes | -|__/analysis/mutated_cases_count_by_project__| Returns counts for the number of cases that have associated `ssm` data in each project. The number of affected cases can be found under "case_with_ssm": {"doc_count": $case_count}.| -|__/analysis/survival__| Survival plots can be generated in the Data Portal for different subsets of data, based upon many query factors such as variants, disease type and projects. This endpoint can be used to programmatically retrieve the raw data to generate these plots and apply different filters to the data. (see Survival Example)| +|__Node__| __Endpoint__ | __Description__ | +|---|---|---| +|__Genes__| __/genes__ | Allows users to access summary information about each gene using its Ensembl ID. | +|__SSMS__| __/ssms__ | Allows users to access information about each somatic mutation. For example, a `ssm` would represent the transition of C to T at position 52000 of chromosome 1. | +||__/ssms/``__|Get information about a specific ssm using a ``, often supplemented with the `expand` option to show fields of interest. | +|| __/ssm_occurrences__ | A `ssm` entity as applied to a single instance (case). An example of a `ssm occurrence` would be that the transition of C to T at position 52000 of chromosome 1 occurred in patient TCGA-XX-XXXX. | +||__/ssm_occurrences/``__|Get information about a specific ssm occurrence using a ``, often supplemented with the `expand` option to show fields of interest. | +|__CNVS__|__/cnvs__|Allows users to access data about copy number variations (cnvs). This data will be specifc to cnvs and not a specific case. | +||__/cnvs/``__|Get information about a specific copy number variation using a ``, often supplemented with the `expand` option to show fields of interest. | +||__/cnvs/ids__|This endpoint will retrieve nodes that contain the queried cnv_id. This is accomplished by adding the query parameter: /cnvs/ids?query=``.| +||__/cnv_occurrences__|A `cnv` entity as applied to a single case.| +||__/cnv_occurrences/``__|Get information about a specific copy number variation occurrence using a ``, often supplemented with the `expand` option to show fields of interest. | +||__/cnv_occurrences/ids__|This endpoint will retrieve nodes that contain the queried cnv_occurrence_id. This is accomplished by adding the query parameter: /cnv_occurrences/ids?query=``| +|__Analysis__|__/analysis/top_cases_counts_by_genes__| Returns the number of cases with a mutation in each gene listed in the gene_ids parameter for each project. Note that this endpoint cannot be used with the `format` or `fields` parameters.| +||__/analysis/top_mutated_genes_by_project__| Returns a list of genes that have the most mutations within a given project. | +||__/analysis/top_mutated_cases_by_gene__| Generates information about the cases that are most affected by mutations in a given number of genes | +||__/analysis/mutated_cases_count_by_project__| Returns counts for the number of cases that have associated `ssm` data in each project. The number of affected cases can be found under "case_with_ssm": {"doc_count": $case_count}.| +||__/analysis/survival__| Survival plots can be generated in the Data Portal for different subsets of data, based upon many query factors such as variants, disease type and projects. This endpoint can be used to programmatically retrieve the raw data to generate these plots and apply different filters to the data. (see Survival Example)| + The methods for retrieving information from these endpoints are very similar to those used for the `cases` and `files` endpoints. These methods are explored in depth in the [API Search and Retrieval](https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/) documentation. The `_mapping` parameter can also be used with each of these endpoints to generate a list of potential fields. For example: `https://api.gdc.cancer.gov/ssms/_mapping` -Note: While it is not an endpoint, the `observation` entity is featured in the visualization section of the API. The `observation` entity provides information from the MAF file, such as read depth and normal genotype, that supports the validity of the associated `ssm`. An example is demonstrated below: +While it is not an endpoint, the `observation` entity is featured in the visualization section of the API. The `observation` entity provides information from the MAF file, such as read depth and normal genotype, that supports the validity of the associated `ssm`. An example is demonstrated below: ```Shell curl "https://api.gdc.cancer.gov/ssms/57bb3f2e-ec05-52c2-ab02-7065b7d24849?expand=occurrence.case.observation.read_depth&pretty=true" @@ -143,9 +152,9 @@ gene_start gene_end symbol id ## Simple Somatic Mutation Endpoint Examples -__Example 1__: Similar to the `/genes` endpoint, a user would like to retrieve information about the mutation based on its COSMIC ID. This would be accomplished by creating a JSON filter such as: +__Example 1__: Similar to the `/genes` endpoint, a user would like to retrieve information about the mutation based on its COSMIC ID. This would be accomplished by creating a JSON filter, which will then be encoded to URL for the `curl` command. -```Query +```Filter { "op":"in", "content":{ @@ -207,11 +216,137 @@ curl 'https://api.gdc.cancer.gov/ssms?pretty=true&filters=%7B%0A%22op%22%3A%22in } ``` +__Example 2:__ Based on the previous example's `ssm_id` (`8b3c1a7a-e4e0-5200-9d46-5767c2982145`), a user would like to look at the consequences and the VEP impact due to this ssm. + +```Shell +curl 'https://api.gdc.cancer.gov/ssms/8b3c1a7a-e4e0-5200-9d46-5767c2982145?pretty=true&expand=consequence.transcript&fields=consequence.transcript.annotation.vep_impact' +``` + +```JSON +{ + "data": { + "consequence": [ + { + "transcript": { + "aa_start": 127, + "consequence_type": "synonymous_variant", + "aa_end": 127, + "transcript_id": "ENST00000466621", + "is_canonical": false, + "aa_change": "G127G", + "annotation": { + "vep_impact": "LOW" + }, + "ref_seq_accession": "" + } + }, + { + "transcript": { + "aa_start": 95, + "consequence_type": "synonymous_variant", + "aa_end": 95, + "transcript_id": "ENST00000613879", + "is_canonical": false, + "aa_change": "G95G", + "annotation": { + "vep_impact": "LOW" + }, + "ref_seq_accession": "" + } + }, + { + "transcript": { + "aa_start": 218, + "consequence_type": "synonymous_variant", + "aa_end": 218, + "transcript_id": "ENST00000473635", + "is_canonical": false, + "aa_change": "G218G", + "annotation": { + "vep_impact": "LOW" + }, + "ref_seq_accession": "" + } + }, + { + "transcript": { + "aa_start": null, + "consequence_type": "non_coding_transcript_exon_variant", + "aa_end": null, + "transcript_id": "ENST00000474560", + "is_canonical": false, + "aa_change": null, + "annotation": { + "vep_impact": "MODIFIER" + }, + "ref_seq_accession": "" + } + }, + { + "transcript": { + "aa_start": 1226, + "consequence_type": "synonymous_variant", + "aa_end": 1226, + "transcript_id": "ENST00000383710", + "is_canonical": true, + "aa_change": "G1226G", + "annotation": { + "vep_impact": "LOW" + }, + "ref_seq_accession": "NM_003716.3" + } + }, + { + "transcript": { + "aa_start": 1187, + "consequence_type": "synonymous_variant", + "aa_end": 1187, + "transcript_id": "ENST00000283269", + "is_canonical": false, + "aa_change": "G1187G", + "annotation": { + "vep_impact": "LOW" + }, + "ref_seq_accession": "NM_183394.2" + } + }, + { + "transcript": { + "aa_start": 1147, + "consequence_type": "synonymous_variant", + "aa_end": 1147, + "transcript_id": "ENST00000357948", + "is_canonical": false, + "aa_change": "G1147G", + "annotation": { + "vep_impact": "LOW" + }, + "ref_seq_accession": "NM_183393.2" + } + }, + { + "transcript": { + "aa_start": 1217, + "consequence_type": "synonymous_variant", + "aa_end": 1217, + "transcript_id": "ENST00000612439", + "is_canonical": false, + "aa_change": "G1217G", + "annotation": { + "vep_impact": "LOW" + }, + "ref_seq_accession": "" + } + } + ] + } +``` + ## Simple Somatic Mutation Occurrence Endpoint Examples __Example 1:__ A user wants to determine the chromosome in case `TCGA-DU-6407` that contains the greatest number of `ssms`. As this relates to mutations that are observed in a case, the `ssm_occurrences` endpoint is used. -``` +```Filter { "op":"in", "content":{ @@ -223,7 +358,7 @@ __Example 1:__ A user wants to determine the chromosome in case `TCGA-DU-6407` t ```Shell curl "https://api.gdc.cancer.gov/ssm_occurrences?format=tsv&fields=ssm.chromosome&size=5000&filters=%7B%0D%0A%22op%22%3A%22in%22%2C%0D%0A%22content%22%3A%7B%0D%0A%22field%22%3A%22case.submitter_id%22%2C%0D%0A%22value%22%3A%5B%0D%0A%22TCGA-DU-6407%22%0D%0A%5D%0D%0A%7D%0D%0A%7D" ``` -```Response +```tsv ssm.chromosome id chr3 552c09d1-69b1-5c04-b543-524a6feae3eb chr10 391011ff-c1fd-5e2a-a128-652bc660f64c @@ -256,6 +391,313 @@ chr10 727c9d57-7b74-556f-aa5b-e1ca1f76d119 chr15 b4a86ffd-e60c-5c9c-aaa1-9e9f02d86116 chr5 3a023e72-da92-54f7-aa18-502c1076b2b0 ``` +__Example 2:__ A user has retrieved a `ssm_occurrence`, and would like to determine if that case also has tissue slides and transcriptome profiling data. + +```Shell +curl 'https://api.gdc.cancer.gov/ssm_occurrences/6fd8527d-5c40-5604-8fa9-0ce798eec231?pretty=true&expand=case,case.summary.experimental_strategies' +``` + +```Json +{ + "data": { + "case": { + "disease_type": "Nevi and Melanomas", + "updated_datetime": "2018-09-06T18:42:50.098635-05:00", + "created_datetime": null, + "summary": { + "experimental_strategies": [ + { + "file_count": 3, + "experimental_strategy": "miRNA-Seq" + }, + { + "file_count": 1, + "experimental_strategy": "Tissue Slide" + }, + { + "file_count": 18, + "experimental_strategy": "WXS" + }, + { + "file_count": 1, + "experimental_strategy": "Diagnostic Slide" + }, + { + "file_count": 4, + "experimental_strategy": "RNA-Seq" + }, + { + "file_count": 4, + "experimental_strategy": "Genotyping Array" + }, + { + "file_count": 1, + "experimental_strategy": "Methylation Array" + } + ] + }, + "state": "released", + "case_id": "590b5e18-d837-4c0e-becf-80520db57c0f", + "primary_site": "Skin", + "submitter_id": "TCGA-Z2-A8RT", + "available_variation_data": [ + "cnv", + "ssm" + ] + }, + "ssm_occurrence_id": "6fd8527d-5c40-5604-8fa9-0ce798eec231" + } +``` + +## Copy Number Variation Endpoint Examples + +__Example 1:__ A user is interested in finding the first 30 cnvs found on chromosome 4 that have a cnv loss. + +```Filter +{ + "op": "and", + "content": [ + { + "op": "in", + "content": { + "field": "chromosome", + "value": [ + "4" + ] + } + }, + { + "op": "in", + "content": { + "field": "cnv_change", + "value": [ + "Loss" + ] + } + } + ] +} +``` + +```Shell +curl 'https://api.gdc.cancer.gov/cnvs?filters=%7B%0D%0A+++%22op%22%3A+%22and%22%2C%0D%0A++++%22content%22%3A+%5B%0D%0A++++++++%7B%0D%0A++++++++++++%22op%22%3A+%22in%22%2C%0D%0A++++++++++++%22content%22%3A+%7B%0D%0A++++++++++++++++%22field%22%3A+%22chromosome%22%2C%0D%0A++++++++++++++++%22value%22%3A+%5B%0D%0A++++++++++++++++++++%224%22%0D%0A++++++++++++++++%5D%0D%0A++++++++++++%7D%0D%0A++++++++%7D%2C%0D%0A++++++++%7B%0D%0A++++++++++++%22op%22%3A+%22in%22%2C%0D%0A++++++++++++%22content%22%3A+%7B%0D%0A++++++++++++++++%22field%22%3A+%22cnv_change%22%2C%0D%0A++++++++++++++++%22value%22%3A+%5B%0D%0A++++++++++++++++++++%22Loss%22%0D%0A++++++++++++++++%5D%0D%0A++++++++++++%7D%0D%0A++++++++%7D%0D%0A++++%5D%0D%0A%7D&size=30&sort=start_position&format=tsv' +``` + +```tsv +ncbi_build cnv_id gene_level_cn cnv_change end_position start_position id chromosome +GRCh38 d18e0dc8-7d56-5d9e-84fd-4f2cf3353c66 True Loss 88211 53285 d18e0dc8-7d56-5d9e-84fd-4f2cf3353c66 4 +GRCh38 357a6606-8a64-5827-b776-e71f44b7e05f True Loss 163989 124480 357a6606-8a64-5827-b776-e71f44b7e05f 4 +GRCh38 eda45f5f-6a57-5fae-b8ad-5d67a14423f1 True Loss 305321 270675 eda45f5f-6a57-5fae-b8ad-5d67a14423f1 4 +GRCh38 64d82c29-0f20-5a8f-8599-7afb550ab403 True Loss 384864 337814 64d82c29-0f20-5a8f-8599-7afb550ab403 4 +GRCh38 f9d24781-34cb-51ff-99c2-84c83a8348ac True Loss 499156 425815 f9d24781-34cb-51ff-99c2-84c83a8348ac 4 +GRCh38 56209b45-3b2c-5862-85bb-362722bae857 True Loss 540196 499210 56209b45-3b2c-5862-85bb-362722bae857 4 +GRCh38 04b976d8-90ad-501d-b672-e14816582339 True Loss 670782 625584 04b976d8-90ad-501d-b672-e14816582339 4 +GRCh38 574939d6-bf4f-57e9-9c86-629b3d8de664 True Loss 674338 672436 574939d6-bf4f-57e9-9c86-629b3d8de664 4 +GRCh38 b2ebf724-0a08-542e-ad1e-392a30208140 True Loss 682033 673580 b2ebf724-0a08-542e-ad1e-392a30208140 4 +GRCh38 4e37e683-6f9f-5e80-8e3b-78d0cdf3c28e True Loss 689441 681829 4e37e683-6f9f-5e80-8e3b-78d0cdf3c28e 4 +GRCh38 06837ab7-8242-518f-a24c-dce8a0140b01 True Loss 770640 705748 06837ab7-8242-518f-a24c-dce8a0140b01 4 +GRCh38 9f877f14-55ea-5e19-afa0-d294d1700b4b True Loss 826198 784957 9f877f14-55ea-5e19-afa0-d294d1700b4b 4 +GRCh38 bde18311-8a8a-52ef-bcc0-3b6660509df0 True Loss 932373 849276 bde18311-8a8a-52ef-bcc0-3b6660509df0 4 +GRCh38 31c65477-0e54-5be3-b1f6-3f249850ef79 True Loss 958656 932387 31c65477-0e54-5be3-b1f6-3f249850ef79 4 +GRCh38 c26f1b4d-d4c3-5685-8789-fb0051f8a188 True Loss 986895 958887 c26f1b4d-d4c3-5685-8789-fb0051f8a188 4 +GRCh38 0aa931e9-7ec1-57e7-9cb9-ec66a8da5689 True Loss 993440 979073 0aa931e9-7ec1-57e7-9cb9-ec66a8da5689 4 +GRCh38 162a9e1d-e1ee-5478-9291-6ba8082d5776 True Loss 1004506 986997 162a9e1d-e1ee-5478-9291-6ba8082d5776 4 +GRCh38 6a4d4aef-2289-54f5-b78b-797db8c3a9f2 True Loss 1026897 1009936 6a4d4aef-2289-54f5-b78b-797db8c3a9f2 4 +GRCh38 3c26920b-fb93-5595-81a0-770df0c88246 True Loss 1113562 1056250 3c26920b-fb93-5595-81a0-770df0c88246 4 +GRCh38 7036724d-1a73-5b2b-ae02-c2dc5b3333d7 True Loss 1208962 1166932 7036724d-1a73-5b2b-ae02-c2dc5b3333d7 4 +GRCh38 30b408be-db7b-579b-bbde-4a265c6291ce True Loss 1249953 1211448 30b408be-db7b-579b-bbde-4a265c6291ce 4 +GRCh38 a7c6f097-bba8-5859-838d-8b3b4610c9e6 True Loss 1340147 1289851 a7c6f097-bba8-5859-838d-8b3b4610c9e6 4 +GRCh38 8fd4f4e8-ddf3-574b-ac19-3112a2778b22 True Loss 1388049 1347266 8fd4f4e8-ddf3-574b-ac19-3112a2778b22 4 +GRCh38 2315f6cc-9d91-58b8-9f3e-f0d36cd6846c True Loss 1395989 1391552 2315f6cc-9d91-58b8-9f3e-f0d36cd6846c 4 +GRCh38 1480d682-fe0e-5ba1-bf4e-ac84945f194a True Loss 1406331 1402932 1480d682-fe0e-5ba1-bf4e-ac84945f194a 4 +GRCh38 280e825e-1c51-506b-a4b5-3dc85fd79cbe True Loss 1684302 1617915 280e825e-1c51-506b-a4b5-3dc85fd79cbe 4 +GRCh38 607e36e3-6b1d-5564-9670-759668053ceb True Loss 1712555 1692800 607e36e3-6b1d-5564-9670-759668053ceb 4 +GRCh38 93b6ccc4-d88d-5040-936f-a23c9006a965 True Loss 1721358 1715952 93b6ccc4-d88d-5040-936f-a23c9006a965 4 +GRCh38 f6f660d2-5a68-5e49-92b1-a816be39e0fe True Loss 1745176 1721490 f6f660d2-5a68-5e49-92b1-a816be39e0fe 4 +GRCh38 a0c069d1-dcb0-5833-8fff-211cd6e3719a True Loss 1808872 1793307 a0c069d1-dcb0-5833-8fff-211cd6e3719a 4 +``` + +__Example 2:__ A user wants to determine the location and identity of the gene affected by the cnv `5052be09-2bbe-5175-a0ae-fc568ea75339`, and determine whether the gene is found within the Cancer Gene Census. + +```Shell +curl 'https://api.gdc.cancer.gov/cnvs/5052be09-2bbe-5175-a0ae-fc568ea75339?pretty=true&expand=consequence.gene' +``` + +```Json +{ + "data": { + "ncbi_build": "GRCh38", + "cnv_id": "5052be09-2bbe-5175-a0ae-fc568ea75339", + "gene_level_cn": true, + "cnv_change": "Gain", + "end_position": 110346681, + "start_position": 110338506, + "consequence": [ + { + "gene": { + "symbol": "RBM15", + "is_cancer_gene_census": "True", + "biotype": "protein_coding", + "gene_id": "ENSG00000162775" + } + } + ], + "chromosome": "1" + } +``` + +## Copy Number Variation Occurrence Enpoint Examples + +__Example 1:__ A user is interested in finding cases that have both cnv and ssm data for females diagnosed with Squamous Cell Neoplasms and have a cnv gain change on chromosome 9. It is important to note that for a case like this, where multiple arguments are need for one filtered field, it is easier for the API to have multiple filters for the same field, `case.available_variation_data` in this example, than having one filter with multiple arguments. + +```Filter +{ + "op": "and", + "content": [ + { + "op": "in", + "content": { + "field": "cnv.cnv_change", + "value": [ + "Gain" + ] + } + }, + { + "op": "in", + "content": { + "field": "case.demographic.gender", + "value": [ + "female" + ] + } + }, + { + "op": "in", + "content": { + "field": "case.available_variation_data", + "value": [ + "cnv" + ] + } + }, + { + "op": "in", + "content": { + "field": "case.available_variation_data", + "value": [ + "ssm" + ] + } + }, + { + "op": "in", + "content": { + "field": "cnv.chromosome", + "value": [ + "9" + ] + } + }, + { + "op": "in", + "content": { + "field": "case.disease_type", + "value": [ + "Squamous Cell Neoplasms" + ] + } + } + ] +} + +``` + +```Shell +curl 'https://api.gdc.cancer.gov/cnv_occurrences?filters=%7B%0D%0A++++%22op%22%3A+%22and%22%2C%0D%0A++++%22content%22%3A+%5B%0D%0A++++++++%7B%0D%0A++++++++++++%22op%22%3A+%22in%22%2C%0D%0A++++++++++++%22content%22%3A+%7B%0D%0A++++++++++++++++%22field%22%3A+%22cnv.cnv_change%22%2C%0D%0A++++++++++++++++%22value%22%3A+%5B%0D%0A++++++++++++++++++++%22Gain%22%0D%0A++++++++++++++++%5D%0D%0A++++++++++++%7D%0D%0A++++++++%7D%2C%0D%0A++++++++%7B%0D%0A++++++++++++%22op%22%3A+%22in%22%2C%0D%0A++++++++++++%22content%22%3A+%7B%0D%0A++++++++++++++++%22field%22%3A+%22case.demographic.gender%22%2C%0D%0A++++++++++++++++%22value%22%3A+%5B%0D%0A++++++++++++++++++++%22female%22%0D%0A++++++++++++++++%5D%0D%0A++++++++++++%7D%0D%0A++++++++%7D%2C%0D%0A++++++++%7B%0D%0A++++++++++++%22op%22%3A+%22in%22%2C%0D%0A++++++++++++%22content%22%3A+%7B%0D%0A++++++++++++++++%22field%22%3A+%22case.available_variation_data%22%2C%0D%0A++++++++++++++++%22value%22%3A+%5B%0D%0A++++++++++++++++++++%22cnv%22%0D%0A++++++++++++++++%5D%0D%0A++++++++++++%7D%0D%0A++++++++%7D%2C%0D%0A++++++++%7B%0D%0A++++++++++++%22op%22%3A+%22in%22%2C%0D%0A++++++++++++%22content%22%3A+%7B%0D%0A++++++++++++++++%22field%22%3A+%22case.available_variation_data%22%2C%0D%0A++++++++++++++++%22value%22%3A+%5B%0D%0A++++++++++++++++++++%22ssm%22%0D%0A++++++++++++++++%5D%0D%0A++++++++++++%7D%0D%0A++++++++%7D%2C%0D%0A++++++++%7B%0D%0A++++++++++++%22op%22%3A+%22in%22%2C%0D%0A++++++++++++%22content%22%3A+%7B%0D%0A++++++++++++++++%22field%22%3A+%22cnv.chromosome%22%2C%0D%0A++++++++++++++++%22value%22%3A+%5B%0D%0A++++++++++++++++++++%229%22%0D%0A++++++++++++++++%5D%0D%0A++++++++++++%7D%0D%0A++++++++%7D%2C%0D%0A++++++++%7B%0D%0A++++++++++++%22op%22%3A+%22in%22%2C%0D%0A++++++++++++%22content%22%3A+%7B%0D%0A++++++++++++++++%22field%22%3A+%22case.disease_type%22%2C%0D%0A++++++++++++++++%22value%22%3A+%5B%0D%0A++++++++++++++++++++%22Squamous+Cell+Neoplasms%22%0D%0A++++++++++++++++%5D%0D%0A++++++++++++%7D%0D%0A++++++++%7D%0D%0A++++%5D%0D%0A%7D&fields=case.available_variation_data,case.case_id&format=tsv' +``` + +```tsv +case.case_id case.available_variation_data.1 case.available_variation_data.0 id +638035f6-2909-4a44-980f-468ac5d74e18 ssm cnv e76d2aaf-f951-5a51-a949-a241dba61f73 +ad98977b-e159-410a-b8c2-f4e8a07f9784 ssm cnv ff3506b8-ee80-570f-ad2d-4ab4a7363b82 +c83c52f4-3815-4f49-8218-cf80aaa62e2f ssm cnv e73696c5-386f-5cae-aa10-f8628f32ee0e +dac27c24-cdbf-4527-9214-178fde3d098a ssm cnv 77885824-fae1-5116-9851-694255249cc8 +0e91d7b5-ce35-4671-ab9f-cfd5369b557c ssm cnv 526529ae-8e59-597e-aea1-cc0b06a82e76 +ea34663c-f40e-4a3e-9ac0-65d5e9eef12b ssm cnv e4a0c034-44d4-5dea-912a-ce331d9a9512 +05026179-b1da-411e-a286-89727b1ae380 ssm cnv 30bdc04c-54a5-53ca-bdd0-b808f23da266 +f1a1bbf9-4751-4fb4-8a2b-19f8d4ba57bd ssm cnv 02e3fbb3-da8f-5983-8d10-189e641ddf11 +a6ec75d4-1c90-4527-bfae-aa91d2dae082 ssm cnv 94b0e8be-1130-5b88-9103-6756bdabf67b +107f6b9a-2883-4499-a40a-ec25bc834a06 ssm cnv ad831f27-e6f5-5b78-8a15-0b652621ea4c +``` + +__Example 2:__ A user is interested in the first cnv occurrence (`e76d2aaf-f951-5a51-a949-a241dba61f73`) from the previous example, and would like to know more about the case exposures and demographics. + +```Shell +curl 'https://api.gdc.cancer.gov/cnv_occurrences/e76d2aaf-f951-5a51-a949-a241dba61f73?pretty=true&expand=cnv,case,case.exposures,case.demographic' +``` + +```Json +{ + "data": { + "cnv": { + "ncbi_build": "GRCh38", + "cnv_id": "0d475712-c11e-51fb-b6e6-407d12978057", + "gene_level_cn": true, + "cnv_change": "Gain", + "end_position": 133348131, + "variant_status": "Tumor only", + "start_position": 133338323, + "chromosome": "9" + }, + "case": { + "disease_type": "Squamous Cell Neoplasms", + "updated_datetime": "2018-09-06T11:07:45.510627-05:00", + "created_datetime": null, + "demographic": { + "updated_datetime": "2018-09-06T11:07:45.510627-05:00", + "created_datetime": null, + "gender": "female", + "year_of_birth": 1954, + "submitter_id": "TCGA-EA-A3HR_demographic", + "state": "released", + "race": "white", + "demographic_id": "dd8576a8-bd62-55e7-b0df-7233ceded2fb", + "ethnicity": "not hispanic or latino", + "year_of_death": null + }, + "submitter_id": "TCGA-EA-A3HR", + "state": "released", + "case_id": "638035f6-2909-4a44-980f-468ac5d74e18", + "primary_site": "Cervix uteri", + "available_variation_data": [ + "cnv", + "ssm" + ], + "exposures": [ + { + "cigarettes_per_day": null, + "weight": 86, + "updated_datetime": "2018-09-06T11:07:45.510627-05:00", + "created_datetime": null, + "alcohol_intensity": null, + "bmi": 40, + "years_smoked": null, + "submitter_id": "TCGA-EA-A3HR_exposure", + "alcohol_history": null, + "state": "released", + "tobacco_smoking_status": null, + "tobacco_smoking_onset_year": null, + "tobacco_smoking_quit_year": null, + "exposure_id": "0e7265ab-bf65-50c7-bf33-96a7ac452d7c", + "height": 146, + "pack_years_smoked": null + } + ] + }, + "cnv_occurrence_id": "e76d2aaf-f951-5a51-a949-a241dba61f73" + } +``` ## Analysis Endpoints @@ -937,7 +1379,7 @@ curl "https://api.gdc.cancer.gov/analysis/mutated_cases_count_by_project?size=0& ``` ### Survival Analysis Endpoint -[Survival plots](/Data_Portal/Projects/#Survival-Analysis) are generated for different subsets of data, based on variants or projects, in the GDC Data Portal. The `/analysis/survival` endpoint can be used to programmatically retrieve the raw data used to generate these plots and apply different filters. Note that the `fields` and `format` parameters cannot be modified. +[Survival plots](/Data_Portal/Users_Guide/Exploration/#survival-analysis) are generated for different subsets of data, based on variants or projects, in the GDC Data Portal. The `/analysis/survival` endpoint can be used to programmatically retrieve the raw data used to generate these plots and apply different filters. Note that the `fields` and `format` parameters cannot be modified. __Example 1:__ A user wants to download data to generate a survival plot for cases from the project TCGA-DLBC. diff --git a/docs/API/Users_Guide/Downloading_Files.md b/docs/API/Users_Guide/Downloading_Files.md index 975902ed2..4774f50c1 100644 --- a/docs/API/Users_Guide/Downloading_Files.md +++ b/docs/API/Users_Guide/Downloading_Files.md @@ -2,9 +2,9 @@ The GDC API implements file download functionality using `data` and `manifest` endpoints. The `data` endpoint allows users to download files stored in the GDC by specifying file UUID(s). The `manifest` endpoint generates a download manifest file that can be used with the GDC Data Transfer Tool to transfer large volumes of data. -**Note:** Downloading controlled access data requires the use of an authentication token. See [Getting Started: Authentication](Getting_Started.md#authentication) for details. +>**Note:** Downloading controlled access data requires the use of an authentication token. See [Getting Started: Authentication](Getting_Started.md#authentication) for details. -**Note:** Requests to download data from the GDC Legacy Archive may be directed to `legacy/data` or `data`. See [Getting Started: Legacy Archive](Getting_Started.md#gdc-legacy-archive) for details. +>**Note:** Requests to download data from the GDC Legacy Archive may be directed to `legacy/data` or `data`. See [Getting Started: Legacy Archive](Getting_Started.md#gdc-legacy-archive) for details. ## Data endpoint @@ -42,7 +42,8 @@ curl --remote-name --remote-header-name 'https://api.gdc.cancer.gov/data/7efc039 % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 65353 0 65353 0 0 65353 0 --:--:-- --:--:-- --:--:-- 102k -curl: Saved to filename 'gdc_download_20180830_131817.826097.tar.gz' +curl: Saved to filename 'BLAIN_p_TCGA_282_304_b2_N_GenomeWideSNP_6_D04_1348436.nocnv_hg19.seg.txt' + ``` @@ -60,7 +61,7 @@ curl --remote-name --remote-header-name 'https://api.gdc.cancer.gov/data/e322802 curl: Saved to filename 'gdc_download_064d1aa8cc8cbab33e93979bebbf7d6af2d6a802.tar.gz' ``` -**Note:** This method supports downloading a limited number of files at one time. To download a large number of files, please use [POST](#downloading-multiple-files-using-post). +>**Note:** This method supports downloading a limited number of files at one time. To download a large number of files, please use [POST](#downloading-multiple-files-using-post). #### Downloading an Uncompressed Group of Files @@ -177,14 +178,15 @@ curl: Saved to filename 'ACOLD_p_TCGA_Batch17_SNP_N_GenomeWideSNP_6_A03_466078.t ## Manifest endpoint -The `manifest` endpoint generates a download manifest file that can be used with the GDC Data Transfer Tool. The Data Transfer Tool is recommended for transferring large volumes of data. The GDC API can also generate a download manifest from a list of results that match a [Search and Retrieval](Search_and_Retrieval.md) query. To do this, append `&return_type=manifest` to the end of the query. +The `manifest` endpoint generates a download manifest file that can be used with the [GDC Data Transfer Tool](../../Data_Transfer_Tool/Users_Guide/Getting_Started.md). The Data Transfer Tool is recommended for transferring large volumes of data. The GDC API can also generate a download manifest from a list of results that match a [Search and Retrieval](Search_and_Retrieval.md) query. To do this, append `&return_type=manifest` to the end of the query. + ### Using the manifest endpoint The `manifest` endpoint allows users to create a download manifest, which can be used with the GDC Data Transfer Tool to download a large volume of data. The `manifest` endpoint generates a manifest file from a comma-separated list of UUIDs. ```shell -curl --remote-name --remote-header-name 'https://api.gdc.cancer.gov/v0/manifest/ae9db773-78ab-48d0-972d-debe1bedd37d,3d815e6e-db97-419d-ad7f-dba4e4023b3e' +curl --remote-name --remote-header-name 'https://api.gdc.cancer.gov/v0/manifest/a751cc7e-d2ff-4e9a-8645-09bf12612f1a,9c97e3fe-1610-4a92-9a24-ab3b9e4000e2' ``` ```Output % Total % Received % Xferd Average Speed Time Time Time Current @@ -207,5 +209,5 @@ curl --remote-name --remote-header-name 'https://api.gdc.cancer.gov/files?filter % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 40663 0 40663 0 0 77109 0 --:--:-- --:--:-- --:--:-- 77306 -curl: Saved to filename 'gdc_manifest.2016-06-28T13:26:33.850459.tsv' +curl: Saved to filename 'gdc_manifest.2021-04-05.txt' ``` diff --git a/docs/API/Users_Guide/Getting_Started.md b/docs/API/Users_Guide/Getting_Started.md index f1a578cb9..492b59809 100644 --- a/docs/API/Users_Guide/Getting_Started.md +++ b/docs/API/Users_Guide/Getting_Started.md @@ -26,8 +26,7 @@ Examples of tools that can help build GDC API calls: | ------------- |-------------| | [JSONLint](http://jsonlint.com/)| Validate JSON | | [JSON Formatter](http://jsonformatter.org/) | Format, validate, and convert JSON to other formats | -| [Percent-(URL)-encoding tool](http://text-rescue.com/string-escape/percent-url-encoding-tool.html)| Tool for percent-encoding strings | -| [JSON escape tool](http://text-rescue.com/string-escape/json-escape-tool.html)| Tool for escaping strings using JSON string rules | +| [Percent-(URL)-encoding tool](https://codebeautify.org/url-encode-string)| Tool for percent-encoding strings | ## API Endpoints @@ -54,9 +53,11 @@ For example, the address of the latest version of the `status` endpoint is `http ### GDC Legacy Archive -To interact with data in the GDC Legacy Archive, add `legacy` to the endpoint URL: +To interact with data in the GDC Legacy Archive, add `legacy` to the endpoint URL before the ``: - https://api.gdc.cancer.gov//legacy/ + https://api.gdc.cancer.gov/legacy/ + +> __NOTE:__ The version can also be applied to a Legacy Archive search by placing the `` before "/legacy/" ## Entity UUIDs @@ -80,39 +81,21 @@ import json file_endpt = 'https://api.gdc.cancer.gov/files/' file_uuid = 'd853e541-f16a-4345-9f00-88e03c2dc0bc' response = requests.get(file_endpt + file_uuid) -print json.dumps(response.json(), indent=2) -``` Response -{ - "data": { - "data_type": "Aligned Reads", - "updated_datetime": "2016-05-26T17:06:40.003624-05:00", - "created_datetime": "2016-05-26T17:06:40.003624-05:00", - "file_name": "0017ba4c33a07ba807b29140b0662cb1_gdc_realn.bam", - "md5sum": "a08304b120c5df76b6532da0e9a35ced", - "data_format": "BAM", - "acl": [ - "phs000178" - ], - "access": "controlled", - "platform": "Illumina", - "state": "submitted", - "file_id": "d853e541-f16a-4345-9f00-88e03c2dc0bc", - "data_category": "Raw Sequencing Data", - "file_size": 23650901931, - "submitter_id": "c30188d7-be1a-4b43-9a17-e19ccd71792e", - "type": "aligned_reads", - "file_state": "processed", - "experimental_strategy": "WXS" - }, - "warnings": {} -} -``` +# OUTPUT METHOD 1: Write to a file. +file = open("sample_request.json", "w") +file.write(response.text) +file.close() + +# OUTPUT METHOD 2: View on screen. +print(json.dumps(response.json(), indent=2)) +``` +[Download Script](scripts/Sample_Request.py) ## Authentication Authentication is required for downloading controlled-access data, and for all data submission functionality. The GDC API uses tokens for authentication. -Users can obtain authentication tokens from the [GDC Data Portal](https://portal.gdc.cancer.gov) and the [GDC Data Submission Portal](https://portal.gdc.cancer.gov/submission). See the [GDC Data Portal User's Guide](../../Data_Portal/Users_Guide/Authentication.md#gdc-authentication-tokens) and the [GDC Data Submission Portal User's Guide](../../Data_Submission_Portal/Users_Guide/Authentication.md#gdc-authentication-tokens) for instructions. +Users can obtain authentication tokens from the [GDC Data Portal](https://portal.gdc.cancer.gov) and the [GDC Data Submission Portal](https://portal.gdc.cancer.gov/submission). See the [GDC Data Portal User's Guide](../../Data_Portal/Users_Guide/Repository.md#gdc-authentication-tokens) and the [GDC Data Submission Portal User's Guide](../../Data_Submission_Portal/Users_Guide/Data_Submission_Process.md#authentication) for instructions. ### Using Authentication Tokens @@ -120,18 +103,52 @@ All API requests that require authentication must include a token as an `X-Auth- In the following example, an authentication token is saved as an environment variable and passed to `curl` to download a controlled-access file: -``` shell -token=$() curl -O -J -H "X-Auth-Token: $token" 'https://api.gdc.cancer.gov/data/a1c1b23b-cc41-4e85-b1b7-62a42873c5af' ``` -```Output +```Shell Output % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 31.4M 100 31.4M 0 0 290k 0 0:01:50 0:01:50 --:--:-- 172k curl: Saved to filename 'ACOLD_p_TCGA_Batch17_SNP_N_GenomeWideSNP_6_A03_466078.tangent.copynumber.data.txt' ``` +```Python +import requests +import json +import re + +''' + This script will not work until $TOKEN_FILE_PATH + is replaced with an actual path. +''' + +with open("$TOKEN_FILE_PATH","r") as token: + token_string = str(token.read().strip()) + +headers = { + 'X-Auth-Token': token_string + } + +data_endpt = 'https://api.gdc.cancer.gov/data/' +data_uuid = 'a1c1b23b-cc41-4e85-b1b7-62a42873c5af' +headers = { + 'X-Auth-Token': token_string + } +response = requests.get(data_endpt + data_uuid, headers=headers) + +# The file name can be found in the header within the Content-Disposition key. +response_head_cd = response.headers["Content-Disposition"] + +file_name = re.findall("filename=(.+)", response_head_cd)[0] + +with open(file_name, "wb") as output_file: + output_file.write(response.content) +``` +[Download Python Script](scripts/Authentication_Tokens.py) + For more information about authentication tokens, including token expiration and rotation, see [Data Security](../../Data/Data_Security/Data_Security.md#authentication-tokens). -**NOTE:** The authentication token should be kept in a secure location, as it allows access to all data accessible by the associated user account. +>**NOTE:** The authentication token should be kept in a secure location, as it allows access to all data accessible by the associated user account. diff --git a/docs/API/Users_Guide/GraphQL_Examples.md b/docs/API/Users_Guide/GraphQL_Examples.md new file mode 100644 index 000000000..fe9d7888d --- /dev/null +++ b/docs/API/Users_Guide/GraphQL_Examples.md @@ -0,0 +1,174 @@ +# Introduction to GDC GraphQL +[GraphQL](https://graphql.org/) is a query language for APIs. The [GDC REST API](https://docs.gdc.cancer.gov/API/Users_Guide/Getting_Started/) has structured and specifically defined query parameters as well as endpoints that have set requests and responses. The GDC GraphQL provides advanced GDC users greater flexibility to specify the data they would like to be returned. This allows queries to be cleaner and easier to understand, especially when combining multiple queries into one request. + +To produce queries in a visual interface, the GDC recommends using [GraphiQL](https://github.com/graphql/graphiql). See below for the correct endpoint URLs. + +## Using GDC GraphQL vs GDC REST API + +If the query requires only a subset of the data to be returned, GDC GraphQL may speed up requests as GraphQL queries return only the specified data. This may require less work on the GDC server-side to fulfill those requests. Conversely, if an entire data-set is required for each request, the GDC REST API may be a better fit. No matter which method is used, the data returned by the GDC REST API and the GraphQL query will be identical as they query the same source. + +## GDC GraphQL Overview +GraphQL is not a storage model or a database query language. The graph refers to graph structures defined in the schema, where nodes define objects and edges define relationships between objects. The API traverses and returns application data based on the schema definitions, independent of how the data is stored. + +## GDC GraphQL Endpoints + +The GDC GraphQL has only two endpoints: +* __GDC Search and Retrieval Endpoint:__ https://api.gdc.cancer.gov/v0/graphql +* __GDC Submission Endpoint:__ https://api.gdc.cancer.gov/v0/submission/graphql + +This page covers the search and retrieval endpoint, see the [GDC Submission API](Submission.md) for additional details on the submission endpoint. + +## GDC GraphQL Schema +All GDC GraphQL queries are validated and executed against the [GDC GraphQL schema]( https://github.com/NCI-GDC/portal-ui/blob/92f0dfa17838746093c3c011141d08391016da91/data/schema.graphql). Because the GraphQL parameters are discoverable, the GDC GraphQL schema can be queried for details about itself. + +The `__schema` keyword can be queried to list all types defined in the schema and retrieve details about each: + +```GraphQL +{ + __schema { + types { + name + kind + fields { + name + } + } + } +} +``` +The `__type` keyword can also be queried to retrieve details about any type such as "Explore" or "Case": +```GraphQL + +{ + __type(name: "Explore") { + name + kind + description + fields { + name + } + } +} +``` + +```GraphQL +{ + __type(name: "Case") { + name + kind + description + fields { + name + } + } +} +``` + +## Basic GraphQL queries in GDC +The two types of allowed operations in GDC GraphQL API are queries and mutations. Comparing GraphQL to REST, queries operate like `GET` requests, while mutations operate like `POST`/`PATCH`/`DELETE`. + +__Note:__ This guide does not cover GDC GraphQL mutation operations. + +GraphQL queries return only the data that is specified. Queries are built by specifying fields within fields (also called nested *subfields*) until only scalars are returned. Scalars are primitive values such as: `Int`, `Float`, `String`, `Boolean`, or `ID`. + +## Anatomy of a typical GDC GraphQL Query + + [![GraqhQL-Query](images/graphql-query.png)](images/graphql-query.png "Click to see the full image.") + +- __Operation type:__ Describes what type of operation that is being performed, such as query, mutation, or subscription +- __Operation name:__ Similar to a function name, gives queries meaningful names +- __Field:__ Denotes the specific fields on objects that will be included with the response data +- __Arguments:__ A set of key-value pairs associated with a specific field. The parameters can be literal values or variables. __NOTE:__ Arguments can appear on any field, even fields nested deep in an operation. +- __Variable definitions:__ As GraphQL is strong typed, it validates the variable being passed dynamically. __NOTE:__ Variables are passed separately from the query document as JSON such as: + +```json + { "filters_1": {"op":"in","content":{"field":"projects.program.name","value":["TARGET"]}}} +``` + +## GDC GraphQL Examples +### Nodes And Edges Example +A very powerful feature of GDC GraphQL API is that the graph structures defined in the [GDC GraphQL schema]( https://github.com/NCI-GDC/portal-ui/blob/92f0dfa17838746093c3c011141d08391016da91/data/schema.graphql ) can be queried and traversed. In these queries, nodes define objects and edges define relationships between objects. + +```GraphQL + +query PROJECTS_EDGES($filters_1: FiltersArgument) { + projects { + hits(filters: $filters_1) { + total + edges { + node { + primary_site + disease_type + project_id + dbgap_accession_number + } + } + } + } +} + + variable: + { "filters_1": {"op": "in", "content": {"field": "projects.primary_site", "value": ["Kidney"]}}} +``` + +### Query Case File Counts + +```GraphQL +query CaseFileCounts($filters: FiltersArgument) { + viewer { + repository { + cases { + hits(first: 1, filters: $filters) { + edges { + node { + case_id + files { + hits(first: 0) { + total + } + } + summary { + experimental_strategies { + experimental_strategy + file_count + } + data_categories { + data_category + file_count + } + } + } + } + } + } + } + } +} + +variable: +{"filters":{"op":"in","content":{"field":"cases.case_id","value":["dcd5860c-7e3a-44f3-a732-fe92fe3fe300"]}}} +``` + +### Query Simple Static Mutations Based on Gene IDs + +```GraphQL + +query PROJECTS_EDGES($filters_2: FiltersArgument) { + explore { + ssms { + hits(filters: $filters_2) { + total + edges { + node { + ssm_id + gene_aa_change + } + } + } + } + } +} + +variable: +{"filters_2": {"op":"in","content":{"field":"consequence.transcript.gene.gene_id","value":["ENSG00000155657"]}}} +``` diff --git a/docs/API/Users_Guide/Python_Examples.md b/docs/API/Users_Guide/Python_Examples.md index 25ab3b6d3..843cbf4c6 100644 --- a/docs/API/Users_Guide/Python_Examples.md +++ b/docs/API/Users_Guide/Python_Examples.md @@ -438,5 +438,13 @@ The following script should produce an unformatted JSON string with information import requests status_endpt = "https://api.gdc.cancer.gov/status" response = requests.get(status_endpt) + +# OUTPUT METHOD 1: Write to a file. +file = open("api_status.json", "w") +file.write(response.text) +file.close() + +# OUTPUT METHOD 2: View on screen. print(response.content) ``` +[Download Script](scripts/Basic_Troubleshooting.py) diff --git a/docs/API/Users_Guide/Search_and_Retrieval.md b/docs/API/Users_Guide/Search_and_Retrieval.md index f156ae92e..d43da9de0 100644 --- a/docs/API/Users_Guide/Search_and_Retrieval.md +++ b/docs/API/Users_Guide/Search_and_Retrieval.md @@ -4,7 +4,7 @@ The GDC API provides endpoints that search and retrieve information stored in the GDC according to the [GDC Data Model](../../Data/Data_Model/GDC_Data_Model.md). The general format of requests to search & retrieval endpoints is described below. -**Note:** Queries described in this section work for datasets that have been released to the GDC Data Portal. Unreleased data that is in the process of being submitted to GDC cannot be queried using these methods. See [Submission](Submission.md) to learn how to query unreleased data using GraphQL. +>**Note:** Queries described in this section work for datasets that have been released to the GDC Data Portal. Unreleased data that is in the process of being submitted to GDC cannot be queried using these methods. See [Submission](Submission.md) to learn how to query unreleased data using GraphQL. ### Components of a Request @@ -19,7 +19,7 @@ A typical search and retrieval API request specifies the following parameters: Requests can be executed using HTTP GET or HTTP POST. GET requests are limited by maximum URL length, so the POST method is recommended for large queries. -**Note:** Requests for information stored in the GDC Legacy Archive must be directed to `legacy/` endpoints. See [Getting Started](Getting_Started.md#gdc-legacy-archive) for details. +>**Note:** Requests for information stored in the GDC Legacy Archive must be directed to `legacy/` endpoints. See [Getting Started](Getting_Started.md#gdc-legacy-archive) for details. ### POST Example @@ -87,56 +87,63 @@ The following search and retrieval endpoints are available in the GDC API: The choice of endpoint determines what is listed in the search results. The `files` endpoint will generate a list of files, whereas the `cases` endpoint will generate a list of cases. Each of the above endpoints, other than `_mapping`, can query and return any of the related fields in the [GDC Data Model](../../Data/Data_Model/GDC_Data_Model.md). So the `cases` endpoint can be queried for file fields (e.g. to look for cases that have certain types of experimental data), and the `files` endpoint can be queried for clinical metadata associated with a case (e.g. to look for files from cases diagnosed with a specific cancer type). -### Project Endpoint +### `Project` Endpoint The `projects` endpoint provides access to project records, the highest level of data organization in the GDC. #### Example This example is a query for projects contained in the GDC. It uses the [from](#from), [size](#size), [sort](#sort), and [pretty](#pretty) parameters, and returns the first two projects sorted by project id. ```shell -curl 'https://api.gdc.cancer.gov/projects?from=0&size=2&sort=project.project_id:asc&pretty=true' +curl 'https://api.gdc.cancer.gov/projects?from=0&size=2&sort=project_id:asc&pretty=true' ``` ``` Output { "data": { "hits": [ { - "dbgap_accession_number": null, - "disease_type": [ - "Brain Lower Grade Glioma" - ], - "released": true, - "state": "legacy", + "id": "BEATAML1.0-COHORT", "primary_site": [ - "Brain" + "Hematopoietic and reticuloendothelial systems" ], - "project_id": "TCGA-LGG", - "id": "TCGA-LGG", - "name": "Brain Lower Grade Glioma" + "dbgap_accession_number": "phs001657", + "project_id": "BEATAML1.0-COHORT", + "disease_type": [ + "Chronic Myeloproliferative Disorders", + "Unknown", + "Plasma Cell Tumors", + "Myelodysplastic Syndromes", + "Myeloid Leukemias", + "Leukemias, NOS" + ], + "releasable": true, + "name": "Functional Genomic Landscape of Acute Myeloid Leukemia", + "state": "open", + "released": true }, { - "dbgap_accession_number": null, - "disease_type": [ - "Thyroid Carcinoma" - ], - "released": true, - "state": "legacy", + "id": "BEATAML1.0-CRENOLANIB", "primary_site": [ - "Thyroid" + "Hematopoietic and reticuloendothelial systems" ], - "project_id": "TCGA-THCA", - "id": "TCGA-THCA", - "name": "Thyroid Carcinoma" + "dbgap_accession_number": "phs001628", + "project_id": "BEATAML1.0-CRENOLANIB", + "disease_type": [ + "Myeloid Leukemias" + ], + "releasable": false, + "name": "Clinical Resistance to Crenolanib in Acute Myeloid Leukemia Due to Diverse Molecular Mechanisms", + "state": "open", + "released": true } ], "pagination": { "count": 2, - "sort": "project.project_id:asc", + "total": 68, + "size": 2, "from": 0, + "sort": "project_id:asc", "page": 1, - "total": 39, - "pages": 20, - "size": 2 + "pages": 34 } }, "warnings": {} @@ -153,67 +160,99 @@ curl 'https://api.gdc.cancer.gov/projects/TARGET-NBL?expand=summary,summary.expe ```Response { "data": { - "dbgap_accession_number": "phs000467", - "disease_type": [ - "Neuroblastoma" - ], "summary": { + "file_count": 5796, "data_categories": [ { - "case_count": 151, - "file_count": 471, - "data_category": "Transcriptome Profiling" + "file_count": 941, + "case_count": 278, + "data_category": "Sequencing Reads" }, { - "case_count": 1127, - "file_count": 3, + "file_count": 2, + "case_count": 1132, "data_category": "Biospecimen" }, { - "case_count": 216, - "file_count": 1732, + "file_count": 322, + "case_count": 155, + "data_category": "Structural Variation" + }, + { + "file_count": 3723, + "case_count": 219, "data_category": "Simple Nucleotide Variation" }, { - "case_count": 7, - "file_count": 1, - "data_category": "Clinical" + "file_count": 805, + "case_count": 155, + "data_category": "Transcriptome Profiling" }, { - "case_count": 270, - "file_count": 599, - "data_category": "Raw Sequencing Data" + "file_count": 3, + "case_count": 1120, + "data_category": "Clinical" } ], - "case_count": 1127, - "file_count": 2806, "experimental_strategies": [ { - "case_count": 221, - "file_count": 2174, - "experimental_strategy": "WXS" + "file_count": 18, + "case_count": 9, + "experimental_strategy": "WGS" }, { - "case_count": 151, - "file_count": 628, + "file_count": 1610, + "case_count": 155, "experimental_strategy": "RNA-Seq" + }, + { + "file_count": 4163, + "case_count": 221, + "experimental_strategy": "WXS" } ], - "file_size": 8157614402888 + "case_count": 1132, + "file_size": 17132578153087 }, - "released": true, - "state": "legacy", "primary_site": [ - "Nervous System" + "Bones, joints and articular cartilage of other and unspecified sites", + "Other endocrine glands and related structures", + "Kidney", + "Lymph nodes", + "Connective, subcutaneous and other soft tissues", + "Renal pelvis", + "Bones, joints and articular cartilage of limbs", + "Meninges", + "Unknown", + "Peripheral nerves and autonomic nervous system", + "Skin", + "Liver and intrahepatic bile ducts", + "Adrenal gland", + "Heart, mediastinum, and pleura", + "Spinal cord, cranial nerves, and other parts of central nervous system", + "Uterus, NOS", + "Other and ill-defined sites", + "Hematopoietic and reticuloendothelial systems", + "Stomach", + "Retroperitoneum and peritoneum" ], + "dbgap_accession_number": "phs000467", "project_id": "TARGET-NBL", - "name": "Neuroblastoma" + "disease_type": [ + "Not Applicable", + "Neuroepitheliomatous Neoplasms" + ], + "name": "Neuroblastoma", + "releasable": true, + "state": "open", + "released": true }, "warnings": {} } + ``` -### Files Endpoint +### `Files` Endpoint The GDC Files Endpoint `https://api.gdc.cancer.gov/files` enables search and retrieval of information relating to files stored in the GDC, including file properties such as `file_name`, `md5sum`, `data_format`, and others. @@ -229,58 +268,60 @@ curl 'https://api.gdc.cancer.gov/files?from=0&size=2&sort=file_size:asc&pretty=t "data": { "hits": [ { - "data_release": "13.0", - "data_type": "Raw Simple Somatic Mutation", - "updated_datetime": "2018-07-20T22:27:55.342974+00:00", - "file_name": "333193d5-ca9a-4262-81f5-e9f3b44358fe.vcf.gz", - "submitter_id": "AD19_SimpleSomaticMutation", - "file_id": "333193d5-ca9a-4262-81f5-e9f3b44358fe", - "file_size": 866, - "id": "333193d5-ca9a-4262-81f5-e9f3b44358fe", - "created_datetime": "2017-09-10T19:16:02.549312-05:00", - "md5sum": "e33e95edb778fe67643162ef0ae3297e", - "data_format": "VCF", - "acl": [ - "phs001179" - ], + "id": "b2f71ec9-2047-463f-9381-0ecedf178954", + "data_format": "BEDPE", "access": "controlled", + "file_name": "e62df973-80fb-4fd2-af97-7f991006b34c.star_fusion.rna_fusion.bedpe", + "data_category": "Structural Variation", + "submitter_id": "012c013d-e928-4bb6-9c1d-b4c9fa94763a", + "acl": [ + "phs000235", + "phs000528" + ], + "type": "structural_variation", + "file_size": 229, + "created_datetime": "2020-10-07T12:37:53.507724-05:00", + "md5sum": "6e5690795ff424264402ab9d2661b62b", + "updated_datetime": "2020-10-20T21:09:05.135867-05:00", + "file_id": "b2f71ec9-2047-463f-9381-0ecedf178954", + "data_type": "Transcript Fusion", "state": "released", + "experimental_strategy": "RNA-Seq", "version": "1", - "data_category": "Simple Nucleotide Variation", - "type": "simple_somatic_mutation", - "experimental_strategy": "Targeted Sequencing" + "data_release": "27.0" }, { - "data_release": "13.0", - "data_type": "Raw Simple Somatic Mutation", - "updated_datetime": "2018-07-20T22:27:55.342974+00:00", - "file_name": "d9114e23-0f62-4979-aefc-0dd4d5eb891b.vcf.gz", - "submitter_id": "AD116_SimpleSomaticMutation", - "file_id": "d9114e23-0f62-4979-aefc-0dd4d5eb891b", - "file_size": 866, - "id": "d9114e23-0f62-4979-aefc-0dd4d5eb891b", - "created_datetime": "2017-09-10T21:53:02.376246-05:00", - "md5sum": "95bbfd0586d3c284e9f88edf3bf26065", - "data_format": "VCF", - "acl": [ - "phs001179" - ], + "id": "0acf98ae-2acb-4c08-ae10-166905c1c326", + "data_format": "BEDPE", "access": "controlled", + "file_name": "aa39e53f-5a15-4ade-a85e-753933068327.star_fusion.rna_fusion.bedpe", + "data_category": "Structural Variation", + "submitter_id": "bfad2328-8120-45fb-b7af-0bb4d71e6a1c", + "acl": [ + "phs000235", + "phs000528" + ], + "type": "structural_variation", + "file_size": 229, + "created_datetime": "2020-10-07T12:37:40.289456-05:00", + "md5sum": "6e5690795ff424264402ab9d2661b62b", + "updated_datetime": "2020-10-20T21:09:05.135867-05:00", + "file_id": "0acf98ae-2acb-4c08-ae10-166905c1c326", + "data_type": "Transcript Fusion", "state": "released", + "experimental_strategy": "RNA-Seq", "version": "1", - "data_category": "Simple Nucleotide Variation", - "type": "simple_somatic_mutation", - "experimental_strategy": "Targeted Sequencing" + "data_release": "27.0" } ], "pagination": { "count": 2, - "sort": "file_size:asc", + "total": 596758, + "size": 2, "from": 0, + "sort": "file_size:asc", "page": 1, - "total": 356381, - "pages": 178191, - "size": 2 + "pages": 298379 } }, "warnings": {} @@ -289,7 +330,7 @@ curl 'https://api.gdc.cancer.gov/files?from=0&size=2&sort=file_size:asc&pretty=t #### Retrieval of file metadata using individual UUIDs: -The `\files` endpoint supports a simple query format that retrieves the metadata of a single file using its UUID. Note that the `\files` endpoint is inactive when querying for earlier file versions. In that case, the `\history` or `/files/versions` endpoints should be used instead. +The `/files` endpoint supports a simple query format that retrieves the metadata of a single file using its UUID. Note that the `/files` endpoint is inactive when querying for earlier file versions. In that case, the `/history` or `/files/versions` endpoints should be used instead. ```Shell curl 'https://api.gdc.cancer.gov/files/874e71e0-83dd-4d3e-8014-10141b49f12c?pretty=true' @@ -297,31 +338,31 @@ curl 'https://api.gdc.cancer.gov/files/874e71e0-83dd-4d3e-8014-10141b49f12c?pret ``` Output { "data": { - "data_release": "13.0", - "data_type": "Raw Simple Somatic Mutation", - "updated_datetime": "2018-07-20T22:27:55.342974+00:00", - "created_datetime": "2016-06-03T17:03:06.608739-05:00", - "file_name": "874e71e0-83dd-4d3e-8014-10141b49f12c.vcf.gz", - "md5sum": "acf2929b1b825bcd1377023e8b8767ec", "data_format": "VCF", + "access": "controlled", + "file_name": "874e71e0-83dd-4d3e-8014-10141b49f12c.vcf.gz", + "submitter_id": "TCGA-V4-A9EZ-01A-11D-A39W-08_TCGA-V4-A9EZ-10A-01D-A39Z-08_mutect", + "data_category": "Simple Nucleotide Variation", "acl": [ "phs000178" ], - "access": "controlled", - "state": "live", - "version": "1", - "file_id": "874e71e0-83dd-4d3e-8014-10141b49f12c", - "data_category": "Simple Nucleotide Variation", - "file_size": 122293, - "submitter_id": "TCGA-V4-A9EZ-01A-11D-A39W-08_TCGA-V4-A9EZ-10A-01D-A39Z-08_mutect", "type": "simple_somatic_mutation", - "experimental_strategy": "WXS" + "created_datetime": "2016-06-03T17:03:06.608739-05:00", + "file_size": 122293, + "updated_datetime": "2018-09-06T20:37:37.991443-05:00", + "md5sum": "acf2929b1b825bcd1377023e8b8767ec", + "file_id": "874e71e0-83dd-4d3e-8014-10141b49f12c", + "data_type": "Raw Simple Somatic Mutation", + "state": "released", + "experimental_strategy": "WXS", + "version": "1", + "data_release": "12.0 - 27.0" }, "warnings": {} } ``` -__Note:__ The `file_size` field associated with each file is reported in bytes. +>__Note:__ The `file_size` field associated with each file is reported in bytes. #### Example of retrieving file version information: @@ -335,40 +376,70 @@ curl 'https://api.gdc.cancer.gov/files/versions/1dd28069-5777-4ff9-bd2b-d1ba68e8 ``` Output1 [ { - "latest_size": 332092, - "latest_id": "1dd28069-5777-4ff9-bd2b-d1ba68e88b06", - "latest_version": "1", + "id": "1dd28069-5777-4ff9-bd2b-d1ba68e88b06", "filename": "1dd28069-5777-4ff9-bd2b-d1ba68e88b06.vcf.gz", - "state": "validated", "version": "1", + "md5": "c2f9b196e154906a70c7ec46492a859d", + "size": 332092, + "state": "validated", + "release": "12.0", + "latest_id": "1dd28069-5777-4ff9-bd2b-d1ba68e88b06", "latest_filename": "1dd28069-5777-4ff9-bd2b-d1ba68e88b06.vcf.gz", - "latest_release": [ - "13.0" - ], - "latest_state": "validated", - "release": "13.0", + "latest_version": "1", "latest_md5": "c2f9b196e154906a70c7ec46492a859d", - "size": 332092, - "id": "1dd28069-5777-4ff9-bd2b-d1ba68e88b06", - "md5": "c2f9b196e154906a70c7ec46492a859d" + "latest_size": 332092, + "latest_state": "validated", + "latest_release": [ + "12.0", + "13.0", + "14.0", + "15.0", + "16.0", + "17.0", + "18.0", + "19.0", + "20.0", + "21.0", + "22.0", + "23.0", + "24.0", + "25.0", + "26.0", + "27.0" + ] }, { - "latest_size": 6653119038, - "latest_id": "2a03abac-f1a2-49a9-a57c-7543739dd862", - "latest_version": "1", + "id": "2a03abac-f1a2-49a9-a57c-7543739dd862", "filename": "a5d86cde-32ca-4ed6-b1a5-5a47575f2ac6_gdc_realn_rehead.bam", - "state": "validated", "version": "1", + "md5": "48686fcd84ac713d44261ca9e26b89fb", + "size": 6653119038, + "state": "validated", + "release": "12.0", + "latest_id": "2a03abac-f1a2-49a9-a57c-7543739dd862", "latest_filename": "a5d86cde-32ca-4ed6-b1a5-5a47575f2ac6_gdc_realn_rehead.bam", - "latest_release": [ - "13.0" - ], - "latest_state": "validated", - "release": "13.0", + "latest_version": "1", "latest_md5": "48686fcd84ac713d44261ca9e26b89fb", - "size": 6653119038, - "id": "2a03abac-f1a2-49a9-a57c-7543739dd862", - "md5": "48686fcd84ac713d44261ca9e26b89fb" + "latest_size": 6653119038, + "latest_state": "validated", + "latest_release": [ + "12.0", + "13.0", + "14.0", + "15.0", + "16.0", + "17.0", + "18.0", + "19.0", + "20.0", + "21.0", + "22.0", + "23.0", + "24.0", + "25.0", + "26.0", + "27.0" + ] } ] ``` @@ -448,11 +519,11 @@ curl --request POST --header "Content-Type: text/tsv" https://api.gdc.cancer.go }] ``` -### Cases Endpoint +### `Cases` Endpoint The GDC Cases Endpoint `https://api.gdc.cancer.gov/cases` enables search and retrieval of information related to a specific case. -__Note:__ The `cases` endpoint is designed to retrieve the metadata associated with one or more cases, including all nested biospecimen entities. Filters can be applied to retrieve information for entire cases, but not for lower-level biospecimen entities. For example, a sample within a case cannot be used to query for aliquots that are associated only with that sample. All aliquots associated with the case would be retrieved. +The `cases` endpoint is designed to retrieve the metadata associated with one or more cases, including all nested biospecimen entities. Filters can be applied to retrieve information for entire cases, but not for lower-level biospecimen entities. For example, a sample within a case cannot be used to query for aliquots that are associated only with that sample. All aliquots associated with the case would be retrieved. #### Example @@ -468,100 +539,109 @@ curl 'https://api.gdc.cancer.gov/cases?filters=%7B%22op%22%3A%22and%22%2C%22cont ``` ``` Output { - { - "data": { - "hits": [ - { - "updated_datetime": "2017-03-04T16:39:19.244769-06:00", - "submitter_analyte_ids": [ - "TCGA-BH-A0EA-01A-11R", - "TCGA-BH-A0EA-10A-01W", - "TCGA-BH-A0EA-01A-11W", - "TCGA-BH-A0EA-01A-11D", - "TCGA-BH-A0EA-10A-01D" - ], - "analyte_ids": [ - "fe678556-acf4-4bde-a95e-860bb0150a95", - "66ed0f86-5ca5-4dec-ba76-7ee4dcf31831", - "f19f408a-815f-43d9-8032-e9482b796371", - "69ddc092-88a0-4839-a2bb-9f1c9e760409", - "30cb470f-66d4-4085-8c30-83a42e8453d4" - ], - "submitter_id": "TCGA-BH-A0EA", - "case_id": "1f601832-eee3-48fb-acf5-80c4a454f26e", - "id": "1f601832-eee3-48fb-acf5-80c4a454f26e", - "disease_type": "Breast Invasive Carcinoma", - "sample_ids": [ - "9a6c71a6-82cd-42b1-a93f-f569370848d6", - "7f791228-dd77-4ab0-8227-d784a4c7fea1" - ], - "portion_ids": [ - "cb6086d1-3416-4310-b109-e8fa6e8b72d4", - "8629bf5a-cdaf-4f6a-90bb-27dd4a7565c5", - "ae4f5816-f97a-4605-9b05-9ab820467dee" - ], - "submitter_portion_ids": [ - "TCGA-BH-A0EA-01A-21-A13C-20", - "TCGA-BH-A0EA-01A-11", - "TCGA-BH-A0EA-10A-01" - ], - "created_datetime": null, - "slide_ids": [ - "90154ea1-6b76-4445-870e-d531d6fa1239", - "a0826f0d-986a-491b-8c6f-b34f8929f3ee" - ], - "state": "live", - "aliquot_ids": [ - "eef9dce1-6ba6-432b-bbe2-53c7dbe64fe7", - "cde982b7-3b0a-49eb-8710-a599cb0e44c1", - "b1a3739d-d554-4202-b96f-f25a444e2042", - "97c64d6a-7dce-4d0f-9cb3-b3e4eb4719c5", - "561b8777-801a-49ed-a306-e7dafeb044b6", - "42d050e4-e8ee-4442-b9c0-0ee14706b138", - "ca71ca96-cbb7-4eab-9487-251dda34e107", - "cfbd5476-e83a-401d-9f9a-639c73a0e35b", - "edad5bd3-efe0-4c5f-b05c-2c0c2951c45a", - "262715e1-835c-4f16-8ee7-6900e26f7cf5", - "2beb34c4-d493-4a73-b21e-de77d43251ff", - "bcb7fc6d-60a0-48b7-aa81-14c0dda72d76" - ], - "primary_site": "Breast", - "submitter_aliquot_ids": [ - "TCGA-BH-A0EA-10A-01D-A113-01", - "TCGA-BH-A0EA-01A-11R-A115-07", - "TCGA-BH-A0EA-01A-11D-A10Y-09", - "TCGA-BH-A0EA-01A-11D-A314-09", - "TCGA-BH-A0EA-01A-11R-A114-13", - "TCGA-BH-A0EA-01A-11D-A111-01", - "TCGA-BH-A0EA-01A-11D-A112-05", - "TCGA-BH-A0EA-01A-11D-A10X-02", - "TCGA-BH-A0EA-10A-01D-A110-09", - "TCGA-BH-A0EA-10A-01W-A12U-09", - "TCGA-BH-A0EA-10A-01D-A10Z-02", - "TCGA-BH-A0EA-01A-11W-A12T-09" - ], - "submitter_sample_ids": [ - "TCGA-BH-A0EA-10A", - "TCGA-BH-A0EA-01A" - ], - "submitter_slide_ids": [ - "TCGA-BH-A0EA-01A-01-MSA", - "TCGA-BH-A0EA-01A-01-TSA" - ] - } - ], - "pagination": { - "count": 1, - "sort": "", - "from": 0, - "page": 1, - "total": 1, - "pages": 1, - "size": 10 - } - }, - "warnings": {} - } + "data": { + "hits": [ + { + "id": "1f601832-eee3-48fb-acf5-80c4a454f26e", + "slide_ids": [ + "a0826f0d-986a-491b-8c6f-b34f8929f3ee", + "90154ea1-6b76-4445-870e-d531d6fa1239", + "1dd1cab5-5a81-428a-8153-91e8c4cf9905" + ], + "submitter_slide_ids": [ + "TCGA-BH-A0EA-01Z-00-DX1", + "TCGA-BH-A0EA-01A-01-MSA", + "TCGA-BH-A0EA-01A-01-TSA" + ], + "disease_type": "Ductal and Lobular Neoplasms", + "analyte_ids": [ + "f19f408a-815f-43d9-8032-e9482b796371", + "fe678556-acf4-4bde-a95e-860bb0150a95", + "69ddc092-88a0-4839-a2bb-9f1c9e760409", + "66ed0f86-5ca5-4dec-ba76-7ee4dcf31831", + "30cb470f-66d4-4085-8c30-83a42e8453d4" + ], + "submitter_id": "TCGA-BH-A0EA", + "submitter_analyte_ids": [ + "TCGA-BH-A0EA-10A-01D", + "TCGA-BH-A0EA-01A-11D", + "TCGA-BH-A0EA-01A-11R", + "TCGA-BH-A0EA-10A-01W", + "TCGA-BH-A0EA-01A-11W" + ], + "aliquot_ids": [ + "cde982b7-3b0a-49eb-8710-a599cb0e44c1", + "b1a3739d-d554-4202-b96f-f25a444e2042", + "eef9dce1-6ba6-432b-bbe2-53c7dbe64fe7", + "97c64d6a-7dce-4d0f-9cb3-b3e4eb4719c5", + "262715e1-835c-4f16-8ee7-6900e26f7cf5", + "cfbd5476-e83a-401d-9f9a-639c73a0e35b", + "bcb7fc6d-60a0-48b7-aa81-14c0dda72d76", + "561b8777-801a-49ed-a306-e7dafeb044b6", + "edad5bd3-efe0-4c5f-b05c-2c0c2951c45a", + "42d050e4-e8ee-4442-b9c0-0ee14706b138", + "2beb34c4-d493-4a73-b21e-de77d43251ff", + "ca71ca96-cbb7-4eab-9487-251dda34e107" + ], + "submitter_aliquot_ids": [ + "TCGA-BH-A0EA-10A-01W-A12U-09", + "TCGA-BH-A0EA-01A-11D-A111-01", + "TCGA-BH-A0EA-01A-11D-A314-09", + "TCGA-BH-A0EA-01A-11D-A10X-02", + "TCGA-BH-A0EA-10A-01D-A10Z-02", + "TCGA-BH-A0EA-10A-01D-A110-09", + "TCGA-BH-A0EA-01A-11D-A10Y-09", + "TCGA-BH-A0EA-10A-01D-A113-01", + "TCGA-BH-A0EA-01A-11D-A112-05", + "TCGA-BH-A0EA-01A-11R-A115-07", + "TCGA-BH-A0EA-01A-11W-A12T-09", + "TCGA-BH-A0EA-01A-11R-A114-13" + ], + "created_datetime": null, + "diagnosis_ids": [ + "84654ad5-2a2c-5c3b-8340-ecac6a5550fe" + ], + "sample_ids": [ + "55864d86-dab8-47bb-a3e3-8cfb198b06c1", + "9a6c71a6-82cd-42b1-a93f-f569370848d6", + "7f791228-dd77-4ab0-8227-d784a4c7fea1" + ], + "submitter_sample_ids": [ + "TCGA-BH-A0EA-01A", + "TCGA-BH-A0EA-01Z", + "TCGA-BH-A0EA-10A" + ], + "primary_site": "Breast", + "submitter_diagnosis_ids": [ + "TCGA-BH-A0EA_diagnosis" + ], + "updated_datetime": "2019-08-06T14:15:54.128069-05:00", + "case_id": "1f601832-eee3-48fb-acf5-80c4a454f26e", + "state": "released", + "portion_ids": [ + "cb6086d1-3416-4310-b109-e8fa6e8b72d4", + "8629bf5a-cdaf-4f6a-90bb-27dd4a7565c5", + "ae4f5816-f97a-4605-9b05-9ab820467dee" + ], + "submitter_portion_ids": [ + "TCGA-BH-A0EA-10A-01", + "TCGA-BH-A0EA-01A-21-A13C-20", + "TCGA-BH-A0EA-01A-11" + ] + } + ], + "pagination": { + "count": 1, + "total": 1, + "size": 10, + "from": 0, + "sort": "", + "page": 1, + "pages": 1 + } + }, + "warnings": {} +} ``` #### Retrieval of case metadata using individual UUIDs: @@ -574,131 +654,130 @@ curl 'https://api.gdc.cancer.gov/cases/1f601832-eee3-48fb-acf5-80c4a454f26e?pret ```Response { "data": { - "diagnoses": [ - { - "classification_of_tumor": "not reported", - "last_known_disease_status": "not reported", - "updated_datetime": "2016-05-16T10:59:16.740358-05:00", - "primary_diagnosis": "c50.9", - "submitter_id": "TCGA-BH-A0EA_diagnosis", - "tumor_stage": "stage iia", - "age_at_diagnosis": 26548.0, - "vital_status": "dead", - "morphology": "8500/3", - "days_to_death": 991.0, - "days_to_last_known_disease_status": null, - "days_to_last_follow_up": null, - "state": null, - "days_to_recurrence": null, - "diagnosis_id": "84654ad5-2a2c-5c3b-8340-ecac6a5550fe", - "tumor_grade": "not reported", - "tissue_or_organ_of_origin": "c50.9", - "days_to_birth": -26548.0, - "progression_or_recurrence": "not reported", - "prior_malignancy": "not reported", - "site_of_resection_or_biopsy": "c50.9", - "created_datetime": null - } - ], - "sample_ids": [ - "7f791228-dd77-4ab0-8227-d784a4c7fea1", - "9a6c71a6-82cd-42b1-a93f-f569370848d6" - ], - "portion_ids": [ - "cb6086d1-3416-4310-b109-e8fa6e8b72d4", - "8629bf5a-cdaf-4f6a-90bb-27dd4a7565c5", - "ae4f5816-f97a-4605-9b05-9ab820467dee" - ], - "submitter_portion_ids": [ - "TCGA-BH-A0EA-01A-11", - "TCGA-BH-A0EA-01A-21-A13C-20", - "TCGA-BH-A0EA-10A-01" - ], - "created_datetime": null, - "submitter_aliquot_ids": [ - "TCGA-BH-A0EA-01A-11R-A114-13", - "TCGA-BH-A0EA-01A-11D-A111-01", - "TCGA-BH-A0EA-01A-11W-A12T-09", - "TCGA-BH-A0EA-01A-11R-A114-13", - "TCGA-BH-A0EA-01A-11R-A115-07", - "TCGA-BH-A0EA-01A-11D-A111-01", - "TCGA-BH-A0EA-01A-11D-A314-09", - "TCGA-BH-A0EA-01A-11D-A112-05", - "TCGA-BH-A0EA-01A-11D-A10Y-09", - "TCGA-BH-A0EA-01A-11D-A10X-02", - "TCGA-BH-A0EA-01A-11W-A12T-09", - "TCGA-BH-A0EA-01A-11D-A10X-02", - "TCGA-BH-A0EA-01A-11D-A10Y-09", - "TCGA-BH-A0EA-01A-11D-A314-09", - "TCGA-BH-A0EA-01A-11R-A115-07", - "TCGA-BH-A0EA-01A-11D-A112-05", - "TCGA-BH-A0EA-10A-01D-A110-09", - "TCGA-BH-A0EA-10A-01D-A113-01", - "TCGA-BH-A0EA-10A-01W-A12U-09", - "TCGA-BH-A0EA-10A-01D-A10Z-02", - "TCGA-BH-A0EA-10A-01D-A113-01", - "TCGA-BH-A0EA-10A-01D-A110-09", - "TCGA-BH-A0EA-10A-01W-A12U-09", - "TCGA-BH-A0EA-10A-01D-A10Z-02" + "slide_ids": [ + "a0826f0d-986a-491b-8c6f-b34f8929f3ee", + "90154ea1-6b76-4445-870e-d531d6fa1239", + "1dd1cab5-5a81-428a-8153-91e8c4cf9905" ], - "updated_datetime": "2016-05-02T14:37:43.619198-05:00", - "submitter_analyte_ids": [ - "TCGA-BH-A0EA-01A-11R", - "TCGA-BH-A0EA-01A-11D", - "TCGA-BH-A0EA-01A-11W", - "TCGA-BH-A0EA-10A-01W", - "TCGA-BH-A0EA-10A-01D" + "submitter_slide_ids": [ + "TCGA-BH-A0EA-01Z-00-DX1", + "TCGA-BH-A0EA-01A-01-MSA", + "TCGA-BH-A0EA-01A-01-TSA" ], + "disease_type": "Ductal and Lobular Neoplasms", "analyte_ids": [ - "30cb470f-66d4-4085-8c30-83a42e8453d4", - "66ed0f86-5ca5-4dec-ba76-7ee4dcf31831", "f19f408a-815f-43d9-8032-e9482b796371", + "fe678556-acf4-4bde-a95e-860bb0150a95", "69ddc092-88a0-4839-a2bb-9f1c9e760409", - "fe678556-acf4-4bde-a95e-860bb0150a95" + "66ed0f86-5ca5-4dec-ba76-7ee4dcf31831", + "30cb470f-66d4-4085-8c30-83a42e8453d4" ], "submitter_id": "TCGA-BH-A0EA", - "case_id": "1f601832-eee3-48fb-acf5-80c4a454f26e", - "state": null, + "submitter_analyte_ids": [ + "TCGA-BH-A0EA-10A-01D", + "TCGA-BH-A0EA-01A-11D", + "TCGA-BH-A0EA-01A-11R", + "TCGA-BH-A0EA-10A-01W", + "TCGA-BH-A0EA-01A-11W" + ], "aliquot_ids": [ - "bcb7fc6d-60a0-48b7-aa81-14c0dda72d76", + "cde982b7-3b0a-49eb-8710-a599cb0e44c1", + "b1a3739d-d554-4202-b96f-f25a444e2042", + "eef9dce1-6ba6-432b-bbe2-53c7dbe64fe7", "97c64d6a-7dce-4d0f-9cb3-b3e4eb4719c5", - "edad5bd3-efe0-4c5f-b05c-2c0c2951c45a", + "262715e1-835c-4f16-8ee7-6900e26f7cf5", + "cfbd5476-e83a-401d-9f9a-639c73a0e35b", "bcb7fc6d-60a0-48b7-aa81-14c0dda72d76", - "ca71ca96-cbb7-4eab-9487-251dda34e107", - "97c64d6a-7dce-4d0f-9cb3-b3e4eb4719c5", - "eef9dce1-6ba6-432b-bbe2-53c7dbe64fe7", - "42d050e4-e8ee-4442-b9c0-0ee14706b138", "561b8777-801a-49ed-a306-e7dafeb044b6", - "262715e1-835c-4f16-8ee7-6900e26f7cf5", "edad5bd3-efe0-4c5f-b05c-2c0c2951c45a", - "262715e1-835c-4f16-8ee7-6900e26f7cf5", - "561b8777-801a-49ed-a306-e7dafeb044b6", - "eef9dce1-6ba6-432b-bbe2-53c7dbe64fe7", - "ca71ca96-cbb7-4eab-9487-251dda34e107", "42d050e4-e8ee-4442-b9c0-0ee14706b138", - "cfbd5476-e83a-401d-9f9a-639c73a0e35b", "2beb34c4-d493-4a73-b21e-de77d43251ff", - "b1a3739d-d554-4202-b96f-f25a444e2042", - "cde982b7-3b0a-49eb-8710-a599cb0e44c1", - "2beb34c4-d493-4a73-b21e-de77d43251ff", - "cfbd5476-e83a-401d-9f9a-639c73a0e35b", - "b1a3739d-d554-4202-b96f-f25a444e2042", - "cde982b7-3b0a-49eb-8710-a599cb0e44c1" + "ca71ca96-cbb7-4eab-9487-251dda34e107" ], - "slide_ids": [ - "90154ea1-6b76-4445-870e-d531d6fa1239", - "a0826f0d-986a-491b-8c6f-b34f8929f3ee" + "submitter_aliquot_ids": [ + "TCGA-BH-A0EA-10A-01W-A12U-09", + "TCGA-BH-A0EA-01A-11D-A111-01", + "TCGA-BH-A0EA-01A-11D-A314-09", + "TCGA-BH-A0EA-01A-11D-A10X-02", + "TCGA-BH-A0EA-10A-01D-A10Z-02", + "TCGA-BH-A0EA-10A-01D-A110-09", + "TCGA-BH-A0EA-01A-11D-A10Y-09", + "TCGA-BH-A0EA-10A-01D-A113-01", + "TCGA-BH-A0EA-01A-11D-A112-05", + "TCGA-BH-A0EA-01A-11R-A115-07", + "TCGA-BH-A0EA-01A-11W-A12T-09", + "TCGA-BH-A0EA-01A-11R-A114-13" + ], + "diagnoses": [ + { + "synchronous_malignancy": "Not Reported", + "ajcc_pathologic_stage": "Stage IIA", + "tumor_stage": "stage iia", + "days_to_diagnosis": 0, + "created_datetime": null, + "last_known_disease_status": "not reported", + "tissue_or_organ_of_origin": "Breast, NOS", + "days_to_last_follow_up": null, + "primary_diagnosis": "Infiltrating duct carcinoma, NOS", + "age_at_diagnosis": 26548, + "updated_datetime": "2019-08-08T16:25:42.215495-05:00", + "prior_malignancy": "yes", + "year_of_diagnosis": 2008, + "prior_treatment": "No", + "state": "released", + "days_to_last_known_disease_status": null, + "ajcc_staging_system_edition": "6th", + "ajcc_pathologic_t": "T1c", + "days_to_recurrence": null, + "morphology": "8500/3", + "ajcc_pathologic_n": "N1a", + "ajcc_pathologic_m": "M0", + "submitter_id": "TCGA-BH-A0EA_diagnosis", + "classification_of_tumor": "not reported", + "diagnosis_id": "84654ad5-2a2c-5c3b-8340-ecac6a5550fe", + "icd_10_code": "C50.9", + "site_of_resection_or_biopsy": "Breast, NOS", + "tumor_grade": "not reported", + "progression_or_recurrence": "not reported" + } + ], + "created_datetime": null, + "diagnosis_ids": [ + "84654ad5-2a2c-5c3b-8340-ecac6a5550fe" + ], + "sample_ids": [ + "55864d86-dab8-47bb-a3e3-8cfb198b06c1", + "9a6c71a6-82cd-42b1-a93f-f569370848d6", + "7f791228-dd77-4ab0-8227-d784a4c7fea1" ], "submitter_sample_ids": [ "TCGA-BH-A0EA-01A", + "TCGA-BH-A0EA-01Z", "TCGA-BH-A0EA-10A" + ], + "primary_site": "Breast", + "submitter_diagnosis_ids": [ + "TCGA-BH-A0EA_diagnosis" + ], + "updated_datetime": "2019-08-06T14:15:54.128069-05:00", + "case_id": "1f601832-eee3-48fb-acf5-80c4a454f26e", + "state": "released", + "portion_ids": [ + "cb6086d1-3416-4310-b109-e8fa6e8b72d4", + "8629bf5a-cdaf-4f6a-90bb-27dd4a7565c5", + "ae4f5816-f97a-4605-9b05-9ab820467dee" + ], + "submitter_portion_ids": [ + "TCGA-BH-A0EA-10A-01", + "TCGA-BH-A0EA-01A-21-A13C-20", + "TCGA-BH-A0EA-01A-11" ] }, "warnings": {} } ``` -### Annotations Endpoint +### `Annotations` Endpoint The GDC Annotation Endpoint `https://api.gdc.cancer.gov/annotations` enables search and retrieval of annotations stored in the GDC. @@ -801,7 +880,7 @@ curl 'https://api.gdc.cancer.gov/annotations?filters=%7B%22op%22%3A%22in%22%2C%2 "warnings": {} } ``` -### History Endpoint +### `History` Endpoint The GDC History Endpoint `https://api.gdc.cancer.gov/history` enables search and retrieval of version and release information about a file. This endpoint will return the entire provenance of all versions of a file. A file may be versioned if a file is updated by the GDC (e.g. using a new alignment algorithm or fixing a file that contained an error). `Version` refers to the instance of a particular file. `Release` refers to which data release a file was part of. A file may be a part of many different data releases with no change in version number or content. @@ -814,11 +893,11 @@ This example is a query for versioning information associated with the follow wi curl 'https://api.gdc.cancer.gov/history/1dd28069-5777-4ff9-bd2b-d1ba68e88b06' ``` ``` Output -[{"release_date": "2018-07-23", "version": "1", "uuid": "1dd28069-5777-4ff9-bd2b-d1ba68e88b06", "file_change": "released", "data_release": "13.0"}] +[{"uuid": "1dd28069-5777-4ff9-bd2b-d1ba68e88b06", "version": "1", "file_change": "released", "release_date": "2018-08-23", "data_release": "12.0"}] ``` -### \_mapping Endpoint +### `_mapping` Endpoint Each search and retrieval endpoint is equipped with a ```_mapping``` endpoint that provides information about available fields. For example, `files/_mapping` endpoint provides information about fields and field groups available at the `files` endpoint: `https://api.gdc.cancer.gov/files/_mapping`. @@ -851,23 +930,24 @@ Each part of the response is described below: curl 'https://api.gdc.cancer.gov/projects/_mapping' ``` ```output +This output was put thought a json format application for easier viewability. { - ... + ... - "_mapping": { - "projects.disease_type": { - "doc_type": "projects", - "field": "disease_type", - "type": "id" - }, - "projects.name": { - "doc_type": "projects", - "field": "name", - "type": "id" - } - } + "_mapping": { + "projects.disease_type": { + "doc_type": "projects", + "field": "disease_type", + "type": "id" + }, + "projects.name": { + "doc_type": "projects", + "field": "name", + "type": "id" + } + } - ... + ... } ``` @@ -967,16 +1047,19 @@ This example requests `male` cases using HTTP GET. The JSON object to be passed to the GDC API looks like: - {"op": "=", - "content": { - "field": "cases.demographic.gender", - "value": ["male"] - } - } + { + "op": "=", + "content": { + "field": "cases.demographic.gender", + "value": [ + "male" + ] + } + } -URL-encoding the above JSON object using [Percent-(URL)-encoding tool](https://www.beautifyconverter.com/json-escape-unescape.php) results in the following string: +URL-encoding the above JSON object using [Percent-(URL)-encoding tool](https://www.freeformatter.com/url-encoder.html) results in the following string: - %7b%22op%22%3a+%22%3d%22%2c%0d%0a++++++%22content%22%3a+%7b%0d%0a++++++++++%22field%22%3a+%22cases.clinical.gender%22%2c%0d%0a++++++++++%22value%22%3a+%5b%22male%22%5d%0d%0a++++++%7d%0d%0a%7d + %7B%0D%0A++++%22op%22%3A+%22%3D%22%2C%0D%0A++++%22content%22%3A+%7B%0D%0A++++++++%22field%22%3A+%22cases.demographic.gender%22%2C%0D%0A++++++++%22value%22%3A+%5B%0D%0A++++++++++++%22male%22%0D%0A++++++++%5D%0D%0A++++%7D%0D%0A%7D The above string can now be passed to the GDC API using the `filters` parameter: @@ -1003,690 +1086,863 @@ print json.dumps(response.json(), indent=2) "data": { "hits": [ { - "sample_ids": [ - "1d014bf1-95ae-42e3-ae39-97ff4841d8ca", - "6b685bfc-651b-48d1-8e68-32c8096ea205" + "id": "f8970455-bfb2-4b1d-ab71-3c5d619898ad", + "slide_ids": [ + "324684b5-8f18-4aa3-9b32-78382b96760b", + "542a84f2-35e5-4843-9e98-c3d4bf0efe34" ], - "portion_ids": [ - "c061217a-266a-496d-8a96-3489191afa87", - "0d3a6a58-0e00-4889-bc73-5ddb5a387738", - "e858ee92-0438-48e9-a70d-80ef2c0ad539" + "submitter_slide_ids": [ + "TCGA-ZN-A9VQ-01Z-00-DX1", + "TCGA-ZN-A9VQ-01A-01-TS1" ], - "submitter_portion_ids": [ - "TCGA-66-2770-01A-21-2193-20", - "TCGA-66-2770-01A-01", - "TCGA-66-2770-11A-01" + "disease_type": "Mesothelial Neoplasms", + "analyte_ids": [ + "73491451-b44e-41b3-be8c-1d7e44e54d08", + "eda45603-7e65-4160-9963-e8907e7248b2", + "f8f4c5d9-b09d-46d4-9fc1-afbebae1a81d" ], - "created_datetime": null, - "submitter_aliquot_ids": [ - "TCGA-66-2770-01A-01D-1522-08", - "TCGA-66-2770-01A-01D-0848-05", - "TCGA-66-2770-01A-01W-0879-09", - "TCGA-66-2770-11A-01W-0878-08", - "TCGA-66-2770-01A-01R-0849-01", - "TCGA-66-2770-01A-01W-0877-08", - "TCGA-66-2770-01A-01D-0846-06", - "TCGA-66-2770-11A-01W-0880-09", - "TCGA-66-2770-01A-01D-0964-09", - "TCGA-66-2770-11A-01D-0846-06", - "TCGA-66-2770-01A-01D-0845-04", - "TCGA-66-2770-01A-01W-0881-10", - "TCGA-66-2770-11A-01D-0963-08", - "TCGA-66-2770-11A-01D-0844-01", - "TCGA-66-2770-01A-01R-0851-07", - "TCGA-66-2770-11A-01W-0882-10", - "TCGA-66-2770-11A-01D-1522-08", - "TCGA-66-2770-01A-01T-1557-13", - "TCGA-66-2770-01A-01D-0847-02", - "TCGA-66-2770-01A-01D-0844-01", - "TCGA-66-2770-11A-01D-0847-02", - "TCGA-66-2770-11A-01D-0964-09", - "TCGA-66-2770-01A-01D-0963-08", - "TCGA-66-2770-01A-01R-0850-03", - "TCGA-66-2770-11A-01D-0845-04", - "TCGA-66-2770-01A-01T-0852-07" - ], - "updated_datetime": "2016-05-02T15:57:03.730994-05:00", + "submitter_id": "TCGA-ZN-A9VQ", "submitter_analyte_ids": [ - "TCGA-66-2770-01A-01D", - "TCGA-66-2770-11A-01W", - "TCGA-66-2770-01A-01T", - "TCGA-66-2770-01A-01W", - "TCGA-66-2770-01A-01R", - "TCGA-66-2770-11A-01D" + "TCGA-ZN-A9VQ-01A-11D", + "TCGA-ZN-A9VQ-01A-11R", + "TCGA-ZN-A9VQ-10A-01D" + ], + "submitter_aliquot_ids": [ + "TCGA-ZN-A9VQ-01A-11R-A40A-07", + "TCGA-ZN-A9VQ-10A-01D-A39T-01", + "TCGA-ZN-A9VQ-01A-11D-A39S-05", + "TCGA-ZN-A9VQ-01A-11D-A40F-26", + "TCGA-ZN-A9VQ-01A-11D-A39Q-01", + "TCGA-ZN-A9VQ-01A-11D-A761-36", + "TCGA-ZN-A9VQ-01A-11R-A404-13", + "TCGA-ZN-A9VQ-01A-11D-A39R-32", + "TCGA-ZN-A9VQ-10A-01D-A39U-32", + "TCGA-ZN-A9VQ-10A-01D-A761-36", + "TCGA-ZN-A9VQ-10A-01D-A40G-26" ], - "analyte_ids": [ - "385807d3-78de-4558-8d93-702d93fc835a", - "247acc7a-b4f5-47e9-86da-5ea9b04ad444", - "151b8cb9-6b0a-4db9-9b0e-62aa501b35d9", - "e549aebd-4dda-4ea8-8ccf-56c03bc8b2be", - "631ad4eb-845a-4e70-96ad-4b40157218a8", - "9a75640e-09d4-42b7-8cb4-75d62b39e98a" - ], - "submitter_id": "TCGA-66-2770", - "case_id": "f1b357e4-d67a-42c9-b0b7-12f69fa3da58", - "state": null, "aliquot_ids": [ - "a2d10f8e-6b27-4df0-bd25-ac24992d0bb4", - "8c1c733a-abed-468f-b4d0-d1ac34ba6d8b", - "cad8d384-3b7a-4f70-89c2-5584ae75c5eb", - "42e774cf-3c4a-4efd-9665-378cb6b4afac", - "3755168b-f5da-422d-847a-566cb112a8d7", - "cae4d249-ba67-4316-8761-7e71e3813182", - "aa6e700c-ce01-4cc9-87de-8bf615a8aa1a", - "ad5c4069-e616-4ab4-9b03-b196f9189b20", - "07c26ea4-0584-4cb0-8e5a-d057b8fe6c14", - "f95c2cb5-d20a-4f1f-8f2a-95a2d37fbdc4", - "817bf327-e583-4704-b294-c3645dcc4adf", - "2246cb75-38bd-491f-b6ee-99f4781f2564", - "a81b9090-626d-492d-9baf-7fa3ef70111c", - "5cd6f026-894e-45f6-bc59-d6f056e63846", - "e417903d-ab76-44f0-aae9-3a91fa9a8d3c", - "1d809a56-31ca-49d8-a57b-e773236b24de", - "df60a743-ef4b-43ea-bc5a-4d75e8befb8a", - "871350e2-958f-401c-ae86-6bc880a01942", - "3dc4207d-5671-4c3d-b75a-d39ef69b564c", - "69b77cc0-d00a-4ea3-9b39-3e3019d9e292", - "3d035ee8-9523-4771-8738-c8a5a2f91403", - "775e46bd-e56f-40fa-9891-aaedc1d49395", - "d1c60049-922a-42d4-bd7e-8cf4ace47f05", - "5220a53f-f3fc-476c-aa72-65a038eb2fd8", - "b7e44e6e-ccf9-4b75-a258-159912ab51ca", - "42750622-28d7-4d32-9262-b139fe77bc01" + "26e89986-bfd5-4d4e-a3ff-5ad612c48358", + "3d7835c0-388f-4cdc-9fe3-2dae11b71daa", + "3f076394-cb86-46ea-8cdc-88f385c6b54e", + "0b2cbe7c-8392-4072-95af-9ded20aa3888", + "6762b668-a952-4c66-8a63-af909dbdc3ec", + "dd85d29b-5883-4292-849a-8706698ff32b", + "0e3a5bc1-9fe1-49e6-9ae8-fe7e86198acd", + "4c1a209f-b005-4137-a344-c0befa66047c", + "418d6b09-cffc-4ca4-84eb-7c9b2b5aacaf", + "a09f740a-9529-4688-be12-978f13054e1e", + "c32fb3d0-2894-4859-be14-30f03e0d8997" ], - "slide_ids": [ - "a10196d2-7a81-4e1e-a9a7-62d123c30875", - "72edc1ba-916d-42a2-9f22-6254c6e54c5c", - "ff15eeb9-550e-4c78-90cc-a6cce8ccc3df", - "71ccfb52-169d-4176-94d6-fff5b75f853d" + "created_datetime": null, + "diagnosis_ids": [ + "68b5b7ee-bbbe-502c-b087-f325c4ccde09" + ], + "sample_ids": [ + "95830203-ccab-4a0d-8daf-e2b67ab95b86", + "089c6901-5fe6-48b0-97ab-39f00609255c", + "546ba1f1-7e16-4701-875a-8e9dd426fb76" ], "submitter_sample_ids": [ - "TCGA-66-2770-11A", - "TCGA-66-2770-01A" + "TCGA-ZN-A9VQ-10A", + "TCGA-ZN-A9VQ-01Z", + "TCGA-ZN-A9VQ-01A" + ], + "submitter_diagnosis_ids": [ + "TCGA-ZN-A9VQ_diagnosis" + ], + "primary_site": "Heart, mediastinum, and pleura", + "updated_datetime": "2019-08-06T14:39:56.656272-05:00", + "case_id": "f8970455-bfb2-4b1d-ab71-3c5d619898ad", + "portion_ids": [ + "5b4b99dc-a44e-4679-8b19-d1d78020aa9f", + "3e8db2c5-d5b3-4a9b-9ef5-948f16fe5cac", + "ae23fda4-25ac-44ad-bfba-75e393b4bec5" + ], + "state": "released", + "submitter_portion_ids": [ + "TCGA-ZN-A9VQ-01A-21-A45O-20", + "TCGA-ZN-A9VQ-01A-11", + "TCGA-ZN-A9VQ-10A-01" ] }, { - "sample_ids": [ - "06889714-2a40-4248-98ee-f690b301e36a", - "9f43a0c6-ea19-4021-b0ed-026f33ce1c33" + "id": "c739fd61-22b2-412d-bcf3-89bda45a2c0f", + "slide_ids": [ + "6aa9b64a-9624-424e-87be-2ca9902794a2", + "1c5ef953-b382-4b73-8bda-f2cfe8c86874" ], - "portion_ids": [ - "3a001d28-7cf9-4c61-b155-73938aebaa25", - "79554cfd-e853-481e-8e37-1e296034094e" + "submitter_slide_ids": [ + "TCGA-3H-AB3X-01A-01-TS1", + "TCGA-3H-AB3X-01Z-00-DX1" ], - "submitter_portion_ids": [ - "TCGA-02-0075-01A-01", - "TCGA-02-0075-10A-01" + "disease_type": "Mesothelial Neoplasms", + "analyte_ids": [ + "0be4a69c-50e5-4634-b3ad-cf75f5cea8c5", + "e208e95c-f999-41c2-b8df-6ee04af51f83", + "7f0ed3c8-f3b2-47bf-a911-fe6463502315" ], - "created_datetime": null, - "submitter_aliquot_ids": [ - "TCGA-02-0075-01A-01W-0204-02", - "TCGA-02-0075-01A-01R-0194-03", - "TCGA-02-0075-01A-01D-0198-02", - "TCGA-02-0075-01A-01R-0202-01", - "TCGA-02-0075-10A-01W-0207-09", - "TCGA-02-0075-01A-01R-0676-04", - "TCGA-02-0075-10A-01D-0198-02", - "TCGA-02-0075-10A-01D-0197-06", - "TCGA-02-0075-10A-01D-0193-01", - "TCGA-02-0075-01A-01W-0207-09", - "TCGA-02-0075-01A-01W-0206-08", - "TCGA-02-0075-01A-01D-0193-01", - "TCGA-02-0075-10A-01W-0205-10", - "TCGA-02-0075-01A-01R-0201-02", - "TCGA-02-0075-10A-01W-0204-02", - "TCGA-02-0075-01A-01D-0199-05", - "TCGA-02-0075-10A-01W-0206-08", - "TCGA-02-0075-01A-01D-0196-04", - "TCGA-02-0075-01A-01T-0195-07", - "TCGA-02-0075-10A-01D-0196-04", - "TCGA-02-0075-01A-01D-0197-06", - "TCGA-02-0075-01A-01D-0888-01", - "TCGA-02-0075-01A-01R-0195-07", - "TCGA-02-0075-01A-01W-0205-10" - ], - "updated_datetime": "2016-05-02T15:00:01.972331-05:00", + "submitter_id": "TCGA-3H-AB3X", "submitter_analyte_ids": [ - "TCGA-02-0075-01A-01R", - "TCGA-02-0075-10A-01D", - "TCGA-02-0075-01A-01W", - "TCGA-02-0075-01A-01T", - "TCGA-02-0075-01A-01D", - "TCGA-02-0075-10A-01W" + "TCGA-3H-AB3X-01A-11R", + "TCGA-3H-AB3X-10A-01D", + "TCGA-3H-AB3X-01A-11D" ], - "analyte_ids": [ - "fec22de0-a2b9-45df-9854-1ebe76cee84e", - "b4d11c50-61f1-4d4a-815f-1c0413018d7f", - "c48673d0-a38d-44e1-8cfd-e91cb23ea2d5", - "24f1852c-999a-4ea8-917c-fcfd683e2aca", - "aa431260-a0fc-4924-80ce-61cab8b5e83e", - "11f21140-d761-44ca-a9b2-b24099df3b15" - ], - "submitter_id": "TCGA-02-0075", - "case_id": "b196f82b-ef3f-4e05-99f7-da5df65e691e", - "state": null, "aliquot_ids": [ - "75531fe0-101e-4220-bd47-98892c90ee70", - "e5ea38d4-f47c-4c8a-8bab-13631e0a9a7b", - "d48b7c2c-daac-4496-af8f-1f45ca43f627", - "bbba08fc-2514-4e15-afb7-41eecc7e876f", - "0685b37f-a47c-4222-a846-bf9f3c000de3", - "683986da-3cee-446d-9b7a-83bef25815c9", - "e6ffdb20-a1be-4664-bcd3-cc7a4de6f40b", - "5d1f25c0-9e1a-41ad-9735-134f39dbf70e", - "528b40b9-246f-4ba3-8209-777136638e62", - "33131479-5d69-4262-a549-ba8864320f3b", - "5c7822fc-cf4f-4f62-8482-7c0ce1b7ab9a", - "b95e7659-e3a4-4e96-b98c-f67d26b85322", - "30c84aca-f9db-4e07-ac34-1a92b1652ca1", - "d5e3b5cc-06e0-4294-9d3c-8f3b63acae3d", - "b14b3d09-3a7f-41a6-81df-2757efa67906", - "513040e2-dc29-4e2c-86fb-57371eede17a", - "21c3be1b-7c1e-4864-99d1-486cfe5d8f1d", - "5e28e5dc-6dfa-44a9-8793-9134cb4cdda5", - "b8c25892-4773-428f-a02c-f930931268e8", - "266d5260-08e4-4cec-87f3-ca415bd98575", - "8859a3ae-f85d-4ef2-830b-80f42f98d53e", - "ac018a8c-a6e2-4291-a4bf-a330ae9c441e", - "4b022f7f-7549-4d97-9d41-4e5f2e9ec74c", - "caad3dfa-74a9-4ecc-95c1-86f6fbfd4ab5" + "ba8caacf-7a47-48db-b375-58b2a417d073", + "4026d79f-155e-48e9-91de-dedbf201f55a", + "c895dbf4-5753-489a-b83a-dc7d80456388", + "d0253b2c-99e0-49ff-b7f6-314bf1729cbf", + "4a21ef3b-2d39-4588-8763-d2c26254932a", + "537d8c34-d69a-4982-92d1-4d2d48e8b9c5", + "41b7c449-47f2-470f-9f92-f3a4a02c8549", + "3127573c-6700-4b82-9262-2bcc9c72b56c", + "57dbf875-55a2-4a18-bccb-3a16df3ddbfa" ], - "slide_ids": [ - "39f547cd-5dc3-4bf4-99ea-073bb161c23c", - "5f096267-0cc2-4cc5-a206-7357159633d7" + "submitter_aliquot_ids": [ + "TCGA-3H-AB3X-10A-01D-A39U-32", + "TCGA-3H-AB3X-01A-11D-A39S-05", + "TCGA-3H-AB3X-01A-11R-A40A-07", + "TCGA-3H-AB3X-01A-11D-A39Q-01", + "TCGA-3H-AB3X-01A-11D-A40F-26", + "TCGA-3H-AB3X-01A-11R-A404-13", + "TCGA-3H-AB3X-01A-11D-A39R-32", + "TCGA-3H-AB3X-10A-01D-A39T-01", + "TCGA-3H-AB3X-10A-01D-A40G-26" + ], + "created_datetime": null, + "diagnosis_ids": [ + "9252ba53-0bd5-5f49-b154-aa1dd6473fd5" + ], + "sample_ids": [ + "276c6d7b-712a-465c-913f-320de285cad4", + "b06b0a8a-e992-42aa-a6aa-e85685bbe3f9", + "ae48bc7d-fc5a-4d63-9399-783e1d7f50d6" ], "submitter_sample_ids": [ - "TCGA-02-0075-10A", - "TCGA-02-0075-01A" + "TCGA-3H-AB3X-01Z", + "TCGA-3H-AB3X-10A", + "TCGA-3H-AB3X-01A" + ], + "submitter_diagnosis_ids": [ + "TCGA-3H-AB3X_diagnosis" + ], + "primary_site": "Bronchus and lung", + "updated_datetime": "2019-08-06T14:39:45.057305-05:00", + "case_id": "c739fd61-22b2-412d-bcf3-89bda45a2c0f", + "portion_ids": [ + "7ec81582-1cb3-4f34-a68f-23a9d9804658", + "72bd54de-7e9f-46de-b62e-7994c9d4ad4d" + ], + "state": "released", + "submitter_portion_ids": [ + "TCGA-3H-AB3X-10A-01", + "TCGA-3H-AB3X-01A-11" ] }, { - "sample_ids": [ - "ba08195b-31cf-4bb1-a470-23740225c99d", - "929889c4-e474-4104-b69b-fac7e414a59e" + "id": "ae90972d-5bc2-4b53-b5ff-1b8c31f39342", + "slide_ids": [ + "e94c1249-1f03-4a7f-9161-d1714ef546b2", + "e8bdc1d8-01c0-40a7-b308-cb5681f28b2f" ], - "portion_ids": [ - "48a36eb4-79fb-45e7-8bb1-0fa1d5fcda2c", - "1de5e67a-ac3f-4c18-92c4-27ba1868c7ac", - "e09fc5e7-e8d2-4bf9-b12b-17b22e0387e4" + "submitter_slide_ids": [ + "TCGA-CQ-A4CH-01A-01-TSA", + "TCGA-CQ-A4CH-01Z-00-DX1" ], - "submitter_portion_ids": [ - "TCGA-EJ-A8FU-10A-01", - "TCGA-EJ-A8FU-01A-21-A43L-20", - "TCGA-EJ-A8FU-01A-11" + "disease_type": "Squamous Cell Neoplasms", + "analyte_ids": [ + "be4729cb-4f7d-4c9b-8df7-2de2328c40d4", + "53e0774f-2a36-4100-a73a-5fd8a45c4169", + "51e4cd09-e408-4077-9ec3-7c805296a016", + "d99212f6-8c23-44c4-8c30-87fc4166c9d0", + "f3adfcba-e014-43e0-8c9b-09039b84613e" ], - "created_datetime": null, - "submitter_aliquot_ids": [ - "TCGA-EJ-A8FU-01A-11R-A36B-13", - "TCGA-EJ-A8FU-01A-11R-A36G-07", - "TCGA-EJ-A8FU-01A-11D-A363-01", - "TCGA-EJ-A8FU-10A-01D-A361-01", - "TCGA-EJ-A8FU-10A-01D-A362-08", - "TCGA-EJ-A8FU-01A-11W-A447-08", - "TCGA-EJ-A8FU-01A-11D-A365-05", - "TCGA-EJ-A8FU-01A-11D-A364-08", - "TCGA-EJ-A8FU-10A-01W-A446-08" - ], - "updated_datetime": "2016-05-02T15:57:04.948573-05:00", + "submitter_id": "TCGA-CQ-A4CH", "submitter_analyte_ids": [ - "TCGA-EJ-A8FU-01A-11W", - "TCGA-EJ-A8FU-01A-11D", - "TCGA-EJ-A8FU-01A-11R", - "TCGA-EJ-A8FU-10A-01W", - "TCGA-EJ-A8FU-10A-01D" + "TCGA-CQ-A4CH-10A-01W", + "TCGA-CQ-A4CH-01A-11D", + "TCGA-CQ-A4CH-01A-11R", + "TCGA-CQ-A4CH-10A-01D", + "TCGA-CQ-A4CH-01A-11W" + ], + "submitter_aliquot_ids": [ + "TCGA-CQ-A4CH-01A-11R-A266-07", + "TCGA-CQ-A4CH-10A-01D-A25X-01", + "TCGA-CQ-A4CH-01A-11R-A25Z-13", + "TCGA-CQ-A4CH-01A-11W-A296-08", + "TCGA-CQ-A4CH-10A-01W-A296-08", + "TCGA-CQ-A4CH-10A-01D-A25Y-08", + "TCGA-CQ-A4CH-01A-11D-A25X-01", + "TCGA-CQ-A4CH-01A-11D-A265-05", + "TCGA-CQ-A4CH-01A-11D-A25Y-08" ], - "analyte_ids": [ - "2d4e4925-6ac8-498f-882b-4bbf319f6b7b", - "8d09b982-1256-4674-b383-d6ca4b4bb3c8", - "c74495d9-63bf-4ac0-b10e-04b3b06103c1", - "b9884d98-af57-4901-8b9d-4fdbf73d2c5a", - "2f16ac02-13bf-44fd-bbd7-658c1c384928" - ], - "submitter_id": "TCGA-EJ-A8FU", - "case_id": "23e56e08-e11d-4e83-88a8-1254675b3af8", - "state": null, "aliquot_ids": [ - "e77da017-5dc6-4e32-9568-755e4ee9b533", - "c9b286d1-d500-4bb3-bb3d-5bf40b1b1265", - "b7867d52-7987-46d4-a595-0ff5b5375a58", - "5586ad35-94b7-459e-8982-8e7fb25697a1", - "162a63f7-594f-4669-a06d-b4899c7fe86a", - "b8b1ab44-ee6e-4ac5-9efd-d5bd07e67b9c", - "7adcdf73-3ad3-4da7-ab27-2888f1d4f53a", - "eb498e52-3eae-402f-8cac-ec930f8d938d", - "293f781c-c2c7-479b-b1a6-5f951a2c5e5a" + "9eb34130-b3b9-4c92-8238-4053f8c6d06b", + "0c99ef22-b1ef-42c9-a184-4c98f674c7be", + "b77aa7fa-403b-4ee1-b537-695a799c80f5", + "3aeb1426-58f0-47bd-825e-8d1578dda18b", + "afd797fb-97d2-4482-ae15-753f5f66b828", + "73074dac-057b-456d-b0b8-761b149d10dc", + "48bbbc5e-34c0-46be-a3bb-2d28c2e7d357", + "818db1ea-5ae7-4d50-87cf-a85c62830566", + "92e8b340-322e-4411-95d8-7db81767f660" ], - "slide_ids": [ - "454a95d5-d084-4f36-b1f1-32c6c23ab46e" + "created_datetime": null, + "diagnosis_ids": [ + "7e574325-c82b-5054-bf4a-038f528c4110" + ], + "sample_ids": [ + "273490d1-9862-4480-86aa-12522b35fe24", + "c7dc33be-768c-42a2-a1ab-2eb5b67be87e", + "465d6949-f5a4-4a9d-a082-1f1403876d85" ], "submitter_sample_ids": [ - "TCGA-EJ-A8FU-01A", - "TCGA-EJ-A8FU-10A" + "TCGA-CQ-A4CH-01Z", + "TCGA-CQ-A4CH-01A", + "TCGA-CQ-A4CH-10A" + ], + "submitter_diagnosis_ids": [ + "TCGA-CQ-A4CH_diagnosis" + ], + "primary_site": "Other and unspecified parts of tongue", + "updated_datetime": "2019-08-06T14:25:53.026261-05:00", + "case_id": "ae90972d-5bc2-4b53-b5ff-1b8c31f39342", + "portion_ids": [ + "3bf8dee3-ac07-4ab2-bb1c-c8b564e6b3e1", + "8c802162-a804-437a-9f54-93de7a4c21b3", + "5131b158-ca97-4afe-b236-5d6fc41f70fd" + ], + "state": "released", + "submitter_portion_ids": [ + "TCGA-CQ-A4CH-01A-11", + "TCGA-CQ-A4CH-10A-01", + "TCGA-CQ-A4CH-01A-21-A45L-20" ] }, { + "id": "24c1cf70-bd67-431e-a623-d20f8d3f52b2", + "slide_ids": [ + "0fb2f319-414d-4bd4-bd88-e61303208dfb", + "847e5a3f-c88a-40a3-926e-563de0a26ca0", + "15737384-02ba-47f3-8655-c11ba975edc5", + "50044cf9-fee9-4745-8904-8f5240d4d18d" + ], + "submitter_slide_ids": [ + "TCGA-CJ-4881-01Z-00-DX1", + "TCGA-CJ-4881-01A-01-BS1", + "TCGA-CJ-4881-11A-01-TS1", + "TCGA-CJ-4881-01A-01-TS1" + ], + "disease_type": "Adenomas and Adenocarcinomas", + "analyte_ids": [ + "2f46942c-7f9c-4dab-ac3b-fb5abf0ca11c", + "986f008d-d05c-4dda-88fc-858bf4f1f379", + "e90c3295-7c25-4e34-9b89-eeb6838081e4", + "2fe03fe3-1036-40d2-81ea-9933be17e4d9", + "020b312a-7b12-49b0-9135-189eb5e1e42a", + "21fddcbd-2f48-4733-845e-089a4e9076a7", + "4a25c9f2-f1ff-4cb2-ba9a-77e55961c488" + ], + "submitter_id": "TCGA-CJ-4881", + "submitter_analyte_ids": [ + "TCGA-CJ-4881-11A-01X", + "TCGA-CJ-4881-01A-01R", + "TCGA-CJ-4881-11A-01D", + "TCGA-CJ-4881-01A-01X", + "TCGA-CJ-4881-01A-01W", + "TCGA-CJ-4881-01A-01D", + "TCGA-CJ-4881-11A-01W" + ], + "submitter_aliquot_ids": [ + "TCGA-CJ-4881-01A-01D-1303-05", + "TCGA-CJ-4881-01A-01R-1305-07", + "TCGA-CJ-4881-01A-01X-1371-10", + "TCGA-CJ-4881-11A-01X-1371-10", + "TCGA-CJ-4881-01A-01R-1762-13", + "TCGA-CJ-4881-01A-01D-2098-10", + "TCGA-CJ-4881-01A-01W-1369-10", + "TCGA-CJ-4881-01A-01R-1304-13", + "TCGA-CJ-4881-11A-01D-1302-01", + "TCGA-CJ-4881-11A-01D-2098-10", + "TCGA-CJ-4881-11A-01D-1303-05", + "TCGA-CJ-4881-11A-01W-1369-10", + "TCGA-CJ-4881-01A-01D-1373-10", + "TCGA-CJ-4881-01A-01D-1301-02", + "TCGA-CJ-4881-11A-01D-1301-02", + "TCGA-CJ-4881-11A-01D-1373-10", + "TCGA-CJ-4881-01A-01D-1302-01" + ], + "aliquot_ids": [ + "581cae5b-a55e-4bd4-a9da-48b5480615c0", + "00d54c43-aba8-4503-827a-30444e38c704", + "290dd57c-0f01-431d-8b72-5f25f1a00ca7", + "63f47ca8-5a98-4c54-8d83-9f0f9c9f4559", + "ece4609c-ffe3-4330-a92b-c4847b618d77", + "30ec4e95-d07b-4a2d-9869-7b9008eb8d8b", + "e373d58b-dccf-49e3-ab23-c177883ea2bc", + "521d43d9-882f-42dd-ad21-ef5b6df5dae9", + "67d59b58-b34c-4616-8661-b58cbb32e726", + "9efca5db-e210-468a-a38e-9fcc83d3f113", + "26281187-0846-4578-8e8a-ad9886493af7", + "495dbab8-1e78-4cb4-b3e4-0ffda17c823a", + "ae20e2d0-5d39-4a94-a9ff-dee71503cbfe", + "d883e20c-5237-420c-a58b-98ca359f6b2a", + "ec4d0eff-cbe4-4dbb-8319-1f4b0b4a5d35", + "5d2fada9-5a0f-41b9-b602-9675623191ca", + "ffbf81f1-b5a5-4739-9f35-716169650023" + ], + "created_datetime": null, + "diagnosis_ids": [ + "f9c1962f-13de-5ffe-b201-9d125adc590f" + ], "sample_ids": [ - "d43f0112-fe59-4842-9fda-1189e5fb7248", - "213cbbe5-c382-47a1-b936-bf40c2c99091" + "f00de7e8-d54d-4f07-85bc-8fa3f1d56b0c", + "1f8ce8ea-74b6-4d91-81e7-b689d11c26cd", + "534a3864-e8f4-462f-81a5-c0a3895cb68f" + ], + "submitter_sample_ids": [ + "TCGA-CJ-4881-01Z", + "TCGA-CJ-4881-01A", + "TCGA-CJ-4881-11A" ], + "submitter_diagnosis_ids": [ + "TCGA-CJ-4881_diagnosis" + ], + "primary_site": "Kidney", + "updated_datetime": "2019-08-06T14:29:28.932622-05:00", + "case_id": "24c1cf70-bd67-431e-a623-d20f8d3f52b2", "portion_ids": [ - "26441aae-22e5-4e69-b3f5-34ccde356c93", - "60d7a93c-0634-438e-a72a-ce63630bb890", - "246a8f01-7ef2-4737-a984-49aa0b41c089" + "4f1a0956-ee24-4ba1-b8a3-b23fb8a26601", + "1a7f6ad8-97f3-416d-8d1e-1752bb6638f7", + "367e98f4-181c-4656-b28e-855fa6f265af" ], + "state": "released", "submitter_portion_ids": [ - "TCGA-F2-6879-10A-01", - "TCGA-F2-6879-01A-21-A39M-20", - "TCGA-F2-6879-01A-11" - ], - "created_datetime": "2016-05-02T16:23:44.347995-05:00", - "submitter_aliquot_ids": [ - "TCGA-F2-6879-01A-11R-2155-13", - "TCGA-F2-6879-10A-01D-2153-01", - "TCGA-F2-6879-10A-01D-2152-26", - "TCGA-F2-6879-01A-11D-2157-05", - "TCGA-F2-6879-10A-01D-2154-08", - "TCGA-F2-6879-01A-11D-A45X-08", - "TCGA-F2-6879-01A-11D-2154-08", - "TCGA-F2-6879-01A-11W-2179-08", - "TCGA-F2-6879-01A-11D-2153-01", - "TCGA-F2-6879-01A-11R-2156-07", - "TCGA-F2-6879-01A-11D-2152-26", - "TCGA-F2-6879-10A-01D-A45X-08", - "TCGA-F2-6879-10A-01W-2179-08", - "TCGA-F2-6879-01A-01D-YYYY-23" - ], - "updated_datetime": "2016-05-02T16:23:44.347995-05:00", + "TCGA-CJ-4881-11A-01", + "TCGA-CJ-4881-01A-01", + "TCGA-CJ-4881-01A-21-1739-20" + ] + }, + { + "id": "12adefc4-c9fd-46d3-904b-8fc52d5f1913", + "slide_ids": [ + "802aa0c2-d722-46b1-9a66-29333abba34c", + "6a956f2d-0040-498f-b0e6-68839fe8f668", + "3d0c023e-7023-4e11-931f-61c72105d828", + "cbd62e61-cc59-4533-bca2-518ce48c1ab3", + "d56c89b6-cda6-45f7-965a-1cf6dc037ad1" + ], + "submitter_slide_ids": [ + "TCGA-B0-4696-01A-01-TS1", + "TCGA-B0-4696-11A-01-TS1", + "TCGA-B0-4696-11A-01-BS1", + "TCGA-B0-4696-01A-01-BS1", + "TCGA-B0-4696-01Z-00-DX1" + ], + "disease_type": "Adenomas and Adenocarcinomas", + "analyte_ids": [ + "a5399e3a-54ba-4971-b72a-a7bea9312511", + "41488c12-a3cd-4118-bc9d-abf8d28eac27", + "3da18be3-3bf7-475f-947e-5b2c22e7c046", + "6bb85779-d0a1-47ca-aacc-540c907604f4", + "06fd554d-32bb-4c13-aec8-33619b77fea6", + "474d8e2f-76b8-4cb3-b166-62266be78c42", + "26bf50f5-c51a-4c06-b68a-c23f271d0077" + ], + "submitter_id": "TCGA-B0-4696", "submitter_analyte_ids": [ - "TCGA-F2-6879-10A-01D", - "TCGA-F2-6879-01A-11R", - "TCGA-F2-6879-10A-01W", - "TCGA-F2-6879-01A-11W", - "TCGA-F2-6879-01A-11D" + "TCGA-B0-4696-01A-01W", + "TCGA-B0-4696-01A-01X", + "TCGA-B0-4696-11A-01D", + "TCGA-B0-4696-11A-01W", + "TCGA-B0-4696-01A-01R", + "TCGA-B0-4696-11A-01X", + "TCGA-B0-4696-01A-01D" ], - "analyte_ids": [ - "e87dde8d-3bf5-42d8-9a77-620d5c4943e0", - "30ade77d-996b-4031-93ab-6b341d49eb0a", - "1d94bd70-6621-4a94-8102-d673663e6665", - "ea65d92e-1597-410d-84d8-abb2a6235b3e", - "79697034-1cec-4d92-8195-8a35258ab477" - ], - "submitter_id": "TCGA-F2-6879", - "case_id": "8d9bd437-8b4b-4da5-87ba-6b5790f05022", - "state": null, "aliquot_ids": [ - "e7533585-b062-4d74-b511-05dc806a1357", - "e107952a-cc2b-4410-b0f9-62e7115430a0", - "61f1c8b1-986a-485a-9d96-4e4285b6425a", - "c043e276-fece-4cb9-a848-a0b16e6099b6", - "e5d110e1-63ad-49ce-b9b7-22bbd7ef8a88", - "7accb08d-acdb-46bc-bf7f-b9f678193115", - "a52cd04b-41d6-40db-b050-00ef3a143f7e", - "207fcf5e-c422-4333-9ec2-5dab38d240c7", - "5ddd3f83-28a8-4b7f-9aec-203a3c2efbe5", - "ccd4dd70-c0e4-42cf-870e-33d1013b201a", - "e12314fe-f16a-4d85-95b4-e712ede450f6", - "695461e3-283c-4b5b-9325-6b2588b67fd8", - "8481be1e-0993-487d-8d73-b0eb72b304ee", - "d7200791-4f1c-418f-8744-91b793486d9f" + "3c57f0c5-340c-478f-af17-55c0bda50cce", + "1701e9f3-22ba-4fd9-9a88-d310a3c9dc1e", + "c9c0689c-da29-476a-84bb-f52ae5698b50", + "f04d25de-d349-4ed4-94c7-aaf24528e77b", + "f00ce2ee-45c4-4232-8758-d4de6d85ea46", + "5ddfdefe-9b9d-438b-b55b-2953a2541378", + "504da228-bbe6-490f-9f59-94f6f00adb31", + "e9aedb52-652a-48b2-a9ef-e3a1010e5874", + "e3847aa2-047f-46ba-bb5c-e18d725defc2", + "92590459-3a3e-4160-9c57-30e078258103", + "57dac8d5-58bf-4beb-ad24-d34437d1b3e7", + "9e5b6046-1d2c-44af-8430-421ec0036bb6", + "a744ca75-7994-453c-a283-3db5778844f9", + "3128f70f-0a05-41dc-8657-2312f8af4104", + "70544bed-6e2a-45f0-9b7c-e4f44705935f", + "65197a42-c28f-47ee-909f-64555fb8478e" ], - "slide_ids": [ - "bcbcc947-cab1-4400-aebc-1d9e251a3ce8", - "cae8d0b9-3605-40af-bf99-7c23df8110a9" + "submitter_aliquot_ids": [ + "TCGA-B0-4696-01A-01R-1277-07", + "TCGA-B0-4696-01A-01D-1275-05", + "TCGA-B0-4696-11A-01D-1275-05", + "TCGA-B0-4696-11A-01D-1274-01", + "TCGA-B0-4696-01A-01D-2096-10", + "TCGA-B0-4696-11A-01D-1273-02", + "TCGA-B0-4696-01A-01X-1360-10", + "TCGA-B0-4696-01A-01D-1361-10", + "TCGA-B0-4696-01A-01D-1274-01", + "TCGA-B0-4696-11A-01X-1360-10", + "TCGA-B0-4696-11A-01W-1359-10", + "TCGA-B0-4696-01A-01W-1359-10", + "TCGA-B0-4696-01A-01D-1273-02", + "TCGA-B0-4696-11A-01D-1361-10", + "TCGA-B0-4696-11A-01D-2096-10", + "TCGA-B0-4696-01A-01R-1276-13" + ], + "created_datetime": null, + "diagnosis_ids": [ + "34b6cdff-d3b0-55e6-9e92-4ff8e9f7803f" + ], + "sample_ids": [ + "1ee83ac8-c90b-4364-9e4f-2ed76978ccda", + "1784f393-cf44-4bba-a0fd-98d0f6c50b83", + "9e30a07a-f227-4b6a-9c80-619abed6f7bd" ], "submitter_sample_ids": [ - "TCGA-F2-6879-10A", - "TCGA-F2-6879-01A" + "TCGA-B0-4696-01A", + "TCGA-B0-4696-11A", + "TCGA-B0-4696-01Z" + ], + "submitter_diagnosis_ids": [ + "TCGA-B0-4696_diagnosis" + ], + "primary_site": "Kidney", + "updated_datetime": "2019-08-06T14:28:04.080572-05:00", + "case_id": "12adefc4-c9fd-46d3-904b-8fc52d5f1913", + "portion_ids": [ + "1f4528b1-44c0-459d-a71e-b3c11575b6f4", + "e8148187-d209-4ee6-a261-c2860402f02f", + "2f3b0ffd-223c-4573-86e4-afdef6116698" + ], + "state": "released", + "submitter_portion_ids": [ + "TCGA-B0-4696-01A-02-1738-20", + "TCGA-B0-4696-01A-01", + "TCGA-B0-4696-11A-01" ] }, { - "sample_ids": [ - "3a66b5bd-7037-463c-9f8d-2ba3de9d5571", - "84f603d6-9f71-48fb-b2e3-190424407452" + "id": "c0edde5e-d229-4061-8820-14afc712c5b6", + "slide_ids": [ + "b57e7804-588d-4e71-909b-257fa877e2ad", + "fc91a512-85a5-4479-ab07-9b8bd96fdf91" ], - "portion_ids": [ - "fe90de9f-8ee3-4d55-834f-a90538958cb7", - "7a0042fd-07f0-4894-adb0-03cebce8aa02" + "submitter_slide_ids": [ + "TCGA-HD-A633-01A-01-TS1", + "TCGA-HD-A633-01Z-00-DX1" ], - "submitter_portion_ids": [ - "TCGA-VQ-A922-01A-11", - "TCGA-VQ-A922-10A-01" + "disease_type": "Squamous Cell Neoplasms", + "analyte_ids": [ + "c8cd9c1f-6688-4c3a-9414-ebede9a06622", + "cac22161-1b5d-4436-8ba3-12fba6aae76f", + "6b410bfe-a420-4268-a2a3-f3aac24e8761", + "22a7918c-6a39-444e-b995-2ba9e60eae90", + "a07ff300-0c9c-4b70-88a8-b4ac22936e0d" ], - "created_datetime": "2016-05-02T16:26:23.121974-05:00", - "submitter_aliquot_ids": [ - "TCGA-VQ-A922-10A-01D-A412-01", - "TCGA-VQ-A922-01A-11D-A40Z-01", - "TCGA-VQ-A922-10A-01D-A413-08", - "TCGA-VQ-A922-01A-01D-YYYY-23", - "TCGA-VQ-A922-01A-11R-A414-31", - "TCGA-VQ-A922-01A-11D-A410-08", - "TCGA-VQ-A922-01A-11R-A415-13", - "TCGA-VQ-A922-01A-11D-A411-05" - ], - "updated_datetime": "2016-05-02T16:26:23.121974-05:00", + "submitter_id": "TCGA-HD-A633", "submitter_analyte_ids": [ - "TCGA-VQ-A922-01A-11R", - "TCGA-VQ-A922-10A-01D", - "TCGA-VQ-A922-01A-11D" + "TCGA-HD-A633-10A-01D", + "TCGA-HD-A633-10A-01W", + "TCGA-HD-A633-01A-11W", + "TCGA-HD-A633-01A-11R", + "TCGA-HD-A633-01A-11D" ], - "analyte_ids": [ - "15bec495-04c7-412b-ad69-26b1f9274ccf", - "26a24673-04a1-4837-b888-702b0578aef2", - "2c0ecd67-b9ff-4e60-8d2f-7744c79a13aa" + "submitter_aliquot_ids": [ + "TCGA-HD-A633-01A-11W-A316-08", + "TCGA-HD-A633-01A-11D-A28R-08", + "TCGA-HD-A633-10A-01D-A28U-08", + "TCGA-HD-A633-01A-11R-A28V-07", + "TCGA-HD-A633-01A-11D-A28Q-01", + "TCGA-HD-A633-01A-11R-A28Z-13", + "TCGA-HD-A633-10A-01D-A28T-01", + "TCGA-HD-A633-10A-01W-A317-08", + "TCGA-HD-A633-01A-11D-A28S-05" ], - "submitter_id": "TCGA-VQ-A922", - "case_id": "8bd783a3-d6c9-4c87-a2a1-09f903b9c7ca", - "state": null, "aliquot_ids": [ - "58a121b4-265c-44ae-b6a9-79d087ee8b34", - "76fbba49-0123-4524-89aa-a1818c5507cb", - "0b0805bb-edaa-400f-ae9f-effed3dbb605", - "3370d626-d572-4d13-9cd3-1823a5df3d34", - "60934993-a9df-4389-b64d-da6844ef22df", - "243f24ba-bb0f-44e0-bcb1-69a97b395981", - "6cae9f2a-1c6c-4645-98b6-20719aec1413", - "44d020d1-c516-4a15-94e8-bcf0cb9c2683" + "19657bce-8994-4dc6-8923-6d3959874dec", + "10a6dce3-7446-4dc5-8df0-b36558aa8449", + "0eb29e0f-dff7-47fe-a184-f4d3247298dc", + "b492257f-17a5-48de-aee3-a21a9bedf5ee", + "db856f4b-d854-4bab-b92e-755dfdb8acea", + "d67c2939-353f-4299-88ea-e3d640fc4ac8", + "31641d8f-733a-469b-ae84-907430e8bf0a", + "36aead4f-8d43-426b-ad2c-7dcb249f579e", + "895828e4-db59-4f6a-9df3-c423ec3ec6b7" ], - "slide_ids": [ - "0ff02899-57f8-419e-8872-c6ede53f4d3c" + "created_datetime": null, + "diagnosis_ids": [ + "3c27a0ae-0496-5d58-b9c6-1b6362b5da05" ], - "submitter_sample_ids": [ - "TCGA-VQ-A922-10A", - "TCGA-VQ-A922-01A" - ] - }, - { "sample_ids": [ - "5bb5bd60-cf47-413b-88fa-f14977e24035", - "82fcf670-1646-4a28-9578-f7e5b2f426e5", - "3b87fed0-cfbd-4ee3-b71d-ab595853e836" - ], - "portion_ids": [ - "18bf160e-702a-464a-9920-f115024b5484", - "10a9c093-009d-4bc0-a344-2afd3f0f9b9f", - "8ebd06e1-5eda-47ec-8888-61965ecf005e" - ], - "submitter_portion_ids": [ - "TCGA-HU-8243-11A-01", - "TCGA-HU-8243-01A-11", - "TCGA-HU-8243-10A-01" + "ec6326e4-1a3e-4afd-ae55-6a9fdb872d93", + "6ad70a8a-2d7e-406a-a419-b805b72eba56", + "1b65e4f9-45cb-47ed-bbae-e3f189b1c8f8" ], - "created_datetime": "2016-05-02T16:17:09.754748-05:00", - "submitter_aliquot_ids": [ - "TCGA-HU-8243-01A-01D-YYYY-23", - "TCGA-HU-8243-01A-11D-2340-08", - "TCGA-HU-8243-01A-11D-2338-01", - "TCGA-HU-8243-01A-11D-2342-05", - "TCGA-HU-8243-11A-01D-2338-01", - "TCGA-HU-8243-11A-01D-2340-08", - "TCGA-HU-8243-10A-01D-2339-01", - "TCGA-HU-8243-01A-11R-2343-13", - "TCGA-HU-8243-10A-01D-2341-08" - ], - "updated_datetime": "2016-05-02T16:17:09.754748-05:00", - "submitter_analyte_ids": [ - "TCGA-HU-8243-11A-01D", - "TCGA-HU-8243-10A-01D", - "TCGA-HU-8243-01A-11R", - "TCGA-HU-8243-01A-11D" + "submitter_sample_ids": [ + "TCGA-HD-A633-10A", + "TCGA-HD-A633-01A", + "TCGA-HD-A633-01Z" ], - "analyte_ids": [ - "89c9094d-5cf6-4c7d-ad24-41b7ad9427cc", - "2c413e60-0122-426b-afb3-ae94810e2513", - "57d41760-0fed-49d2-8606-48231cb244ea", - "37ed51fd-b540-408e-8bd6-4447ae4aa84a" - ], - "submitter_id": "TCGA-HU-8243", - "case_id": "77a8eab6-f6a1-4739-9031-75ead40d68cb", - "state": null, - "aliquot_ids": [ - "ace3edd6-14a9-42cc-84f3-6127237f2913", - "a711abd1-f1c2-4e42-8b66-79b4514ac1c4", - "6af7ba34-58f7-4472-8c7e-89fc91ad5ac1", - "558ff67a-a584-46f8-9089-8f4a08015294", - "71c0a224-5953-4b59-a49c-b7aa1e959f1e", - "a460c222-bcac-4959-961f-4dbd73e1ce13", - "6e5789d7-4988-457a-86eb-e618c7ab06eb", - "ff31f56b-398c-45ee-b122-f10027774527", - "9635cfd4-3d26-4fc6-846c-fd74d5b60098" + "submitter_diagnosis_ids": [ + "TCGA-HD-A633_diagnosis" ], - "slide_ids": [ - "60b7c6b8-594a-40c3-9341-a0902e4e6938", - "e55e00a0-2048-404a-b83a-f34106468694" + "primary_site": "Other and unspecified parts of mouth", + "updated_datetime": "2019-08-06T14:26:51.527876-05:00", + "case_id": "c0edde5e-d229-4061-8820-14afc712c5b6", + "portion_ids": [ + "dd7bf6a5-9e94-470a-8fb4-9f2a2a56c173", + "d4475b19-78d8-4b71-ac0e-86a4230aa0cd", + "9c1c0a78-2d57-4157-bff1-00d5bb818c26" ], - "submitter_sample_ids": [ - "TCGA-HU-8243-10A", - "TCGA-HU-8243-01A", - "TCGA-HU-8243-11A" + "state": "released", + "submitter_portion_ids": [ + "TCGA-HD-A633-01A-11", + "TCGA-HD-A633-10A-01", + "TCGA-HD-A633-01A-21-A45M-20" ] }, { - "sample_ids": [ - "2f5cc9c9-31a9-5eb3-952a-b21e7cef50ca", - "4f3f4fc8-4465-5230-83ec-c0ef6aceb2ea" + "id": "ae2051a3-f851-4cb3-af18-029f4574b179", + "slide_ids": [ + "90f06679-0397-4688-8c75-b1855efa4bd1", + "f8c610bf-7789-4b57-8a74-cf29106be142", + "30e6a59b-fdc9-4976-9040-e1a851232b1b", + "e6b71383-bd0f-4f2f-9483-f0048522c234" + ], + "submitter_slide_ids": [ + "TCGA-CV-7245-01A-01-TS1", + "TCGA-CV-7245-01A-01-BS1", + "TCGA-CV-7245-01Z-00-DX1", + "TCGA-CV-7245-11A-01-TS1" + ], + "disease_type": "Squamous Cell Neoplasms", + "analyte_ids": [ + "4215b417-de72-49da-87f5-efee18f6166f", + "2acff637-98f6-454d-8217-70d42e5612c1", + "a55e555d-4858-4153-9047-4ce50794efe8", + "b30b89ec-e0f2-49b8-b47d-489730000460", + "b8c80ab1-d3f7-43da-8968-6c846f03724d", + "17ae48ec-047f-4cca-8182-a2635ce772be", + "f8115c36-56c0-4e68-bc96-a26331972298", + "d4c8a05a-a194-48fb-8c5d-2f9391613dcb" + ], + "submitter_id": "TCGA-CV-7245", + "submitter_analyte_ids": [ + "TCGA-CV-7245-11A-01R", + "TCGA-CV-7245-01A-11R", + "TCGA-CV-7245-01A-11D", + "TCGA-CV-7245-10A-01D", + "TCGA-CV-7245-11A-01D", + "TCGA-CV-7245-11A-01W", + "TCGA-CV-7245-01A-11W", + "TCGA-CV-7245-10A-01W" ], - "updated_datetime": "2016-05-25T19:12:45.610324-05:00", "submitter_aliquot_ids": [ - "TARGET-30-PAUXFZ-01A-01D", - "TARGET-30-PAUXFZ-10A-01D" + "TCGA-CV-7245-01A-11R-2015-13", + "TCGA-CV-7245-11A-01R-2015-13", + "TCGA-CV-7245-10A-01W-2033-08", + "TCGA-CV-7245-11A-01D-2010-01", + "TCGA-CV-7245-11A-01R-2016-07", + "TCGA-CV-7245-01A-11D-2014-05", + "TCGA-CV-7245-10A-01D-2011-01", + "TCGA-CV-7245-11A-01D-2014-05", + "TCGA-CV-7245-01A-11R-2016-07", + "TCGA-CV-7245-11A-01D-2012-08", + "TCGA-CV-7245-10A-01D-2013-08", + "TCGA-CV-7245-01A-11D-2012-08", + "TCGA-CV-7245-11A-01W-2033-08", + "TCGA-CV-7245-01A-11W-2032-08", + "TCGA-CV-7245-01A-11D-2010-01" ], - "submitter_id": "TARGET-30-PAUXFZ", - "case_id": "a7ccef7c-14c0-5232-b647-58b4a54fb343", "aliquot_ids": [ - "9e1e30a8-7607-5b7e-b33c-9a6c5828d5fb", - "c56898f9-c394-516a-bdbb-bf32a5af9d3f" + "c1a21df1-8265-4040-843a-2785b98b3463", + "95b2722d-f9c7-4d7f-8a70-7603080ecac5", + "265cfe54-10da-45c9-b0d1-2d5c65308be1", + "d73e8c20-b392-475e-936b-10d6bb173e6c", + "ce0c0c2e-902a-4ecb-b753-e696750ca407", + "0ead3f08-6ce9-4eea-8ac2-c8021f427f2b", + "0f19fa35-7bf4-4cec-b463-8b1e01dfefa0", + "56291b3c-595c-4388-a264-9037a48401d8", + "f84f8f63-06bf-40c7-8ba8-439aaf360c89", + "bdc58b10-1756-45bf-a34f-fa4095429476", + "081cec95-360f-4127-9fbc-e6bfd212c23f", + "61897747-36dc-4aad-b17a-fe935c605ca3", + "f28ead58-aa1c-490b-8b4c-e7dc2d76a563", + "b87cdd87-3663-4e8a-99f2-7d475f939cb6", + "fe4ca5c3-0767-4264-be28-2bb9891c0c02" + ], + "created_datetime": null, + "diagnosis_ids": [ + "cceca036-354c-59e4-ad7a-886865aada46" ], - "submitter_sample_ids": [ - "TARGET-30-PAUXFZ-01A", - "TARGET-30-PAUXFZ-10A" - ] - }, - { "sample_ids": [ - "c1bcb8d1-e13d-4af4-93f4-02d5f7f616a2", - "52fcf737-cdcc-43ea-b33c-4018039b42dd" + "aaa6607c-e333-4f99-b1dc-390c3cad4f1b", + "c8ac9749-82ff-4348-8d91-5ce677697a15", + "f0684e03-2d91-4e9b-9d11-a3063ccc2f34", + "d455327a-a947-4a67-8fb5-44438abe5976" ], + "submitter_sample_ids": [ + "TCGA-CV-7245-10A", + "TCGA-CV-7245-01A", + "TCGA-CV-7245-01Z", + "TCGA-CV-7245-11A" + ], + "submitter_diagnosis_ids": [ + "TCGA-CV-7245_diagnosis" + ], + "primary_site": "Larynx", + "updated_datetime": "2019-08-06T14:26:16.536997-05:00", + "case_id": "ae2051a3-f851-4cb3-af18-029f4574b179", "portion_ids": [ - "e0e97a05-656a-468e-8418-0d08c38e76ab", - "3e2a0eab-7d89-4f3c-9c0e-8942e53d3c45" + "9d94c9a4-9d71-4843-86f7-be076d939415", + "90abd8dd-a40a-4433-9ae1-856021300eb4", + "3cfc5361-e915-4237-bc4d-a2b06f48cc3e", + "8bb93715-41cd-436a-8ee5-a35ea050b6c0" ], + "state": "released", "submitter_portion_ids": [ - "TCGA-KK-A8I9-01A-11", - "TCGA-KK-A8I9-11A-11" + "TCGA-CV-7245-01A-13-2074-20", + "TCGA-CV-7245-11A-01", + "TCGA-CV-7245-10A-01", + "TCGA-CV-7245-01A-11" + ] + }, + { + "id": "57959b73-534d-450c-b09d-f70ef1ffee25", + "slide_ids": [ + "f7e2faec-0fd2-41f2-82dc-8a888a1c01a5", + "c9a0789f-cb9b-48f5-a8ce-489071fd618c", + "bb49cb73-66de-4c20-a6ef-8483da3fc0e2", + "f09ecda2-cab1-4f2b-a616-26f9ce4fd479", + "09b3f4a8-1366-41a0-b2ea-2dc010f431f8" + ], + "submitter_slide_ids": [ + "TCGA-BP-4346-01A-01-TS1", + "TCGA-BP-4346-11A-01-BS1", + "TCGA-BP-4346-01A-01-BS1", + "TCGA-BP-4346-01Z-00-DX1", + "TCGA-BP-4346-11A-01-TS1" + ], + "disease_type": "Adenomas and Adenocarcinomas", + "analyte_ids": [ + "ce37810b-cf9e-4c0c-962f-432047dae3e7", + "ac583369-7b4e-4b96-ae8e-ae5769c075a5", + "5d9e9e98-e84d-4e33-8c73-b2ea951bbdbd", + "4cf5f5d3-436d-4f40-9beb-c3d2299c008b", + "a2debb38-75ec-456c-944a-d90aaa9fa032", + "f75a4f5c-5d8b-423e-b2b8-718b818a025b", + "8ab159b0-57f8-44b5-916e-3d0d709ffc38" + ], + "submitter_id": "TCGA-BP-4346", + "submitter_analyte_ids": [ + "TCGA-BP-4346-01A-01W", + "TCGA-BP-4346-01A-01D", + "TCGA-BP-4346-11A-01D", + "TCGA-BP-4346-11A-01X", + "TCGA-BP-4346-01A-01X", + "TCGA-BP-4346-01A-01R", + "TCGA-BP-4346-11A-01W" ], - "created_datetime": null, "submitter_aliquot_ids": [ - "TCGA-KK-A8I9-11A-11D-A361-01", - "TCGA-KK-A8I9-11A-11D-A362-08", - "TCGA-KK-A8I9-11A-11W-A446-08", - "TCGA-KK-A8I9-01A-11R-A36G-07", - "TCGA-KK-A8I9-11A-11D-A40C-01", - "TCGA-KK-A8I9-01A-11D-A363-01", - "TCGA-KK-A8I9-01A-11W-A447-08", - "TCGA-KK-A8I9-01A-11D-A365-05", - "TCGA-KK-A8I9-01A-11D-A364-08", - "TCGA-KK-A8I9-01A-11R-A36B-13" - ], - "updated_datetime": "2016-05-02T15:57:29.451686-05:00", - "submitter_analyte_ids": [ - "TCGA-KK-A8I9-11A-11W", - "TCGA-KK-A8I9-01A-11R", - "TCGA-KK-A8I9-11A-11D", - "TCGA-KK-A8I9-01A-11W", - "TCGA-KK-A8I9-01A-11D" + "TCGA-BP-4346-11A-01X-1364-10", + "TCGA-BP-4346-11A-01D-1283-01", + "TCGA-BP-4346-01A-01D-1283-01", + "TCGA-BP-4346-01A-01D-1282-02", + "TCGA-BP-4346-01A-01D-1284-05", + "TCGA-BP-4346-01A-01X-1364-10", + "TCGA-BP-4346-11A-01D-1366-10", + "TCGA-BP-4346-01A-01D-1366-10", + "TCGA-BP-4346-01A-01W-1362-10", + "TCGA-BP-4346-11A-01D-1284-05", + "TCGA-BP-4346-11A-01D-2097-10", + "TCGA-BP-4346-01A-01R-1289-07", + "TCGA-BP-4346-01A-01R-1288-13", + "TCGA-BP-4346-01A-01D-2097-10", + "TCGA-BP-4346-11A-01D-1282-02", + "TCGA-BP-4346-11A-01W-1362-10" ], - "analyte_ids": [ - "ddec19cb-5e4c-4151-8b6d-741044abff1e", - "96c5b539-8eb7-4156-81d0-7b7fecd68900", - "ced38a45-7610-49d4-8bf9-d53a1fc2d489", - "476f5deb-1b3f-4a35-8a31-f27763ba8d8a", - "c284f2af-1e9b-40cc-8936-b61cfd251d62" - ], - "submitter_id": "TCGA-KK-A8I9", - "case_id": "261c3d74-706e-4751-bd15-8f3c1a402ff0", - "state": null, "aliquot_ids": [ - "4f76de2d-e07a-402b-9818-7f04d3704a43", - "96802a73-b1db-47d7-8f5f-4504f3ece5ad", - "f376fc45-370a-4d96-833b-9a1322e32a42", - "d3e88dd3-66d7-40d4-978a-4ddab868373a", - "06f1d087-75c9-4da8-8339-80aff3bfaa12", - "50b1e243-b45a-42a1-8692-b7ae5d51250f", - "0f1c00d3-f3dc-4d2b-bd8a-ecc31e4f4089", - "986a3ed6-ba56-4025-a2bd-9909648e703a", - "bebc84b6-9179-420b-8207-858b999e8c0c", - "239d5e7e-5fb5-4df3-ae6b-a5a06ee296ae" + "81ad330a-2e60-4a6e-b301-6564875f41c4", + "b3ceb69a-6e16-41cf-9aa1-539b48aacca2", + "956d13ed-7b60-4e15-a6d2-adfc3ecff4f8", + "f1b05fbc-cda4-4015-a496-6d30c592fa3d", + "e680ec7b-d88c-4da8-b339-da043fda3dc6", + "a8b5426b-ff3a-435f-bffc-90bb4a06d77d", + "20735253-1ba8-41c9-b4ab-682c1af79b9d", + "391da188-dd73-40a8-afae-2f523354b95a", + "60bdedba-bdea-4734-b7ff-a171f274e7d4", + "109292f2-4bc0-4eff-8e5f-c829f51b835b", + "753e5d3b-0334-4202-8af4-45bc9aad100e", + "f58cda65-eaf7-4f4b-af5a-5c05618814d3", + "d758addf-a866-45b6-819a-159f814206cb", + "4f350288-6283-4f87-b1b4-6afae2edfa85", + "69e027cf-071a-4b3e-97de-c450555f38c1", + "0ca5fa21-f692-40cc-b501-cb3b670530b8" ], - "slide_ids": [ - "1e174ca5-9298-41b6-a705-728f111a3e7b", - "a3e31324-9e06-4799-85b4-4f6236848009" + "created_datetime": null, + "diagnosis_ids": [ + "c1d7ae9c-c61e-5509-aa35-7555f1d35493" + ], + "sample_ids": [ + "a6180591-a96d-4d38-9937-5963fac6b3df", + "18c6c530-51e7-4e43-a1b9-f152d17c7c47", + "5306e996-869d-4cee-a990-dea934af0cb8" ], "submitter_sample_ids": [ - "TCGA-KK-A8I9-11A", - "TCGA-KK-A8I9-01A" + "TCGA-BP-4346-01Z", + "TCGA-BP-4346-11A", + "TCGA-BP-4346-01A" + ], + "submitter_diagnosis_ids": [ + "TCGA-BP-4346_diagnosis" + ], + "primary_site": "Kidney", + "updated_datetime": "2019-08-06T14:28:51.268056-05:00", + "case_id": "57959b73-534d-450c-b09d-f70ef1ffee25", + "portion_ids": [ + "8092a672-1506-4e71-a75d-ee15c8f4cbc4", + "386408e6-2197-4edc-a459-21bb444fbc7f", + "28206fd9-253b-4f2b-94f1-1e558edd32ed" + ], + "state": "released", + "submitter_portion_ids": [ + "TCGA-BP-4346-01A-03-1737-20", + "TCGA-BP-4346-11A-01", + "TCGA-BP-4346-01A-01" ] }, { - "sample_ids": [ - "d43f727a-96d6-40b8-86ae-7a3e0aa46853", - "b8329a6d-a87b-47f4-ad00-9e979e62647b" + "id": "714496c5-d221-4397-9c5a-cd2d22603e6f", + "slide_ids": [ + "db5c34b9-d2c2-4bc7-a4e4-eb47eb806a1b", + "80e8f54c-2976-4dcb-a9c3-0954e841aec3", + "9c045280-8ea3-4ca5-a20b-4483dacc4c5c" ], - "portion_ids": [ - "8960ddcc-0950-4d6e-a557-8727b652c93b", - "e36bfd07-c911-4a98-8424-e58e5e9aaa68" + "submitter_slide_ids": [ + "TCGA-CN-6013-01A-01-TS1", + "TCGA-CN-6013-01Z-00-DX1", + "TCGA-CN-6013-01A-01-BS1" ], - "submitter_portion_ids": [ - "TCGA-QR-A70H-10A-01", - "TCGA-QR-A70H-01A-12" + "disease_type": "Squamous Cell Neoplasms", + "analyte_ids": [ + "8a90c5f9-c866-4109-8058-b7da2ae79d4a", + "3eeca312-02c3-4e20-898d-04e3b8aa0aad", + "1b1ac5e8-63e9-4f9f-9231-00e0f91ae576", + "016a6a0b-6d11-4074-a8b4-98c74599a586", + "17b12617-8570-4d5f-85d6-39dc5a47dca6" ], - "created_datetime": null, - "submitter_aliquot_ids": [ - "TCGA-QR-A70H-01A-12R-A35K-07", - "TCGA-QR-A70H-01A-12R-A35M-13", - "TCGA-QR-A70H-01A-12D-A35E-05", - "TCGA-QR-A70H-10A-01D-A35A-01", - "TCGA-QR-A70H-01A-12D-A35C-01", - "TCGA-QR-A70H-01A-12W-A43Z-08", - "TCGA-QR-A70H-10A-01D-A35B-08", - "TCGA-QR-A70H-10A-01W-A441-08", - "TCGA-QR-A70H-01A-12D-A35D-08" - ], - "updated_datetime": "2016-05-02T15:37:31.996088-05:00", + "submitter_id": "TCGA-CN-6013", "submitter_analyte_ids": [ - "TCGA-QR-A70H-10A-01D", - "TCGA-QR-A70H-10A-01W", - "TCGA-QR-A70H-01A-12D", - "TCGA-QR-A70H-01A-12W", - "TCGA-QR-A70H-01A-12R" + "TCGA-CN-6013-01A-11R", + "TCGA-CN-6013-01A-11W", + "TCGA-CN-6013-10A-01W", + "TCGA-CN-6013-10A-01D", + "TCGA-CN-6013-01A-11D" + ], + "submitter_aliquot_ids": [ + "TCGA-CN-6013-10A-01D-1683-08", + "TCGA-CN-6013-01A-11W-1767-08", + "TCGA-CN-6013-01A-11D-1681-02", + "TCGA-CN-6013-01A-11D-1683-08", + "TCGA-CN-6013-01A-11D-1684-05", + "TCGA-CN-6013-10A-01D-1681-02", + "TCGA-CN-6013-01A-11R-1685-13", + "TCGA-CN-6013-10A-01D-1682-01", + "TCGA-CN-6013-01A-11R-1686-07", + "TCGA-CN-6013-01A-11D-1682-01", + "TCGA-CN-6013-10A-01W-1767-08" ], - "analyte_ids": [ - "c4a41555-dd45-4e10-a3be-50d49a1121a3", - "957e01f6-eb3f-446e-9f45-b50c66337e2d", - "1acde950-2e0c-4586-852b-b4ac4e1ea4a4", - "67c033c0-9fe8-4004-967e-c605e1890f4d", - "b0873010-5d60-4691-b700-e172950f1d7c" - ], - "submitter_id": "TCGA-QR-A70H", - "case_id": "13b41b15-a785-4ab7-b864-ffff6d35dd45", - "state": null, "aliquot_ids": [ - "d9120f00-7f10-49d5-ae84-6177e9424c7c", - "31c6fa50-200a-46c1-a546-61b52592fd8f", - "ab50f38c-2e7d-4d75-a216-27aeaa4d9305", - "382d5e31-6c66-4df3-a695-6b8c29cfc681", - "51d1fb14-c918-4439-b816-ef6cd3253c64", - "f586d8d5-d0c6-4979-aaa7-10217a88fa4c", - "2f9a60eb-602e-44bb-bc57-87e20d946f76", - "fbafc85e-deff-46cd-a40f-479b9dc92a60", - "cacbc8a6-0eb0-4277-931f-d0075c9b1de9" + "e4a0cd96-81aa-41c8-b32c-1b6c57a08679", + "992de9b5-c394-48e7-b4e3-4c4aeacb4a23", + "0f8d7f78-2640-444d-a068-a348f50cde8f", + "2de7dbc6-2a3e-4e67-bb96-552e27137618", + "e996c35e-3614-4258-b99e-0eb444398931", + "78c2da9e-3837-4098-b662-3c10d8b5d14e", + "157aba21-4c81-45b9-914c-925813c537f6", + "762ca7da-6d6e-4709-b1aa-13621d962d35", + "1906f4b3-d226-4570-bbce-f2cbbb1d5ec2", + "9496355f-6aa6-4f6a-ac06-b17966b4208c", + "ffff755e-285f-466c-85c6-b372f4a7ae14" ], - "slide_ids": [ - "2310e34c-0ea5-4876-9f87-bad0b7a44513" + "created_datetime": null, + "diagnosis_ids": [ + "7747a88f-1a9b-5833-9442-af0b1350bb4f" ], - "submitter_sample_ids": [ - "TCGA-QR-A70H-01A", - "TCGA-QR-A70H-10A" - ] - }, - { "sample_ids": [ - "19dee039-9c98-4d4a-8baf-eea1b6dda8eb", - "fdf1e501-f34f-450c-9a5c-611157079a86" + "c0c84016-117e-498c-9ed9-52e279e89e33", + "d018cd44-6b01-44f2-9181-81153225aab8", + "8b510bc8-7b3c-4861-9a40-c2c30b7099e0" + ], + "submitter_sample_ids": [ + "TCGA-CN-6013-10A", + "TCGA-CN-6013-01A", + "TCGA-CN-6013-01Z" + ], + "submitter_diagnosis_ids": [ + "TCGA-CN-6013_diagnosis" ], + "primary_site": "Gum", + "updated_datetime": "2019-08-06T14:25:25.511101-05:00", + "case_id": "714496c5-d221-4397-9c5a-cd2d22603e6f", "portion_ids": [ - "10b6ccb4-3637-4769-8988-417c0306eaef", - "92f8cd48-451d-4ed6-8e60-b15aa93d2c09", - "d0d55efa-c91d-45de-92bf-cf6f0d263b21" + "4add0374-495b-4d4c-b96b-8494600fea26", + "436822b9-08d5-493e-b213-b9b5dfc161e9", + "acdd7eae-c26c-46c9-9dca-036ed3317718" ], + "state": "released", "submitter_portion_ids": [ - "TCGA-BJ-A18Z-01A-21", - "TCGA-BJ-A18Z-01A-11-A21L-20", - "TCGA-BJ-A18Z-10A-01" - ], - "created_datetime": null, + "TCGA-CN-6013-10A-01", + "TCGA-CN-6013-01A-11", + "TCGA-CN-6013-01A-13-2072-20" + ] + }, + { + "id": "74f89bda-bc2a-4cd3-9aa3-f89aa7282dd5", + "lost_to_followup": null, + "days_to_lost_to_followup": null, + "disease_type": "Myeloid Leukemias", + "submitter_id": "TARGET-20-PAUXKI", "submitter_aliquot_ids": [ - "TCGA-BJ-A18Z-01A-21D-A13U-02", - "TCGA-BJ-A18Z-10A-01D-A13V-01", - "TCGA-BJ-A18Z-01A-21R-A13Y-07", - "TCGA-BJ-A18Z-01A-21W-A14T-08", - "TCGA-BJ-A18Z-01A-21D-A13Z-05", - "TCGA-BJ-A18Z-01A-21D-A37T-08", - "TCGA-BJ-A18Z-10A-01D-A13W-08", - "TCGA-BJ-A18Z-01A-21R-A13X-13", - "TCGA-BJ-A18Z-01A-21D-A13W-08", - "TCGA-BJ-A18Z-10A-01D-A13U-02", - "TCGA-BJ-A18Z-10A-01W-A14T-08", - "TCGA-BJ-A18Z-01A-21D-A13V-01" - ], - "updated_datetime": "2016-05-02T16:18:19.199189-05:00", - "submitter_analyte_ids": [ - "TCGA-BJ-A18Z-01A-21W", - "TCGA-BJ-A18Z-01A-21D", - "TCGA-BJ-A18Z-01A-21R", - "TCGA-BJ-A18Z-10A-01D", - "TCGA-BJ-A18Z-10A-01W" + "TARGET-20-PAUXKI-09A-01R" ], - "analyte_ids": [ - "119ebfa1-75b2-4f24-816a-4e9a5061f6b5", - "f86759fd-ecc5-4f42-b5fe-b9f079d23968", - "39691042-bd28-40ed-b66b-26414ecf1ba0", - "76ea5056-d7fa-49fb-94bf-11171ca7c100", - "71a822c9-b510-4a4c-8c30-18b8083acc2d" - ], - "submitter_id": "TCGA-BJ-A18Z", - "case_id": "0d497faf-2c1c-4173-a5fe-770cca73323c", - "state": null, "aliquot_ids": [ - "fa580596-e70f-4ed0-85a2-6fb594ca679a", - "776cb4b1-8efd-4ea2-b53f-9dff7dd94b10", - "85a7922f-0327-437c-bdf5-1bb67a1e932f", - "6d532180-0175-4610-8bfa-cca3a7c3697a", - "b5977e73-49d8-4e99-9e97-993cc44dad17", - "918793fa-b35e-4745-ac75-4d1c868089f8", - "ba9479a1-929f-4e4e-8bf5-e23cb280dfcf", - "e9776ff5-69b9-4669-ab33-e4bb030461ec", - "8ba98907-ab03-4c9e-a900-e31aa16ff810", - "35e18649-183e-4223-b2f6-d812bdd9becd", - "4aa17671-4420-4989-a6dd-379250f4aeda", - "815c53c3-8add-4612-b93c-3ed4bfa530aa" + "192d5f48-ba11-44cf-91d5-5eec5daea281" ], - "slide_ids": [ - "7c5b5c77-9fbc-4b48-81f5-48b5ede7c436" + "sample_ids": [ + "22b9ad16-7791-4c41-a1c5-0392b4d4721c" + ], + "created_datetime": "2019-02-25T10:13:06.478422-06:00", + "diagnosis_ids": [ + "2f7f2b8c-35fa-412f-894a-6c4b09cf8077" ], "submitter_sample_ids": [ - "TCGA-BJ-A18Z-01A", - "TCGA-BJ-A18Z-10A" - ] + "TARGET-20-PAUXKI-09A" + ], + "primary_site": "Hematopoietic and reticuloendothelial systems", + "submitter_diagnosis_ids": [ + "TARGET-20-PAUXKI_diagnosis" + ], + "updated_datetime": "2019-10-24T08:22:10.208559-05:00", + "case_id": "74f89bda-bc2a-4cd3-9aa3-f89aa7282dd5", + "index_date": null, + "state": "released" } ], "pagination": { "count": 10, - "sort": "", + "total": 39092, + "size": 10, "from": 0, + "sort": "", "page": 1, - "total": 6340, - "pages": 634, - "size": 10 + "pages": 3910 } }, "warnings": {} @@ -1773,12 +2029,13 @@ response = requests.get(cases_endpt, params = params) print response.content ``` ```response1 -submitter_id -TCGA-RC-A6M6 -TCGA-B6-A0RV -TCGA-MB-A5Y8 -TCGA-BQ-5876 -TCGA-Z6-A9VB +id submitter_id +375436b3-66ac-4d5e-b495-18a96d812a69 TCGA-F5-6810 +74543fa4-ce73-46e4-9c59-224e8242b4a2 TCGA-AG-A01W +f8970455-bfb2-4b1d-ab71-3c5d619898ad TCGA-ZN-A9VQ +c739fd61-22b2-412d-bcf3-89bda45a2c0f TCGA-3H-AB3X +340fef21-55d8-433f-b00a-51276b849356 TCGA-MQ-A4LI + ``` ```shell2 curl 'https://api.gdc.cancer.gov/cases?fields=submitter_id&size=5&format=XML&pretty=true' @@ -1799,29 +2056,34 @@ print response.content - TCGA-MQ-A4LV + 375436b3-66ac-4d5e-b495-18a96d812a69 + TCGA-F5-6810 - TCGA-N9-A4Q1 + 74543fa4-ce73-46e4-9c59-224e8242b4a2 + TCGA-AG-A01W - TCGA-78-7154 + f8970455-bfb2-4b1d-ab71-3c5d619898ad + TCGA-ZN-A9VQ - TCGA-S7-A7WX + c739fd61-22b2-412d-bcf3-89bda45a2c0f + TCGA-3H-AB3X - TCGA-XF-AAML + 340fef21-55d8-433f-b00a-51276b849356 + TCGA-MQ-A4LI 5 - + 84392 + 5 0 - 2811 - 14052 + 1 - 5 + 16879 @@ -1838,7 +2100,7 @@ Returns when the `pretty` parameter is set to `true`, the API response is format curl 'https://api.gdc.cancer.gov/cases?fields=submitter_id&sort=submitter_id:asc&size=5' ``` ```Response1 -{"data": {"hits": [{"id": "f7af65fc-97e3-52ce-aa2c-b707650e747b", "submitter_id": "TARGET-00-NAAEMA"}, {"id": "513d0a2a-3c94-5a36-97a4-24c3656fc66e", "submitter_id": "TARGET-00-NAAEMB"}, {"id": "b5f20676-727b-50b0-9b5a-582cd8572d6d", "submitter_id": "TARGET-00-NAAEMC"}, {"id": "0c0b183f-0d4a-5a9d-9888-0617cebcc462", "submitter_id": "TARGET-20-PABGKN"}, {"id": "0f5ed7a7-226d-57bc-a4ce-8a6b18560c55", "submitter_id": "TARGET-20-PABHET"}], "pagination": {"count": 5, "sort": "submitter_id:asc", "from": 0, "page": 1, "total": 14551, "pages": 2911, "size": 5}}, "warnings": {}} +{"data": {"hits": [{"id": "be37f1f7-2f98-4f74-bc04-6dd2ae2afcad", "submitter_id": "01BR001"}, {"id": "e6915db0-7c89-484d-8f9f-15cca68b82fc", "submitter_id": "01BR008"}, {"id": "16614d46-172b-479c-992b-e80a8e9a2c59", "submitter_id": "01BR009"}, {"id": "567fc9e3-17a6-42b1-a896-5e9a9507d1d8", "submitter_id": "01BR010"}, {"id": "54e89878-a1bc-4f5a-9d68-4842a469586e", "submitter_id": "01BR015"}], "pagination": {"count": 5, "total": 84392, "size": 5, "from": 0, "sort": "submitter_id:asc", "page": 1, "pages": 16879}}, "warnings": {}} ``` ```Request2 curl 'https://api.gdc.cancer.gov/cases?fields=submitter_id&sort=submitter_id:asc&size=5&pretty=true' @@ -1848,34 +2110,34 @@ curl 'https://api.gdc.cancer.gov/cases?fields=submitter_id&sort=submitter_id:as "data": { "hits": [ { - "id": "f7af65fc-97e3-52ce-aa2c-b707650e747b", - "submitter_id": "TARGET-00-NAAEMA" + "id": "be37f1f7-2f98-4f74-bc04-6dd2ae2afcad", + "submitter_id": "01BR001" }, { - "id": "513d0a2a-3c94-5a36-97a4-24c3656fc66e", - "submitter_id": "TARGET-00-NAAEMB" + "id": "e6915db0-7c89-484d-8f9f-15cca68b82fc", + "submitter_id": "01BR008" }, { - "id": "b5f20676-727b-50b0-9b5a-582cd8572d6d", - "submitter_id": "TARGET-00-NAAEMC" + "id": "16614d46-172b-479c-992b-e80a8e9a2c59", + "submitter_id": "01BR009" }, { - "id": "0c0b183f-0d4a-5a9d-9888-0617cebcc462", - "submitter_id": "TARGET-20-PABGKN" + "id": "567fc9e3-17a6-42b1-a896-5e9a9507d1d8", + "submitter_id": "01BR010" }, { - "id": "0f5ed7a7-226d-57bc-a4ce-8a6b18560c55", - "submitter_id": "TARGET-20-PABHET" + "id": "54e89878-a1bc-4f5a-9d68-4842a469586e", + "submitter_id": "01BR015" } ], "pagination": { "count": 5, - "sort": "submitter_id:asc", + "total": 84392, + "size": 5, "from": 0, + "sort": "submitter_id:asc", "page": 1, - "total": 14551, - "pages": 2911, - "size": 5 + "pages": 16879 } }, "warnings": {} @@ -1907,114 +2169,124 @@ print json.dumps(response.json(), indent=2) "data": { "hits": [ { - "file_name": "NARKY_p_TCGAb69_SNP_N_GenomeWideSNP_6_H03_697832.grch38.seg.txt", + "id": "c2cefea6-74d4-4859-8fe2-822767d6f68d", "cases": [ { - "submitter_id": "TCGA-BP-4989" + "submitter_id": "HCM-BROD-0003-C71" } ], - "file_id": "3bd4d5dc-563a-481c-87a6-ec0017d0d58a", - "file_size": 54200 + "file_name": "30f53128-5def-4d1c-b203-9717e9cf4401_wxs_gdc_realn.bam", + "file_id": "c2cefea6-74d4-4859-8fe2-822767d6f68d", + "file_size": 35753708766 }, { - "file_name": "652ecf99-1af9-41fc-b0a5-d3e5c07a7b5d.FPKM.txt.gz", + "id": "070d2103-4350-477b-8bd7-ee529d9d24fb", "cases": [ { - "submitter_id": "TCGA-60-2709" + "submitter_id": "HCM-CSHL-0142-C18" } ], - "file_id": "b3286166-01f9-4149-81b5-a2ea5f27c50e", - "file_size": 530665 + "file_name": "ca5a3304-2af0-4a5f-9479-881918520921.wxs.varscan2.raw_somatic_mutation.vcf.gz", + "file_id": "070d2103-4350-477b-8bd7-ee529d9d24fb", + "file_size": 53177 }, { - "file_name": "CUSKS_p_TCGAb47_SNP_1N_GenomeWideSNP_6_D05_628212.nocnv_grch38.seg.txt", + "id": "9133f158-4bea-4036-b94a-60c25385ed36", "cases": [ { - "submitter_id": "TCGA-A8-A07Z" + "submitter_id": "HCM-BROD-0028-C71" } ], - "file_id": "282cc9d1-c5e9-49ff-b27b-e00c1e5529c6", - "file_size": 15806 + "file_name": "03203e90-6180-4994-bfc8-1521669a6a49.wxs.MuSE.somatic_annotation.vcf.gz", + "file_id": "9133f158-4bea-4036-b94a-60c25385ed36", + "file_size": 158597 }, { - "file_name": "REEDY_p_TCGAb65_SNP_N_GenomeWideSNP_6_F01_697686.nocnv_grch38.seg.txt", + "id": "22a04866-c605-4b2d-a48e-816058028c6f", "cases": [ { - "submitter_id": "TCGA-CJ-4871" + "submitter_id": "HCM-BROD-0002-C71" } ], - "file_id": "fe44a644-eefc-42c5-aac7-a216bc1e88e1", - "file_size": 6179 + "file_name": "07d8e937-79e3-4fac-83c1-2e67e7a6ae14.wxs.MuSE.aliquot.maf.gz", + "file_id": "22a04866-c605-4b2d-a48e-816058028c6f", + "file_size": 125050 }, { - "file_name": "84df7a8fee9fedb5e8e22849ec66d294_gdc_realn.bam", + "id": "2843fe16-b371-44a9-b9ab-1a93c26d24db", "cases": [ { - "submitter_id": "TCGA-A2-A0CO" + "submitter_id": "HCM-CSHL-0366-C50" } ], - "file_id": "acd0ec73-c1fe-463e-912c-84e8416510e5", - "file_size": 15545555724 + "file_name": "43f4ba37-92ed-4d30-86f5-e1eeb0109d9a.wgs.sanger_raw_pindel.raw_somatic_mutation.vcf.gz", + "file_id": "2843fe16-b371-44a9-b9ab-1a93c26d24db", + "file_size": 62812030 }, { - "file_name": "ed8c4bb6-891a-4cf2-80ba-42c5594760d0.vcf", + "id": "aea93c80-0551-459e-8408-6d16148a7210", "cases": [ { - "submitter_id": "TCGA-BQ-7059" + "submitter_id": "HCM-CSHL-0461-D12" } ], - "file_id": "ed8c4bb6-891a-4cf2-80ba-42c5594760d0", - "file_size": 264694 + "file_name": "c45b5b42-acfb-457c-9244-2e70368c29c5.FPKM-UQ.txt.gz", + "file_id": "aea93c80-0551-459e-8408-6d16148a7210", + "file_size": 406727 }, { - "file_name": "nationwidechildrens.org_clinical.TCGA-IG-A6QS.xml", + "id": "d46795b7-2166-44af-97a9-c825585878d3", "cases": [ { - "submitter_id": "TCGA-IG-A6QS" + "submitter_id": "HCM-CSHL-0248-C19" } ], - "file_id": "fe8cf009-f033-4536-95c7-836adcba5bf3", - "file_size": 36996 + "file_name": "e837ac90-f482-454f-a1e0-f16cea1d9f95.wgs.CaVEMan.raw_somatic_mutation.vcf.gz", + "file_id": "d46795b7-2166-44af-97a9-c825585878d3", + "file_size": 2658954 }, { - "file_name": "05f6f9f7-6fb7-4c95-b79c-fdfaba16539d.vep.reheader.vcf.gz", + "id": "7ba38c35-6491-48f7-811f-336b8487021f", "cases": [ { - "submitter_id": "TCGA-DK-A3IV" + "submitter_id": "HCM-CSHL-0057-C18" } ], - "file_id": "05f6f9f7-6fb7-4c95-b79c-fdfaba16539d", - "file_size": 415044 + "file_name": "9720b990-3ff4-4cb9-a900-ad57da72cff4.FPKM.txt.gz", + "file_id": "7ba38c35-6491-48f7-811f-336b8487021f", + "file_size": 358324 }, { - "file_name": "C484.TCGA-12-5301-01A-01D-1486-08.7_gdc_realn.bam", + "id": "3565c301-a03c-4334-9e06-3bb01f92c3f0", "cases": [ { - "submitter_id": "TCGA-12-5301" + "submitter_id": "HCM-BROD-0002-C71" } ], - "file_id": "3b0293c2-4a26-428c-b097-9489f23a2a2d", - "file_size": 23661175335 + "file_name": "52f32e5d-d56d-4e54-b962-9a0ed377afd3.wgs.BRASS.raw_structural_variation.bedpe.gz", + "file_id": "3565c301-a03c-4334-9e06-3bb01f92c3f0", + "file_size": 9715 }, { - "file_name": "75a36e71-400d-46a5-93b0-7813cf0595ea.FPKM.txt.gz", + "id": "2fe7b061-48d7-45af-b435-84919ce68e47", "cases": [ { - "submitter_id": "TCGA-BF-A5EO" + "submitter_id": "HCM-BROD-0012-C71" } ], - "file_id": "28f763c7-8064-4151-ae0e-31e70cd9bfe8", - "file_size": 488422 + "file_name": "53984f05-821c-492a-8a0b-6e2c0b340e92.rna_seq.star_splice_junctions.tsv.gz", + "file_id": "2fe7b061-48d7-45af-b435-84919ce68e47", + "file_size": 2865433 } ], "pagination": { "count": 10, - "sort": "", + "total": 596758, + "size": 10, "from": 0, + "sort": "", "page": 1, - "total": 216435, - "pages": 21644, - "size": 10 + "pages": 59676 } }, "warnings": {} @@ -2030,62 +2302,63 @@ The `expand` parameter provides a shortcut to request multiple related fields (f ```Shell curl 'https://api.gdc.cancer.gov/files/ac2ddebd-5e5e-4aea-a430-5a87c6d9c878?expand=cases.samples&pretty=true' ``` -``` +```Response { "data": { - "data_type": "Aligned Reads", - "updated_datetime": "2016-09-18T04:25:13.163601-05:00", - "created_datetime": "2016-05-26T18:55:53.506549-05:00", - "file_name": "000aa811c15656604161e8f0e3a0aae4_gdc_realn.bam", - "md5sum": "200475f5f6e42520204e5f6aadfe954f", "data_format": "BAM", - "acl": [ - "phs000178" - ], - "access": "controlled", - "platform": "Illumina", - "state": "submitted", - "file_id": "ac2ddebd-5e5e-4aea-a430-5a87c6d9c878", - "data_category": "Raw Sequencing Data", - "file_size": 12667634731, "cases": [ { "samples": [ { "sample_type_id": "11", - "updated_datetime": "2016-09-08T11:00:45.021005-05:00", - "time_between_excision_and_freezing": null, - "oct_embedded": "false", - "tumor_code_id": null, - "submitter_id": "TCGA-QQ-A5VA-11A", - "intermediate_dimension": null, - "sample_id": "b4e7558d-898e-4d68-a897-381edde0bbcc", - "is_ffpe": false, - "pathology_report_uuid": null, - "created_datetime": null, "tumor_descriptor": null, + "sample_id": "b4e7558d-898e-4d68-a897-381edde0bbcc", "sample_type": "Solid Tissue Normal", - "state": null, - "current_weight": null, + "created_datetime": null, + "tumor_code": null, + "time_between_excision_and_freezing": null, "composition": null, + "updated_datetime": "2018-11-15T21:38:54.195821-06:00", + "days_to_collection": 5980, + "state": "released", + "initial_weight": 810.0, + "preservation_method": null, + "intermediate_dimension": null, "time_between_clamping_and_freezing": null, + "freezing_method": null, + "pathology_report_uuid": null, + "submitter_id": "TCGA-QQ-A5VA-11A", + "tumor_code_id": null, "shortest_dimension": null, - "tumor_code": null, - "tissue_type": null, + "oct_embedded": "false", "days_to_sample_procurement": null, - "freezing_method": null, - "preservation_method": null, - "days_to_collection": 5980, - "initial_weight": 810.0, - "longest_dimension": null + "longest_dimension": null, + "current_weight": null, + "is_ffpe": false, + "tissue_type": "Not Reported" } ] } ], + "access": "controlled", + "file_name": "000aa811c15656604161e8f0e3a0aae4_gdc_realn.bam", "submitter_id": "32872121-d38a-4128-b96a-698a6f18f29d", + "data_category": "Sequencing Reads", + "acl": [ + "phs000178" + ], "type": "aligned_reads", - "file_state": "processed", - "experimental_strategy": "WXS" + "platform": "Illumina", + "created_datetime": "2016-05-26T18:55:53.506549-05:00", + "file_size": 12667634731, + "md5sum": "200475f5f6e42520204e5f6aadfe954f", + "updated_datetime": "2018-11-15T21:38:44.655215-06:00", + "file_id": "ac2ddebd-5e5e-4aea-a430-5a87c6d9c878", + "data_type": "Aligned Reads", + "state": "released", + "experimental_strategy": "WXS", + "version": "1", + "data_release": "12.0 - 27.0" }, "warnings": {} } @@ -2122,20 +2395,22 @@ print json.dumps(response.json(), indent=2) "data": { "hits": [ { - "file_name": "unc.edu.276a1e00-cf3a-4463-a97b-d544381219ea.2363081.rsem.isoforms.normalized_results" + "id": "c2cefea6-74d4-4859-8fe2-822767d6f68d", + "file_name": "30f53128-5def-4d1c-b203-9717e9cf4401_wxs_gdc_realn.bam" }, { - "file_name": "nationwidechildrens.org_clinical.TCGA-EY-A5W2.xml" + "id": "070d2103-4350-477b-8bd7-ee529d9d24fb", + "file_name": "ca5a3304-2af0-4a5f-9479-881918520921.wxs.varscan2.raw_somatic_mutation.vcf.gz" } ], "pagination": { "count": 2, - "sort": "", + "total": 596758, + "size": 2, "from": 0, - "pages": 300936, - "total": 601872, + "sort": "", "page": 1, - "size": 2 + "pages": 298379 } }, "warnings": {} @@ -2159,34 +2434,34 @@ print json.dumps(response.json(), indent=2) "data": { "hits": [ { - "file_name": "OCULI_p_TCGA_159_160_SNP_N_GenomeWideSNP_6_E09_831242.grch38.seg.txt", - "id": "1d959137-d8e6-4336-b357-8ab9c88eeca8" + "id": "79c5e6ab-7d33-48fb-8ad3-1a353f1aa8a0", + "file_name": "bded93b7-da8b-467a-b301-bc0533780b7b.wxs.VarScan2.aliquot.maf.gz" }, { - "file_name": "jhu-usc.edu_SKCM.HumanMethylation450.3.lvl-3.TCGA-EE-A3JI-06A-11D-A21B-05.gdc_hg38.txt", - "id": "9c02ec95-4aa3-4112-8823-c0fa87f71773" + "id": "b8b730aa-6f8f-4c7c-ad64-d69f49df56a3", + "file_name": "5b930358-2132-4c5d-874c-9b94656dcf3b_gdc_realn.bam" }, { - "file_name": "jhu-usc.edu_LAML.HumanMethylation450.2.lvl-3.TCGA-AB-3002-03A-01D-0742-05.gdc_hg38.txt", - "id": "731c3560-bcef-4ebf-bfbc-7320399a5bcb" + "id": "bd912d6c-7325-4e90-ab1d-c3a4880f1e84", + "file_name": "8a737363-742f-40df-ab4e-9a0bdd3adeed.wxs.varscan2.raw_somatic_mutation.vcf.gz" }, { - "file_name": "CUSKS_p_TCGAb47_SNP_1N_GenomeWideSNP_6_B03_628222.grch38.seg.txt", - "id": "a6f73a3e-faf8-49d9-9b68-77781bd302df" + "id": "a11d6196-7f01-4c62-8808-0a627250c59c", + "file_name": "4a030c3f-79e0-4737-92ad-59ba5af89977.wxs.aliquot_ensemble_raw.maf.gz" }, { - "file_name": "5496e9f1-a383-4874-95bb-f4d1b33f4594.vcf", - "id": "5496e9f1-a383-4874-95bb-f4d1b33f4594" + "id": "6fc778a7-6c7d-4aba-b1e4-36c2bb752216", + "file_name": "2d9f75a7-95fd-418f-9c48-ae981b6853f1.star_fusion.rna_fusion.bedpe" } ], "pagination": { "count": 5, - "sort": "", + "total": 596758, + "size": 5, "from": 101, + "sort": "", "page": 21, - "total": 274724, - "pages": 54945, - "size": 5 + "pages": 119352 } }, "warnings": {} @@ -2220,54 +2495,54 @@ print json.dumps(response.json(), indent=2) "data": { "hits": [ { - "id": "f7af65fc-97e3-52ce-aa2c-b707650e747b", - "submitter_id": "TARGET-00-NAAEMA" + "id": "be37f1f7-2f98-4f74-bc04-6dd2ae2afcad", + "submitter_id": "01BR001" }, { - "id": "513d0a2a-3c94-5a36-97a4-24c3656fc66e", - "submitter_id": "TARGET-00-NAAEMB" + "id": "e6915db0-7c89-484d-8f9f-15cca68b82fc", + "submitter_id": "01BR008" }, { - "id": "b5f20676-727b-50b0-9b5a-582cd8572d6d", - "submitter_id": "TARGET-00-NAAEMC" + "id": "16614d46-172b-479c-992b-e80a8e9a2c59", + "submitter_id": "01BR009" }, { - "id": "0c0b183f-0d4a-5a9d-9888-0617cebcc462", - "submitter_id": "TARGET-20-PABGKN" + "id": "567fc9e3-17a6-42b1-a896-5e9a9507d1d8", + "submitter_id": "01BR010" }, { - "id": "0f5ed7a7-226d-57bc-a4ce-8a6b18560c55", - "submitter_id": "TARGET-20-PABHET" + "id": "54e89878-a1bc-4f5a-9d68-4842a469586e", + "submitter_id": "01BR015" }, { - "id": "b2a560a4-5e52-5d78-90ef-d680fbaf44d0", - "submitter_id": "TARGET-20-PABHKY" + "id": "a1c7b7b9-b8c8-48c3-9420-55497f9318fd", + "submitter_id": "01BR017" }, { - "id": "1e5c8323-383d-51a0-9199-1b9504b29c7e", - "submitter_id": "TARGET-20-PABLDZ" + "id": "ce3c8b98-e275-4cfd-a379-940d675a564b", + "submitter_id": "01BR018" }, { - "id": "c550a267-30bd-5bf3-9699-61341559e0d5", - "submitter_id": "TARGET-20-PACDZR" + "id": "e4ce89ef-bcaa-418a-8a6b-3602793b9bbf", + "submitter_id": "01BR020" }, { - "id": "0fe29a81-74fc-5158-ae13-0437bc272805", - "submitter_id": "TARGET-20-PACEGD" + "id": "19d3c861-8a5f-49a2-acc0-b55b25465c35", + "submitter_id": "01BR023" }, { - "id": "dd2b23ec-46f4-56b2-9429-6015c6dc730f", - "submitter_id": "TARGET-20-PADDXZ" + "id": "afae8dce-294a-4108-bb28-376f804ae5c4", + "submitter_id": "01BR025" } ], "pagination": { "count": 10, - "sort": "submitter_id:asc", + "total": 84392, + "size": 10, "from": 0, + "sort": "submitter_id:asc", "page": 1, - "total": 14551, - "pages": 1456, - "size": 10 + "pages": 8440 } }, "warnings": {} @@ -2303,29 +2578,85 @@ print json.dumps(response.json(), indent=2) ```Response { "data": { - "pagination": { - "count": 0, - "sort": "program.name:asc", - "from": 0, - "page": 1, - "total": 39, - "pages": 39, - "size": 0 - }, "hits": [], "aggregations": { "program.name": { "buckets": [ { - "key": "TCGA", - "doc_count": 33 + "doc_count": 33, + "key": "TCGA" + }, + { + "doc_count": 9, + "key": "TARGET" + }, + { + "doc_count": 8, + "key": "GENIE" + }, + { + "doc_count": 2, + "key": "BEATAML1.0" + }, + { + "doc_count": 2, + "key": "CGCI" + }, + { + "doc_count": 2, + "key": "CMI" + }, + { + "doc_count": 2, + "key": "CPTAC" + }, + { + "doc_count": 1, + "key": "CTSP" + }, + { + "doc_count": 1, + "key": "FM" + }, + { + "doc_count": 1, + "key": "HCMI" + }, + { + "doc_count": 1, + "key": "MMRF" + }, + { + "doc_count": 1, + "key": "NCICCR" + }, + { + "doc_count": 1, + "key": "OHSU" + }, + { + "doc_count": 1, + "key": "ORGANOID" }, { - "key": "TARGET", - "doc_count": 6 + "doc_count": 1, + "key": "VAREPOP" + }, + { + "doc_count": 1, + "key": "WCDT" } ] } + }, + "pagination": { + "count": 0, + "total": 67, + "size": 0, + "from": 0, + "sort": "program.name:asc", + "page": 1, + "pages": 67 } }, "warnings": {} @@ -2528,69 +2859,7 @@ The GDC Portal has a quicksearch functionality that allows for a project, case, curl "https://api.gdc.cancer.gov/v0/all?query=TCGA&size=5" ``` ```Response -{ - "data": { - "query": { - "hits": [ - { - "disease_type": [ - "Esophageal Carcinoma" - ], - "id": "UHJvamVjdDpUQ0dBLUVTQ0E=", - "name": "Esophageal Carcinoma", - "primary_site": [ - "Esophagus" - ], - "project_id": "TCGA-ESCA" - }, - { - "disease_type": [ - "Head and Neck Squamous Cell Carcinoma" - ], - "id": "UHJvamVjdDpUQ0dBLUhOU0M=", - "name": "Head and Neck Squamous Cell Carcinoma", - "primary_site": [ - "Head and Neck" - ], - "project_id": "TCGA-HNSC" - }, - { - "disease_type": [ - "Liver Hepatocellular Carcinoma" - ], - "id": "UHJvamVjdDpUQ0dBLUxJSEM=", - "name": "Liver Hepatocellular Carcinoma", - "primary_site": [ - "Liver" - ], - "project_id": "TCGA-LIHC" - }, - { - "disease_type": [ - "Colon Adenocarcinoma" - ], - "id": "UHJvamVjdDpUQ0dBLUNPQUQ=", - "name": "Colon Adenocarcinoma", - "primary_site": [ - "Colorectal" - ], - "project_id": "TCGA-COAD" - }, - { - "disease_type": [ - "Adrenocortical Carcinoma" - ], - "id": "UHJvamVjdDpUQ0dBLUFDQw==", - "name": "Adrenocortical Carcinoma", - "primary_site": [ - "Adrenal Gland" - ], - "project_id": "TCGA-ACC" - } - ] - } - } -} +{"data":{"query":{"hits":[{"disease_type":["Gliomas"],"id":"UHJvamVjdDpUQ0dBLUxHRw==","name":"Brain Lower Grade Glioma","primary_site":["Brain"],"project_id":"TCGA-LGG","project_quicksearch":"Brain Lower Grade Glioma"},{"disease_type":["Myeloid Leukemias"],"id":"UHJvamVjdDpUQ0dBLUxBTUw=","name":"Acute Myeloid Leukemia","primary_site":["Hematopoietic and reticuloendothelial systems"],"project_id":"TCGA-LAML","project_quicksearch":"Acute Myeloid Leukemia"},{"disease_type":["Adenomas and Adenocarcinomas"],"id":"UHJvamVjdDpUQ0dBLUtJUkM=","name":"Kidney Renal Clear Cell Carcinoma","primary_site":["Kidney"],"project_id":"TCGA-KIRC","project_quicksearch":"Kidney Renal Clear Cell Carcinoma"},{"disease_type":["Complex Mixed and Stromal Neoplasms"],"id":"UHJvamVjdDpUQ0dBLVVDUw==","name":"Uterine Carcinosarcoma","primary_site":["Uterus, NOS"],"project_id":"TCGA-UCS","project_quicksearch":"Uterine Carcinosarcoma"},{"disease_type":["Germ Cell Neoplasms"],"id":"UHJvamVjdDpUQ0dBLVRHQ1Q=","name":"Testicular Germ Cell Tumors","primary_site":["Testis"],"project_id":"TCGA-TGCT","project_quicksearch":"Testicular Germ Cell Tumors"}],"total":195221}}} ``` This endpoint can be used to quickly retrieve information about a file. For example, if a user wanted to know the UUID for `nationwidechildrens.org_biospecimen.TCGA-EL-A4K1.xml`, the following query could be used to quickly retrieve it programmatically: @@ -2599,20 +2868,7 @@ This endpoint can be used to quickly retrieve information about a file. For exa curl "https://api.gdc.cancer.gov/v0/all?query=nationwidechildrens.org_biospecimen.TCGA-EL-A4K1.xml&size=5" ``` ```Response -{ - "data": { - "query": { - "hits": [ - { - "file_id": "2a7a354b-e497-4ae6-8a85-a170951596c1", - "file_name": "nationwidechildrens.org_biospecimen.TCGA-EL-A4K1.xml", - "id": "RmlsZToyYTdhMzU0Yi1lNDk3LTRhZTYtOGE4NS1hMTcwOTUxNTk2YzE=", - "submitter_id": null - } - ] - } - } -} +{"data":{"query":{"hits":[{"file_id":"a74abfec-db78-4ed4-9e4b-604b66e30e30","file_name":"nationwidechildrens.org_biospecimen.TCGA-EL-A4K1.xml","id":"RmlsZTphNzRhYmZlYy1kYjc4LTRlZDQtOWU0Yi02MDRiNjZlMzBlMzA=","submitter_id":"nationwidechildrens.org_biospecimen.TCGA-EL-A4K1.xml"}],"total":1}}} ``` ## Additional Examples diff --git a/docs/API/Users_Guide/Submission.md b/docs/API/Users_Guide/Submission.md index d47acf23c..d050b3036 100644 --- a/docs/API/Users_Guide/Submission.md +++ b/docs/API/Users_Guide/Submission.md @@ -48,7 +48,7 @@ Metadata files must be uploaded in raw, unencoded form. Binary mode should be us #### BCR XML -While JSON and TSV are the recommended formats for submitting metadata, the GDC API can also extract metadata elements from BCR XML files. Users wishing to submit metadata as BCR XML must contact GDC User Services and ensure that appropriate element mapping is in place before initiating XML submission. +While JSON and TSV are the recommended formats for submitting metadata, the GDC API can also extract metadata elements from BCR XML files. Users wishing to submit metadata as BCR XML must contact GDC User Services and ensure that appropriate element mapping is in place before initiating XML submission. Current mapping can be found in [GitHub](https://github.com/NCI-GDC/gdcdatamodel/tree/develop/gdcdatamodel/xml_mappings). To submit BCR XML, make `PUT` requests with the `Content-Type: application/xml` header to the following URLs, replacing Program.name and Project.code as desribed in [Submission Endpoint](#submission_endpoint) (above): @@ -63,7 +63,7 @@ The following is a sample shell command for submitting an XML file: curl --request PUT --header "X-Auth-Token: $token" --header 'Content-Type: application/xml' --data-binary @biospecimen.xml 'https://api.gdc.cancer.gov/v0/submission/GDC/INTERNAL/xml/biospecimen/bcr/_dry_run' -**NOTE:** A typical BCR XML file contains more information than what is extracted and indexed by the GDC. XML files submitted to the above endpoints are not retained or distributed to GDC data users, so the same files should also be submitted as data files (i.e. as clinical or biospecimen supplements). +>**NOTE:** A typical BCR XML file contains more information than what is extracted and indexed by the GDC. XML files submitted to the above endpoints are not retained or distributed to GDC data users, so the same files should also be submitted as data files (i.e. as clinical or biospecimen supplements). ### Data File Formats @@ -82,7 +82,7 @@ Submitters can assign UUIDs to all submittable entities other than those that co In addition to `id`, many entities also include a `submitter_id` field. This field can contain any string (e.g. a "barcode") that the submitter wishes to use to identify the entity. Typically this string identifies a corresponding entry in submitter's records. The GDC's only requirement with respect to `submitter_id` is that it be a string that is unique for all entities within a project. The GDC Submission API requires a `submitter_id` for most entities. -**Note:** For `case` entities, `submitter_id` must correspond to a `submitted_subject_id` of a study participant registered with the project in dbGaP. +>**Note:** For `case` entities, `submitter_id` must correspond to a `submitted_subject_id` of a study participant registered with the project in dbGaP. ### GDC Data Dictionary Endpoints @@ -321,7 +321,7 @@ The following transaction fields can be queried using [GraphQL](#querying-submit |`state`|String|Indicates the state of the transaction: `PENDING`, `SUCCEEDED`, `FAILED` (due to user error), or `ERRORED` (due to system error)| |`committed_by`|ID|The ID of the transaction that committed this transaction| -**Note:** To check whether a dry run transaction was committed successfully, check the `state` of the transaction that executed the commit. The `state` of the dry run transaction itself does not represent the status of a subsequent commit. +>**Note:** To check whether a dry run transaction was committed successfully, check the `state` of the transaction that executed the commit. The `state` of the dry run transaction itself does not represent the status of a subsequent commit. ## Creating and Updating Entities @@ -333,7 +333,7 @@ The GDC Submission API supports HTTP POST and HTTP PUT methods for creating enti The GDC suggests using POST for creating new entities, and using PUT only for updating entities. This helps to avoid inadvertent entity updates that can occur when using PUT for creating entities. -**Note:** Once a relationship has been created between two entities, it cannot be removed by updating an entity. To remove a relationship, the child entity must be [deleted](#deleting-entities). +>**Note:** Once a relationship has been created between two entities, it cannot be removed by updating an entity. To remove a relationship, the child entity must be [deleted](#deleting-entities). ### Example: Creating and Updating Case Entities (JSON) @@ -342,7 +342,7 @@ In this example, a case entity is created using POST. Then an attempt is made to The JSON in the request was generated using the `case` JSON template that can be obtained from the [GDC Data Dictionary Viewer](../../Data_Dictionary/index.md) and from `https://api.gdc.cancer.gov/v0/submission/template/case?format=json`. -**Note:** For `case` entities, `submitter_id` must correspond to a `submitted_subject_id` of a study participant registered with the project in dbGaP. +>**Note:** For `case` entities, `submitter_id` must correspond to a `submitted_subject_id` of a study participant registered with the project in dbGaP. ```Request1 @@ -718,6 +718,181 @@ curl --header "X-Auth-Token: $token" --header 'Content-Type: text/tsv' --request "updated_entity_count": 0 } ``` + +### Example: Bulk Transaction + +To wrap multiple TSV or JSON files into a single transaction the bulk endpoint can be used. In this example a TSV to create Clinical Supplement nodes is included in the same transactions as a JSON to create Demographic nodes. + + +```Request +[ + { + "name":"Demographic", + "doc_format":"Json", + "doc":"[\n {\n \"submitter_id\": \"demographic1234\",\n \"vital_status\": \"Dead\",\n \"cases\": [\n {\n \"submitter_id\": \"GDC-INTERNAL-000021\"\n }\n ],\n \"ethnicity\": \"not reported\",\n \"gender\": \"male\",\n \"race\": \"white\",\n \"project_id\": \"GDC-INTERNAL\",\n \"type\": \"demographic\"\n },\n {\n \"submitter_id\": \"demographicABCD\",\n \"vital_status\": \"Alive\",\n \"cases\": [\n {\n \"submitter_id\": \"GDC-INTERNAL-000010\"\n }\n ],\n \"ethnicity\": \"not reported\",\n \"gender\": \"female\",\n \"race\": \"white\",\n \"project_id\": \"GDC-INTERNAL\",\n \"type\": \"demographic\"\n }\n]" + }, + { + "name":"Clinical Supplement", + "doc_format":"Tsv", + "doc":"cases.submitter_id\tdiagnoses.id\tdiagnoses.submitter_id\tparent_samples.id\tparent_samples.submitter_id\ttissue_source_sites.id\ttissue_source_sites.code\ttype\tproject_id\tsubmitter_id\tsample_type\ttissue_type\tbiospecimen_anatomic_site\tbiospecimen_laterality\tcatalog_reference\tcomposition\tcurrent_weight\tdays_to_collection\tdays_to_sample_procurement\tdiagnosis_pathologically_confirmed\tdistance_normal_to_tumor\tdistributor_reference\tfreezing_method\tgrowth_rate\tinitial_weight\tintermediate_dimension\tis_ffpe\tlongest_dimension\tmethod_of_sample_procurement\toct_embedded\tpassage_count\tpathology_report_uuid\tpreservation_method\tsample_type_id\tshortest_dimension\ttime_between_clamping_and_freezing\ttime_between_excision_and_freezing\ttumor_code\ttumor_code_id\ttumor_descriptor\nGDC-INTERNAL-000021\t\t\t\t\t\t\tsample\tGDC-INTERNAL\tGDC-INTERNAL-000021-Sample1\tPrimary Tumor\tTumor\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tPrimary\nGDC-INTERNAL-000021\t\t\t\t\t\t\tsample\tGDC-INTERNAL\tGDC-INTERNAL-000021-Sample2\tPrimary Tumor\tTumor\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tPrimary\nGDC-INTERNAL-000021\t\t\t\t\t\t\tsample\tGDC-INTERNAL\tGDC-INTERNAL-000021-Sample3\tPrimary Tumor\tTumor\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tPrimary\n" + } +] +``` +```Command +curl -XPOST --header "X-Auth-Token: $token" --data-binary @Request 'https://api.gdc.cancer.gov/submission/GDC/INTERNAL/bulk/_dry_run' +``` +```Response +{ + "code": 200, + "created_entity_count": 5, + "document_error_count": 0, + "entity_error_count": 0, + "message": "Bulk Transaction succeeded.", + "subtransactions": [ + { + "name": "Demographic", + "response_json": { + "cases_related_to_created_entities_count": 2, + "cases_related_to_updated_entities_count": 0, + "code": 200, + "created_entity_count": 2, + "entities": [ + { + "action": "create", + "errors": [], + "id": "642ffbd6-f909-40b7-84a5-51458c28fab8", + "related_cases": [ + { + "id": "b5622ca2-8f51-453e-b411-b2ac045bb04a", + "submitter_id": "GDC-INTERNAL-000021" + } + ], + "type": "demographic", + "unique_keys": [ + { + "project_id": "GDC-INTERNAL", + "submitter_id": "demographic1234" + } + ], + "valid": true, + "warnings": [] + }, + { + "action": "create", + "errors": [], + "id": "3d3488c9-07d3-46bb-8c13-4671ced43033", + "related_cases": [ + { + "id": "4ca09b58-5765-4034-8ec0-ede5d756ea5d", + "submitter_id": "GDC-INTERNAL-000010" + } + ], + "type": "demographic", + "unique_keys": [ + { + "project_id": "GDC-INTERNAL", + "submitter_id": "demographicABCD" + } + ], + "valid": true, + "warnings": [] + } + ], + "entity_error_count": 0, + "message": "Transaction would have been successful. User selected dry run option, transaction aborted, no data written to database.", + "success": true, + "transaction_id": 1636917, + "transactional_error_count": 0, + "transactional_errors": [], + "updated_entity_count": 0 + } + }, + { + "name": "Clinical Supplement", + "response_json": { + "cases_related_to_created_entities_count": 1, + "cases_related_to_updated_entities_count": 0, + "code": 200, + "created_entity_count": 3, + "entities": [ + { + "action": "create", + "errors": [], + "id": "f0555c6b-8737-4d06-bf33-9641aab14497", + "related_cases": [ + { + "id": "b5622ca2-8f51-453e-b411-b2ac045bb04a", + "submitter_id": "GDC-INTERNAL-000021" + } + ], + "type": "sample", + "unique_keys": [ + { + "project_id": "GDC-INTERNAL", + "submitter_id": "GDC-INTERNAL-000021-Sample1" + } + ], + "valid": true, + "warnings": [] + }, + { + "action": "create", + "errors": [], + "id": "dbb07d81-cda3-47b3-87a4-3a50271b72b6", + "related_cases": [ + { + "id": "b5622ca2-8f51-453e-b411-b2ac045bb04a", + "submitter_id": "GDC-INTERNAL-000021" + } + ], + "type": "sample", + "unique_keys": [ + { + "project_id": "GDC-INTERNAL", + "submitter_id": "GDC-INTERNAL-000021-Sample2" + } + ], + "valid": true, + "warnings": [] + }, + { + "action": "create", + "errors": [], + "id": "d8b9fb1f-d94b-4c9c-8bf2-48e69daba6ba", + "related_cases": [ + { + "id": "b5622ca2-8f51-453e-b411-b2ac045bb04a", + "submitter_id": "GDC-INTERNAL-000021" + } + ], + "type": "sample", + "unique_keys": [ + { + "project_id": "GDC-INTERNAL", + "submitter_id": "GDC-INTERNAL-000021-Sample3" + } + ], + "valid": true, + "warnings": [] + } + ], + "entity_error_count": 0, + "message": "Transaction would have been successful. User selected dry run option, transaction aborted, no data written to database.", + "success": true, + "transaction_id": 1636917, + "transactional_error_count": 0, + "transactional_errors": [], + "updated_entity_count": 0 + } + } + ], + "success": true, + "transaction_id": 1636917, + "transactional_errors": [], + "updated_entity_count": 0 +} +``` + ### Example: Updating a Sample Entity (JSON) Entities can be updated using a very similar process to what is shown above. @@ -2505,11 +2680,13 @@ curl --header "X-Auth-Token: $token" --header 'Content-Type: json' --request PUT ### Downloading Files -Files in file state = validated can be downloaded by the submitter using the API or the Data Transfer Tool. This is done in a similar manner as files available in the Data Portal, but will require submission access to the particular project in dbGaP as opposed to downloader access. File UUIDs can be found in the original upload manifest file, the submission portal, or by API calls. See [Downloading Files](Downloading_Files.md) for details. +Files in `file_state = validated` can be downloaded by the submitter using the API or the Data Transfer Tool. This is done in a similar manner as files available in the Data Portal, but will require submission access to the particular project in dbGaP as opposed to downloader access. File UUIDs can be found in the original upload manifest file, the submission portal, or by API calls. See [Downloading Files](Downloading_Files.md) for details. ### Deleting Files -Uploaded files can be deleted by deleting the entity that corresponds to the file. See [Deleting Entities](#deleting-entities) for details. +Uploaded files must be deleted using a two step process. First, the file is deleted using the Data Transfer Tool. See [Deleting Previously Uploaded Data](../../Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload/#deleting-previously-uploaded-data) for details. + +Second, the file node can be deleted or modified. See [Deleting Entities](#deleting-entities) for details. ## Querying Submitted Data Using GraphQL @@ -2519,7 +2696,7 @@ Uploaded files can be deleted by deleting the entity that corresponds to the fil Unlike the methods outlined in [Search and Retrieval](Search_and_Retrieval.md), which provide access to public releases (or snapshots) of GDC data, the `/graphql` endpoint of GDC Submission API makes it possible for submitters to access "live" data, which provides a real-time view of the state of entities in a project. -**NOTE:** Access to GDC Submission API GraphQL service is limited to authorized and authenticated submitters. Submitters may only access data in their own project using GraphQL. +>**NOTE:** Access to GDC Submission API GraphQL service is limited to authorized and authenticated submitters. Submitters may only access data in their own project using GraphQL. ### GraphQL IDE @@ -2536,7 +2713,7 @@ GDC data submitters can access the GDC Submission API GraphQL endpoint at: where __[API_version/]__ is the optional API version component (see [Getting Started](Getting_Started.md)). -**NOTE:** An authentication token is required for all requests to the `graphql` endpoint. Queries are restricted to those projects for which the submitter has obtained authorization. +>**NOTE:** An authentication token is required for all requests to the `graphql` endpoint. Queries are restricted to those projects for which the submitter has obtained authorization. ### Constructing a Query diff --git a/docs/API/Users_Guide/images/graphql-query.png b/docs/API/Users_Guide/images/graphql-query.png new file mode 100644 index 000000000..e09f3f9a5 Binary files /dev/null and b/docs/API/Users_Guide/images/graphql-query.png differ diff --git a/docs/API/Users_Guide/scripts/Authentication_Tokens.py b/docs/API/Users_Guide/scripts/Authentication_Tokens.py new file mode 100644 index 000000000..2ff713591 --- /dev/null +++ b/docs/API/Users_Guide/scripts/Authentication_Tokens.py @@ -0,0 +1,27 @@ +import requests +import json +import re + +# This script will not work until $TOKEN_FILE_PATH is replaced with an actual path. + +with open("$TOKEN_FILE_PATH","r") as token: + token_string = str(token.read().strip()) + +headers = { + 'X-Auth-Token': token_string + } + +data_endpt = 'https://api.gdc.cancer.gov/data/' +data_uuid = 'a1c1b23b-cc41-4e85-b1b7-62a42873c5af' +headers = { + 'X-Auth-Token': token_string + } +response = requests.get(data_endpt + data_uuid, headers=headers) + +# The file name can be found in the header within the Content-Disposition key. +response_head_cd = response.headers["Content-Disposition"] + +file_name = re.findall("filename=(.+)", response_head_cd)[0] + +with open(file_name, "wb") as output_file: + output_file.write(response.content) \ No newline at end of file diff --git a/docs/API/Users_Guide/scripts/Basic_Query.py b/docs/API/Users_Guide/scripts/Basic_Query.py index 5eb08266d..eedeb2432 100644 --- a/docs/API/Users_Guide/scripts/Basic_Query.py +++ b/docs/API/Users_Guide/scripts/Basic_Query.py @@ -3,7 +3,7 @@ cases_endpt = 'https://api.gdc.cancer.gov/cases' -# The fields parameter is passed as a comma-separated string of single names +# The 'fields' parameter is passed as a comma-separated string of single names. fields = [ "submitter_id", "case_id", @@ -22,4 +22,10 @@ response = requests.get(cases_endpt, params = params) +# OUTPUT METHOD 1: Write to a file. +file = open("basic_query.tsv", "w") +file.write(response.text) +file.close() + +# OUTPUT METHOD 2: View on screen. print(response.content) diff --git a/docs/API/Users_Guide/scripts/Basic_Troubleshooting.py b/docs/API/Users_Guide/scripts/Basic_Troubleshooting.py new file mode 100644 index 000000000..0374b112c --- /dev/null +++ b/docs/API/Users_Guide/scripts/Basic_Troubleshooting.py @@ -0,0 +1,11 @@ +import requests +status_endpt = "https://api.gdc.cancer.gov/status" +response = requests.get(status_endpt) + +# OUTPUT METHOD 1: Write to a file. +file = open("api_status.json", "w") +file.write(response.text) +file.close() + +# OUTPUT METHOD 2: View on screen. +print(response.content) \ No newline at end of file diff --git a/docs/API/Users_Guide/scripts/Complex_Query.py b/docs/API/Users_Guide/scripts/Complex_Query.py index e3df06f0d..ad9b70682 100644 --- a/docs/API/Users_Guide/scripts/Complex_Query.py +++ b/docs/API/Users_Guide/scripts/Complex_Query.py @@ -1,6 +1,7 @@ import requests import json +# The 'fields' parameter is passed as a comma-separated string of single names. fields = [ "file_name", "cases.submitter_id", @@ -52,4 +53,10 @@ # The parameters are passed to 'json' rather than 'params' in this case response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params) -print(response.content.decode("utf-8")) +# OUTPUT METHOD 1: Write to a file. +file = open("complex_filters.tsv", "w") +file.write(response.text) +file.close() + +# OUTPUT METHOD 2: View on screen. +print(response.content.decode("utf-8")) \ No newline at end of file diff --git a/docs/API/Users_Guide/scripts/Filter_Query.py b/docs/API/Users_Guide/scripts/Filter_Query.py index 42c17761d..11bd6dcc9 100644 --- a/docs/API/Users_Guide/scripts/Filter_Query.py +++ b/docs/API/Users_Guide/scripts/Filter_Query.py @@ -1,6 +1,7 @@ import requests import json +# The 'fields' parameter is passed as a comma-separated string of single names. fields = [ "submitter_id", "case_id", @@ -21,8 +22,7 @@ } } -# With a GET request, the filters parameter needs to be converted -# from a dictionary to JSON-formatted string +# With a GET request, the filters parameter needs to be converted from a dictionary to JSON-formatted string params = { "filters": json.dumps(filters), @@ -33,4 +33,10 @@ response = requests.get(cases_endpt, params = params) -print(response.content) +# OUTPUT METHOD 1: Write to a file. +file = open("filtered_query.tsv", "w") +file.write(response.text) +file.close() + +# OUTPUT METHOD 2: View on screen. +print(response.content) \ No newline at end of file diff --git a/docs/API/Users_Guide/scripts/Sample_Request.py b/docs/API/Users_Guide/scripts/Sample_Request.py new file mode 100644 index 000000000..9dc118b67 --- /dev/null +++ b/docs/API/Users_Guide/scripts/Sample_Request.py @@ -0,0 +1,14 @@ +import requests +import json + +file_endpt = 'https://api.gdc.cancer.gov/files/' +file_uuid = 'd853e541-f16a-4345-9f00-88e03c2dc0bc' +response = requests.get(file_endpt + file_uuid) + +# OUTPUT METHOD 1: Write to a file. +file = open("sample_request.json", "w") +file.write(response.text) +file.close() + +# OUTPUT METHOD 2: View on screen. +print(json.dumps(response.json(), indent=2)) \ No newline at end of file diff --git a/docs/Data/Bioinformatics_Pipelines/Aligned_reads_summary_metrics.md b/docs/Data/Bioinformatics_Pipelines/Aligned_reads_summary_metrics.md new file mode 100644 index 000000000..c15f23e86 --- /dev/null +++ b/docs/Data/Bioinformatics_Pipelines/Aligned_reads_summary_metrics.md @@ -0,0 +1,23 @@ +# Aligned Reads Summary Metrics + +Various summary metrics are added to the aligned reads entity for query by the user. These are generated by such tools as SAMtools, Picard, and GATK4. These may be helpful to determine underlying quality or summary information regarding the submitted data. Examples are included below: + +* average_base_quality +* average_insert_size +* average_read_length +* contamination +* contamination_error +* mean_coverage +* msi_score +* msi_status +* pairs_on_diff_chr +* proportion_base_mismatch +* proportion_coverage_10X +* proportion_coverage_30X +* proportion_reads_duplicated +* proportion_reads_mapped +* proportion_targets_no_coverage +* total_reads + + +For a complete list of the summary metrics as well as the tools used to generate them please visit the [Data Dictionary Viewer](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=aligned_reads). diff --git a/docs/Data/Bioinformatics_Pipelines/CNV_Pipeline.md b/docs/Data/Bioinformatics_Pipelines/CNV_Pipeline.md index bf0f9218a..b2fb7eb8c 100644 --- a/docs/Data/Bioinformatics_Pipelines/CNV_Pipeline.md +++ b/docs/Data/Bioinformatics_Pipelines/CNV_Pipeline.md @@ -2,29 +2,79 @@ ## Introduction -The copy number variation (CNV) pipeline uses Affymetrix SNP 6.0 array data to identify genomic regions that are repeated and infer the copy number of these repeats. This pipeline is built onto the existing TCGA level 2 data generated by [Birdsuite](https://www.broadinstitute.org/scientific-community/science/programs/medical-and-population-genetics/birdsuite/birdsuite) and uses the [DNAcopy](http://www.bioconductor.org/packages/release/bioc/html/DNAcopy.html) R-package to perform a circular binary segmentation (CBS) analysis [[1]](http://biostatistics.oxfordjournals.org/content/5/4/557.short). CBS translates noisy intensity measurements into chromosomal regions of equal copy number. The final output files are segmented into genomic regions with the estimated copy number for each region. The GDC further transforms these copy number values into segment mean values, which are equal to log2(copy-number/ 2). Diploid regions will have a segment mean of zero, amplified regions will have positive values, and deletions will have negative values. The GRCh38 probe-set was produced by mapping probe sequences to the GRCh38 reference genome and can be downloaded at the [GDC Reference File Website](https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files). +The copy number variation (CNV) pipeline uses Affymetrix SNP 6.0 array data to identify genomic regions that are repeated and infer the copy number of these repeats. This pipeline is built onto the existing TCGA level 2 data generated by [Birdsuite](https://www.broadinstitute.org/scientific-community/science/programs/medical-and-population-genetics/birdsuite/birdsuite) and uses the [DNAcopy](http://www.bioconductor.org/packages/release/bioc/html/DNAcopy.html) R-package to perform a circular binary segmentation (CBS) analysis [[1]](http://biostatistics.oxfordjournals.org/content/5/4/557.short). CBS translates noisy intensity measurements into chromosomal regions of equal copy number. The final output files are segmented into genomic regions with the estimated copy number for each region. The GDC further transforms these copy number values into segment mean values, which are equal to log2(copy-number/ 2). Diploid regions will have a segment mean of zero, amplified regions will have positive values, and deletions will have negative values. ## Data Processing Steps -A metadata preprocessing step is used to convert the GRCh37 (hg19) probe set coordinates to the newer GRCh38 (hg38) genome build coordinates. A minimum quality control step to verify that reference bases are consistent across two genome builds is used to filter out low quality liftover probe sets. +The GRCh38 probe-set was produced by mapping probe sequences to the GRCh38 reference genome and can be downloaded at the [GDC Reference File Website](https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files). + +### Copy Number Segmentation The [Copy Number Liftover Workflow](/Data_Dictionary/viewer/#?view=table-definition-view&id=copy_number_liftover_workflow) uses the TCGA level 2 tangent.copynumber files described above. These files were generated by first normalizing array intensity values, estimating raw copy number, and performing tangent normalization, which subtracts variation that is found in a set of normal samples. Original array intensity values (TCGA level 1) are available in the [GDC Legacy Archive](https://portal.gdc.cancer.gov/legacy-archive/) under the "Data Format: CEL" and "Platform: Affymetrix SNP 6.0" filters. The Copy Number Liftover Workflow performs CBS analysis using the DNACopy R-package to process tangent normalized data into [Copy Number Segment](/Data_Dictionary/viewer/#?view=table-definition-view&id=copy_number_segment) files, which associate contiguous chromosome regions with log2 ratio segment means in a tab-delimited format. The number of probes with intensity values associated with each chromosome region is also reported (probes with no intensity values are not included in this count). During copy number segmentation probe sets from Pseudo-Autosomal Regions (PARs) were removed from males and Y chromosome segments were removed from females. -Masked copy number segments are generated with the same method except that a filtering step is performed that removes Y chromosome and probe sets that were previously indicated to have frequent germline copy-number variation. +Masked copy number segments are generated using the same method except that a filtering step is performed that removes the Y chromosome and probe sets that were previously indicated to be associated with frequent germline copy-number variation. | I/O | Entity | Format | |---|---|---| | Input | [Submitted Tangent Copy Number](/Data_Dictionary/viewer/#?view=table-definition-view&id=submitted_tangent_copy_number) | TXT | | Output | [Copy Number Segment](/Data_Dictionary/viewer/#?view=table-definition-view&id=copy_number_segment) or Masked Copy Number Segment | TXT | + +### Copy Number Estimation + +Numeric focal-level Copy Number Variation (CNV) values were generated with "Masked Copy Number Segment" files from tumor aliquots using GISTIC2 [[2]](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2011-12-4-r41), [[3]](https://www.nature.com/articles/nature08822) on a project level. Only protein-coding genes were kept, and their numeric CNV values were further thresholded by a noise cutoff of 0.3: + +* Genes with focal CNV values smaller than -0.3 are categorized as a "loss" (-1) +* Genes with focal CNV values larger than 0.3 are categorized as a "gain" (+1) +* Genes with focal CNV values between and including -0.3 and 0.3 are categorized as "neutral" (0). + +Values are reported in a project-level TSV file. Each row represents a gene, which is reported as an Ensembl ID and associated cytoband. The columns represent aliquots, which are associated with CNV value categorizations (0/1/-1) for each gene. + +| I/O | Entity | Format | +|---|---|---| +| Input | [Masked Copy Number Segment](/Data_Dictionary/viewer/#?view=table-definition-view&id=copy_number_segment) | TXT | +| Output | [Copy Number Estimate](/Data_Dictionary/viewer/#?view=table-definition-view&id=copy_number_estimate) | TXT | + + +### GISTIC2 Command Line Parameters + +```Shell +gistic2 \ +-b \ +-seg \ +-mk \ +-refgene \ +-ta 0.1 \ +-armpeel 1 \ +-brlen 0.7 \ +-cap 1.5 \ +-conf 0.99 \ +-td 0.1 \ +-genegistic 1 \ +-gcm extreme \ +-js 4 \ +-maxseg 2000 \ +-qvt 0.25 \ +-rx 0 \ +-savegene 1 \ +(-broad 1) +``` + + ## File Access and Availability | Type | Description | Format | |---|---|---| | Copy Number Segment| A table that associates contiguous chromosomal segments with genomic coordinates, mean array intensity, and the number of probes that bind to each segment. | TXT | | Masked Copy Number Segment | A table with the same information as the Copy Number Segment except that segments with probes known to contain germline mutations are removed. | TXT | +| Copy Number Estimate | A project-level file that displays gains/losses on a gene level. Generated from the Masked Copy Number Segment files | TXT | + [1] Olshen, Adam B., E. S. Venkatraman, Robert Lucito, and Michael Wigler. "Circular binary segmentation for the analysis of array-based DNA copy number data." Biostatistics 5, no. 4 (2004): 557-572. + +[2] Mermel, Craig H., Steven E. Schumacher, Barbara Hill, Matthew L. Meyerson, Rameen Beroukhim, and Gad Getz. "GISTIC2. 0 facilitates sensitive and confident localization of the targets of focal somatic copy-number alteration in human cancers." Genome biology 12, no. 4 (2011): R41. + +[3] Beroukhim, Rameen, Craig H. Mermel, Dale Porter, Guo Wei, Soumya Raychaudhuri, Jerry Donovan, Jordi Barretina et al. "The landscape of somatic copy-number alteration across human cancers." Nature 463, no. 7283 (2010): 899. diff --git a/docs/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline.md b/docs/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline.md index 0c7e95651..6398f4e38 100644 --- a/docs/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline.md +++ b/docs/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline.md @@ -26,11 +26,6 @@ Prior to alignment, BAM files that were submitted to the GDC are split by read g DNA-Seq analysis begins with the [Alignment Workflow](/Data_Dictionary/viewer/#?view=table-definition-view&id=alignment_workflow). Read groups are aligned to the reference genome using one of two [BWA](http://bio-bwa.sourceforge.net) algorithms [[1]](http://www.ncbi.nlm.nih.gov/pubmed/19451168). BWA-MEM is used if mean read length is greater than or equal to 70 bp. Otherwise BWA-aln is used. Each read group is aligned to the reference genome separately and all read group alignments that belong to a single aliquot are merged using [Picard Tools](http://broadinstitute.github.io/picard) [SortSam](https://broadinstitute.github.io/picard/command-line-overview.html#SortSam) and [MergeSamFiles](https://broadinstitute.github.io/picard/command-line-overview.html#MergeSamFiles). Duplicate reads, which may persist as PCR artifacts, are then flagged to prevent downstream variant call errors. -#### Quality Control - -Quality control metrics are collected before and after the alignment workflow and reviewed to identify potential low-quality data files. Basic metrics such as GC content and mean read length as well as quality score metrics are collected from unaligned reads using [FASTQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Quality metrics collected by the GDC for aligned reads include samtools idxstat and flagstat. Alignment information is collected using Picard [CollectMultipleMetrics](https://broadinstitute.github.io/picard/command-line-overview.html#CollectMultipleMetrics) for both WGS and WXS. Coverage information is collected using picard [CollectWgsMetrics](https://broadinstitute.github.io/picard/command-line-overview.html#CollectWgsMetrics) for WGS and picard [CollectHsMetrics](https://broadinstitute.github.io/picard/command-line-overview.html#CollectHsMetrics) for WXS. - -Quality control metrics for each file endpoint can be accessed through the API using the `expand=analysis.metadata.read_groups,analysis.metadata.read_groups.read_group_qcs` parameter. Click [here](https://api.gdc.cancer.gov/files/40e311a4-67aa-468a-8e09-1c7daa2d10bb?pretty=true&expand=analysis.metadata.read_groups,analysis.metadata.read_groups.read_group_qcs) for an example query. #### Reference Genome @@ -42,10 +37,12 @@ All alignments are performed using the human reference genome GRCh38.d1.vd1. Dec | Input | [Submitted Unaligned Reads](/Data_Dictionary/viewer/#?view=table-definition-view&id=submitted_unaligned_reads) or [Submitted Aligned Reads](/Data_Dictionary/viewer/#?view=table-definition-view&id=submitted_aligned_reads) | FASTQ or BAM | | Output | [Aligned Reads](/Data_Dictionary/viewer/#?view=table-definition-view&id=aligned_reads) | BAM | -![DNA-Seq Alignment Pipeline](images/dna-alignment-pipeline_0.png) +![DNA-Seq Alignment Pipeline](images/dna-alignment-pipeline_1.png) ### DNA-Seq Alignment Command Line Parameters +__Note that version numbers may vary in files downloaded from the GDC Portal due to ongoing pipeline development and improvement.__ + #### Step 1: Converting BAMs to FASTQs with Biobambam - biobambam2 2.0.54 ```Shell bamtofastq \ @@ -53,7 +50,7 @@ collate=1 \ exclude=QCFAIL,SECONDARY,SUPPLEMENTARY \ filename= \ gz=1 \ -inputformat=bam +inputformat=bam \ level=5 \ outputdir= \ outputperreadgroup=1 \ @@ -180,21 +177,22 @@ java -jar GenomeAnalysisTK.jar \ ### Somatic Variant Calling Workflow -Aligned and co-cleaned BAM files are processed through the [Somatic Mutation Calling Workflow](/Data_Dictionary/viewer/#?view=table-definition-view&id=somatic_mutation_calling_workflow) as tumor-normal pairs. Variant calling is performed using four separate pipelines: +Aligned and co-cleaned BAM files are processed through the [Somatic Mutation Calling Workflow](/Data_Dictionary/viewer/#?view=table-definition-view&id=somatic_mutation_calling_workflow) as tumor-normal pairs. Variant calling is performed using five separate pipelines: - [MuSE](http://bioinformatics.mdanderson.org/main/MuSE) [[2]](http://www.biorxiv.org/content/early/2016/05/25/055467.abstract) -- [MuTect2](https://www.broadinstitute.org/cancer/cga/mutect) [[3]](http://www.nature.com/nbt/journal/v31/n3/abs/nbt.2514.html) +- [MuTect2](https://gatkforums.broadinstitute.org/gatk/discussion/9183/how-to-call-somatic-snvs-and-indels-using-mutect2) [[3]](http://www.nature.com/nbt/journal/v31/n3/abs/nbt.2514.html) - [VarScan2](http://dkoboldt.github.io/varscan/) [[4]](http://genome.cshlp.org/content/22/3/568.short) - [SomaticSniper](http://gmt.genome.wustl.edu/packages/somatic-sniper/) [[5]](http://bioinformatics.oxfordjournals.org/content/28/3/311.short) +- [Pindel](https://github.com/ucscCancer/pindel-tool) Variant calls are reported by each pipeline in a VCF formatted file. See the GDC [VCF Format](../File_Formats/VCF_Format/) documentation for details on each available field. At this point in the DNA-Seq pipeline, all downstream analyses are branched into four separate paths that correspond to their respective variant calling pipeline. #### Pipeline Descriptions -Four separate variant calling pipelines are implemented for GDC data harmonization. There is currently no scientific consensus on the best variant calling pipeline so the investigator is responsible for choosing the pipeline(s) most appropriate for the data. Some details about the pipelines are indicated below. +Five separate variant calling pipelines are implemented for GDC data harmonization. There is currently no scientific consensus on the best variant calling pipeline so the investigator is responsible for choosing the pipeline(s) most appropriate for the data. Some details about the pipelines are indicated below. The [MuTect2 pipeline](https://gdc.cancer.gov/files/public/image/Broad_MuTect_0.png) employs a "Panel of Normals" to identify additional germline mutations. This panel is generated using TCGA blood normal genomes from thousands of individuals that were curated and confidently assessed to be cancer-free. This method allows for a higher level of confidence to be assigned to somatic variants that were called by the MuTect2 pipeline. -Basic outlines for the other three pipelines can be found here: +Basic outlines for the other three of the pipelines can be found here: - [VarScan2 pipeline](https://gdc.cancer.gov/files/public/image/varscan-somatic-variant-calling-pipeline.png) - [MuSE pipeline](https://gdc.cancer.gov/files/public/image/muse-somatic-variant-calling-pipeline.png) @@ -202,7 +200,7 @@ Basic outlines for the other three pipelines can be found here: #### Indels -Indel mutations that were generated with the MuTect2 and VarScan pipeline are detected and reported in GDC VCF files. +Indel mutations that were generated with the MuTect2, Pindel, and VarScan pipelinesd are detected and reported in GDC VCF files. #### Germline Variants At this time, germline variants are deliberately excluded as harmonized data. The GDC does not recommend using germline variants that were previously detected and stored in the Legacy Archive as they do not meet the GDC criteria for high-quality data. @@ -327,6 +325,123 @@ java -jar VarScan.jar processSomatic \ --p-value 0.07 ``` +#### Pindel + +__Step 1:__ Filter Reads + +Filter BAM reads that are not unmapped or duplicate or secondary_alignment or failed_quality_control or supplementary for both tumor and normal BAM files + +Tool: sambamba 0.7.0-pre1 + +```Shell +Sambamba view $(input.bam) --filter "not (unmapped or duplicate or secondary_alignment or failed_quality_control or supplementary)" --format bam --nthreads 1 --output-filename $(output.bam) +``` + +__Step 2:__ Pindel + +[Pindel version 0.2.5b8, 20151210](https://github.com/genome/pindel/releases/tag/v0.2.5b8) + +__Step 2a.:__ Calculate mean insert size +```Python +cmd = "samtools view -f66 %s | head -n 1000000" % (bam) +output = do_shell_command(cmd) +lines = output.decode('utf-8').split('\n') +b_sum = 0 +b_count = 0 +numlines = 0 +for line in lines: + numlines += 1 + tmp = line.split("\t") + if len(tmp) < 9: + break + if abs(int(tmp[8])) < 10000: + b_sum += abs(int(tmp[8])) + b_count += 1 +try: + mean = b_sum / b_count +``` +__Step 2b.:__ Write it to a config file +```Python +for inputBamFile, meanInsertSize, tag in zip(inputBamFiles, meanInsertSizes, tags): + fil.write("%s\t%s\t%s\n" %(inputBamFile, meanInsertSize, tag)) + fil.close() +``` +__Step 2c.:__ Run pindel +```Shell +pindel \ +-f GRCh38.d1.vd1.fa \ +-i config_file \ +-o $(output_prefix) \ +--exclude GRCh38.d1.vd1.centromeres.telomeres.bed + +``` +__Step 2d.:__ Merge DI and SI OUTPUT +```Python +with open(os.path.join(args.workdir, "pindel_somatic"), "w") as handle: + for p in pindel_files: + if p.endswith("_D"): + with open(p) as ihandle: + for line in ihandle: + if re.search("ChrID", line): + handle.write(line) + for p in pindel_files: + if p.endswith("_SI"): + with open(p) as ihandle: + for line in ihandle: + if re.search("ChrID", line): + handle.write(line) +``` +__Step 2e.:__ Create a config for pindel somatic filter +```Python +indel.filter.input = $(merged.pindel.output) +indel.filter.vaf = 0.08 +indel.filter.cov = 20 +indel.filter.hom = 6 +indel.filter.pindel2vcf = "/path/to/pindel/pindel2vcf4tcga" +indel.filter.reference = "GRCh38.d1.vd1.fa" +indel.filter.referencename = "GRCh38" +indel.filter.referencedate = datetime.datetime.now().strftime("%Y%m%d") +indel.filter.output = $(output.file.name.vcf) + +``` +__Step 2f.:__ Apply somatic filter on pindel output +Tool: pindel2vcf4tcga 0.6.3 +```Perl +perl pindel/somatic_filter/somatic_indelfilter.pl $(somatic.indel.filter.config) +``` +__Step 3:__ Pindel +Tool: Picard.jar 2.18.4-SNAPSHOT +```Shell +java \ +-d64 \ +-XX: +UseSerialGC \ +-Xmx16G \ +-jar picard.jar \ +SortVcf \ +CREATE_INDEX=true \ +SEQUENCE_DICTIONARY=GRCh38.d1.vd1.dict \ +I=$(pindel.somatic.vcf) \ +OUTPUT=$(output.vcf.gz) +``` +__Step 5:__ Vt Normalization +Tool: GenomeAnalysisTK.jar nightly-2016-02-25-gf39d340 + +```Shell +java \ +-Xmx4G \ +-jar \ +/bin/GenomeAnalysisTK.jar \ +-T VariantFiltration \ +--disable_auto_index_creation_and_locking_when_reading_rods \ +--variant $(vt.normal.output.vcf.gz) \ +-R GRCh38.d1.vd1.fa \ +--filterExpression vc.isBiallelic() && vc.getGenotype(\"TUMOR\").getAD().1 < 3" \ +--filterName TALTDP \ +-o $(output.vcf.gz) + + +``` + ### Variant Call Annotation Workflow Raw VCF files are then annotated in the [Somatic Annotation Workflow](/Data_Dictionary/viewer/#?view=table-definition-view&id=somatic_annotation_workflow) with the [Variant Effect Predictor (VEP)](https://www.ensembl.org/info/docs/tools/vep/index.html) v84 [[6]](http://dx.doi.org/10.1093/bioinformatics/btq330) along with VEP GDC plugins. @@ -352,10 +467,108 @@ In addition to annotation, [False Positive Filter](https://github.com/ucscCancer | Input | [Simple Somatic Mutation](/Data_Dictionary/viewer/#?view=table-definition-view&id=simple_somatic_mutation) | VCF | | Output | [Annotated Somatic Mutation](/Data_Dictionary/viewer/#?view=table-definition-view&id=annotated_somatic_mutation) | VCF | -### Somatic Aggregation Workflow +### Tumor-Only Variant Calling Workflow -The Somatic Aggregation Workflow generates one MAF file from multiple VCF files; see the [GDC MAF Format](/Data/File_Formats/MAF_Format/) guide for details on file structure. In this step, one MAF file is generated per variant calling pipeline for each project, and contains all available cases within this project. +Tumor only variant calling is performed on a tumor sample with no paired normal at the request of the research group. This method takes advantage of the normal cell contamination that is present in most tumor samples. These calls are made using the version of MuTect2 included in GATK4. Tumor-only variant call files can be found in the GDC Portal by filtering for "Workflow Type: GATK4 MuTect2". +### Tumor-Only Variant Call Command-Line Parameters +``` +GATK4 v4.0.4.0 + +## 1. Generate OXOG metrics: + +java -d64 -XX:+UseSerialGC -Xmx3G -jar /gatk/gatk.jar \ +CollectSequencingArtifactMetrics \ +-I Tumor_Sample_Alignment.bam \ +-O \ +--FILE_EXTENSION .txt \ +-R GRCh38.d1.vd1.fa ## Only chr1-22 + XYM + +## 2. Generate pileup summaries on tumor sample: + +java -d64 -XX:+UseSerialGC -Xmx3G -jar /gatk/gatk.jar \ +GetPileupSummaries +-I Tumor_Sample_Alignment.bam \ +-O .targeted_sequencing.table \ +-V af-only-gnomad-common-biallelic.grch38.main.vcf.gz \ # Germline reference from gnomad +-L intervals.bed \ ## Only chr1-22 + XYM +-R GRCh38.d1.vd1.fa + +## 3. Calculate contamination on tumor sample + +java -d64 -XX:+UseSerialGC -Xmx3G -jar /gatk/gatk.jar \ +CalculateContamination \ +-I .targeted_sequencing.table \ # From step 2 +-O .targeted_sequencing.contamination.table + +## 4. Find tumor sample name from BAM + +java -d64 -XX:+UseSerialGC -Xmx3G -jar /gatk/gatk.jar \ +GetSampleName \ +-I Tumor_Sample_Alignment.bam \ +-O .targeted_sequencing.sample_name + +## 5. Run MuTect2 using only tumor sample on chromosome level (25 commands with different intervals) + +java -Djava.io.tmpdir=/tmp/job_tmp_3 -d64 -jar -Xmx3G -XX:+UseSerialGC \ +/bin/gatk-4.0.4.0/gatk-package-4.0.4.0-local.jar \ +Mutect2 \ +-R GRCh38.d1.vd1.fa \ +-L chr4:1-190214555 \ # Specify chromosome +-I Tumor_Sample_Alignment.bam \ +-O 3.mt2.vcf \ +-tumor \ # From step 4 +--af-of-alleles-not-in-resource 2.5e-06 \ +--germline-resource af-only-gnomad.hg38.vcf.gz \ # Germline reference from gnomad +-pon gatk4_mutect2_4136_pon.vcf.gz # New panel of normal created by 4136 TCGA curated normal samples, using GATK4 + +## After this step, all chromosome level VCFs are merged into one. + +## 6. Sort VCF with Picard + +java -d64 -XX:+UseSerialGC -Xmx16G -jar /usr/local/bin/picard.jar \ +SortVcf \ +SEQUENCE_DICTIONARY=GRCh38.d1.vd1.dict \ +OUTPUT=.targeted_sequencing.mutect2.tumor_only.sorted.vcf.gz \ +I=merged_multi_gatk4_mutect2_tumor_only_calling.vcf \ # From step 5 +CREATE_INDEX=true + +## 7. Filter variant calls from MuTect +java -d64 -XX:+UseSerialGC -Xmx3G -jar /gatk/gatk.jar \ +FilterMutectCalls \ +-O .targeted_sequencing.mutect2.tumor_only.contFiltered.vcf.gz \ +-V .targeted_sequencing.mutect2.tumor_only.sorted.vcf.gz \ # From step 6 +--contamination-table .targeted_sequencing.contamination.table \ # From step 3 +-L intervals.bed + +## 8. Filter variants by orientation bias +java -d64 -XX:+UseSerialGC -Xmx3G -jar /gatk/gatk.jar \ +FilterByOrientationBias \ +-O .targeted_sequencing.tumor_only.gatk4_mutect2.raw_somatic_mutation.vcf.gz \ # final output +-P .pre_adapter_detail_metrics.txt \ # From step 1 +-V .targeted_sequencing.mutect2.tumor_only.contFiltered.vcf.gz \ # From step 7 +-L intervals.bed \ +-R GRCh38.d1.vd1.fa \ +-AM G/T \ +-AM C/T +``` + +### Tumor-Only Variant Annotation Workflow + +After single-tumor variant calling is performed with MuTect2, a series of filters are applied to minimize the release of germline variants in downloadable VCFs. In all cases, the GDC applies a set of custom filters based on allele frequency, mapping quality, somatic/germline probability, and copy number. In some cases an additional variant classification step is applied before the GDC filters. + +The [PureCN](https://bioconductor.org/packages/devel/bioc/html/PureCN.html) R-package [[7]](https://doi.org/10.1186/s13029-016-0060-z) [[8]](https://doi.org/10.1101/552711) is used to classify the variants by somatic/germline status and clonality based on tumor purity, ploidy, contamination, copy number, and loss of heterozygosity. The following steps are performed with this package: + +* __Interval Capture__ : Generates an interval file using a FASTA and BED file coordinates. +* __GC-Normalization__ : Calculates GC-normalized tumor/normal coverage data. +* __Normal DB Creation__ : Generates a normal database using the normalized coverage file and panel-of-normals VCF +* __Somatic Variant Calling__ : Classifies each of the previously called variants + +Note that PureCN will not be performed if there is insufficient data to produce a target capture kit specific normal database. In rare occasions, PureCN may not find a numeric solution. If PureCN is not performed or does not find a solution, this is indicated in the VCF header. VCF files that were annotated with these pipelines can be found in the GDC Portal by filtering for "Workflow Type: GATK4 MuTect2 Annotation". + +### Somatic Aggregation Workflow + +The Somatic Aggregation Workflow generates one MAF file from multiple VCF files; see the [GDC MAF Format](/Data/File_Formats/MAF_Format/) guide for details on file structure. In this step, one MAF file is generated per variant calling pipeline for each project and contains all available cases within this project. | I/O | Entity | Format | |---|---|---| @@ -373,9 +586,56 @@ While these criteria cause the pipeline to over-filter some of the true positive | Input | [Aggregated Somatic Mutation](/Data_Dictionary/viewer/#?view=table-definition-view&id=aggregated_somatic_mutation) | Protected MAF | | Output | [Masked Somatic Mutation](/Data_Dictionary/viewer/#?view=table-definition-view&id=masked_somatic_mutation) | Somatic MAF | -## File Access and Availability +### Whole Genome Sequencing Variant Calling + +Variant calls are generated from WGS data using a different pipeline than WXS and Targeted Sequencing samples. This pipeline, based on a [workflow generated by the Sanger Institute](https://github.com/cancerit/dockstore-cgpwgs), generates multiple downstream data types using the following software packages: + +* __CaVEMan:__ Single nucleotide variants, which are available in [VCF format](https://docs.gdc.cancer.gov/Data/File_Formats/VCF_Format/). +* __Pindel:__ Small indel variants, which are available in [VCF format](https://docs.gdc.cancer.gov/Data/File_Formats/VCF_Format/). +* __BRASS:__ Structural variants, which are available in *BEDPE format*. +* __AscatNGS:__ Copy number variants, which are available as copy number estimates or copy number segment files, data may be available in *tab separated values (.TSV) or plain text file (.TXT)* + +#### BEDPE File Format + +[BEDPE file format](https://bedtools.readthedocs.io/en/latest/content/general-usage.html#bedpe-format), (**b**rowser **e**xtensible **d**ata **p**aired-**e**nd) is designed to concisely describe disjoint genome features, such as structural variations or paired-end sequence alignments. It's an enhanced version of the [BED format](http://genome.ucsc.edu/FAQ/FAQformat#format1), as BED does not allow inter-chromosomal feature definitions. In addition, BED only has one strand field, which is insufficient for paired-end sequence alignments, especially when studying structural variation. The BEDPE format is described below. +* __chr*x* (required):__ The name of the chromosome on which the *x*th end of the feature exists. (x is 1 or 2). Any string can be used. For example, "chr1", "III", "myChrom", "contig1112.23" (use "." for unknown). +* __start*x* (required):__ The zero-based starting position of the **first** end of the feature on chr*x*. The first base in a chromosome is numbered 0. The start position in each BEDPE feature is therefore interpreted to be 1 greater than the start position listed in the feature (use -1 for unknown). +* __end*x* (required):__ The one-based ending position of the first end of the feature on chr*x*. The end position in each BEDPE feature is one-based (use -1 for unknown). +* __name (optional):__ Defines the name of the BEDPE feature. Any string can be used. +* __score (optional):__ A score between 0 and 1000. If the track line *useScore* attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). Any string can be used. +* __strand*x* (optional):__ Defines the strand for the *x*th end of the feature. Either "." (unknown), "+", or "-". + +In addition to the above fields, bedtools allows for the addition of user-defined fields to the normal, 10-column BEDPE format as necessary. These columns are merely "passed through" pairToBed and pairToPair and are not part of any analysis. One would use these additional columns to add extra information (e.g., edit distance for each end of an alignment, or "deletion", "inversion", etc.) to each BEDPE feature. + +#### CNV from WGS File Format + +AscatNGS, originally developed by [Raine *et al* (2016)]( https://doi.org/10.1002/cpbi.17) ([GitHub page](https://github.com/cancerit)), indicates the DNA copy number changes affecting a tumor genome when comparing to a matched normal sample. See below for a description of the copy number segment and copy number estimation files produced by AscatNGS: + +* __GDC Aliquot:__ The GDC ID for the aliquot collected from the sample (copy number segment files only). +* __Gene ID:__ The gene ENSMBL ID (copy number variant only). +* __Gene Name:__ The gene symbol (copy number variant only). +* __Chromosome:__ The name of the chromosome on which the copy number change exists. +* __Start:__ The starting position of the copy. +* __End:__ The ending position of the copy. +* __Copy Number:__ The weighted median of the strand copy numbers [9]. +* __Major Copy Number:__ The greater strand copy number of the two strands of the DNA (copy number segment files only). +* __Minor Copy number:__ The smaller strand copy number of the two strands of the DNA (copy number segment files only). +* __Max. Copy number:__ The highest copy number for overlapped segment (copy number variant only). +* __Min. Copy number:__ The lowest copy number for overlapped segment (copy number variant only). + + +### Harmonization for GENIE variants + +Variants reported from the AACR Project GENIE are available from the GDC Data Portal in MAF format. These variants were produced using an abridged pipeline in which the Genomic Data Commons received the variants directly instead of calling them from aligned reads. For an outline of the harmonization process, see the steps below: + +1. Variants are submitted directly to the GDC as a "Genomic Profile." +1. GENIE variants are lifted over to GRCh38 coordinates. +1. Variants are annotated using VEP and made available via the GDC Data Portal. + +## File Access and Availability + Files from the GDC DNA-Seq analysis pipeline are available in the [GDC Data Portal](https://portal.gdc.cancer.gov) in BAM, VCF, and MAF formats. Descriptions are listed below for all available data types and their respective file formats. | Data Type | Description | File Format | @@ -398,4 +658,10 @@ Files from the GDC DNA-Seq analysis pipeline are available in the [GDC Data Port [5]. Larson, David E., Christopher C. Harris, Ken Chen, Daniel C. Koboldt, Travis E. Abbott, David J. Dooling, Timothy J. Ley, Elaine R. Mardis, Richard K. Wilson, and Li Ding. "SomaticSniper: identification of somatic point mutations in whole genome sequencing data." Bioinformatics 28, no. 3 (2012): 311-317. -[6] McLaren, William, Bethan Pritchard, Daniel Rios, Yuan Chen, Paul Flicek, and Fiona Cunningham. "Deriving the consequences of genomic variants with the Ensembl API and SNP Effect Predictor." Bioinformatics 26, no. 16 (2010): 2069-2070. +[6]. McLaren, William, Bethan Pritchard, Daniel Rios, Yuan Chen, Paul Flicek, and Fiona Cunningham. "Deriving the consequences of genomic variants with the Ensembl API and SNP Effect Predictor." Bioinformatics 26, no. 16 (2010): 2069-2070. + +[7]. Riester, Markus, Angad P. Singh, A. Rose Brannon, Kun Yu, Catarina D. Campbell, Derek Y. Chiang, and Michael P. Morrissey. "PureCN: copy number calling and SNV classification using targeted short read sequencing." Source code for biology and medicine 11, no. 1 (2016): 13. + +[8]. Oh, Sehyun, Ludwig Geistlinger, Marcel Ramos, Martin Morgan, Levi Waldron, and Markus Riester. "Reliable analysis of clinical tumor-only whole exome sequencing data" bioRxiv 552711 (2019); + +[9]. Gene-level copy number data is generated by intersection of copy number segment and gene ranges. It is possible for one gene to overlap with multiple segments, and in this case, copy_number, min_copy_number and max_copy_number could take different values. In particular, the copy_number value is calculated as the median, weighted on length of overlapped bases, of segment copy numbers from all overlapped segments. diff --git a/docs/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline.md b/docs/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline.md index 0519ef59d..4684a30c2 100644 --- a/docs/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline.md +++ b/docs/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline.md @@ -6,9 +6,13 @@ The GDC mRNA quantification analysis pipeline measures gene level expression in ## Data Processing Steps ### RNA-Seq Alignment Workflow -The mRNA Analysis pipeline begins with the [Alignment Workflow](/Data_Dictionary/viewer/#?view=table-definition-view&id=alignment_workflow), which is performed using a two-pass method with [STAR](http://labshare.cshl.edu/shares/gingeraslab/www-data/dobin/STAR/STAR.posix/doc/STARmanual.pdf). STAR aligns each [read group](/Data_Dictionary/viewer/#?view=table-definition-view&id=read_group) separately and then merges the resulting alignments into one. Following the methods used by the International Cancer Genome Consortium [ICGC](https://icgc.org/) ([github](https://github.com/akahles/icgc_rnaseq_align)), the two-pass method includes a splice junction detection step, which is used to generate the final alignment. This workflow outputs a BAM file, which contains both aligned and unaligned reads. Quality assessment is performed pre-alignment with [FASTQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) and post-alignment with [RNA-SeQC](https://www.broadinstitute.org/cancer/cga/rna-seqc) and [Picard Tools](http://broadinstitute.github.io/picard/). +The mRNA Analysis pipeline begins with the [Alignment Workflow](/Data_Dictionary/viewer/#?view=table-definition-view&id=alignment_workflow), which is performed using a two-pass method with [STAR](https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf). STAR aligns each [read group](/Data_Dictionary/viewer/#?view=table-definition-view&id=read_group) separately and then merges the resulting alignments into one. Following the methods used by the International Cancer Genome Consortium [ICGC](https://icgc.org/) ([github](https://github.com/akahles/icgc_rnaseq_align)), the two-pass method includes a splice junction detection step, which is used to generate the final alignment. This workflow outputs a genomic BAM file, which contains both aligned and unaligned reads. Quality assessment is performed pre-alignment with [FASTQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) and post-alignment with [Picard Tools](http://broadinstitute.github.io/picard/). -[![RNA Alignment Pipeline](images/rna-alignment-pipeline-resized.png)](images/gene-expression-quantification-pipeline.png "Click to see the full image.") +Files that were processed after Data Release 14 have associated transcriptomic and chimeric alignments in addition to the genomic alignment detailed above. This only applies to aliquots with at least one set of paired-end reads. The chimeric BAM file contains reads that were mapped to different chromosomes or strands (fusion alignments). The genomic alignment files contain chimeric and unaligned reads to facilitate the retrieval of all original reads. The transcriptomic alignment reports aligned reads with transcript coordinates rather than genomic coordinates. The transcriptomic alignment is also sorted differently to facilitate downstream analyses. BAM index file pairing is not supported by this method of sorting, which does not allow for BAM slicing on these alignments. The splice-junction file for these alignments are also available. + +Files that were processed after Data Release 25 will have associated [gene fusion files](#fusion-pipelines). + +[![RNA Alignment Pipeline](images/gene-expression-quantification-pipeline-v3.png)](images/gene-expression-quantification-pipeline-v3.png "Click to see the full image.") | I/O | Entity | Format | |---|---|---| @@ -17,13 +21,13 @@ The mRNA Analysis pipeline begins with the [Alignment Workflow](/Data_Dictionary ### RNA-Seq Alignment Command Line Parameters -####STAR-2.4.2a +__Note that version numbers may vary in files downloaded from the GDC Portal due to ongoing pipeline development and improvement.__ -####ICGC STAR alignment pipeline +```Original +# STAR-2.4.2a -__For users with access to the ICGC pipeline:__ +### For users with access to the ICGC pipeline: -```Shell python star_align.py \ --genomeDir \ --FastqFileIn \ @@ -46,12 +50,11 @@ python star_align.py \ --sjdbOverhang 100 \ --outSAMstrandField intronMotif \ --outSAMunmapped Within -``` -__For users without access to the ICGC pipeline:__ +### For users without access to the ICGC pipeline: + +### Step 1: Building the STAR index.* -#### Step 1: Building the STAR index.* -```Shell STAR --runMode genomeGenerate --genomeDir @@ -59,11 +62,9 @@ STAR --sjdbOverhang 100 --sjdbGTFfile --runThreadN 8 -``` -\*These indices are available for download at the [GDC Website](https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files) and do not need to be built again. -#### Step 2: Alignment 1st Pass. -```Shell +### Step 2: Alignment 1st Pass. + STAR --genomeDir --readFilesIn ,,... ,,... @@ -83,9 +84,9 @@ STAR --outSAMstrandField intronMotif --outSAMtype None --outSAMmode None -``` -#### Step 3: Intermediate Index Generation. -```Shell + +### Step 3: Intermediate Index Generation. + STAR --runMode genomeGenerate --genomeDir @@ -93,9 +94,9 @@ STAR --sjdbOverhang 100 --runThreadN --sjdbFileChrStartEnd -``` -#### Step 4: Alignment 2nd Pass. -```Shell + +### Step 4: Alignment 2nd Pass. + STAR --genomeDir --readFilesIn ,,... ,,... @@ -120,37 +121,90 @@ STAR --outSAMheaderHD @HD VN:1.4 --outSAMattrRGline ``` +```DR15Plus +# STAR-2.6.0c + +STAR \ +--readFilesIn \ +--outSAMattrRGline \ +--alignIntronMax 1000000 \ +--alignIntronMin 20 \ +--alignMatesGapMax 1000000 \ +--alignSJDBoverhangMin 1 \ +--alignSJoverhangMin 8 \ +--alignSoftClipAtReferenceEnds Yes \ +--chimJunctionOverhangMin 15 \ +--chimMainSegmentMultNmax 1 \ +--chimOutType Junctions SeparateSAMold WithinBAM SoftClip \ +--chimSegmentMin 15 \ +--genomeDir \ +--genomeLoad NoSharedMemory \ +--limitSjdbInsertNsj 1200000 \ +--outFileNamePrefix \ +--outFilterIntronMotifs None \ +--outFilterMatchNminOverLread 0.33 \ +--outFilterMismatchNmax 999 \ +--outFilterMismatchNoverLmax 0.1 \ +--outFilterMultimapNmax 20 \ +--outFilterScoreMinOverLread 0.33 \ +--outFilterType BySJout \ +--outSAMattributes NH HI AS nM NM ch \ +--outSAMstrandField intronMotif \ +--outSAMtype BAM Unsorted \ +--outSAMunmapped Within \ +--quantMode TranscriptomeSAM GeneCounts \ +--readFilesCommand \ +--runThreadN \ +--twopassMode Basic +``` + +\*These indices are available for download at the [GDC Website](https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files) and do not need to be built again. + ### mRNA Expression Workflow -Following alignment, BAM files are processed through the [RNA Expression Workflow](/Data_Dictionary/viewer/#?view=table-definition-view&id=rna_expression_workflow). +Following alignment, BAM files are processed through the [RNA Expression Workflow](/Data_Dictionary/viewer/#?view=table-definition-view&id=rna_expression_workflow) to determine RNA expression levels. + +The reads mapped to each gene are enumerated using HT-Seq-Count. Expression values are provided in a tab-delimited format. [GENCODE v22](http://www.gencodegenes.org/releases/22.html) was used for gene annotation. -First the BAM files are filtered for aligned reads using the [samtools](http://samtools.sourceforge.net) view function. The reads mapped to each gene are enumerated using HT-Seq count. Expression values are provided in a tab-delimited format. [GENCODE v22](http://www.gencodegenes.org/releases/22.html) was used for gene annotation. +Files that were processed after Data Release 14 have an additional set of read counts that were produced by STAR during the alignment step. -[![Gene Expression Pipeline](images/gene-expression-quantification-pipeline.png)](images/gene-expression-quantification-pipeline.png "Click to see the full image.") +Note that counting algorithms such as HTSeq and STAR will not count reads that are mapped to more than one different gene. Below are two files that list genes that are completely encompassed by other genes and will likely display a value of zero. +* [Overlapped Genes (stranded)](/Data/Bioinformatics_Pipelines/overlap.gene.stranded.tsv) +* [Overlapped Genes (unstranded)](/Data/Bioinformatics_Pipelines/overlap.gene.strandless.tsv) | I/O | Entity | Format | |---|---|---| | Input | [Aligned Reads](/Data_Dictionary/viewer/#?view=table-definition-view&id=aligned_reads) | BAM | -| Output | [Gene Expression (HTSeq count/ FPKM/ FPKM-UQ)](/Data_Dictionary/viewer/#?view=table-definition-view&id=gene_expression) | TXT | +| Output | [Gene Expression](/Data_Dictionary/viewer/#?view=table-definition-view&id=gene_expression) | TXT | ### mRNA Quantification Command Line Parameters -Samtools v1.1; HTSeq-0.6.1p1 +HTSeq-0.6.1p1 -```Shell -samtools view -F 4 | +```Original htseq-count \ -m intersection-nonempty \ -i gene_id \ -r pos \ -s no \ - gencode.v22.annotation.gtf - +``` +```DR15Plus +htseq-count \ +-f bam \ +-r name \ +-s no \ +-a 10 \ +-t exon \ +-i gene_id \ +-m intersection-nonempty \ + \ + > ``` -## mRNA Expression Normalization +## mRNA Expression HT-Seq Normalization -RNA-Seq expression level read counts are normalized using two related methods: FPKM and FPKM-UQ. Normalized values should be used only within the context of the entire gene set. Users are encouraged to normalize raw read count values if a subset of genes is investigated. +RNA-Seq expression level read counts produced by HT-Seq are normalized using two similar methods: FPKM and FPKM-UQ. Normalized values should be used only within the context of the entire gene set. Users are encouraged to normalize raw read count values if a subset of genes is investigated. ### FPKM @@ -162,7 +216,7 @@ The upper quartile FPKM (FPKM-UQ) is a modified FPKM calculation in which the to ### Calculations -[![FPKM Calculations](images/Calc_FPKM_andUQ.png)](images/Calc_FPKM_andUQ.png "Click to see the full image.") +[![FPKM Calculations](images/Calc_FPKM_andUQ.png)](images/fpkm.gif "Click to see the full image.") - __RCg:__ Number of reads mapped to the gene - __RCpc:__ Number of reads mapped to all protein-coding genes @@ -184,6 +238,44 @@ __FPKM for Gene A__ = (1,000)\*(10^9)/[(3,000)\*(1,000,000)] = __333.33__ __FPKM-UQ for Gene A__ = (1,000)\*(10^9)/[(3,000)\*(2,000)] = __166,666.67__ +## Fusion Pipelines + +The GDC uses two pipelines for the detection of gene fusions. + +### Star Fusion Pipeline +The GDC gene fusion pipeline uses the STAR-Fusion v1.6 algorithm to generate gene fusion data. +STAR-Fusion pipeline processes the output generated by STAR aligner to map junction reads +and spanning reads to a junction annotation set. It utilizes a chimeric junction file from +running the STAR aligner and produces a tab-limited gene fusion prediction file. +The prediction file provides fused gene names, junction read count and breakpoint information. + +### Arriba Fusion Pipeline + +The [Arriba gene fusion pipeline](https://github.com/suhrig/arriba) uses Arriba v1.1.0 to detect gene fusions from the RNA-Seq data of tumor samples. + + +## scRNA-Seq Pipeline (single-nuclei) + +The GDC processes single-cell RNA-Seq (scRNA-Seq) data using the [Cell Ranger pipeline](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) to calculate gene expression followed by [Seurat](https://satijalab.org/seurat/) for secondary expression analysis. + +### scRNA Gene Expression Pipeline + +The gene expression pipeline, which uses Cell Ranger, generates three files: + +* Aligned reads file (BAM) +* Raw counts matrix - contains all barcodes in [Market Exchange Format](https://math.nist.gov/MatrixMarket/formats.html) (MEX) +* Filtered counts matrix - contains only detected cellular barcodes (MEX) + +### scRNA Analysis Pipeline + +The analysis pipeline, which uses the Seurat software, generates three files from an input of Filtered counts matrix: + +* Analysis - PCA, UMAP, tSNE values, and graph-based clustering results with associated metadata (TSV) +* Differential gene expression (TSV) - DEG information comparing cells from one cluster to the rest of the cells. +* Full Seurat analysis log as a loom object in [HDF5](https://support.hdfgroup.org/HDF5/) format. + +When the input RNA was extracted from nuclei instead of cytoplasm, a slightly modified quantification method is implemented to include introns. Currently, these single-nuclei RNA-Seq (snRNA-Seq) analyses share the same experimental strategy (scRNA-Seq) in the Data Portal, and can be filtered by querying for aliquot.analyte_type = "Nuclei RNA". + ## File Access and Availability To facilitate the use of harmonized data in user-created pipelines, RNA-Seq gene expression is accessible in the GDC Data Portal at several intermediate steps in the pipeline. Below is a description of each type of file available for download in the GDC Data Portal. @@ -191,6 +283,7 @@ To facilitate the use of harmonized data in user-created pipelines, RNA-Seq gene | Type | Description | Format | |---|---|---| | RNA-Seq Alignment | RNA-Seq reads that have been aligned to the GRCh38 build. Reads that were not aligned are included to facilitate the availability of raw read sets | BAM | -| Raw Read Counts | The number of reads aligned to each gene, calculated by HT-Seq | TXT | +| HT-Seq Read Counts | The number of reads aligned to each gene, calculated by HT-Seq | TXT | +| STAR Read Counts | The number of reads aligned to each gene, calculated by STAR | TSV | | FPKM | A normalized expression value that takes into account each gene length and the number of reads mapped to all protein-coding genes | TXT | | FPKM-UQ | A modified version of the FPKM formula in which the 75th percentile read count is used as the denominator in place of the total number of protein-coding reads | TXT | diff --git a/docs/Data/Bioinformatics_Pipelines/Methylation_LO_Pipeline.md b/docs/Data/Bioinformatics_Pipelines/Methylation_LO_Pipeline.md index 637b2506c..35d997718 100644 --- a/docs/Data/Bioinformatics_Pipelines/Methylation_LO_Pipeline.md +++ b/docs/Data/Bioinformatics_Pipelines/Methylation_LO_Pipeline.md @@ -2,7 +2,7 @@ ## Introduction -The [DNA Methylation Liftover Pipeline](https://gdc-docs.nci.nih.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=methylation_liftover_workflow) uses data from the Illumina Infinium Human Methylation 27 (HM27) and HumanMethylation450 (HM450) arrays to measure the level of methylation at known CpG sites as beta values, calculated from array intensities (Level 2 data) as Beta = M/(M+U). +The [DNA Methylation Liftover Pipeline](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=methylation_liftover_workflow) uses data from the Illumina Infinium Human Methylation 27 (HM27) and HumanMethylation450 (HM450) arrays to measure the level of methylation at known CpG sites as beta values, calculated from array intensities (Level 2 data) as Beta = M/(M+U). Using probe sequence information provided in the manufacturer's manifest, HM27 and HM450 probes were remapped to the GRCh38 reference genome [1]. Type II probes with a mapping quality of <10, or Type I probes for which the methylated and unmethylated probes map to different locations in the genome, and/or had a mapping quality of <10, had an entry of '\*' for the 'chr' field, and '-1' for coordinates. These coordinates were then used to identify the associated transcripts from GENCODE v22, the associated CpG island (CGI), and the CpG sites' distance from each of these features. Multiple transcripts overlapping the target CpG were separated with semicolons. Beta values were inherited from existing TCGA Level 3 DNA methylation data (hg19-based) based on Probe IDs. diff --git a/docs/Data/Bioinformatics_Pipelines/RPPA_intro.md b/docs/Data/Bioinformatics_Pipelines/RPPA_intro.md new file mode 100644 index 000000000..a070d334d --- /dev/null +++ b/docs/Data/Bioinformatics_Pipelines/RPPA_intro.md @@ -0,0 +1,46 @@ +# Protein Expression - RPPA + +## Introduction + +**R**everse **P**hase **P**rotein **A**rray (RPPA) is a high-throughput antibody-based technique with a procedure similar to that of Western blots. In the procedure carried by [MD Anderson Cancer Center](https://www.mdanderson.org/), hundreds to thousands of different cell lysates are immobilized on a nitrocellulose-coated slide as many individual spots, followed by incubations with one protein-specific antibody, and detection. A group (often several hundreds) of antibodies form a *set*, which are used for each assay. Occasionally, antibodies may be added to or removed from the set depending on feasibility/functionality, which forms a new set. + +To quantify protein expression, a "standard curve" is constructed from spots on each slide (one slide probed for one antibody). These spots include serial dilutions of each sample plus QC spots of standard lysates at different concentrations. + +The technique is capable of the following types of analyses: + +* Patient tumor classification +* DNA, RNA, and Protein correlation +* Prognosis +* Response prediction for targeted therapies +* Pharmacodynamics and biologically relevant dose +* Determination of appropriate handling procedures for clinical samples (based on antigen stability analysis) + +## RPPA Data in the GDC + +The antigens used for RPPA are available at ref [5], with the following information available: + +* ```AGID```: The antigen unique ID +* ```peptide_target```: The unique ID for the target site that the antigen binds to +* ```gene_symbol```: The unique gene name abbreviation that codes the peptide +* ```antibody_origin```: The species that the antibody originated from +* ```source```: The antibody vendor company +* ```catalog_number```: Antibody vendor's catalog number +* ```validation_status```: Indicating how trustworthy those antibodies are, based on QC tests of antibody quality by the MD Anderson. + +The GDC protein expression quantification data set is available in TSV format and contains ```AGID```, ```catalog_number```, and ```peptide_target``` from the reference file, plus the following fields: + +* ```lab_id```: The unique antibody ID +* ```set_id```: The ID for a set, ie list of antibodies (eg refs [3] & [4]). +* ```protein_expression```: Relative levels of protein expression - interpolation of each dilution curve to the "standard curve" (supercurve) of the slide (antibody). + +## References + +[1]. https://bioinformatics.mdanderson.org/public-software/tcpa/ + +[2]. https://www.mdanderson.org/research/research-resources/core-facilities/functional-proteomics-rppa-core/rppa-process.html + +[3]. https://www.mdanderson.org/content/dam/mdanderson/documents/core-facilities/Functional%20Proteomics%20RPPA%20Core%20Facility/RPPA_Expanded_Ab_List_Updated.xlsx + +[4]. https://www.mdanderson.org/content/dam/mdanderson/documents/core-facilities/Functional%20Proteomics%20RPPA%20Core%20Facility/RPPA_Standard_Ab_List_Updated.xlsx + +[5]. https://gdc.cancer.gov/about-data/gdc-data-processing/gdc-reference-files diff --git a/docs/Data/Bioinformatics_Pipelines/fpkm.gif b/docs/Data/Bioinformatics_Pipelines/fpkm.gif new file mode 100644 index 000000000..664d69051 Binary files /dev/null and b/docs/Data/Bioinformatics_Pipelines/fpkm.gif differ diff --git a/docs/Data/Bioinformatics_Pipelines/images/dna-alignment-pipeline_1.png b/docs/Data/Bioinformatics_Pipelines/images/dna-alignment-pipeline_1.png new file mode 100644 index 000000000..990903b5f Binary files /dev/null and b/docs/Data/Bioinformatics_Pipelines/images/dna-alignment-pipeline_1.png differ diff --git a/docs/Data/Bioinformatics_Pipelines/images/fpkm.gif b/docs/Data/Bioinformatics_Pipelines/images/fpkm.gif new file mode 100644 index 000000000..664d69051 Binary files /dev/null and b/docs/Data/Bioinformatics_Pipelines/images/fpkm.gif differ diff --git a/docs/Data/Bioinformatics_Pipelines/images/gene-expression-quantification-pipeline-v2.png b/docs/Data/Bioinformatics_Pipelines/images/gene-expression-quantification-pipeline-v2.png new file mode 100644 index 000000000..f628bf30b Binary files /dev/null and b/docs/Data/Bioinformatics_Pipelines/images/gene-expression-quantification-pipeline-v2.png differ diff --git a/docs/Data/Bioinformatics_Pipelines/images/gene-expression-quantification-pipeline-v3.png b/docs/Data/Bioinformatics_Pipelines/images/gene-expression-quantification-pipeline-v3.png new file mode 100644 index 000000000..749d545ba Binary files /dev/null and b/docs/Data/Bioinformatics_Pipelines/images/gene-expression-quantification-pipeline-v3.png differ diff --git a/docs/Data/Bioinformatics_Pipelines/overlap.gene.stranded.tsv b/docs/Data/Bioinformatics_Pipelines/overlap.gene.stranded.tsv new file mode 100644 index 000000000..74ae86ef3 --- /dev/null +++ b/docs/Data/Bioinformatics_Pipelines/overlap.gene.stranded.tsv @@ -0,0 +1,963 @@ +seqname start end strand gene_id gene_name gene_type +chr1 30366 30503 + ENSG00000274890.1 MIR1302-9 miRNA +chr1 89551 91105 - ENSG00000239945.1 RP11-34P13.8 lincRNA +chr1 258568 259024 - ENSG00000241670.3 AP006222.1 processed_pseudogene +chr1 965110 965166 + ENSG00000277294.1 AL645608.1 miRNA +chr1 1055033 1056116 + ENSG00000242590.1 RP11-54O7.14 sense_intronic +chr1 1312502 1312566 - ENSG00000274153.1 MIR6727 miRNA +chr1 1405460 1405752 - ENSG00000264293.2 RN7SL657P misc_RNA +chr1 9983141 9984568 + ENSG00000241326.1 RP11-807G9.2 sense_intronic +chr1 10306465 10306757 + ENSG00000264501.2 RN7SL731P misc_RNA +chr1 11843812 11843984 + ENSG00000276470.1 NPPA-AS1_1 misc_RNA +chr1 11845549 11845697 + ENSG00000278852.1 NPPA-AS1_2 misc_RNA +chr1 11847442 11847549 + ENSG00000275915.1 NPPA-AS1_3 misc_RNA +chr1 12824942 12828663 - ENSG00000279195.1 PRAMEF11 protein_coding +chr1 15659869 15661722 + ENSG00000215695.1 RSC1A1 protein_coding +chr1 15684472 15684558 + ENSG00000264048.1 AL121992.1 miRNA +chr1 16873708 16873851 + ENSG00000277234.1 U1 snRNA +chr1 17413631 17413694 - ENSG00000266727.1 AC004824.1 miRNA +chr1 20633679 20633788 + ENSG00000273695.1 MIR6084 miRNA +chr1 23370254 23370346 + ENSG00000201405.1 Y_RNA misc_RNA +chr1 25831913 25832134 - ENSG00000272478.1 RP1-317E23.7 antisense +chr1 27325470 27325553 + ENSG00000281023.1 FO393419.1 miRNA +chr1 28507366 28507571 + ENSG00000274266.1 SNORA73A snoRNA +chr1 28579764 28579893 - ENSG00000278274.1 SNORA61 snoRNA +chr1 28580381 28580512 - ENSG00000273544.1 SNORA44 snoRNA +chr1 28580920 28581054 - ENSG00000280498.1 SNORA16A snoRNA +chr1 28580920 28581056 - ENSG00000274582.1 SNORA16A snoRNA +chr1 28648600 28648730 + ENSG00000270103.3 RNU11 lincRNA +chr1 32170733 32176568 + ENSG00000250135.1 RP4-622L5.2 sense_intronic +chr1 35925832 35929610 + ENSG00000280133.1 RP4-789D17.3 TEC +chr1 37480230 37480289 + ENSG00000278228.1 MIR6732 miRNA +chr1 43364648 43364715 - ENSG00000277622.1 MIR6734 miRNA +chr1 44103945 44104079 + ENSG00000281534.1 AL139220.1 miRNA +chr1 44775864 44775943 + ENSG00000264294.1 SNORD55 snoRNA +chr1 44778390 44778456 + ENSG00000207421.1 SNORD38B snoRNA +chr1 44778390 44778458 + ENSG00000281859.1 SNORD38B snoRNA +chr1 44819883 44819997 - ENSG00000202444.1 RNU5E-6P snRNA +chr1 47761132 47765547 - ENSG00000223814.1 RP11-543D5.2 lincRNA +chr1 52150329 52150412 + ENSG00000281172.1 AL139156.1 miRNA +chr1 62211557 62211666 + ENSG00000200174.1 Y_RNA misc_RNA +chr1 67320369 67395580 + ENSG00000281152.1 IL12RB2 protein_coding +chr1 75787072 75787150 + ENSG00000206620.1 SNORD45C snoRNA +chr1 75787889 75787972 + ENSG00000207241.1 SNORD45A snoRNA +chr1 75789477 75789548 + ENSG00000201487.1 SNORD45B snoRNA +chr1 81990753 81990806 + ENSG00000274207.1 AC113949.1 miRNA +chr1 92229256 92229339 - ENSG00000265543.1 AL451010.1 miRNA +chr1 92837289 92837383 + ENSG00000206680.1 SNORD21 snoRNA +chr1 98045242 98045351 - ENSG00000276280.1 MIR2682 miRNA +chr1 98046070 98046171 - ENSG00000277990.1 MIR137 miRNA +chr1 103569553 103570674 + ENSG00000236085.1 ACTG1P4 processed_pseudogene +chr1 109100193 109100612 + ENSG00000278249.1 SCARNA2 scaRNA +chr1 109596225 109597781 + ENSG00000225113.1 RP5-1160K1.3 sense_overlapping +chr1 109598893 109598967 + ENSG00000207709.2 MIR197 miRNA +chr1 145281116 145281279 + ENSG00000207501.1 RNVU1-14 snRNA +chr1 151765709 151766389 + ENSG00000232937.1 RP11-98D18.2 sense_intronic +chr1 155925958 155926085 - ENSG00000280466.1 SCARNA15 scaRNA +chr1 158578919 158579899 - ENSG00000186400.3 OR10X1 protein_coding +chr1 158578919 158579899 - ENSG00000279111.1 OR10X1 polymorphic_pseudogene +chr1 160205377 160205464 + ENSG00000265381.1 AL121987.1 miRNA +chr1 165662585 165662687 - ENSG00000206990.1 Y_RNA misc_RNA +chr1 170370213 170370295 - ENSG00000263384.1 AL354732.1 miRNA +chr1 172138798 172138907 - ENSG00000207949.1 MIR214 miRNA +chr1 182944402 182944454 - ENSG00000281615.1 AL450304.2 miRNA +chr1 182959485 182959575 - ENSG00000264768.1 AL450304.1 miRNA +chr1 186311825 186311928 - ENSG00000202025.1 RNU6-1240P snRNA +chr1 201464383 201465146 - ENSG00000224818.1 RP11-134G8.10 3prime_overlapping_ncrna +chr1 201520056 201520549 + ENSG00000242150.2 RP11-134G8.6 transcribed_processed_pseudogene +chr1 202873294 202874326 + ENSG00000243113.1 RP11-480I12.9 transcribed_processed_pseudogene +chr1 207801845 207801956 - ENSG00000275668.1 AL035209.1 miRNA +chr1 207802440 207802523 - ENSG00000276752.1 MIR29B2 miRNA +chr1 218442626 218443996 + ENSG00000281453.1 TGFB2-OT1 3prime_overlapping_ncrna +chr1 220571834 220571899 - ENSG00000281158.1 AC096640.1 miRNA +chr1 220825776 220825893 + ENSG00000276642.1 U6atac snRNA +chr1 225922080 225922142 - ENSG00000274674.1 MIR6741 miRNA +chr1 227430526 227430976 + ENSG00000242757.1 CTD-2090I13.3 processed_pseudogene +chr1 228097263 228097341 + ENSG00000264944.1 MIR3620 miRNA +chr1 228397048 228397109 - ENSG00000278067.1 MIR6742 miRNA +chr1 228628176 228628234 - ENSG00000281588.1 AL713899.1 miRNA +chr1 229304857 229305504 + ENSG00000213029.3 SPHAR protein_coding +chr1 229440284 229441020 - ENSG00000226920.1 RP5-1068B5.5 3prime_overlapping_ncrna +chr1 231019828 231019924 - ENSG00000221290.1 MIR1182 miRNA +chr1 232223059 232223141 + ENSG00000281475.1 BX323014.1 miRNA +chr1 236483165 236484468 + ENSG00000244457.2 ENO1P1 transcribed_processed_pseudogene +chr1 237926831 237927605 + ENSG00000243781.1 RP11-193H5.2 transcribed_processed_pseudogene +chr1 245749563 246355081 - ENSG00000280657.1 SMYD3 protein_coding +chr1 247201967 247202060 - ENSG00000263568.1 MIR3916 miRNA +chr1 247948858 247949796 + ENSG00000196936.3 OR2L8 protein_coding +chr1 247948858 247949796 + ENSG00000279263.1 OR2L8 polymorphic_pseudogene +chr1 248626178 248627128 - ENSG00000183130.3 OR2T11 protein_coding +chr1 248721993 248722201 - ENSG00000242529.1 AHCYP8 processed_pseudogene +chr2 10691538 10691831 + ENSG00000277446.1 Metazoa_SRP misc_RNA +chr2 20401650 20401706 - ENSG00000281337.1 AC007041.1 miRNA +chr2 31823018 31823106 - ENSG00000265267.1 AL121652.3 miRNA +chr2 60925909 60931610 + ENSG00000267520.2 RP11-373L24.1 3prime_overlapping_ncrna +chr2 61177554 61177636 + ENSG00000280672.1 AC016747.1 miRNA +chr2 62146413 62147153 + ENSG00000242735.1 AC018462.3 processed_pseudogene +chr2 64817586 64817668 + ENSG00000280945.1 AC007880.2 miRNA +chr2 65667256 65667346 + ENSG00000265899.1 AC007389.4 miRNA +chr2 85567662 85567737 - ENSG00000280656.1 AC016753.1 miRNA +chr2 88860886 88860922 - ENSG00000211594.2 IGKJ4 IG_J_gene +chr2 88861221 88861258 - ENSG00000211595.2 IGKJ3 IG_J_gene +chr2 88861525 88861563 - ENSG00000211596.3 IGKJ2 IG_J_gene +chr2 88861886 88861923 - ENSG00000211597.2 IGKJ1 IG_J_gene +chr2 96809473 96809614 + ENSG00000280510.1 AC092636.2 miRNA +chr2 110126732 110126824 - ENSG00000212091.1 AC013268.1 miRNA +chr2 113596000 113596067 + ENSG00000276624.1 AL078621.1 miRNA +chr2 127846736 127847084 - ENSG00000244563.1 AC006011.4 processed_pseudogene +chr2 130929762 130929883 - ENSG00000281159.1 SCARNA15 scaRNA +chr2 131491160 131491236 - ENSG00000265575.1 MIR4784 miRNA +chr2 132152243 132152349 - ENSG00000239108.1 RNU6-1132P snRNA +chr2 144518447 144518574 + ENSG00000275372.1 ZEB2_AS1_1 misc_RNA +chr2 144520456 144520555 + ENSG00000273537.1 ZEB2_AS1_3 misc_RNA +chr2 144521039 144521116 + ENSG00000277444.1 ZEB2_AS1_4 misc_RNA +chr2 159462417 159463256 - ENSG00000225369.1 AC009506.2 processed_pseudogene +chr2 160407810 160407882 - ENSG00000263948.1 MIR4785 miRNA +chr2 175176493 175176586 - ENSG00000200121.2 Y_RNA misc_RNA +chr2 176188843 176188901 + ENSG00000277284.1 MIR7704 miRNA +chr2 177212726 177212799 + ENSG00000263721.1 MIR4444-1 miRNA +chr2 178831371 178831449 - ENSG00000238542.1 RNU7-104P snRNA +chr2 179934402 179934527 - ENSG00000202216.2 SNORA17 snoRNA +chr2 182314454 182314536 + ENSG00000281257.1 AC012500.1 miRNA +chr2 182788098 182788435 + ENSG00000232430.1 RPL31P15 processed_pseudogene +chr2 188974419 188974555 + ENSG00000281204.1 AC066694.1 miRNA +chr2 189311671 189311761 - ENSG00000266817.1 AC118063.1 miRNA +chr2 190880854 190880942 + ENSG00000280517.1 AC005540.1 miRNA +chr2 192775943 192776073 + ENSG00000278406.1 PCGEM1 misc_RNA +chr2 201166965 201167546 + ENSG00000234431.2 AC007283.5 3prime_overlapping_ncrna +chr2 201621646 201623430 - ENSG00000241790.2 ENO1P4 processed_pseudogene +chr2 202115167 202115260 - ENSG00000274633.1 AC079354.1 miRNA +chr2 202333642 202333724 + ENSG00000281619.1 AC064836.1 miRNA +chr2 202773720 202774360 - ENSG00000240761.1 AC098831.4 processed_pseudogene +chr2 203190780 203191277 + ENSG00000204196.5 AC011737.2 processed_pseudogene +chr2 203764707 203764798 - ENSG00000211573.2 AC125238.1 miRNA +chr2 206116110 206116500 - ENSG00000237580.1 GCSHP3 processed_pseudogene +chr2 206160843 206161024 + ENSG00000277502.1 uc_338 misc_RNA +chr2 206162228 206162359 + ENSG00000207406.1 SNORA41 snoRNA +chr2 207754807 207754881 + ENSG00000264900.1 MIR4775 miRNA +chr2 218280125 218280188 - ENSG00000274203.1 MIR6513 miRNA +chr2 230176665 230221721 - ENSG00000280755.1 SP110 protein_coding +chr2 231456444 231456523 - ENSG00000207280.1 SNORD20 snoRNA +chr2 231711525 231711647 + ENSG00000277986.1 U4 snRNA +chr2 232550474 232550573 - ENSG00000266620.1 MIR5001 miRNA +chr2 233865496 233867359 - ENSG00000279809.1 AC005538.3 TEC +chr2 239091759 239091836 - ENSG00000281838.1 AC017028.12 miRNA +chr3 8931506 8931631 + ENSG00000199815.2 SNORA17 snoRNA +chr3 12071038 12071121 + ENSG00000265870.1 AC026166.1 miRNA +chr3 12840294 12840396 - ENSG00000281117.1 AC034198.1 miRNA +chr3 12840312 12840450 - ENSG00000207496.1 SNORA7A snoRNA +chr3 12850659 12850860 + ENSG00000250939.2 AC034198.7 transcribed_unprocessed_pseudogene +chr3 15738657 15738718 + ENSG00000281814.1 AC090950.1 miRNA +chr3 16933196 16933260 + ENSG00000264818.1 MIR3714 miRNA +chr3 24096512 24097360 - ENSG00000242109.1 NPM1P23 transcribed_processed_pseudogene +chr3 30304321 30304536 - ENSG00000281710.1 U3 snoRNA +chr3 32259728 32259819 + ENSG00000207857.2 AC097639.1 miRNA +chr3 38125292 38125394 - ENSG00000201965.1 Y_RNA misc_RNA +chr3 39408389 39408539 + ENSG00000206760.1 SNORA6 snoRNA +chr3 39411054 39411206 + ENSG00000202363.1 SNORA62 snoRNA +chr3 44629284 44629372 + ENSG00000280942.1 AC099669.1 miRNA +chr3 44861888 44861981 + ENSG00000207783.1 MIR564 miRNA +chr3 48094801 48094876 - ENSG00000281068.1 AC124916.1 miRNA +chr3 48094888 48094957 - ENSG00000281238.1 AC124916.2 miRNA +chr3 48465811 48467645 + ENSG00000213689.8 TREX1 protein_coding +chr3 48633636 48633698 - ENSG00000274831.1 MIR6824 miRNA +chr3 49099854 49099914 - ENSG00000274888.1 MIR6890 miRNA +chr3 49806137 49806245 - ENSG00000263506.1 MIR5193 miRNA +chr3 49863381 49863464 - ENSG00000281278.1 AC139451.2 miRNA +chr3 50273236 50273297 + ENSG00000274596.1 MIR6872 miRNA +chr3 52690744 52690827 + ENSG00000238862.1 SNORD19B snoRNA +chr3 64099273 64101122 + ENSG00000241572.1 PRICKLE2-AS1 antisense +chr3 98264285 98265262 + ENSG00000230301.4 OR5H6 protein_coding +chr3 98264285 98265262 + ENSG00000279922.1 OR5H6 polymorphic_pseudogene +chr3 116716460 116716624 + ENSG00000278072.1 AC108713.1 misc_RNA +chr3 123161794 123161879 + ENSG00000275891.1 MIR7110 miRNA +chr3 124723788 124726325 + ENSG00000260391.2 RP11-71H17.7 sense_overlapping +chr3 124792319 124792562 - ENSG00000276626.1 7SK misc_RNA +chr3 126571789 126572636 - ENSG00000206483.5 TXNRD3NB protein_coding +chr3 128673691 128673771 - ENSG00000280957.1 AC079945.1 miRNA +chr3 134437827 134437906 + ENSG00000263554.1 MIR4788 miRNA +chr3 139233414 139233520 + ENSG00000276304.1 PISRT1 misc_RNA +chr3 139494618 139494701 + ENSG00000263538.1 AC097103.1 miRNA +chr3 149779009 149779108 - ENSG00000251854.1 RNU6-507P snRNA +chr3 151079506 151079584 + ENSG00000276055.1 CLRN1-AS1 misc_RNA +chr3 169764610 169765047 - ENSG00000277925.1 Telomerase-vert misc_RNA +chr3 169945987 169946754 - ENSG00000244193.1 RP11-379K17.5 transcribed_processed_pseudogene +chr3 181610498 181610729 + ENSG00000276074.1 SOX2OT_exon1 misc_RNA +chr3 181699608 181699883 + ENSG00000276690.1 SOX2OT_exon3 misc_RNA +chr3 181699705 181699783 + ENSG00000281596.1 AC117415.1 miRNA +chr3 183453814 183453944 + ENSG00000199363.1 SNORA63 snoRNA +chr3 186784796 186784864 + ENSG00000238942.1 SNORD2 snoRNA +chr3 186786323 186786445 + ENSG00000200418.1 SNORA63 snoRNA +chr3 186786672 186786777 + ENSG00000281017.1 MIR1248 miRNA +chr3 186786675 186786852 + ENSG00000221420.2 SNORA81 snoRNA +chr3 186787300 186787431 + ENSG00000200320.1 SNORA63 snoRNA +chr3 186787612 186787749 + ENSG00000263776.1 SNORA4 snoRNA +chr3 190659216 190659750 + ENSG00000273370.1 RP11-268E23.2 lincRNA +chr3 195658096 195685904 + ENSG00000215837.7 SDHAP2 transcribed_unprocessed_pseudogene +chr3 195688008 195688120 + ENSG00000276635.1 AC233280.3 miRNA +chr3 197674496 197674576 - ENSG00000216042.1 MIR922 miRNA +chr4 1986384 1986477 - ENSG00000216105.1 MIR943 miRNA +chr4 2250077 2250156 - ENSG00000265080.1 MIR4800 miRNA +chr4 56097422 56097481 + ENSG00000280464.1 AC092627.1 miRNA +chr4 56794354 56794438 - ENSG00000281241.1 AC022483.1 miRNA +chr4 88275205 88275308 - ENSG00000200469.1 RNU6-112P snRNA +chr4 88521573 88521789 - ENSG00000255072.1 PIGY protein_coding +chr4 88710147 88710265 + ENSG00000278151.1 FAM13A-AS1_1 misc_RNA +chr4 98929914 98929993 - ENSG00000238449.2 AC019131.1 miRNA +chr4 108789200 108789262 - ENSG00000265522.1 AC097473.1 miRNA +chr4 118279190 118279320 + ENSG00000275994.1 SNORA24 snoRNA +chr4 122827014 122827090 + ENSG00000253069.1 AC021205.1 miRNA +chr4 135371590 135371681 - ENSG00000207849.2 AC108867.1 miRNA +chr4 146639261 146639343 + ENSG00000264323.1 AC093887.1 miRNA +chr4 151103827 151103891 + ENSG00000208797.1 SNORD73A snoRNA +chr4 152536428 152536516 + ENSG00000277685.1 MIR4453 miRNA +chr4 190065233 190065914 + ENSG00000277162.1 DBET processed_pseudogene +chr4 190175141 190175224 + ENSG00000274222.1 AC215524.1 miRNA +chr5 8460925 8460999 + ENSG00000273868.1 MIR4458 miRNA +chr5 10195187 10197622 - ENSG00000271998.1 CTD-2199O4.7 lincRNA +chr5 18958153 18958237 - ENSG00000281138.1 AC114981.1 miRNA +chr5 31249879 31250200 + ENSG00000250482.2 RP11-152K4.1 processed_pseudogene +chr5 32379407 32379467 - ENSG00000222961.1 AC008949.1 miRNA +chr5 52903908 52904478 + ENSG00000241809.1 CTD-2207L17.1 processed_pseudogene +chr5 57481820 57481903 - ENSG00000264748.1 AC025470.1 miRNA +chr5 60487713 60487929 + ENSG00000273701.1 PART1_1 misc_RNA +chr5 60488078 60488327 + ENSG00000275634.1 PART1_2 misc_RNA +chr5 60546219 60546349 + ENSG00000276233.1 PART1_3 misc_RNA +chr5 69160808 69160939 + ENSG00000280894.1 SNORA76 snoRNA +chr5 74779309 74779413 + ENSG00000199645.1 RNU6-1330P snRNA +chr5 88666853 88666939 - ENSG00000273878.1 MIR9-2 miRNA +chr5 91313057 91314402 - ENSG00000271762.1 RP11-213H15.4 lincRNA +chr5 91844022 91844116 - ENSG00000276426.1 uc_338 misc_RNA +chr5 93620696 93620788 - ENSG00000251725.1 MIR2277 miRNA +chr5 131427266 131635030 - ENSG00000281164.1 RAPGEF6 protein_coding +chr5 136080497 136080597 - ENSG00000278815.1 VTRNA2-1 misc_RNA +chr5 136133696 136133826 + ENSG00000275646.1 SMAD5-AS1_2 misc_RNA +chr5 140563671 140563751 - ENSG00000274910.1 MIR6831 miRNA +chr5 140861224 140863521 + ENSG00000249504.3 PCDHA14 transcribed_unprocessed_pseudogene +chr5 141479535 141479617 + ENSG00000281646.1 AC008781.1 miRNA +chr5 149430646 149430733 + ENSG00000276365.1 MIR145 miRNA +chr5 155845469 155845527 + ENSG00000280546.1 AC140677.1 miRNA +chr5 160485352 160485450 + ENSG00000277727.1 MIR146A miRNA +chr5 168552277 168553727 - ENSG00000253861.1 SLC2A3P1 processed_pseudogene +chr5 178884715 178884879 + ENSG00000206624.1 RNU1-39P snRNA +chr5 181241814 181241892 - ENSG00000272296.1 SNORD96A snoRNA +chr5 181243312 181243379 - ENSG00000264549.1 SNORD95 snoRNA +chr6 6347081 6347381 - ENSG00000241216.1 SNAPC5P1 transcribed_processed_pseudogene +chr6 8653558 8653797 + ENSG00000276019.1 HULC misc_RNA +chr6 16301693 16301824 - ENSG00000281447.1 AL009031.1 miRNA +chr6 20421686 20421749 - ENSG00000281428.1 AL136303.2 miRNA +chr6 24839967 24840065 - ENSG00000263391.1 AL512428.1 miRNA +chr6 27837957 27838094 + ENSG00000281435.1 Z98744.1 miRNA +chr6 28977613 28977709 + ENSG00000280628.1 AL662791.1 miRNA +chr6 29396700 29397623 + ENSG00000168787.6 OR12D2 protein_coding +chr6 29440016 29440954 + ENSG00000279941.1 OR10C1 protein_coding +chr6 29726669 29727139 - ENSG00000239257.1 RPL23AP1 transcribed_processed_pseudogene +chr6 30061080 30061183 + ENSG00000278773.1 ZNRD1-AS1_3 misc_RNA +chr6 30890883 30890972 + ENSG00000264594.1 MIR4640 miRNA +chr6 31532757 31532869 - ENSG00000276877.1 AL662801.1 miRNA +chr6 31541101 31541178 - ENSG00000265236.1 SNORD84 snoRNA +chr6 31837076 31837142 + ENSG00000201754.1 SNORD52 snoRNA +chr6 31956839 31956940 - ENSG00000221267.1 MIR1236 miRNA +chr6 32936916 32937010 - ENSG00000212066.1 AL645941.1 miRNA +chr6 33290245 33290325 + ENSG00000275010.1 MIR6834 miRNA +chr6 34873831 34873927 + ENSG00000252106.2 RNY3P15 misc_RNA +chr6 41787662 41789898 + ENSG00000214736.6 TOMM6 protein_coding +chr6 42155426 42163439 + ENSG00000214732.2 RP1-139D8.6 protein_coding +chr6 44254206 44254285 - ENSG00000265700.1 MIR4647 miRNA +chr6 53090961 53091257 + ENSG00000242865.3 RN7SL244P misc_RNA +chr6 56432379 56432442 - ENSG00000266793.1 AL137008.1 miRNA +chr6 71585089 71585201 + ENSG00000211530.1 AL354933.1 miRNA +chr6 85677294 85677368 - ENSG00000281147.1 SNORD50A snoRNA +chr6 85677589 85677658 - ENSG00000275072.1 SNORD50B snoRNA +chr6 88276125 88276208 - ENSG00000281199.1 AL139042.1 miRNA +chr6 110440082 110440165 - ENSG00000281088.1 AC002464.1 miRNA +chr6 112361848 112361939 - ENSG00000266485.1 AL365214.1 miRNA +chr6 116457732 116457822 - ENSG00000265516.1 Z84488.1 miRNA +chr6 136034553 136034886 - ENSG00000213111.5 COX5BP2 transcribed_processed_pseudogene +chr6 136343193 136343249 - ENSG00000276943.1 AL023284.1 miRNA +chr6 159785594 159785733 - ENSG00000206910.1 SNORA29 snoRNA +chr6 166099853 166099924 + ENSG00000276643.1 SNORD45 snoRNA +chr7 2257515 2257577 - ENSG00000277102.1 MIR6836 miRNA +chr7 5528103 5528186 - ENSG00000263900.1 AC006483.1 miRNA +chr7 7742091 7742535 - ENSG00000269721.1 RPL23AP51 transcribed_processed_pseudogene +chr7 12654179 12654985 + ENSG00000229233.1 CTD-2320J21.2 sense_overlapping +chr7 27096124 27096248 + ENSG00000276528.1 HOTAIRM1_1 misc_RNA +chr7 27098900 27099114 + ENSG00000276771.1 HOTAIRM1_2 misc_RNA +chr7 27099778 27099836 + ENSG00000274864.1 HOTAIRM1_3 misc_RNA +chr7 27099856 27099957 + ENSG00000274396.1 HOTAIRM1_4 misc_RNA +chr7 27099967 27100111 + ENSG00000277694.1 HOTAIRM1_5 misc_RNA +chr7 27169480 27169564 - ENSG00000207584.1 MIR196B miRNA +chr7 27185433 27185530 + ENSG00000273961.1 HOXA11-AS1_1 misc_RNA +chr7 27185832 27186018 + ENSG00000278334.1 HOXA11-AS1_2 misc_RNA +chr7 27186166 27186263 + ENSG00000276496.1 HOXA11-AS1_3 misc_RNA +chr7 27188816 27188994 + ENSG00000278020.1 HOXA11-AS1_6 misc_RNA +chr7 27200465 27200521 + ENSG00000276609.1 HOTTIP_1 misc_RNA +chr7 27201844 27202219 + ENSG00000278708.1 HOTTIP_2 misc_RNA +chr7 27202302 27202638 + ENSG00000277469.1 HOTTIP_3 misc_RNA +chr7 27206139 27206303 + ENSG00000277553.1 HOTTIP_4 misc_RNA +chr7 30157531 30159534 + ENSG00000251660.1 AC007036.5 sense_overlapping +chr7 39609717 39610280 + ENSG00000106540.4 AC004837.3 processed_pseudogene +chr7 40128121 40128232 - ENSG00000199273.1 Y_RNA misc_RNA +chr7 44051766 44051829 + ENSG00000274083.1 MIR6837 miRNA +chr7 44064908 44066079 + ENSG00000239775.1 AC017116.11 sense_overlapping +chr7 44110849 44110912 + ENSG00000264652.1 MIR4649 miRNA +chr7 44881748 44881800 - ENSG00000264326.1 MIR4657 miRNA +chr7 44985378 44985510 - ENSG00000277184.1 SNORA9 snoRNA +chr7 45105968 45106099 - ENSG00000200656.1 SNORA5B snoRNA +chr7 65038372 65038565 + ENSG00000239985.2 RP11-460N20.3 unprocessed_pseudogene +chr7 66980335 66980409 + ENSG00000280772.1 AC079920.1 miRNA +chr7 75237293 75237405 - ENSG00000275121.1 CH17-232I21.1 processed_pseudogene +chr7 75474707 75486108 - ENSG00000242073.2 AC006014.7 transcribed_unprocessed_pseudogene +chr7 75915197 75915269 + ENSG00000265020.1 MIR4651 miRNA +chr7 84939349 84940245 - ENSG00000232019.1 AC074183.4 lincRNA +chr7 90403434 90513391 + ENSG00000273299.1 CTB-13L3.1 processed_transcript +chr7 95156606 95156671 - ENSG00000277296.1 AC002429.1 miRNA +chr7 100352360 100353692 + ENSG00000235333.3 PVRIG2P transcribed_unprocessed_pseudogene +chr7 100356651 100356721 + ENSG00000278005.1 MIR6840 miRNA +chr7 100868036 100868107 + ENSG00000273985.1 MIR6875 miRNA +chr7 101058299 101058567 + ENSG00000222636.1 RN7SKP54 misc_RNA +chr7 102465742 102465826 + ENSG00000266715.1 MIR5090 miRNA +chr7 102471469 102471531 + ENSG00000264471.1 MIR4467 miRNA +chr7 105014152 105014199 - ENSG00000280574.1 AC007384.1 miRNA +chr7 112288623 112288952 + ENSG00000202406.1 RN7SKP187 misc_RNA +chr7 114629855 114629956 + ENSG00000266229.1 AC020606.1 miRNA +chr7 116954480 116954679 + ENSG00000273596.1 ST7-OT4_1 misc_RNA +chr7 116956431 116956723 + ENSG00000276100.1 ST7-OT4_3 misc_RNA +chr7 116959593 116959805 + ENSG00000274606.1 ST7-OT4_4 misc_RNA +chr7 117020211 117020319 + ENSG00000275870.1 MIR6132 miRNA +chr7 117184126 117184199 + ENSG00000275054.1 ST7-OT3_1 misc_RNA +chr7 117191647 117191761 + ENSG00000278482.1 ST7-OT3_3 misc_RNA +chr7 121050927 121051203 + ENSG00000234927.1 HMGN1P18 processed_pseudogene +chr7 128466563 128469171 + ENSG00000272601.1 RP11-155G14.5 transcribed_unprocessed_pseudogene +chr7 129756266 129756377 + ENSG00000212238.1 RNA5SP244 rRNA +chr7 130877459 130877539 - ENSG00000274250.1 MIR29B1 miRNA +chr7 135927274 135927450 - ENSG00000267697.1 LUZP6 protein_coding +chr7 138123758 138123821 + ENSG00000266193.1 MIR4468 miRNA +chr7 140645966 140646034 + ENSG00000280499.1 AC006452.2 miRNA +chr7 142797119 142797166 + ENSG00000211769.1 TRBJ2-5 TR_J_gene +chr7 142797239 142797291 + ENSG00000211770.1 TRBJ2-6 TR_J_gene +chr7 143959971 143960924 + ENSG00000279723.1 OR2F1 protein_coding +chr7 151238421 151238538 + ENSG00000211517.1 MIR671 miRNA +chr8 8582352 8582427 - ENSG00000281480.1 AC114550.1 miRNA +chr8 9903388 9903472 - ENSG00000275677.1 MIR124-1 miRNA +chr8 20290174 20290289 + ENSG00000274467.1 5S_rRNA rRNA +chr8 27610601 27610751 - ENSG00000273705.1 MIR6843 miRNA +chr8 56073835 56073901 - ENSG00000238650.1 SNORD54 snoRNA +chr8 66922467 66922555 - ENSG00000254341.2 SNORD87 snoRNA +chr8 70480192 70480487 - ENSG00000275128.1 Metazoa_SRP misc_RNA +chr8 73982125 73982467 + ENSG00000244295.2 RPS20P21 processed_pseudogene +chr8 80484589 80484683 - ENSG00000277604.1 Y_RNA misc_RNA +chr8 86657870 86657952 - ENSG00000221137.1 AC013751.1 miRNA +chr8 93916022 93916119 - ENSG00000276513.1 MIR378D2 miRNA +chr8 98192205 98192313 - ENSG00000252558.1 RNU6-914P snRNA +chr8 100702968 100703024 - ENSG00000277719.1 MIR7705 miRNA +chr8 116874728 116874800 - ENSG00000264875.1 MIR3610 miRNA +chr8 117164179 117164255 + ENSG00000281002.1 AC084114.1 miRNA +chr8 127794541 127794734 + ENSG00000276443.1 PVT1_1 misc_RNA +chr8 127795962 127796028 + ENSG00000275264.1 MIR1204 miRNA +chr8 127890626 127890720 + ENSG00000278324.1 PVT1_3 misc_RNA +chr8 133229056 133229697 + ENSG00000270132.1 WISP1-OT1 sense_intronic +chr8 134598071 134598149 + ENSG00000276140.1 ZFAT-AS1_1 misc_RNA +chr8 134598318 134598518 + ENSG00000277732.1 ZFAT-AS1_2 misc_RNA +chr8 134600257 134600336 + ENSG00000278454.1 ZFAT-AS1_3 misc_RNA +chr8 143542110 143542398 + ENSG00000275558.1 7SK misc_RNA +chr8 143579636 143580670 + ENSG00000254741.1 RP11-661A12.7 antisense +chr8 143837756 143837816 - ENSG00000274094.1 MIR6845 miRNA +chr8 144079874 144079942 + ENSG00000276472.1 MIR6847 miRNA +chr8 144262673 144262737 - ENSG00000277158.1 MIR7112 miRNA +chr8 144314590 144315138 - ENSG00000254690.1 GS1-393G12.12 antisense +chr8 144394149 144394230 - ENSG00000216133.1 MIR939 miRNA +chr8 144400086 144400165 - ENSG00000266624.1 MIR1234 miRNA +chr8 144400277 144400345 - ENSG00000274683.1 MIR6849 miRNA +chr9 30144 30281 + ENSG00000278579.1 MIR1302-2 miRNA +chr9 4834156 4860275 + ENSG00000281007.1 AL158147.2 protein_coding +chr9 30774228 30774319 + ENSG00000211510.2 AL590726.1 miRNA +chr9 33042109 33042216 - ENSG00000222169.1 AL162590.1 miRNA +chr9 35449748 35450352 + ENSG00000244213.1 ZFAND6P1 processed_pseudogene +chr9 35657754 35658017 - ENSG00000277027.1 RNase_MRP ribozyme +chr9 35811476 35811550 - ENSG00000263448.1 AL133410.1 miRNA +chr9 35957139 35958098 - ENSG00000122718.5 OR2S2 protein_coding +chr9 37434177 37434586 + ENSG00000236156.2 CHCHD4P3 processed_pseudogene +chr9 38542314 38542569 - ENSG00000259898.1 CYP4F33P processed_pseudogene +chr9 38542389 38543215 - ENSG00000272934.1 RP11-392E22.10 processed_transcript +chr9 38543104 38543446 - ENSG00000250989.1 RP11-392E22.5 processed_pseudogene +chr9 39173794 39174513 - ENSG00000243695.1 RP11-290L7.3 processed_pseudogene +chr9 41280460 41280787 + ENSG00000278647.1 RP11-4L24.2 transcribed_processed_pseudogene +chr9 69428248 69428721 - ENSG00000243888.1 RP11-548B3.3 3prime_overlapping_ncrna +chr9 76785782 76786190 + ENSG00000277320.1 PCA3_2 misc_RNA +chr9 79726069 79726321 + ENSG00000240979.1 RP11-79D8.2 processed_pseudogene +chr9 83969748 83969857 - ENSG00000207603.1 MIR7-1 miRNA +chr9 97700234 97700325 + ENSG00000266608.1 AL445531.1 miRNA +chr9 100732033 100732116 + ENSG00000281312.1 AL390876.1 miRNA +chr9 122628579 122629535 - ENSG00000171484.4 OR1B1 protein_coding +chr9 123111546 123111643 - ENSG00000274325.1 MIR600 miRNA +chr9 127690687 127690795 - ENSG00000264329.1 MIR3911 miRNA +chr9 127785833 127785923 + ENSG00000266070.1 MIR3960 miRNA +chr9 127785836 127785905 + ENSG00000281546.1 AL162586.1 miRNA +chr9 128260749 128260817 - ENSG00000274982.1 AL590708.3 miRNA +chr9 128391461 128392016 + ENSG00000272593.1 RP11-339B21.11 lincRNA +chr9 128392618 128392714 - ENSG00000273685.1 MIR219B miRNA +chr9 129040566 129040674 + ENSG00000280692.1 AL592211.1 miRNA +chr9 133349396 133349470 + ENSG00000206611.1 SNORD24 snoRNA +chr9 133350095 133350168 + ENSG00000200831.1 SNORD36B snoRNA +chr9 133350456 133350528 + ENSG00000199744.1 SNORD36A snoRNA +chr9 136726104 136726239 - ENSG00000280496.1 SNORA43 snoRNA +chr9 136726105 136726234 - ENSG00000276161.1 SNORA17 snoRNA +chr9 136726747 136726879 - ENSG00000274998.1 SNORA17 snoRNA +chr9 136726748 136726879 - ENSG00000281808.1 SNORA17 snoRNA +chr9 137450026 137450086 - ENSG00000276682.1 MIR7114 miRNA +chr10 5861805 5862594 - ENSG00000240180.1 RP11-318E3.4 processed_pseudogene +chr10 32346864 32346946 - ENSG00000222309.1 AL391839.1 miRNA +chr10 35641172 35641252 - ENSG00000264780.1 MIR4683 miRNA +chr10 42470082 42470268 - ENSG00000279239.1 RP11-178A10.3 transcribed_unprocessed_pseudogene +chr10 77912280 77912362 - ENSG00000281632.1 AL391421.1 miRNA +chr10 86751611 86751772 - ENSG00000281735.1 Clostridiales-1 sRNA +chr10 86970237 86970826 - ENSG00000273413.1 RP11-96C23.15 antisense +chr10 87001636 87009905 + ENSG00000261011.1 RP11-96C23.11 transcribed_unprocessed_pseudogene +chr10 89644603 89645223 - ENSG00000249962.1 RP11-80H5.5 processed_pseudogene +chr10 102409042 102409102 - ENSG00000275957.1 AL121928.1 miRNA +chr10 110898090 110898155 + ENSG00000265827.1 MIR4680 miRNA +chr10 114174105 114174179 - ENSG00000238742.1 MIR2110 miRNA +chr10 127013501 127026475 - ENSG00000232935.2 RP11-223P11.2 antisense +chr11 209336 209406 + ENSG00000274298.1 MIR6743 miRNA +chr11 811681 811814 + ENSG00000199785.1 SNORA52 snoRNA +chr11 1996730 1996842 - ENSG00000278648.1 AC051649.1 miRNA +chr11 2129121 2129964 - ENSG00000240801.1 AC132217.4 3prime_overlapping_ncrna +chr11 3856062 3856141 + ENSG00000263421.1 MIR4687 miRNA +chr11 4367351 4368295 - ENSG00000280253.1 OR52B4 protein_coding +chr11 4768979 4769917 - ENSG00000188069.4 OR51F1 protein_coding +chr11 4803433 4804380 - ENSG00000176937.9 OR52R1 protein_coding +chr11 4803433 4804380 - ENSG00000279270.1 OR52R1 polymorphic_pseudogene +chr11 4923374 4924339 - ENSG00000176879.3 OR51G1 protein_coding +chr11 5069572 5070461 + ENSG00000236621.3 OR52E1 pseudogene +chr11 5323359 5324297 - ENSG00000184881.3 OR51B2 protein_coding +chr11 6608667 6610135 - ENSG00000254641.1 RP11-732A19.2 sense_overlapping +chr11 6785038 6785988 + ENSG00000170803.5 OR2AG1 protein_coding +chr11 7927718 7928662 - ENSG00000175393.3 OR10A6 protein_coding +chr11 8684204 8684314 + ENSG00000280884.1 AC091053.1 miRNA +chr11 8684227 8684356 + ENSG00000200983.1 SNORA45A snoRNA +chr11 9578028 9578131 + ENSG00000238387.1 snoU13 snoRNA +chr11 9958744 9959790 - ENSG00000254765.1 RP11-1H15.1 processed_pseudogene +chr11 15969533 15969621 - ENSG00000274140.1 MIR6073 miRNA +chr11 17075779 17075868 - ENSG00000201403.1 SNORD14B snoRNA +chr11 32435738 32435846 + ENSG00000278822.1 WT1-AS_1 misc_RNA +chr11 32436797 32437030 + ENSG00000278045.1 WT1-AS_3 misc_RNA +chr11 32439086 32439206 + ENSG00000276530.1 WT1-AS_6 misc_RNA +chr11 32439716 32440009 + ENSG00000273908.1 WT1-AS_7 misc_RNA +chr11 32440109 32440383 + ENSG00000277119.1 WT1-AS_8 misc_RNA +chr11 33354465 33354545 + ENSG00000223134.1 AL122015.1 miRNA +chr11 35860654 35860744 - ENSG00000266590.1 AC090692.1 miRNA +chr11 48245104 48246015 + ENSG00000279556.1 OR4X2 protein_coding +chr11 48263861 48264778 + ENSG00000176567.1 OR4X1 protein_coding +chr11 48263861 48264778 + ENSG00000279260.1 OR4X1 polymorphic_pseudogene +chr11 55572128 55573060 + ENSG00000181935.3 OR4C16 protein_coding +chr11 55572128 55573060 + ENSG00000279514.1 OR4C16 polymorphic_pseudogene +chr11 55773438 55774382 + ENSG00000198877.1 OR5D13 protein_coding +chr11 55773438 55774382 + ENSG00000279761.1 OR5D13 polymorphic_pseudogene +chr11 55811467 55812402 + ENSG00000186117.3 OR5L1 protein_coding +chr11 56318307 56319245 + ENSG00000181689.1 OR8K3 protein_coding +chr11 56318307 56319245 + ENSG00000280314.1 OR8K3 polymorphic_pseudogene +chr11 56412744 56413668 + ENSG00000181395.6 OR5AL1 pseudogene +chr11 56417258 56418232 - ENSG00000174942.1 OR5R1 protein_coding +chr11 56417258 56418232 - ENSG00000279961.1 OR5R1 polymorphic_pseudogene +chr11 56663686 56664618 + ENSG00000279911.1 OR5AR1 protein_coding +chr11 58030953 58031906 + ENSG00000172381.4 OR6Q1 protein_coding +chr11 58214745 58215722 + ENSG00000172774.7 OR1S1 protein_coding +chr11 61792495 61792561 - ENSG00000207601.1 MIR611 miRNA +chr11 61815161 61815240 - ENSG00000222326.1 MIR1908 miRNA +chr11 62372656 62372719 + ENSG00000265696.1 AP003064.2 miRNA +chr11 62665422 62665570 + ENSG00000206597.1 SNORA57 snoRNA +chr11 62789815 62789885 + ENSG00000274856.1 MIR6748 miRNA +chr11 62841619 62841809 - ENSG00000222328.1 RNU2-2P snRNA +chr11 62852910 62853035 - ENSG00000277194.1 SNORD22 snoRNA +chr11 62854161 62854285 - ENSG00000278527.1 SNORD22 snoRNA +chr11 62854621 62854695 - ENSG00000274544.1 SNORD28 snoRNA +chr11 62855012 62855083 - ENSG00000275996.1 SNORD27 snoRNA +chr11 62855292 62855366 - ENSG00000276788.1 SNORD26 snoRNA +chr11 62855564 62855632 - ENSG00000275043.1 SNORD25 snoRNA +chr11 64313860 64313950 + ENSG00000278148.1 AP001453.1 miRNA +chr11 64891132 64891243 - ENSG00000207648.2 MIR192 miRNA +chr11 64891355 64891439 - ENSG00000277225.1 MIR194-2 miRNA +chr11 65129916 65129978 - ENSG00000277547.1 MIR6751 miRNA +chr11 65175035 65175502 - ENSG00000249251.1 PGAM1P8 processed_pseudogene +chr11 65423273 65423392 + ENSG00000278144.1 NEAT1_1 misc_RNA +chr11 65423638 65423742 + ENSG00000278050.1 NEAT1_2 misc_RNA +chr11 65423960 65424118 + ENSG00000277599.1 NEAT1_3 misc_RNA +chr11 65444458 65444557 + ENSG00000273834.1 MIR612 miRNA +chr11 65502914 65503008 + ENSG00000278217.1 MALAT1 misc_RNA +chr11 65506117 65506173 + ENSG00000274072.1 mascRNA-menRNA sRNA +chr11 67435510 67438067 - ENSG00000213402.2 PTPRCAP protein_coding +chr11 68032864 68032922 + ENSG00000277703.1 MIR7113 miRNA +chr11 68033897 68033981 + ENSG00000266737.1 MIR4691 miRNA +chr11 75742270 75742331 + ENSG00000281528.1 AP001922.1 miRNA +chr11 78016971 78079865 - ENSG00000259112.1 NDUFC2-KCTD14 protein_coding +chr11 82957423 82958216 + ENSG00000242279.1 RP11-659G9.1 processed_pseudogene +chr11 83213005 83213404 + ENSG00000241020.1 RP11-727A23.1 processed_pseudogene +chr11 86278333 86278398 + ENSG00000273630.1 MIR6755 miRNA +chr11 93721542 93721621 + ENSG00000275146.1 snoU2_19 snoRNA +chr11 93730513 93730646 - ENSG00000207112.1 SNORA25 snoRNA +chr11 93730979 93731099 - ENSG00000206799.1 SNORA32 snoRNA +chr11 93732004 93732133 - ENSG00000206834.1 SNORA1 snoRNA +chr11 93732361 93732499 - ENSG00000207304.1 SNORA8 snoRNA +chr11 93732435 93732501 - ENSG00000281293.1 AP001273.1 miRNA +chr11 93733228 93733300 - ENSG00000239195.1 SNORD5 snoRNA +chr11 93733466 93733597 - ENSG00000207145.1 SNORA18 snoRNA +chr11 93733674 93733764 - ENSG00000221170.1 MIR1304 miRNA +chr11 93735111 93735236 - ENSG00000210825.1 SNORA40 snoRNA +chr11 112218326 112218720 + ENSG00000243930.1 RPS12P21 processed_pseudogene +chr11 113789242 113790027 + ENSG00000255870.1 RP11-667M19.5 processed_pseudogene +chr11 114312063 114312207 + ENSG00000280653.1 AP002518.1 miRNA +chr11 117820163 117876667 - ENSG00000255245.3 FXYD6-FXYD2 protein_coding +chr11 118644000 118644079 + ENSG00000275466.1 MIR6716 miRNA +chr11 119018944 119019012 + ENSG00000266398.1 MIR3656 miRNA +chr11 119312950 119313012 - ENSG00000277325.1 MIR6756 miRNA +chr11 119338942 119340883 - ENSG00000223953.4 C1QTNF5 protein_coding +chr11 119821829 119821927 - ENSG00000276827.1 AP001994.1 miRNA +chr11 120986258 120986353 + ENSG00000281726.1 GRIK4_3p_UTR misc_RNA +chr11 123058909 123058995 - ENSG00000207118.1 SNORD14D snoRNA +chr11 123059335 123059422 - ENSG00000202252.1 SNORD14C snoRNA +chr11 124319262 124320197 - ENSG00000197263.3 OR8D2 protein_coding +chr11 124423942 124424871 - ENSG00000198657.5 OR8B4 protein_coding +chr11 124636491 124636552 + ENSG00000280773.1 AP001524.1 miRNA +chr11 133898504 133898581 - ENSG00000264919.1 MIR4697 miRNA +chr12 3726222 3726327 - ENSG00000222338.1 RNU6-174P snRNA +chr12 4809176 4813412 + ENSG00000151079.7 KCNA6 protein_coding +chr12 6510275 6510522 + ENSG00000276232.1 SCARNA10 sense_intronic +chr12 9127871 9127985 + ENSG00000211542.1 AC007436.1 miRNA +chr12 9695384 9696227 + ENSG00000213443.2 RP11-75L1.2 processed_pseudogene +chr12 12915829 12915918 + ENSG00000207817.1 MIR614 miRNA +chr12 22195469 22195629 - ENSG00000212172.1 RNU1-149P snRNA +chr12 39443048 39443116 - ENSG00000252974.1 AC121334.1 miRNA +chr12 48487946 48488408 + ENSG00000240443.1 RPS10P20 transcribed_processed_pseudogene +chr12 52951708 52951795 + ENSG00000265039.1 AC107016.1 miRNA +chr12 53464720 53464901 + ENSG00000273658.1 uc_338 misc_RNA +chr12 53506690 53507638 - ENSG00000139574.8 NPFF protein_coding +chr12 54231397 54231476 - ENSG00000265371.1 MIR3198-2 miRNA +chr12 55870941 55871219 - ENSG00000265119.2 RN7SL676P misc_RNA +chr12 57512688 57512750 + ENSG00000275657.1 MIR6758 miRNA +chr12 57806154 57806260 - ENSG00000206749.1 RNU6-1083P snRNA +chr12 62260359 62260454 + ENSG00000276811.1 MIR6125 miRNA +chr12 62934507 62934607 - ENSG00000275729.1 AC078814.1 miRNA +chr12 66251082 66251157 + ENSG00000275058.1 MIR6502 miRNA +chr12 68841946 68842384 + ENSG00000256664.1 RP11-611O2.3 processed_pseudogene +chr12 91986626 91993403 - ENSG00000279037.1 C12orf79 protein_coding +chr12 91986682 91993403 - ENSG00000280112.1 C12orf79 protein_coding +chr12 93509487 93509768 + ENSG00000243015.2 RN7SL737P misc_RNA +chr12 94571231 94571352 - ENSG00000277302.1 MIR7844 miRNA +chr12 94834398 94834513 + ENSG00000208038.1 MIR492 miRNA +chr12 97492462 97492597 + ENSG00000273942.1 RMST_2 misc_RNA +chr12 97530656 97530858 + ENSG00000275971.1 RMST_6 misc_RNA +chr12 97532691 97532958 + ENSG00000274819.1 RMST_7 misc_RNA +chr12 97564174 97564327 + ENSG00000277081.1 RMST_10 misc_RNA +chr12 100157256 100157338 - ENSG00000221770.1 AC010203.1 miRNA +chr12 100158502 100158765 - ENSG00000266610.2 RN7SL176P misc_RNA +chr12 103930425 103930555 + ENSG00000265072.1 MIR3652 miRNA +chr12 104125268 104125329 + ENSG00000280646.1 AC089983.1 miRNA +chr12 107650789 107650892 + ENSG00000222160.1 Y_RNA misc_RNA +chr12 109567975 109568115 - ENSG00000200274.1 RNU4-32P snRNA +chr12 110360456 110360538 - ENSG00000281030.1 AC006088.1 miRNA +chr12 110496352 110496421 - ENSG00000202335.1 SNORD50 snoRNA +chr12 111598951 111599031 - ENSG00000281280.1 AC137055.1 miRNA +chr12 111899263 111901391 + ENSG00000229186.4 ADAM1A unitary_pseudogene +chr12 119713634 119713724 - ENSG00000221323.1 MIR1178 miRNA +chr12 120723193 120723266 + ENSG00000265455.1 MIR4700 miRNA +chr12 123364764 123364843 - ENSG00000277700.1 MIR8072 miRNA +chr12 130977562 130978768 + ENSG00000256204.1 RP11-243M5.1 lincRNA +chr12 132835374 132835500 - ENSG00000252079.1 RNU6-327P snRNA +chr13 25944606 25944673 + ENSG00000281497.1 AL138815.1 miRNA +chr13 27255064 27255135 + ENSG00000207500.1 SNORD102 snoRNA +chr13 27255401 27255526 + ENSG00000207051.1 SNORA27 snoRNA +chr13 30206616 30206699 - ENSG00000266816.1 AL356750.1 miRNA +chr13 45336314 45336447 - ENSG00000253051.1 SNORA31 snoRNA +chr13 45732089 45732172 - ENSG00000280848.1 AL139320.1 miRNA +chr13 46437394 46437988 - ENSG00000241353.3 PPP1R2P4 processed_pseudogene +chr13 50082295 50082484 + ENSG00000276089.1 DLEU1_1 misc_RNA +chr13 50104833 50105174 + ENSG00000273541.1 DLEU1_2 misc_RNA +chr13 90231182 90231277 + ENSG00000207858.1 MIR622 miRNA +chr13 91350605 91350688 + ENSG00000275042.1 MIR17 miRNA +chr13 91350743 91350834 + ENSG00000274160.1 AL162375.1 miRNA +chr13 91350891 91350972 + ENSG00000277328.1 MIR19A miRNA +chr13 91351065 91351135 + ENSG00000275534.1 MIR20A miRNA +chr13 91351192 91351278 + ENSG00000275802.1 MIR19B1 miRNA +chr13 91351314 91351393 + ENSG00000276018.1 MIR92A1 miRNA +chr13 97363007 97363069 - ENSG00000281389.1 AL442067.1 miRNA +chr14 20343075 20343407 - ENSG00000277209.1 RNaseP_nuc ribozyme +chr14 21023314 21023386 - ENSG00000278629.1 MIR6717 miRNA +chr14 22281105 22281748 + ENSG00000211817.2 TRAV38-2DV8 TR_V_gene +chr14 22450089 22450139 + ENSG00000211825.1 TRDJ1 TR_J_gene +chr14 22509341 22509406 + ENSG00000211857.1 TRAJ32 TR_J_gene +chr14 22956950 22957029 - ENSG00000265037.1 MIR4707 miRNA +chr14 23033530 23033638 - ENSG00000207765.1 AL132780.1 miRNA +chr14 24143489 24143565 - ENSG00000276511.1 MIR7703 miRNA +chr14 35809212 35809300 - ENSG00000266264.1 AL162311.1 miRNA +chr14 49577392 49577794 - ENSG00000244270.1 RPL32P29 processed_pseudogene +chr14 49586580 49586878 + ENSG00000276168.1 RN7SL1 misc_RNA +chr14 49586722 49586791 + ENSG00000281893.1 AL139099.3 miRNA +chr14 49853703 49853772 - ENSG00000279868.2 AL627171.1 miRNA +chr14 49862638 49862707 - ENSG00000280602.1 AL627171.3 miRNA +chr14 53153049 53153115 - ENSG00000266552.1 AL356020.1 miRNA +chr14 55413556 55414059 - ENSG00000239199.1 RPL21P6 processed_pseudogene +chr14 60972272 60972466 - ENSG00000258656.1 RP11-193F5.4 processed_pseudogene +chr14 68887785 68888084 - ENSG00000258967.1 HMGN1P3 processed_pseudogene +chr14 74480133 74480204 - ENSG00000265649.1 MIR4709 miRNA +chr14 76778952 76782249 + ENSG00000258610.1 RP11-488C13.7 lincRNA +chr14 95185117 95185854 - ENSG00000259143.1 CTD-2240H23.2 sense_overlapping +chr14 95662480 95662598 + ENSG00000275033.1 TCL6_2 misc_RNA +chr14 95670441 95670577 + ENSG00000277468.1 TCL6_3 misc_RNA +chr14 100829350 100829454 + ENSG00000276919.1 MEG3_2 misc_RNA +chr14 100894817 100894913 + ENSG00000276225.1 MEG8_1 misc_RNA +chr14 100898958 100899079 + ENSG00000275134.1 MEG8_2 misc_RNA +chr14 100987046 100987117 + ENSG00000200413.1 SNORD114-26 snoRNA +chr14 101560287 101560422 - ENSG00000277601.1 MIR1247 miRNA +chr14 102501676 102501779 + ENSG00000212330.1 RNU6-244P snRNA +chr14 105863198 105863258 - ENSG00000211900.2 IGHJ6 IG_J_gene +chr14 105863814 105863862 - ENSG00000242472.1 IGHJ5 IG_J_gene +chr14 105864215 105864260 - ENSG00000240041.1 IGHJ4 IG_J_gene +chr14 106636704 106636763 + ENSG00000280494.1 AC245369.3 miRNA +chr15 20759311 20774794 - ENSG00000259383.1 RP11-403B2.6 lincRNA +chr15 24981994 24982068 + ENSG00000276314.1 SNORD107 snoRNA +chr15 24985100 24985166 + ENSG00000276610.1 SNORD64 snoRNA +chr15 24986925 24986995 + ENSG00000239014.1 SNORD108 snoRNA +chr15 25041974 25042040 + ENSG00000274640.1 SNORD109B snoRNA +chr15 25087662 25087753 + ENSG00000278715.1 SNORD116-20 snoRNA +chr15 25088804 25088895 + ENSG00000277785.1 SNORD116-21 snoRNA +chr15 25104642 25104732 + ENSG00000278123.1 SNORD116-28 snoRNA +chr15 25182385 25182466 + ENSG00000278089.1 SNORD115-7 snoRNA +chr15 25193321 25193402 + ENSG00000273835.1 SNORD115-13 snoRNA +chr15 25218617 25218698 + ENSG00000275524.1 SNORD115-26 snoRNA +chr15 30380353 30386987 - ENSG00000263070.1 RP11-382B18.3 transcribed_unprocessed_pseudogene +chr15 34373789 34373871 - ENSG00000266205.1 AC025678.1 miRNA +chr15 40331537 40331648 - ENSG00000252714.1 RNA5SP392 rRNA +chr15 45214906 45215033 - ENSG00000261709.3 SNORA11 snoRNA +chr15 45433050 45433129 + ENSG00000211519.1 MIR147B miRNA +chr15 50360329 50360410 + ENSG00000264109.1 MIR4712 miRNA +chr15 62246284 62246349 - ENSG00000277779.1 AC126323.2 miRNA +chr15 66379337 66379915 - ENSG00000240821.1 RPL9P25 processed_pseudogene +chr15 66501250 66501318 - ENSG00000199574.1 SNORD18C snoRNA +chr15 66502019 66502089 - ENSG00000280554.1 snoU18 snoRNA +chr15 66502020 66502091 - ENSG00000202529.1 SNORD18B snoRNA +chr15 66502812 66502910 - ENSG00000199673.1 SNORD16 snoRNA +chr15 66503243 66503314 - ENSG00000200623.1 SNORD18A snoRNA +chr15 69160651 69160724 + ENSG00000266374.1 AC026512.1 miRNA +chr15 69406441 69406552 + ENSG00000207395.1 Y_RNA misc_RNA +chr15 72587217 72587313 + ENSG00000207690.1 MIR630 miRNA +chr15 72589691 72591845 + ENSG00000260534.1 RP11-1006G14.4 sense_overlapping +chr15 74303005 74304343 + ENSG00000261384.1 RP11-60L3.2 sense_intronic +chr15 75762215 75762315 - ENSG00000274496.1 MIR4313 miRNA +chr15 82726550 82726633 - ENSG00000266697.1 AC105339.2 miRNA +chr15 82755945 82756071 + ENSG00000277864.1 SCARNA15 scaRNA +chr15 82756006 82756069 + ENSG00000280933.1 AC105339.3 miRNA +chr15 83113617 83114566 - ENSG00000260579.1 RP11-382A20.2 antisense +chr15 84204516 84204599 - ENSG00000221008.1 AC136698.1 miRNA +chr15 84633874 84634635 + ENSG00000254779.4 EGLN1P1 transcribed_processed_pseudogene +chr15 84642487 84642763 - ENSG00000277578.1 Metazoa_SRP misc_RNA +chr15 85380596 85380662 + ENSG00000276648.1 MIR7706 miRNA +chr15 90275129 90275716 + ENSG00000228998.4 RP11-697E2.7 transcribed_processed_pseudogene +chr15 90294059 90294142 + ENSG00000280965.1 AC091167.2 miRNA +chr15 91029888 91029996 + ENSG00000258542.1 AC068831.11 transcribed_processed_pseudogene +chr15 96333261 96333307 + ENSG00000222651.1 MIR1469 miRNA +chr15 99792297 99792379 + ENSG00000221511.1 AC090825.1 miRNA +chr16 1962334 1962466 - ENSG00000206811.1 SNORA10 snoRNA +chr16 1962973 1963106 - ENSG00000207405.1 SNORA64 snoRNA +chr16 1963745 1964095 - ENSG00000255513.1 AC005363.9 transcribed_processed_pseudogene +chr16 1965184 1965310 + ENSG00000273587.1 SNORA78 snoRNA +chr16 2155025 2155104 - ENSG00000281010.1 snoR1 snoRNA +chr16 2271737 2271846 + ENSG00000274753.1 MIR940 miRNA +chr16 2769872 2769949 + ENSG00000265864.1 AC092117.2 miRNA +chr16 3069523 3069651 + ENSG00000252561.1 RNU1-125P snRNA +chr16 3308609 3308816 + ENSG00000262554.1 LA16c-360H6.2 transcribed_processed_pseudogene +chr16 8683461 8683543 + ENSG00000281170.1 AC007224.1 miRNA +chr16 9104848 9113181 + ENSG00000263244.2 RP11-473I1.9 3prime_overlapping_ncrna +chr16 11915661 11915738 - ENSG00000266163.1 AC007216.1 miRNA +chr16 14901508 14901591 + ENSG00000274301.1 MIR3179-3 miRNA +chr16 15611030 15611095 - ENSG00000275184.1 MIR6506 miRNA +chr16 15643281 15643390 + ENSG00000272213.2 AC026401.1 miRNA +chr16 19498610 19498747 + ENSG00000222750.1 RNU4-46P snRNA +chr16 21519830 21520365 - ENSG00000258186.2 SLC7A5P2 transcribed_processed_pseudogene +chr16 23061406 23064173 - ENSG00000260566.2 RP11-20G6.3 3prime_overlapping_ncrna +chr16 23670011 23675499 + ENSG00000260482.3 CTD-2196E14.9 3prime_overlapping_ncrna +chr16 28958583 28958661 + ENSG00000266868.1 MIR4517 miRNA +chr16 28982014 28982130 + ENSG00000281146.1 AC109460.1 miRNA +chr16 30670160 30670297 + ENSG00000280843.1 AC093249.1 miRNA +chr16 31109211 31109318 + ENSG00000252809.3 AC135050.1 miRNA +chr16 35023339 35023765 + ENSG00000262885.1 CTD-2144E22.11 processed_pseudogene +chr16 48447904 48448398 - ENSG00000261802.1 RP11-44I10.6 sense_overlapping +chr16 58559796 58559929 - ENSG00000206952.3 SNORA76A snoRNA +chr16 66550457 66550567 + ENSG00000275745.1 Y_RNA misc_RNA +chr16 67019727 67019811 + ENSG00000281589.1 AC009084.1 miRNA +chr16 67390691 67390850 - ENSG00000239194.1 RNU1-123P snRNA +chr16 67877657 67877740 + ENSG00000221526.1 AC040162.1 miRNA +chr16 68233426 68233499 - ENSG00000276151.1 MIR6773 miRNA +chr16 68742495 68742551 + ENSG00000281454.1 AC099314.1 miRNA +chr16 69328644 69334871 - ENSG00000272617.1 RP11-343C2.12 protein_coding +chr16 69463844 69466264 + ENSG00000260108.1 RP11-140H17.1 3prime_overlapping_ncrna +chr16 69703065 69704652 + ENSG00000260772.1 RP11-311C24.1 3prime_overlapping_ncrna +chr16 72787693 72787773 - ENSG00000265573.1 AC004943.1 miRNA +chr16 81170740 81181213 - ENSG00000279362.1 PKD1L2 protein_coding +chr16 81385018 81385093 + ENSG00000273048.1 MIR4720 miRNA +chr16 81385463 81387560 + ENSG00000272923.1 RP11-391L3.1 lincRNA +chr16 81961926 81962243 + ENSG00000260682.3 7SK misc_RNA +chr16 89561434 89561517 + ENSG00000200084.1 SNORD68 snoRNA +chr17 1713895 1714005 - ENSG00000275595.1 AC130343.2 miRNA +chr17 2041936 2043430 + ENSG00000262664.2 OVCA2 protein_coding +chr17 2329119 2329213 - ENSG00000212552.3 SNORD91B snoRNA +chr17 2330276 2330370 - ENSG00000274802.1 SNORD91A snoRNA +chr17 3310312 3311189 + ENSG00000180068.8 OR3A4P transcribed_unprocessed_pseudogene +chr17 5514209 5514270 - ENSG00000280489.1 U7 snRNA +chr17 6651914 6652270 + ENSG00000279257.1 C17orf100 protein_coding +chr17 7017615 7017701 - ENSG00000275739.1 MIR195 miRNA +chr17 7017911 7018022 - ENSG00000273895.1 MIR497 miRNA +chr17 7574713 7574847 + ENSG00000209582.1 SNORA48 snoRNA +chr17 7576811 7576952 + ENSG00000238917.1 SNORD10 snoRNA +chr17 7577955 7578091 + ENSG00000277985.1 SNORA67 snoRNA +chr17 8144994 8145071 - ENSG00000278027.1 MIR6883 miRNA +chr17 8173454 8173587 - ENSG00000200463.1 SNORD118 snoRNA +chr17 16044621 16044961 + ENSG00000243686.2 RPLP1P11 processed_pseudogene +chr17 16439505 16439576 + ENSG00000277108.1 SNORD49 snoRNA +chr17 16440036 16440106 + ENSG00000277370.1 SNORD49A snoRNA +chr17 16441226 16441298 + ENSG00000277512.1 SNORD65 snoRNA +chr17 16812447 16812651 + ENSG00000264892.1 NOS2P4 processed_pseudogene +chr17 17242564 17242647 + ENSG00000265109.1 AC055811.1 miRNA +chr17 18340814 18340886 - ENSG00000275820.1 MIR6778 miRNA +chr17 18388871 18389459 + ENSG00000267441.1 RP1-37N7.5 processed_pseudogene +chr17 19061912 19062128 + ENSG00000280523.1 SNORD3B-2 snoRNA +chr17 19061912 19062129 + ENSG00000276271.1 SNORD3B-2 snoRNA +chr17 19063919 19064136 - ENSG00000274088.1 SNORD3B-1 snoRNA +chr17 19063920 19064136 - ENSG00000281187.1 SNORD3B-2 snoRNA +chr17 19112419 19112636 - ENSG00000277947.1 SNORD3D snoRNA +chr17 19112420 19112636 - ENSG00000281000.1 SNORD3D snoRNA +chr17 19188016 19188232 + ENSG00000281298.1 SNORD3A snoRNA +chr17 19188016 19188233 + ENSG00000277813.1 SNORD3A snoRNA +chr17 19190028 19190245 - ENSG00000275303.1 SNORD3C snoRNA +chr17 19190029 19190245 - ENSG00000280847.1 SNORD3C snoRNA +chr17 19557560 19557711 + ENSG00000276660.1 SNORA59A snoRNA +chr17 19649494 19649587 - ENSG00000275982.1 Y_RNA misc_RNA +chr17 20981489 20981654 + ENSG00000266009.1 RP11-746M1.2 transcribed_processed_pseudogene +chr17 22435629 22436064 + ENSG00000265705.1 RP11-744K17.8 transcribed_processed_pseudogene +chr17 28276463 28276553 - ENSG00000207844.1 AC061975.1 miRNA +chr17 28360654 28360734 + ENSG00000264302.1 MIR4723 miRNA +chr17 28723429 28723492 + ENSG00000238649.1 SNORD42A snoRNA +chr17 28723682 28723753 + ENSG00000238597.1 SNORD4B snoRNA +chr17 28861369 28861440 - ENSG00000273915.1 MIR451A miRNA +chr17 28861514 28861624 - ENSG00000277441.1 AC024267.1 miRNA +chr17 28861655 28861730 - ENSG00000278521.1 MIR4732 miRNA +chr17 29390662 29390730 + ENSG00000263719.1 MIR4523 miRNA +chr17 30059054 30059147 - ENSG00000267482.1 RNY4P13 unprocessed_pseudogene +chr17 30117079 30117172 + ENSG00000199071.2 MIR423 miRNA +chr17 30702506 30702586 - ENSG00000281673.1 AC005562.2 miRNA +chr17 31534883 31534971 + ENSG00000266459.1 MIR4724 miRNA +chr17 35097560 35097657 - ENSG00000252328.1 Vault misc_RNA +chr17 38851524 38851659 - ENSG00000252699.1 SNORA21 snoRNA +chr17 38852863 38852994 - ENSG00000199293.1 SNORA21 snoRNA +chr17 39081065 39081187 + ENSG00000222494.1 AC091178.1 miRNA +chr17 40026332 40026409 - ENSG00000275267.1 MIR6884 miRNA +chr17 43387226 43387417 - ENSG00000277084.1 U2 snRNA +chr17 43444806 43444885 + ENSG00000278547.1 MIR2117 miRNA +chr17 44123539 44123649 - ENSG00000274433.1 AC023855.1 miRNA +chr17 44899712 44905390 + ENSG00000214447.4 FAM187A protein_coding +chr17 45637110 45637206 + ENSG00000273917.1 AC126544.1 miRNA +chr17 46537734 46537813 + ENSG00000280583.1 AC138645.1 miRNA +chr17 47042025 47042104 + ENSG00000281864.1 AC068152.1 miRNA +chr17 48048365 48048430 + ENSG00000273862.1 AC004477.1 miRNA +chr17 48382371 48382586 - ENSG00000200538.2 U3 snoRNA +chr17 48579838 48579947 - ENSG00000274592.1 MIR10A miRNA +chr17 49049540 49049624 + ENSG00000264552.1 AC105030.1 miRNA +chr17 51166831 51171507 + ENSG00000280803.1 NME1-NME2 protein_coding +chr17 56891270 56891355 - ENSG00000265238.1 MIR3614 miRNA +chr17 58331215 58331325 - ENSG00000273667.1 AC004687.1 miRNA +chr17 59841266 59841337 + ENSG00000199004.1 MIR21 miRNA +chr17 60810957 60811052 + ENSG00000211515.1 AC079005.1 miRNA +chr17 62005737 62006016 - ENSG00000242398.3 RN7SL800P misc_RNA +chr17 64146337 64146471 + ENSG00000281311.1 SNORA76C snoRNA +chr17 64146339 64146471 + ENSG00000277887.1 SNORA76C snoRNA +chr17 64500773 64500839 - ENSG00000265695.1 MIR3064 miRNA +chr17 64501214 64501313 - ENSG00000266241.1 MIR5047 miRNA +chr17 64749918 64758603 - ENSG00000266820.1 RP13-104F24.1 transcribed_unprocessed_pseudogene +chr17 64940606 64940717 + ENSG00000272346.2 RP11-927P21.11 transcribed_unprocessed_pseudogene +chr17 75498548 75498628 + ENSG00000276372.1 MIR6785 miRNA +chr17 75784521 75784607 - ENSG00000263565.1 MIR4738 miRNA +chr17 75910723 75938149 - ENSG00000281844.1 FBF1 protein_coding +chr17 76558791 76558868 + ENSG00000274091.1 SNORD1C snoRNA +chr17 76561634 76561705 + ENSG00000278261.1 SNORD1A snoRNA +chr17 76736450 76736548 - ENSG00000207556.1 MIR636 miRNA +chr17 77089307 77089493 + ENSG00000275143.1 SCARNA16 scaRNA +chr17 77089417 77089497 + ENSG00000281678.1 MIR6516 miRNA +chr17 81511026 81511109 - ENSG00000266077.1 AC139149.1 miRNA +chr17 82236668 82236728 + ENSG00000275505.1 MIR6787 miRNA +chr18 14074912 14075741 - ENSG00000267756.1 RP11-411B10.4 unprocessed_pseudogene +chr18 21712017 21712119 - ENSG00000281568.1 AC106037.1 miRNA +chr18 21825698 21825785 - ENSG00000276792.1 MIR133A1 miRNA +chr18 21828996 21829106 - ENSG00000278753.1 AC103987.1 miRNA +chr18 22933349 22933438 + ENSG00000264817.1 MIR4741 miRNA +chr18 26689316 26689529 - ENSG00000275900.1 U3 snoRNA +chr18 27452623 27452712 - ENSG00000280492.1 AC068408.1 miRNA +chr18 35466938 35466991 - ENSG00000277489.1 AC007998.1 miRNA +chr18 49487373 49487422 + ENSG00000278544.1 MIR1539 miRNA +chr18 49489245 49489308 - ENSG00000202093.1 SNORD58C snoRNA +chr18 49491283 49491347 - ENSG00000206602.1 SNORD58A snoRNA +chr18 49491664 49491729 - ENSG00000271982.1 SNORD58B snoRNA +chr18 49814133 49814276 + ENSG00000251992.1 SCARNA17 scaRNA +chr18 49814361 49814443 + ENSG00000252139.1 SCARNA18 scaRNA +chr18 57435821 57435880 + ENSG00000266636.1 AC090340.1 miRNA +chr18 58451068 58451176 + ENSG00000207778.2 MIR122 miRNA +chr19 804940 805001 + ENSG00000265767.1 MIR4745 miRNA +chr19 1816159 1816238 - ENSG00000223244.1 MIR1909 miRNA +chr19 2235829 2235926 - ENSG00000276587.1 MIR6789 miRNA +chr19 2250639 2250718 + ENSG00000267021.1 MIR4321 miRNA +chr19 3961414 3961512 - ENSG00000207733.1 MIR637 miRNA +chr19 6176256 6176442 - ENSG00000267415.1 CTC-503J8.2 transcribed_processed_pseudogene +chr19 6736712 6736778 - ENSG00000277714.1 MIR6791 miRNA +chr19 10403458 10403538 - ENSG00000221566.1 MIR1181 miRNA +chr19 12379746 12383687 - ENSG00000248406.1 CTD-3105H18.4 transcribed_unprocessed_pseudogene +chr19 14073361 14073479 + ENSG00000277805.1 MIR1199 miRNA +chr19 14213241 14213321 + ENSG00000280955.1 AC011509.1 miRNA +chr19 15834730 15834804 + ENSG00000273782.1 UCA1 misc_RNA +chr19 16910496 16910579 - ENSG00000280536.1 AC008737.1 miRNA +chr19 17862588 17862720 + ENSG00000207166.1 SNORA68 snoRNA +chr19 18868545 18896096 - ENSG00000130283.8 GDF1 protein_coding +chr19 20125044 20125484 + ENSG00000224864.4 CTC-260E6.2 transcribed_processed_pseudogene +chr19 21188598 21188953 + ENSG00000268995.1 VN1R82P unprocessed_pseudogene +chr19 21753011 21753083 + ENSG00000265084.1 AC092364.1 miRNA +chr19 22601485 22601707 + ENSG00000240713.3 RN7SL860P misc_RNA +chr19 33207129 33207639 + ENSG00000273420.1 CTD-2540B15.13 3prime_overlapping_ncrna +chr19 34645662 34646070 + ENSG00000269811.2 SCGB2B3P processed_pseudogene +chr19 35122700 35122764 + ENSG00000278663.1 MIR6887 miRNA +chr19 44758657 44758721 + ENSG00000277736.1 MIR8085 miRNA +chr19 45149750 45150605 - ENSG00000267037.1 AC005757.7 lincRNA +chr19 45638994 45639087 - ENSG00000199066.1 MIR330 miRNA +chr19 45938139 45938282 - ENSG00000280818.1 AC008623.1 miRNA +chr19 47581231 47581313 + ENSG00000281476.1 AC010331.1 miRNA +chr19 47755853 47755962 + ENSG00000221803.1 SNORD23 snoRNA +chr19 49489965 49490048 + ENSG00000201675.1 SNORD32A snoRNA +chr19 49490615 49490699 + ENSG00000199631.1 SNORD33 snoRNA +chr19 49490904 49490974 + ENSG00000202503.1 SNORD34 snoRNA +chr19 49491175 49491260 + ENSG00000200259.1 SNORD35A snoRNA +chr19 49497720 49497806 + ENSG00000200530.1 SNORD35B snoRNA +chr19 49832018 49832099 + ENSG00000277609.1 MIR6800 miRNA +chr19 49888175 49888230 + ENSG00000263462.1 MIR4750 miRNA +chr19 49933064 49933137 + ENSG00000265438.1 MIR4751 miRNA +chr19 49949316 49949527 - ENSG00000221125.2 U3 snoRNA +chr19 50799032 50799122 - ENSG00000221381.1 SNORD88B snoRNA +chr19 50802328 50802418 - ENSG00000220988.1 SNORD88C snoRNA +chr19 52564076 52564231 + ENSG00000242255.1 RPL39P34 transcribed_unprocessed_pseudogene +chr19 53386233 53386388 + ENSG00000241069.1 CTD-3141N22.1 transcribed_processed_pseudogene +chr19 53495887 53496037 + ENSG00000241434.1 CTD-2224J9.4 transcribed_processed_pseudogene +chr19 53762335 53762445 + ENSG00000280835.1 AC011453.2 miRNA +chr19 53982307 53982397 + ENSG00000215998.1 MIR935 miRNA +chr19 54589441 54590287 + ENSG00000269271.1 CTB-83J4.2 lincRNA +chr19 55388181 55388242 + ENSG00000277326.1 MIR6805 miRNA +chr19 55707550 55707855 + ENSG00000268593.3 CTD-2611O12.6 processed_pseudogene +chr19 56840905 56840992 + ENSG00000281440.1 MIMT1_1 sRNA +chr19 56848068 56848204 + ENSG00000274777.1 MIMT1_2 misc_RNA +chr19 58582024 58582609 + ENSG00000268784.1 MGC2752 transcribed_unprocessed_pseudogene +chr19 58586158 58586356 + ENSG00000269032.1 AC016629.7 transcribed_processed_pseudogene +chr20 1392900 1392961 - ENSG00000276741.1 MIR6869 miRNA +chr20 2652777 2652842 + ENSG00000221062.1 MIR1292 miRNA +chr20 2654212 2654286 + ENSG00000221116.1 SNORD110 snoRNA +chr20 2656097 2656182 + ENSG00000212498.1 SNORD86 snoRNA +chr20 2656624 2656694 + ENSG00000229686.1 SNORD56 snoRNA +chr20 2656939 2657010 + ENSG00000226572.1 SNORD57 snoRNA +chr20 22733210 22733301 - ENSG00000265151.1 AL158175.1 miRNA +chr20 31581089 31581192 + ENSG00000201770.1 RNU6-384P snRNA +chr20 32237795 32237847 + ENSG00000221667.1 MIR1825 miRNA +chr20 36255716 36255820 + ENSG00000281193.1 AL121895.1 miRNA +chr20 38421647 38421774 - ENSG00000277034.1 SNORA71 snoRNA +chr20 38425198 38425331 - ENSG00000273718.1 SNORA71 snoRNA +chr20 38448084 38448215 + ENSG00000274309.1 SNORA71E snoRNA +chr20 47651067 47651156 + ENSG00000280970.1 AL034418.1 miRNA +chr20 49278427 49278624 + ENSG00000277830.1 ZNFX1-AS1_1 misc_RNA +chr20 49279116 49279208 + ENSG00000274760.1 ZNFX1-AS1_2 misc_RNA +chr20 49280485 49280571 + ENSG00000277967.1 ZNFX1-AS1_3 misc_RNA +chr20 49280683 49280772 + ENSG00000212304.1 SNORD12 snoRNA +chr20 58817615 58817694 - ENSG00000276373.1 MIR296 miRNA +chr20 61953546 61953662 - ENSG00000221417.1 MIR1257 miRNA +chr20 63696668 63698684 + ENSG00000243509.4 TNFRSF6B protein_coding +chr20 63941465 63941544 - ENSG00000272045.1 MIR1914 miRNA +chr21 6994374 6997737 - ENSG00000279313.1 CH507-145C22.4 lincRNA +chr21 14143581 14144158 + ENSG00000240755.1 ERLEC1P1 processed_pseudogene +chr21 25573980 25574044 + ENSG00000275402.1 MIR155 miRNA +chr21 33550662 33550728 + ENSG00000275224.1 MIR6501 miRNA +chr21 39171462 39171560 - ENSG00000276873.1 uc_338 misc_RNA +chr21 44328944 44330221 - ENSG00000241728.4 AP001062.8 sense_overlapping +chr22 15826566 15827187 + ENSG00000271672.1 DUXAP8 transcribed_processed_pseudogene +chr22 16961936 17008222 - ENSG00000215568.6 GAB4 protein_coding +chr22 17021398 17022570 - ENSG00000276400.1 VN1R9P processed_pseudogene +chr22 18101612 18101888 + ENSG00000235617.1 XXbac-B476C20.10 unprocessed_pseudogene +chr22 19722945 19724771 + ENSG00000203618.5 GP1BB protein_coding +chr22 20085746 20085833 + ENSG00000266567.1 MIR3618 miRNA +chr22 20086042 20086150 + ENSG00000221366.2 MIR1306 miRNA +chr22 20126402 20126526 + ENSG00000264346.1 SNORA77 snoRNA +chr22 20483225 20483316 - ENSG00000265300.1 AC007731.1 miRNA +chr22 22895375 22895834 + ENSG00000211675.2 IGLC1 IG_C_gene +chr22 26671402 26671550 + ENSG00000277941.1 MIAT_exon5_1 misc_RNA +chr22 26672164 26672248 + ENSG00000275942.1 MIAT_exon5_2 misc_RNA +chr22 26673254 26673630 + ENSG00000276991.1 MIAT_exon5_3 misc_RNA +chr22 27919384 27919545 + ENSG00000277460.1 TTC28-AS1_1 misc_RNA +chr22 27922022 27922178 + ENSG00000276150.1 TTC28-AS1_2 misc_RNA +chr22 28002484 28002595 + ENSG00000276675.1 TTC28-AS1_4 misc_RNA +chr22 28992721 29018620 + ENSG00000235786.1 ZNRF3-IT1 sense_intronic +chr22 29073078 29168333 + ENSG00000183762.11 KREMEN1 protein_coding +chr22 29134014 29134120 + ENSG00000200871.1 RNU6-810P snRNA +chr22 29333163 29333258 - ENSG00000239127.1 SNORD125 snoRNA +chr22 29711820 29719714 - ENSG00000239446.1 RP1-76B20.12 antisense +chr22 30971005 30971149 + ENSG00000276664.1 TUG1_1 misc_RNA +chr22 30971296 30971382 + ENSG00000276057.1 TUG1_2 misc_RNA +chr22 30972857 30973093 + ENSG00000276965.1 TUG1_3 misc_RNA +chr22 30973417 30973597 + ENSG00000275307.1 TUG1_4 misc_RNA +chr22 31059989 31060285 + ENSG00000240186.3 RN7SL633P misc_RNA +chr22 31621467 31621531 - ENSG00000275382.1 MIR7109 miRNA +chr22 31837238 31837298 + ENSG00000252909.1 RNU6-201P snRNA +chr22 33704786 33704920 - ENSG00000253007.2 SNORA76 snoRNA +chr22 35164304 35165347 + ENSG00000238153.1 CTA-714B7.4 processed_pseudogene +chr22 37844272 37844371 - ENSG00000207945.1 MIR658 miRNA +chr22 37891347 37891448 + ENSG00000207227.1 RNU6-900P snRNA +chr22 37950965 37951778 + ENSG00000272582.1 RP5-1039K5.17 antisense +chr22 37967563 37967624 + ENSG00000277321.1 MIR6820 miRNA +chr22 37988794 37988853 + ENSG00000264505.1 MIR4534 miRNA +chr22 39120293 39120566 - ENSG00000226024.1 COX5BP7 processed_pseudogene +chr22 39313819 39313911 - ENSG00000209480.1 SNORD83B snoRNA +chr22 39319050 39319113 - ENSG00000263764.1 SNORD43 snoRNA +chr22 41092513 41092566 + ENSG00000221160.1 MIR1281 miRNA +chr22 41923222 41923297 - ENSG00000263463.1 MIR378I miRNA +chr22 42569147 42569250 - ENSG00000251913.1 RNU6-513P snRNA +chr22 42615244 42615393 + ENSG00000276027.1 RNU12 snRNA +chr22 45976878 45976954 - ENSG00000265610.1 CR536603.1 miRNA +chr22 46112732 46112840 + ENSG00000198986.2 MIRLET7A3 miRNA +chr22 46113566 46113657 + ENSG00000264147.1 MIR4763 miRNA +chr22 46113686 46113768 + ENSG00000207875.1 MIRLET7B miRNA +chrX 3607258 3607365 - ENSG00000207332.1 RNU6-146P snRNA +chrX 21872707 21872808 + ENSG00000206639.1 Y_RNA misc_RNA +chrX 37441523 37442068 + ENSG00000241607.1 RP11-357K9.2 unprocessed_pseudogene +chrX 41233633 41234283 + ENSG00000269941.1 RP5-1172N10.4 sense_intronic +chrX 52063347 52063474 - ENSG00000221705.1 SNORA11E snoRNA +chrX 52190621 52190748 + ENSG00000221475.1 SNORA11D snoRNA +chrX 53143034 53143117 - ENSG00000266700.1 AL139396.1 miRNA +chrX 54927305 54927433 + ENSG00000221750.1 SNORA11 snoRNA +chrX 66018870 66018979 + ENSG00000207939.1 MIR223 miRNA +chrX 67545277 67545422 + ENSG00000280956.1 AL049564.1 miRNA +chrX 70846080 70846295 - ENSG00000280704.1 U3 snoRNA +chrX 73821657 73821724 - ENSG00000274655.1 XIST_intron misc_RNA +chrX 73831145 73831270 - ENSG00000277577.1 Xist_exon4 misc_RNA +chrX 73944332 73944466 + ENSG00000274430.1 JPX_1 misc_RNA +chrX 73944595 73944663 + ENSG00000276784.1 JPX_2 misc_RNA +chrX 80024067 80031111 + ENSG00000281700.1 TBX22 protein_coding +chrX 85244095 85244171 + ENSG00000264517.1 AC003001.1 miRNA +chrX 92460452 92460545 + ENSG00000211526.1 AL121869.1 miRNA +chrX 107428989 107429079 + ENSG00000207846.1 AL035088.1 miRNA +chrX 112780718 112780788 - ENSG00000263351.1 MIR4329 miRNA +chrX 115622308 115649561 + ENSG00000281638.1 AL589842.1 protein_coding +chrX 115703812 115703867 + ENSG00000264759.1 AC005000.1 miRNA +chrX 120877496 120878924 - ENSG00000278646.1 RP1-321E8.5 protein_coding +chrX 136879199 136879271 - ENSG00000206979.1 SNORD61 snoRNA +chrX 140086058 140086145 + ENSG00000280833.1 AL589987.1 miRNA +chrX 148501098 148991332 + ENSG00000281817.1 AFF2 protein_coding +chrX 151956683 151956778 - ENSG00000264120.1 AF274855.1 miRNA +chrX 151958578 151958658 - ENSG00000207621.1 MIR224 miRNA +chrX 151959607 151959719 - ENSG00000207753.2 MIR452 miRNA +chrX 154400281 154400415 + ENSG00000207165.1 SNORA70 snoRNA +chrX 154768596 154768656 + ENSG00000281135.1 MIR664B miRNA +chrX 154774998 154775126 + ENSG00000206693.1 SNORA56 snoRNA +chrX 156022631 156022698 + ENSG00000276543.3 AJ271736.1 miRNA +chrY 5573928 5574019 + ENSG00000252059.2 AC012667.1 miRNA +chrY 7378672 7378779 + ENSG00000252155.1 RNU6-941P snRNA +chrY 57209151 57209218 + ENSGR0000276543.3 AJ271736.1 miRNA diff --git a/docs/Data/Bioinformatics_Pipelines/overlap.gene.strandless.tsv b/docs/Data/Bioinformatics_Pipelines/overlap.gene.strandless.tsv new file mode 100644 index 000000000..702d9b31b --- /dev/null +++ b/docs/Data/Bioinformatics_Pipelines/overlap.gene.strandless.tsv @@ -0,0 +1,1727 @@ +seqname start end strand gene_id gene_name gene_type +chr1 30366 30503 + ENSG00000274890.1 MIR1302-9 miRNA +chr1 89551 91105 - ENSG00000239945.1 RP11-34P13.8 lincRNA +chr1 258568 259024 - ENSG00000241670.3 AP006222.1 processed_pseudogene +chr1 632325 632413 - ENSG00000278791.1 MIR6723 miRNA +chr1 965110 965166 + ENSG00000277294.1 AL645608.1 miRNA +chr1 1055033 1056116 + ENSG00000242590.1 RP11-54O7.14 sense_intronic +chr1 1312502 1312566 - ENSG00000274153.1 MIR6727 miRNA +chr1 1405460 1405752 - ENSG00000264293.2 RN7SL657P misc_RNA +chr1 3736943 3737103 + ENSG00000276189.1 TP73-AS1 misc_RNA +chr1 6393555 6394391 + ENSG00000271746.1 RP1-202O8.3 antisense +chr1 9848318 9850154 + ENSG00000223989.1 RP11-84A14.5 antisense +chr1 9983141 9984568 + ENSG00000241326.1 RP11-807G9.2 sense_intronic +chr1 10306465 10306757 + ENSG00000264501.2 RN7SL731P misc_RNA +chr1 10458555 10459338 + ENSG00000203469.2 RP5-1113E3.3 antisense +chr1 11843812 11843984 + ENSG00000276470.1 NPPA-AS1_1 misc_RNA +chr1 11845549 11845697 + ENSG00000278852.1 NPPA-AS1_2 misc_RNA +chr1 11847442 11847549 + ENSG00000275915.1 NPPA-AS1_3 misc_RNA +chr1 12824942 12828663 - ENSG00000279195.1 PRAMEF11 protein_coding +chr1 15659869 15661722 + ENSG00000215695.1 RSC1A1 protein_coding +chr1 15684472 15684558 + ENSG00000264048.1 AL121992.1 miRNA +chr1 16006160 16006671 - ENSG00000233078.1 RP11-5P18.5 antisense +chr1 16197854 16198357 + ENSG00000234166.1 ARHGEF19-AS1 antisense +chr1 16408508 16408583 + ENSG00000281598.1 AL358794.1 miRNA +chr1 16873708 16873851 + ENSG00000277234.1 U1 snRNA +chr1 17406760 17407382 + ENSG00000227751.1 RP1-20B21.4 antisense +chr1 17413631 17413694 - ENSG00000266727.1 AC004824.1 miRNA +chr1 20633679 20633788 + ENSG00000273695.1 MIR6084 miRNA +chr1 23370254 23370346 + ENSG00000201405.1 Y_RNA misc_RNA +chr1 25831913 25832134 - ENSG00000272478.1 RP1-317E23.7 antisense +chr1 26876133 26878245 + ENSG00000226698.1 RP1-50O24.6 antisense +chr1 27325470 27325553 + ENSG00000281023.1 FO393419.1 miRNA +chr1 27850574 27850744 - ENSG00000278868.1 AL109927.1 protein_coding +chr1 28507366 28507571 + ENSG00000274266.1 SNORA73A snoRNA +chr1 28579764 28579893 - ENSG00000278274.1 SNORA61 snoRNA +chr1 28580381 28580512 - ENSG00000273544.1 SNORA44 snoRNA +chr1 28580920 28581054 - ENSG00000280498.1 SNORA16A snoRNA +chr1 28580920 28581056 - ENSG00000274582.1 SNORA16A snoRNA +chr1 28648600 28648730 + ENSG00000270103.3 RNU11 lincRNA +chr1 32170733 32176568 + ENSG00000250135.1 RP4-622L5.2 sense_intronic +chr1 35925832 35929610 + ENSG00000280133.1 RP4-789D17.3 TEC +chr1 37480230 37480289 + ENSG00000278228.1 MIR6732 miRNA +chr1 37799720 37800879 - ENSG00000233728.1 RP11-109P14.9 antisense +chr1 37860697 37861580 + ENSG00000230955.1 RP11-109P14.10 antisense +chr1 40984853 40984919 - ENSG00000281712.1 AL391730.2 miRNA +chr1 43364648 43364715 - ENSG00000277622.1 MIR6734 miRNA +chr1 43448539 43448611 + ENSG00000274975.1 MIR6735 miRNA +chr1 44103945 44104079 + ENSG00000281534.1 AL139220.1 miRNA +chr1 44775864 44775943 + ENSG00000264294.1 SNORD55 snoRNA +chr1 44778390 44778456 + ENSG00000207421.1 SNORD38B snoRNA +chr1 44778390 44778458 + ENSG00000281859.1 SNORD38B snoRNA +chr1 44819883 44819997 - ENSG00000202444.1 RNU5E-6P snRNA +chr1 47761132 47765547 - ENSG00000223814.1 RP11-543D5.2 lincRNA +chr1 52150329 52150412 + ENSG00000281172.1 AL139156.1 miRNA +chr1 53069938 53085502 - ENSG00000232993.1 RP11-334A14.5 antisense +chr1 53073110 53073389 - ENSG00000227644.2 HIGD1AP11 processed_pseudogene +chr1 54099968 54100224 - ENSG00000279049.1 AL353898.1 pseudogene +chr1 54980950 54992274 - ENSG00000233271.1 RP11-12C17.2 antisense +chr1 62211557 62211666 + ENSG00000200174.1 Y_RNA misc_RNA +chr1 63548977 63549112 - ENSG00000276884.1 DLEU2_6 misc_RNA +chr1 63549133 63549216 - ENSG00000278287.1 DLEU2_5 misc_RNA +chr1 63549227 63549284 - ENSG00000277565.1 DLEU2_4 misc_RNA +chr1 63549294 63549366 - ENSG00000275823.1 DLEU2_3 misc_RNA +chr1 63549374 63549463 - ENSG00000277689.1 DLEU2_2 misc_RNA +chr1 63549489 63549608 - ENSG00000274199.1 DLEU2_1 misc_RNA +chr1 67320369 67395580 + ENSG00000281152.1 IL12RB2 protein_coding +chr1 75787072 75787150 + ENSG00000206620.1 SNORD45C snoRNA +chr1 75787889 75787972 + ENSG00000207241.1 SNORD45A snoRNA +chr1 75789477 75789548 + ENSG00000201487.1 SNORD45B snoRNA +chr1 81990753 81990806 + ENSG00000274207.1 AC113949.1 miRNA +chr1 89260582 89269754 + ENSG00000237568.1 RP4-620F22.2 antisense +chr1 92229256 92229339 - ENSG00000265543.1 AL451010.1 miRNA +chr1 92837289 92837383 + ENSG00000206680.1 SNORD21 snoRNA +chr1 98045242 98045351 - ENSG00000276280.1 MIR2682 miRNA +chr1 98046070 98046171 - ENSG00000277990.1 MIR137 miRNA +chr1 100894928 100895356 + ENSG00000273204.1 RP4-549L20.3 antisense +chr1 103569553 103570674 + ENSG00000236085.1 ACTG1P4 processed_pseudogene +chr1 109100193 109100612 + ENSG00000278249.1 SCARNA2 scaRNA +chr1 109539906 109543837 - ENSG00000254942.1 RP5-1160K1.8 antisense +chr1 109596225 109597781 + ENSG00000225113.1 RP5-1160K1.3 sense_overlapping +chr1 109598893 109598967 + ENSG00000207709.2 MIR197 miRNA +chr1 109628417 109630305 - ENSG00000228703.1 RP5-1160K1.6 antisense +chr1 110456505 110457354 - ENSG00000270380.1 RP11-470L19.5 antisense +chr1 111181374 111181491 - ENSG00000272982.1 RP5-1180E21.4 antisense +chr1 111184415 111185061 - ENSG00000273221.1 RP5-1180E21.5 antisense +chr1 113125321 113126883 - ENSG00000237278.2 RLIMP2 processed_pseudogene +chr1 145281116 145281279 + ENSG00000207501.1 RNVU1-14 snRNA +chr1 145961388 145964422 + ENSG00000278431.1 CH17-270A2.1 antisense +chr1 147757185 147758434 + ENSG00000274415.1 RP11-433J22.2 antisense +chr1 148295792 148297271 - ENSG00000244252.1 RP11-495P10.7 lincRNA +chr1 149842355 149842495 + ENSG00000281134.1 AC239868.3 miRNA +chr1 149845816 149846486 + ENSG00000272993.1 RP11-196G18.24 lincRNA +chr1 149851317 149851457 - ENSG00000280874.1 AC239868.2 miRNA +chr1 149887178 149887318 - ENSG00000280566.1 AC239868.1 miRNA +chr1 150965245 150966256 + ENSG00000259357.2 RP11-316M1.12 antisense +chr1 151340648 151341966 + ENSG00000224645.1 RP11-126K1.8 antisense +chr1 151765709 151766389 + ENSG00000232937.1 RP11-98D18.2 sense_intronic +chr1 153772371 153774079 - ENSG00000279767.1 AL513523.2 protein_coding +chr1 154480012 154481501 + ENSG00000273110.1 RP11-350G8.9 antisense +chr1 154579065 154579663 - ENSG00000233875.1 RP11-61L14.6 antisense +chr1 155211151 155213819 - ENSG00000236263.1 RP11-263K19.6 antisense +chr1 155925958 155926085 - ENSG00000280466.1 SCARNA15 scaRNA +chr1 156077373 156077454 + ENSG00000222611.1 AL355388.1 miRNA +chr1 158578919 158579899 - ENSG00000186400.3 OR10X1 protein_coding +chr1 158578919 158579899 - ENSG00000279111.1 OR10X1 polymorphic_pseudogene +chr1 160205377 160205464 + ENSG00000265381.1 AL121987.1 miRNA +chr1 161364731 161367874 - ENSG00000188931.3 CFAP126 protein_coding +chr1 165662585 165662687 - ENSG00000206990.1 Y_RNA misc_RNA +chr1 168245565 168247343 - ENSG00000214262.4 ANKRD36BP1 transcribed_processed_pseudogene +chr1 170370213 170370295 - ENSG00000263384.1 AL354732.1 miRNA +chr1 171345105 171345307 - ENSG00000268062.1 TOP1P1 processed_pseudogene +chr1 172138798 172138907 - ENSG00000207949.1 MIR214 miRNA +chr1 178479247 178482365 - ENSG00000279210.1 RP4-593C16.4 antisense +chr1 182944402 182944454 - ENSG00000281615.1 AL450304.2 miRNA +chr1 182959485 182959575 - ENSG00000264768.1 AL450304.1 miRNA +chr1 186311825 186311928 - ENSG00000202025.1 RNU6-1240P snRNA +chr1 201464383 201465146 - ENSG00000224818.1 RP11-134G8.10 3prime_overlapping_ncrna +chr1 201520056 201520549 + ENSG00000242150.2 RP11-134G8.6 transcribed_processed_pseudogene +chr1 202873294 202874326 + ENSG00000243113.1 RP11-480I12.9 transcribed_processed_pseudogene +chr1 204131062 204131966 + ENSG00000261065.1 RP11-74C13.4 antisense +chr1 207801845 207801956 - ENSG00000275668.1 AL035209.1 miRNA +chr1 207802440 207802523 - ENSG00000276752.1 MIR29B2 miRNA +chr1 210678315 210681932 + ENSG00000279333.1 RP11-75I2.3 TEC +chr1 211492255 211492917 + ENSG00000223649.1 RP11-359E8.3 lincRNA +chr1 212853280 212853426 + ENSG00000235182.1 RP11-348H3.4 processed_pseudogene +chr1 218442626 218443996 + ENSG00000281453.1 TGFB2-OT1 3prime_overlapping_ncrna +chr1 220571834 220571899 - ENSG00000281158.1 AC096640.1 miRNA +chr1 220825776 220825893 + ENSG00000276642.1 U6atac snRNA +chr1 225922080 225922142 - ENSG00000274674.1 MIR6741 miRNA +chr1 226992140 226993206 + ENSG00000233706.1 RP5-1087E8.3 antisense +chr1 227430526 227430976 + ENSG00000242757.1 CTD-2090I13.3 processed_pseudogene +chr1 228097263 228097341 + ENSG00000264944.1 MIR3620 miRNA +chr1 228394290 228396967 + ENSG00000270094.1 RP11-245P10.8 antisense +chr1 228397048 228397109 - ENSG00000278067.1 MIR6742 miRNA +chr1 228457435 228457572 + ENSG00000281451.1 AL139288.1 miRNA +chr1 228628176 228628234 - ENSG00000281588.1 AL713899.1 miRNA +chr1 229304857 229305504 + ENSG00000213029.3 SPHAR protein_coding +chr1 229440284 229441020 - ENSG00000226920.1 RP5-1068B5.5 3prime_overlapping_ncrna +chr1 230280312 230281893 - ENSG00000224407.1 RP5-956O18.3 antisense +chr1 231019828 231019924 - ENSG00000221290.1 MIR1182 miRNA +chr1 232223059 232223141 + ENSG00000281475.1 BX323014.1 miRNA +chr1 234607008 234609483 + ENSG00000228830.1 RP4-781K5.2 antisense +chr1 236483165 236484468 + ENSG00000244457.2 ENO1P1 transcribed_processed_pseudogene +chr1 237926831 237927605 + ENSG00000243781.1 RP11-193H5.2 transcribed_processed_pseudogene +chr1 245123471 245124450 - ENSG00000223353.2 RP11-290P14.2 processed_pseudogene +chr1 245749563 246355081 - ENSG00000280657.1 SMYD3 protein_coding +chr1 247201967 247202060 - ENSG00000263568.1 MIR3916 miRNA +chr1 247948858 247949796 + ENSG00000196936.3 OR2L8 protein_coding +chr1 247948858 247949796 + ENSG00000279263.1 OR2L8 polymorphic_pseudogene +chr1 248626178 248627128 - ENSG00000183130.3 OR2T11 protein_coding +chr1 248721993 248722201 - ENSG00000242529.1 AHCYP8 processed_pseudogene +chr1 248826377 248826443 + ENSG00000264500.1 MIR3124 miRNA +chr2 10691538 10691831 + ENSG00000277446.1 Metazoa_SRP misc_RNA +chr2 20401650 20401706 - ENSG00000281337.1 AC007041.1 miRNA +chr2 20678254 20678932 - ENSG00000270100.1 RP11-130L8.1 lincRNA +chr2 27581889 27581946 - ENSG00000221531.1 AC074091.1 miRNA +chr2 28308161 28308570 + ENSG00000225991.1 RP11-731I19.1 processed_pseudogene +chr2 31823018 31823106 - ENSG00000265267.1 AL121652.3 miRNA +chr2 44935294 44935368 + ENSG00000276229.1 Six3os1_3 misc_RNA +chr2 44938833 44939028 + ENSG00000274547.1 Six3os1_6 misc_RNA +chr2 51027953 51028097 + ENSG00000281171.1 AC007682.4 miRNA +chr2 54079974 54080280 - ENSG00000241114.1 AC008280.3 processed_pseudogene +chr2 60925909 60931610 + ENSG00000267520.2 RP11-373L24.1 3prime_overlapping_ncrna +chr2 61177554 61177636 + ENSG00000280672.1 AC016747.1 miRNA +chr2 62146413 62147153 + ENSG00000242735.1 AC018462.3 processed_pseudogene +chr2 64817586 64817668 + ENSG00000280945.1 AC007880.2 miRNA +chr2 65667256 65667346 + ENSG00000265899.1 AC007389.4 miRNA +chr2 68179833 68180532 + ENSG00000273275.1 RP11-474G23.2 antisense +chr2 70301451 70302072 + ENSG00000233849.1 AC022201.5 antisense +chr2 71379755 71379858 - ENSG00000200779.1 RNU6-105P snRNA +chr2 82268612 82268706 + ENSG00000252897.1 RNU6-685P snRNA +chr2 85567662 85567737 - ENSG00000280656.1 AC016753.1 miRNA +chr2 88860886 88860922 - ENSG00000211594.2 IGKJ4 IG_J_gene +chr2 88861221 88861258 - ENSG00000211595.2 IGKJ3 IG_J_gene +chr2 88861525 88861563 - ENSG00000211596.3 IGKJ2 IG_J_gene +chr2 88861886 88861923 - ENSG00000211597.2 IGKJ1 IG_J_gene +chr2 95413456 95414296 - ENSG00000248821.1 AC009238.6 unprocessed_pseudogene +chr2 96809473 96809614 + ENSG00000280510.1 AC092636.2 miRNA +chr2 110126732 110126824 - ENSG00000212091.1 AC013268.1 miRNA +chr2 113596000 113596067 + ENSG00000276624.1 AL078621.1 miRNA +chr2 127701508 127705242 + ENSG00000173349.5 SFT2D3 protein_coding +chr2 127845236 127845341 + ENSG00000202532.1 RNU6-395P snRNA +chr2 127846736 127847084 - ENSG00000244563.1 AC006011.4 processed_pseudogene +chr2 130929762 130929883 - ENSG00000281159.1 SCARNA15 scaRNA +chr2 131491160 131491236 - ENSG00000265575.1 MIR4784 miRNA +chr2 132152243 132152349 - ENSG00000239108.1 RNU6-1132P snRNA +chr2 144518447 144518574 + ENSG00000275372.1 ZEB2_AS1_1 misc_RNA +chr2 144520456 144520555 + ENSG00000273537.1 ZEB2_AS1_3 misc_RNA +chr2 144521039 144521116 + ENSG00000277444.1 ZEB2_AS1_4 misc_RNA +chr2 159462417 159463256 - ENSG00000225369.1 AC009506.2 processed_pseudogene +chr2 160407810 160407882 - ENSG00000263948.1 MIR4785 miRNA +chr2 161422659 161423577 - ENSG00000224076.4 AC009487.4 antisense +chr2 175176493 175176586 - ENSG00000200121.2 Y_RNA misc_RNA +chr2 175184215 175184607 + ENSG00000233131.1 AC096649.1 processed_pseudogene +chr2 176188843 176188901 + ENSG00000277284.1 MIR7704 miRNA +chr2 176270241 176270330 - ENSG00000264392.2 AC016739.3 miRNA +chr2 177212726 177212799 + ENSG00000263721.1 MIR4444-1 miRNA +chr2 178831371 178831449 - ENSG00000238542.1 RNU7-104P snRNA +chr2 179934402 179934527 - ENSG00000202216.2 SNORA17 snoRNA +chr2 182314454 182314536 + ENSG00000281257.1 AC012500.1 miRNA +chr2 182788098 182788435 + ENSG00000232430.1 RPL31P15 processed_pseudogene +chr2 188974419 188974555 + ENSG00000281204.1 AC066694.1 miRNA +chr2 189311671 189311761 - ENSG00000266817.1 AC118063.1 miRNA +chr2 190880854 190880942 + ENSG00000280517.1 AC005540.1 miRNA +chr2 192775943 192776073 + ENSG00000278406.1 PCGEM1 misc_RNA +chr2 201166965 201167546 + ENSG00000234431.2 AC007283.5 3prime_overlapping_ncrna +chr2 201621646 201623430 - ENSG00000241790.2 ENO1P4 processed_pseudogene +chr2 202115167 202115260 - ENSG00000274633.1 AC079354.1 miRNA +chr2 202333642 202333724 + ENSG00000281619.1 AC064836.1 miRNA +chr2 202773720 202774360 - ENSG00000240761.1 AC098831.4 processed_pseudogene +chr2 202840949 202841082 + ENSG00000281423.1 AC010900.1 miRNA +chr2 203190780 203191277 + ENSG00000204196.5 AC011737.2 processed_pseudogene +chr2 203764707 203764798 - ENSG00000211573.2 AC125238.1 miRNA +chr2 206115547 206122323 + ENSG00000231955.1 AC007383.4 antisense +chr2 206116110 206116500 - ENSG00000237580.1 GCSHP3 processed_pseudogene +chr2 206160843 206161024 + ENSG00000277502.1 uc_338 misc_RNA +chr2 206162228 206162359 + ENSG00000207406.1 SNORA41 snoRNA +chr2 206259628 206259918 - ENSG00000224070.1 HMGN1P6 processed_pseudogene +chr2 207753872 207754435 - ENSG00000272851.1 RP11-801F7.1 antisense +chr2 207754807 207754881 + ENSG00000264900.1 MIR4775 miRNA +chr2 218280125 218280188 - ENSG00000274203.1 MIR6513 miRNA +chr2 230176665 230221721 - ENSG00000280755.1 SP110 protein_coding +chr2 230908852 230910102 + ENSG00000230385.1 AC012507.4 antisense +chr2 231456444 231456523 - ENSG00000207280.1 SNORD20 snoRNA +chr2 231711525 231711647 + ENSG00000277986.1 U4 snRNA +chr2 231810348 231810844 - ENSG00000260622.1 RP11-690I21.3 processed_pseudogene +chr2 232550474 232550573 - ENSG00000266620.1 MIR5001 miRNA +chr2 233865496 233867359 - ENSG00000279809.1 AC005538.3 TEC +chr2 239091759 239091836 - ENSG00000281838.1 AC017028.12 miRNA +chr3 8931506 8931631 + ENSG00000199815.2 SNORA17 snoRNA +chr3 10278938 10279024 - ENSG00000273665.1 AC022384.2 miRNA +chr3 12071038 12071121 + ENSG00000265870.1 AC026166.1 miRNA +chr3 12840294 12840396 - ENSG00000281117.1 AC034198.1 miRNA +chr3 12840312 12840450 - ENSG00000207496.1 SNORA7A snoRNA +chr3 12850659 12850860 + ENSG00000250939.2 AC034198.7 transcribed_unprocessed_pseudogene +chr3 15738657 15738718 + ENSG00000281814.1 AC090950.1 miRNA +chr3 16933196 16933260 + ENSG00000264818.1 MIR3714 miRNA +chr3 24096512 24097360 - ENSG00000242109.1 NPM1P23 transcribed_processed_pseudogene +chr3 30304321 30304536 - ENSG00000281710.1 U3 snoRNA +chr3 32259728 32259819 + ENSG00000207857.2 AC097639.1 miRNA +chr3 33144104 33147721 - ENSG00000272149.1 RP11-627J17.1 antisense +chr3 38125292 38125394 - ENSG00000201965.1 Y_RNA misc_RNA +chr3 39408389 39408539 + ENSG00000206760.1 SNORA6 snoRNA +chr3 39411054 39411206 + ENSG00000202363.1 SNORA62 snoRNA +chr3 44629284 44629372 + ENSG00000280942.1 AC099669.1 miRNA +chr3 44861888 44861981 + ENSG00000207783.1 MIR564 miRNA +chr3 48094801 48094876 - ENSG00000281068.1 AC124916.1 miRNA +chr3 48094888 48094957 - ENSG00000281238.1 AC124916.2 miRNA +chr3 48465811 48467645 + ENSG00000213689.8 TREX1 protein_coding +chr3 48633636 48633698 - ENSG00000274831.1 MIR6824 miRNA +chr3 48985485 48985963 - ENSG00000273211.1 RP13-131K19.7 lincRNA +chr3 49099854 49099914 - ENSG00000274888.1 MIR6890 miRNA +chr3 49270132 49270239 + ENSG00000199546.1 Y_RNA misc_RNA +chr3 49274120 49274186 + ENSG00000264633.1 MIR4271 miRNA +chr3 49806137 49806245 - ENSG00000263506.1 MIR5193 miRNA +chr3 49863381 49863464 - ENSG00000281278.1 AC139451.2 miRNA +chr3 50273236 50273297 + ENSG00000274596.1 MIR6872 miRNA +chr3 52690744 52690827 + ENSG00000238862.1 SNORD19B snoRNA +chr3 54639681 54639748 + ENSG00000276927.1 AC092057.1 miRNA +chr3 60617805 60617870 - ENSG00000281426.1 MIR548BB miRNA +chr3 64011964 64016246 + ENSG00000243410.1 RP11-245J9.4 antisense +chr3 64099273 64101122 + ENSG00000241572.1 PRICKLE2-AS1 antisense +chr3 75738280 75738363 + ENSG00000266396.1 MIR4273 miRNA +chr3 87323969 87324095 - ENSG00000280503.1 AC130885.1 miRNA +chr3 98264285 98265262 + ENSG00000230301.4 OR5H6 protein_coding +chr3 98264285 98265262 + ENSG00000279922.1 OR5H6 polymorphic_pseudogene +chr3 99691480 99691571 - ENSG00000263810.1 AC055723.1 miRNA +chr3 101823793 101824998 - ENSG00000249474.1 RP11-49I4.3 antisense +chr3 116716460 116716624 + ENSG00000278072.1 AC108713.1 misc_RNA +chr3 123161794 123161879 + ENSG00000275891.1 MIR7110 miRNA +chr3 123283593 123283983 + ENSG00000272678.1 RP11-797D24.4 antisense +chr3 124723788 124726325 + ENSG00000260391.2 RP11-71H17.7 sense_overlapping +chr3 124792319 124792562 - ENSG00000276626.1 7SK misc_RNA +chr3 126571789 126572636 - ENSG00000206483.5 TXNRD3NB protein_coding +chr3 128673691 128673771 - ENSG00000280957.1 AC079945.1 miRNA +chr3 134437827 134437906 + ENSG00000263554.1 MIR4788 miRNA +chr3 138102085 138102222 + ENSG00000281378.1 AC023049.1 miRNA +chr3 139233414 139233520 + ENSG00000276304.1 PISRT1 misc_RNA +chr3 139494618 139494701 + ENSG00000263538.1 AC097103.1 miRNA +chr3 142450102 142452149 + ENSG00000242479.1 RP11-383G6.4 processed_pseudogene +chr3 147386967 147387453 + ENSG00000241202.1 ZIC4-AS1 antisense +chr3 149766814 149766883 + ENSG00000281611.1 AC069216.1 miRNA +chr3 149779009 149779108 - ENSG00000251854.1 RNU6-507P snRNA +chr3 151079506 151079584 + ENSG00000276055.1 CLRN1-AS1 misc_RNA +chr3 157153548 157153640 + ENSG00000201778.1 Y_RNA misc_RNA +chr3 169483709 169483946 + ENSG00000281137.1 AC074033.1 protein_coding +chr3 169764610 169765047 - ENSG00000277925.1 Telomerase-vert misc_RNA +chr3 169793495 169793966 + ENSG00000270135.1 RP11-362K14.7 antisense +chr3 169794962 169796213 + ENSG00000270096.1 RP11-362K14.6 antisense +chr3 169945987 169946754 - ENSG00000244193.1 RP11-379K17.5 transcribed_processed_pseudogene +chr3 170087810 170089590 + ENSG00000268220.1 RP11-379K17.12 antisense +chr3 170496131 170496263 - ENSG00000281375.1 AC008041.1 miRNA +chr3 179396961 179399191 + ENSG00000242539.2 AC007620.3 antisense +chr3 181610498 181610729 + ENSG00000276074.1 SOX2OT_exon1 misc_RNA +chr3 181699608 181699883 + ENSG00000276690.1 SOX2OT_exon3 misc_RNA +chr3 181699705 181699783 + ENSG00000281596.1 AC117415.1 miRNA +chr3 183453814 183453944 + ENSG00000199363.1 SNORA63 snoRNA +chr3 183802475 183802553 - ENSG00000216166.1 AC131160.1 miRNA +chr3 186784796 186784864 + ENSG00000238942.1 SNORD2 snoRNA +chr3 186786323 186786445 + ENSG00000200418.1 SNORA63 snoRNA +chr3 186786672 186786777 + ENSG00000281017.1 MIR1248 miRNA +chr3 186786675 186786852 + ENSG00000221420.2 SNORA81 snoRNA +chr3 186787300 186787431 + ENSG00000200320.1 SNORA63 snoRNA +chr3 186787612 186787749 + ENSG00000263776.1 SNORA4 snoRNA +chr3 190659216 190659750 + ENSG00000273370.1 RP11-268E23.2 lincRNA +chr3 195658096 195685904 + ENSG00000215837.7 SDHAP2 transcribed_unprocessed_pseudogene +chr3 195688008 195688120 + ENSG00000276635.1 AC233280.3 miRNA +chr3 197674496 197674576 - ENSG00000216042.1 MIR922 miRNA +chr4 1986384 1986477 - ENSG00000216105.1 MIR943 miRNA +chr4 2250077 2250156 - ENSG00000265080.1 MIR4800 miRNA +chr4 13627442 13627591 + ENSG00000281620.1 AC006445.1 protein_coding +chr4 16256308 16256555 - ENSG00000242358.1 RPS21P4 processed_pseudogene +chr4 38089944 38090036 - ENSG00000280678.1 AC108933.1 miRNA +chr4 44448005 44448110 + ENSG00000266496.1 AC131951.1 miRNA +chr4 44704405 44704965 + ENSG00000272936.1 RP11-700J17.2 antisense +chr4 56097422 56097481 + ENSG00000280464.1 AC092627.1 miRNA +chr4 56794354 56794438 - ENSG00000281241.1 AC022483.1 miRNA +chr4 69215908 69216635 - ENSG00000268803.1 RP11-704M14.2 unprocessed_pseudogene +chr4 88275205 88275308 - ENSG00000200469.1 RNU6-112P snRNA +chr4 88521573 88521789 - ENSG00000255072.1 PIGY protein_coding +chr4 88710147 88710265 + ENSG00000278151.1 FAM13A-AS1_1 misc_RNA +chr4 98929914 98929993 - ENSG00000238449.2 AC019131.1 miRNA +chr4 104490876 104490954 + ENSG00000272082.1 AC093628.1 miRNA +chr4 105679050 105680094 - ENSG00000248778.1 RP11-311D14.1 processed_pseudogene +chr4 108789200 108789262 - ENSG00000265522.1 AC097473.1 miRNA +chr4 118279190 118279320 + ENSG00000275994.1 SNORA24 snoRNA +chr4 122827014 122827090 + ENSG00000253069.1 AC021205.1 miRNA +chr4 135371590 135371681 - ENSG00000207849.2 AC108867.1 miRNA +chr4 137601899 137602433 - ENSG00000280352.1 RP13-884E18.3 TEC +chr4 143555082 143555583 - ENSG00000248924.2 RP11-481K16.2 transcribed_processed_pseudogene +chr4 144739762 144739941 - ENSG00000273621.1 uc_338 misc_RNA +chr4 146639261 146639343 + ENSG00000264323.1 AC093887.1 miRNA +chr4 151103827 151103891 + ENSG00000208797.1 SNORD73A snoRNA +chr4 152536428 152536516 + ENSG00000277685.1 MIR4453 miRNA +chr4 158199105 158200442 - ENSG00000250604.1 RP11-597D13.8 antisense +chr4 183318194 183320774 - ENSG00000177300.6 CLDN22 protein_coding +chr4 190065233 190065914 + ENSG00000277162.1 DBET processed_pseudogene +chr4 190175141 190175224 + ENSG00000274222.1 AC215524.1 miRNA +chr5 3596211 3600188 - ENSG00000259603.1 CTD-2012M11.3 antisense +chr5 8460925 8460999 + ENSG00000273868.1 MIR4458 miRNA +chr5 10195187 10197622 - ENSG00000271998.1 CTD-2199O4.7 lincRNA +chr5 10652211 10655925 - ENSG00000251196.1 RP11-54F2.1 unprocessed_pseudogene +chr5 18958153 18958237 - ENSG00000281138.1 AC114981.1 miRNA +chr5 31249879 31250200 + ENSG00000250482.2 RP11-152K4.1 processed_pseudogene +chr5 32379407 32379467 - ENSG00000222961.1 AC008949.1 miRNA +chr5 38557502 38557561 - ENSG00000265304.1 MIR3650 miRNA +chr5 42806394 42806997 + ENSG00000272234.1 CTD-2325A15.5 antisense +chr5 52903908 52904478 + ENSG00000241809.1 CTD-2207L17.1 processed_pseudogene +chr5 55936143 55941727 + ENSG00000262211.1 CTD-2031P19.5 antisense +chr5 57481820 57481903 - ENSG00000264748.1 AC025470.1 miRNA +chr5 60487713 60487929 + ENSG00000273701.1 PART1_1 misc_RNA +chr5 60488078 60488327 + ENSG00000275634.1 PART1_2 misc_RNA +chr5 60546219 60546349 + ENSG00000276233.1 PART1_3 misc_RNA +chr5 69160808 69160939 + ENSG00000280894.1 SNORA76 snoRNA +chr5 73446357 73446984 + ENSG00000247993.2 FOXD1-AS1 lincRNA +chr5 74779309 74779413 + ENSG00000199645.1 RNU6-1330P snRNA +chr5 75598482 75599380 - ENSG00000248881.1 CTC-366B18.2 antisense +chr5 77073881 77074520 + ENSG00000250615.1 CTC-564N23.2 antisense +chr5 88666853 88666939 - ENSG00000273878.1 MIR9-2 miRNA +chr5 88692651 88692859 + ENSG00000250555.2 CTC-467M3.2 antisense +chr5 91313057 91314402 - ENSG00000271762.1 RP11-213H15.4 lincRNA +chr5 91844022 91844116 - ENSG00000276426.1 uc_338 misc_RNA +chr5 92405941 92406044 - ENSG00000264489.2 AC120120.1 miRNA +chr5 93620696 93620788 - ENSG00000251725.1 MIR2277 miRNA +chr5 100050365 100050438 + ENSG00000264839.1 AC113407.1 miRNA +chr5 122478662 122478729 + ENSG00000277169.1 AC022101.1 miRNA +chr5 131427266 131635030 - ENSG00000281164.1 RAPGEF6 protein_coding +chr5 132488382 132488702 + ENSG00000277192.1 RP11-89G4.1 antisense +chr5 134928039 134928112 + ENSG00000263963.1 MIR4461 miRNA +chr5 136080497 136080597 - ENSG00000278815.1 VTRNA2-1 misc_RNA +chr5 136129516 136129781 + ENSG00000277859.1 SMAD5-AS1_1 misc_RNA +chr5 136133696 136133826 + ENSG00000275646.1 SMAD5-AS1_2 misc_RNA +chr5 136133912 136134035 + ENSG00000277524.1 SMAD5-AS1_3 misc_RNA +chr5 136134082 136134248 + ENSG00000274934.1 SMAD5-AS1_4 misc_RNA +chr5 138347027 138349641 - ENSG00000249971.1 RP11-256P1.1 antisense +chr5 139364677 139369717 - ENSG00000272742.1 CTB-43P18.1 antisense +chr5 140563671 140563751 - ENSG00000274910.1 MIR6831 miRNA +chr5 140670794 140673586 - ENSG00000256453.1 DND1 protein_coding +chr5 140849105 140849696 - ENSG00000278907.1 AC005609.19 antisense +chr5 140861224 140863521 + ENSG00000249504.3 PCDHA14 transcribed_unprocessed_pseudogene +chr5 140867513 140867959 - ENSG00000278946.1 AC005609.20 antisense +chr5 140875346 140875922 - ENSG00000278915.1 AC005609.18 antisense +chr5 141350109 141350662 - ENSG00000280026.1 AC005618.8 antisense +chr5 141427295 141427752 - ENSG00000279855.1 AC005618.9 antisense +chr5 141479535 141479617 + ENSG00000281646.1 AC008781.1 miRNA +chr5 142317830 142317920 + ENSG00000277968.1 SPRY4-IT1_1 misc_RNA +chr5 142318047 142318167 + ENSG00000274721.1 SPRY4-IT1_2 misc_RNA +chr5 149425771 149428289 - ENSG00000275871.1 RP11-394O4.6 lincRNA +chr5 149430646 149430733 + ENSG00000276365.1 MIR145 miRNA +chr5 150670658 150672390 - ENSG00000250309.2 CTC-345K18.2 antisense +chr5 155845469 155845527 + ENSG00000280546.1 AC140677.1 miRNA +chr5 160485352 160485450 + ENSG00000277727.1 MIR146A miRNA +chr5 164601002 164601452 - ENSG00000253600.1 CTC-340A15.1 processed_pseudogene +chr5 168552277 168553727 - ENSG00000253861.1 SLC2A3P1 processed_pseudogene +chr5 168791498 168791610 + ENSG00000202345.1 Y_RNA misc_RNA +chr5 170767384 170767451 + ENSG00000276554.1 AC008514.1 miRNA +chr5 178884715 178884879 + ENSG00000206624.1 RNU1-39P snRNA +chr5 179082680 179083194 - ENSG00000253144.1 RP11-281O15.7 processed_pseudogene +chr5 181222566 181222633 - ENSG00000264732.1 MIR4638 miRNA +chr5 181224646 181230685 - ENSG00000247049.2 CTC-338M12.7 antisense +chr5 181241814 181241892 - ENSG00000272296.1 SNORD96A snoRNA +chr5 181243312 181243379 - ENSG00000264549.1 SNORD95 snoRNA +chr6 1390314 1390411 - ENSG00000275859.1 MIR6720 miRNA +chr6 3023142 3023772 - ENSG00000271361.1 HTATSF1P2 processed_pseudogene +chr6 3138394 3153062 - ENSG00000228170.1 RP1-40E16.11 antisense +chr6 6347081 6347381 - ENSG00000241216.1 SNAPC5P1 transcribed_processed_pseudogene +chr6 8653558 8653797 + ENSG00000276019.1 HULC misc_RNA +chr6 13614111 13615155 - ENSG00000261071.1 RP1-223E5.4 antisense +chr6 16301693 16301824 - ENSG00000281447.1 AL009031.1 miRNA +chr6 20421686 20421749 - ENSG00000281428.1 AL136303.2 miRNA +chr6 24599052 24599184 - ENSG00000280713.1 AL512385.1 miRNA +chr6 24839967 24840065 - ENSG00000263391.1 AL512428.1 miRNA +chr6 25732635 25732774 - ENSG00000280964.1 AL512384.1 miRNA +chr6 26124436 26124576 - ENSG00000281052.1 U91328.1 miRNA +chr6 26198861 26198997 + ENSG00000281823.1 AL031777.2 miRNA +chr6 26217220 26217357 - ENSG00000280547.1 AL031777.1 miRNA +chr6 27808413 27808553 - ENSG00000281258.1 AL009179.1 miRNA +chr6 27814397 27814537 + ENSG00000280826.1 AL049822.1 miRNA +chr6 27837947 27838339 - ENSG00000275221.1 HIST1H2AK protein_coding +chr6 27837957 27838094 + ENSG00000281435.1 Z98744.1 miRNA +chr6 27865600 27865738 - ENSG00000281512.1 Z98744.2 miRNA +chr6 27892767 27892904 + ENSG00000281851.1 Z98744.3 miRNA +chr6 28977613 28977709 + ENSG00000280628.1 AL662791.1 miRNA +chr6 29396700 29397623 + ENSG00000168787.6 OR12D2 protein_coding +chr6 29440016 29440954 + ENSG00000279941.1 OR10C1 protein_coding +chr6 29726669 29727139 - ENSG00000239257.1 RPL23AP1 transcribed_processed_pseudogene +chr6 30035916 30035983 + ENSG00000278104.1 ZNRD1-AS1_1 misc_RNA +chr6 30058115 30058190 + ENSG00000275856.1 ZNRD1-AS1_2 misc_RNA +chr6 30061080 30061183 + ENSG00000278773.1 ZNRD1-AS1_3 misc_RNA +chr6 30684796 30684892 + ENSG00000277346.1 AL662797.1 miRNA +chr6 30890883 30890972 + ENSG00000264594.1 MIR4640 miRNA +chr6 31532757 31532869 - ENSG00000276877.1 AL662801.1 miRNA +chr6 31541101 31541178 - ENSG00000265236.1 SNORD84 snoRNA +chr6 31658329 31660721 + ENSG00000227198.1 C6orf47-AS1 antisense +chr6 31837076 31837142 + ENSG00000201754.1 SNORD52 snoRNA +chr6 31956839 31956940 - ENSG00000221267.1 MIR1236 miRNA +chr6 32936916 32937010 - ENSG00000212066.1 AL645941.1 miRNA +chr6 32972065 32972853 - ENSG00000263756.1 XXbac-BPG181M17.6 antisense +chr6 33290245 33290325 + ENSG00000275010.1 MIR6834 miRNA +chr6 33698128 33698234 + ENSG00000266509.1 MIR3934 miRNA +chr6 34873831 34873927 + ENSG00000252106.2 RNY3P15 misc_RNA +chr6 40392730 40392874 + ENSG00000280824.1 AL591063.1 miRNA +chr6 41787662 41789898 + ENSG00000214736.6 TOMM6 protein_coding +chr6 41791410 41791477 + ENSG00000268745.1 RP11-298J23.9 antisense +chr6 42155426 42163439 + ENSG00000214732.2 RP1-139D8.6 protein_coding +chr6 43213801 43223860 - ENSG00000245261.1 RP3-330M21.5 antisense +chr6 43770429 43770616 - ENSG00000272114.1 RP1-261G23.7 antisense +chr6 44254206 44254285 - ENSG00000265700.1 MIR4647 miRNA +chr6 53090961 53091257 + ENSG00000242865.3 RN7SL244P misc_RNA +chr6 56432379 56432442 - ENSG00000266793.1 AL137008.1 miRNA +chr6 71585089 71585201 + ENSG00000211530.1 AL354933.1 miRNA +chr6 73263008 73263084 + ENSG00000263378.1 AC019205.1 miRNA +chr6 85677294 85677368 - ENSG00000281147.1 SNORD50A snoRNA +chr6 85677589 85677658 - ENSG00000275072.1 SNORD50B snoRNA +chr6 88276125 88276208 - ENSG00000281199.1 AL139042.1 miRNA +chr6 110440082 110440165 - ENSG00000281088.1 AC002464.1 miRNA +chr6 112361848 112361939 - ENSG00000266485.1 AL365214.1 miRNA +chr6 113428797 113428886 + ENSG00000280723.1 AL021326.1 miRNA +chr6 116244187 116244728 - ENSG00000236326.1 RP3-486I3.5 antisense +chr6 116457732 116457822 - ENSG00000265516.1 Z84488.1 miRNA +chr6 125979812 125979934 - ENSG00000251920.1 RNA5SP216 rRNA +chr6 136034553 136034886 - ENSG00000213111.5 COX5BP2 transcribed_processed_pseudogene +chr6 136343193 136343249 - ENSG00000276943.1 AL023284.1 miRNA +chr6 144007227 144007464 + ENSG00000276680.1 HYMAI misc_RNA +chr6 145736911 145737173 - ENSG00000270828.1 RP3-466P17.2 lincRNA +chr6 158609707 158609790 + ENSG00000278571.1 MIR7161 miRNA +chr6 158764661 158764753 - ENSG00000265558.1 MIR3918 miRNA +chr6 159785594 159785733 - ENSG00000206910.1 SNORA29 snoRNA +chr6 166099853 166099924 + ENSG00000276643.1 SNORD45 snoRNA +chr7 1055360 1059261 - ENSG00000257607.1 RP11-449P15.1 antisense +chr7 2257515 2257577 - ENSG00000277102.1 MIR6836 miRNA +chr7 4788565 4788639 - ENSG00000264474.1 MIR4656 miRNA +chr7 5528103 5528186 - ENSG00000263900.1 AC006483.1 miRNA +chr7 6578565 6588974 - ENSG00000232581.1 AC079742.4 antisense +chr7 6691977 6692059 + ENSG00000265245.1 AC073343.1 miRNA +chr7 7742091 7742535 - ENSG00000269721.1 RPL23AP51 transcribed_processed_pseudogene +chr7 9084022 9084175 + ENSG00000271526.1 RP4-668E10.2 processed_pseudogene +chr7 12654179 12654985 + ENSG00000229233.1 CTD-2320J21.2 sense_overlapping +chr7 19719388 19719451 + ENSG00000221576.2 AC004543.1 miRNA +chr7 27096124 27096248 + ENSG00000276528.1 HOTAIRM1_1 misc_RNA +chr7 27098900 27099114 + ENSG00000276771.1 HOTAIRM1_2 misc_RNA +chr7 27099778 27099836 + ENSG00000274864.1 HOTAIRM1_3 misc_RNA +chr7 27099856 27099957 + ENSG00000274396.1 HOTAIRM1_4 misc_RNA +chr7 27099967 27100111 + ENSG00000277694.1 HOTAIRM1_5 misc_RNA +chr7 27169480 27169564 - ENSG00000207584.1 MIR196B miRNA +chr7 27185433 27185530 + ENSG00000273961.1 HOXA11-AS1_1 misc_RNA +chr7 27185832 27186018 + ENSG00000278334.1 HOXA11-AS1_2 misc_RNA +chr7 27186166 27186263 + ENSG00000276496.1 HOXA11-AS1_3 misc_RNA +chr7 27188816 27188994 + ENSG00000278020.1 HOXA11-AS1_6 misc_RNA +chr7 27200465 27200521 + ENSG00000276609.1 HOTTIP_1 misc_RNA +chr7 27201844 27202219 + ENSG00000278708.1 HOTTIP_2 misc_RNA +chr7 27202302 27202638 + ENSG00000277469.1 HOTTIP_3 misc_RNA +chr7 27206139 27206303 + ENSG00000277553.1 HOTTIP_4 misc_RNA +chr7 30157531 30159534 + ENSG00000251660.1 AC007036.5 sense_overlapping +chr7 30424672 30425412 + ENSG00000272638.1 GS1-114I9.1 antisense +chr7 30550217 30551569 + ENSG00000263683.1 RP4-777O23.1 lincRNA +chr7 38655893 38656025 - ENSG00000281438.1 AC011309.1 miRNA +chr7 39609717 39610280 + ENSG00000106540.4 AC004837.3 processed_pseudogene +chr7 40128121 40128232 - ENSG00000199273.1 Y_RNA misc_RNA +chr7 42918741 42920084 + ENSG00000234983.1 AC010132.11 processed_pseudogene +chr7 44051766 44051829 + ENSG00000274083.1 MIR6837 miRNA +chr7 44062727 44065587 - ENSG00000164708.5 PGAM2 protein_coding +chr7 44064908 44066079 + ENSG00000239775.1 AC017116.11 sense_overlapping +chr7 44110849 44110912 + ENSG00000264652.1 MIR4649 miRNA +chr7 44367176 44367270 + ENSG00000280507.1 AC004453.1 miRNA +chr7 44881748 44881800 - ENSG00000264326.1 MIR4657 miRNA +chr7 44985378 44985510 - ENSG00000277184.1 SNORA9 snoRNA +chr7 45105968 45106099 - ENSG00000200656.1 SNORA5B snoRNA +chr7 63349154 63350059 - ENSG00000227545.1 RP5-905H7.10 processed_pseudogene +chr7 63398046 63398957 + ENSG00000234467.1 SLC25A1P2 processed_pseudogene +chr7 65038372 65038565 + ENSG00000239985.2 RP11-460N20.3 unprocessed_pseudogene +chr7 66980335 66980409 + ENSG00000280772.1 AC079920.1 miRNA +chr7 75237293 75237405 - ENSG00000275121.1 CH17-232I21.1 processed_pseudogene +chr7 75474707 75486108 - ENSG00000242073.2 AC006014.7 transcribed_unprocessed_pseudogene +chr7 75915197 75915269 + ENSG00000265020.1 MIR4651 miRNA +chr7 76968346 76968518 + ENSG00000236280.1 AC114737.3 processed_pseudogene +chr7 84939349 84940245 - ENSG00000232019.1 AC074183.4 lincRNA +chr7 87345082 87345233 + ENSG00000273623.1 TP53TG1_1 misc_RNA +chr7 87345305 87345486 + ENSG00000276185.1 TP53TG1_2 misc_RNA +chr7 90403434 90513391 + ENSG00000273299.1 CTB-13L3.1 processed_transcript +chr7 95156606 95156671 - ENSG00000277296.1 AC002429.1 miRNA +chr7 96965302 96965481 + ENSG00000274409.1 Evf1_1 misc_RNA +chr7 96965489 96965695 + ENSG00000277124.1 Evf1_2 misc_RNA +chr7 100352360 100353692 + ENSG00000235333.3 PVRIG2P transcribed_unprocessed_pseudogene +chr7 100356651 100356721 + ENSG00000278005.1 MIR6840 miRNA +chr7 100868036 100868107 + ENSG00000273985.1 MIR6875 miRNA +chr7 101058299 101058567 + ENSG00000222636.1 RN7SKP54 misc_RNA +chr7 102465742 102465826 + ENSG00000266715.1 MIR5090 miRNA +chr7 102471469 102471531 + ENSG00000264471.1 MIR4467 miRNA +chr7 105014152 105014199 - ENSG00000280574.1 AC007384.1 miRNA +chr7 106781600 106781715 - ENSG00000251978.1 RNA5SP236 rRNA +chr7 112288623 112288952 + ENSG00000202406.1 RN7SKP187 misc_RNA +chr7 114629855 114629956 + ENSG00000266229.1 AC020606.1 miRNA +chr7 116909877 116909964 - ENSG00000252115.1 Y_RNA misc_RNA +chr7 116952698 116952813 + ENSG00000275359.1 ST7-AS1_1 misc_RNA +chr7 116953214 116953324 + ENSG00000277583.1 ST7-AS1_2 misc_RNA +chr7 116954480 116954679 + ENSG00000273596.1 ST7-OT4_1 misc_RNA +chr7 116956431 116956723 + ENSG00000276100.1 ST7-OT4_3 misc_RNA +chr7 116959593 116959805 + ENSG00000274606.1 ST7-OT4_4 misc_RNA +chr7 117020211 117020319 + ENSG00000275870.1 MIR6132 miRNA +chr7 117117912 117117983 + ENSG00000274344.1 ST7-AS2_1 misc_RNA +chr7 117184126 117184199 + ENSG00000275054.1 ST7-OT3_1 misc_RNA +chr7 117191647 117191761 + ENSG00000278482.1 ST7-OT3_3 misc_RNA +chr7 121050927 121051203 + ENSG00000234927.1 HMGN1P18 processed_pseudogene +chr7 128306649 128307678 + ENSG00000242261.1 RP11-62J1.3 processed_pseudogene +chr7 128466563 128469171 + ENSG00000272601.1 RP11-155G14.5 transcribed_unprocessed_pseudogene +chr7 129756266 129756377 + ENSG00000212238.1 RNA5SP244 rRNA +chr7 130487483 130487622 + ENSG00000277269.1 MESTIT1_1 misc_RNA +chr7 130488304 130488538 + ENSG00000278055.1 MESTIT1_2 misc_RNA +chr7 130489868 130489966 + ENSG00000274500.1 MESTIT1_3 misc_RNA +chr7 130496111 130496204 + ENSG00000199043.1 MIR335 miRNA +chr7 130877459 130877539 - ENSG00000274250.1 MIR29B1 miRNA +chr7 135704537 135704841 + ENSG00000273219.1 RP11-644N4.1 antisense +chr7 135927274 135927450 - ENSG00000267697.1 LUZP6 protein_coding +chr7 136903167 136903294 + ENSG00000207597.1 MIR490 miRNA +chr7 138123758 138123821 + ENSG00000266193.1 MIR4468 miRNA +chr7 140645966 140646034 + ENSG00000280499.1 AC006452.2 miRNA +chr7 142797119 142797166 + ENSG00000211769.1 TRBJ2-5 TR_J_gene +chr7 142797239 142797291 + ENSG00000211770.1 TRBJ2-6 TR_J_gene +chr7 143959971 143960924 + ENSG00000279723.1 OR2F1 protein_coding +chr7 151238421 151238538 + ENSG00000211517.1 MIR671 miRNA +chr7 151240399 151240972 + ENSG00000272661.1 RP4-548D19.3 antisense +chr8 8582352 8582427 - ENSG00000281480.1 AC114550.1 miRNA +chr8 9903388 9903472 - ENSG00000275677.1 MIR124-1 miRNA +chr8 17498737 17499157 + ENSG00000280453.1 RP11-349F21.4 TEC +chr8 20290174 20290289 + ENSG00000274467.1 5S_rRNA rRNA +chr8 23225233 23230915 - ENSG00000250714.3 RP11-1149O23.4 antisense +chr8 24956621 24957110 + ENSG00000272157.1 CTD-2168K21.2 antisense +chr8 27610601 27610751 - ENSG00000273705.1 MIR6843 miRNA +chr8 27701673 27701767 - ENSG00000265075.1 MIR3622B miRNA +chr8 29067279 29068454 + ENSG00000259607.1 CTD-2647L4.5 antisense +chr8 33513475 33513578 + ENSG00000239039.1 SNORD13 snoRNA +chr8 37858949 37861333 + ENSG00000280064.1 RP11-205M5.3 TEC +chr8 38970360 38973011 - ENSG00000253645.1 CTD-2544N14.3 antisense +chr8 41609692 41621502 - ENSG00000264578.1 RP11-360L9.8 antisense +chr8 55745232 55745342 + ENSG00000222955.1 RNA5SP265 rRNA +chr8 56073835 56073901 - ENSG00000238650.1 SNORD54 snoRNA +chr8 66922467 66922555 - ENSG00000254341.2 SNORD87 snoRNA +chr8 70480192 70480487 - ENSG00000275128.1 Metazoa_SRP misc_RNA +chr8 73982125 73982467 + ENSG00000244295.2 RPS20P21 processed_pseudogene +chr8 80484589 80484683 - ENSG00000277604.1 Y_RNA misc_RNA +chr8 86098965 86154225 - ENSG00000253675.1 CTD-3118D11.2 antisense +chr8 86657870 86657952 - ENSG00000221137.1 AC013751.1 miRNA +chr8 93733216 93734022 + ENSG00000253722.1 RP11-10N23.4 antisense +chr8 93916022 93916119 - ENSG00000276513.1 MIR378D2 miRNA +chr8 94791643 94793106 - ENSG00000254307.2 KB-1608C10.2 antisense +chr8 98042086 98042217 - ENSG00000207067.1 SNORA72 snoRNA +chr8 98192205 98192313 - ENSG00000252558.1 RNU6-914P snRNA +chr8 100702968 100703024 - ENSG00000277719.1 MIR7705 miRNA +chr8 101686547 101689093 + ENSG00000253629.1 KB-1107E3.1 antisense +chr8 103228425 103229314 - ENSG00000253851.1 RP11-318M2.3 antisense +chr8 109973943 109974781 + ENSG00000255402.1 RP11-696P8.2 antisense +chr8 116874728 116874800 - ENSG00000264875.1 MIR3610 miRNA +chr8 117164179 117164255 + ENSG00000281002.1 AC084114.1 miRNA +chr8 126073046 126073532 - ENSG00000253841.1 RP11-622O11.5 processed_pseudogene +chr8 126556896 126557433 + ENSG00000254010.1 RP11-103H7.5 antisense +chr8 127794541 127794734 + ENSG00000276443.1 PVT1_1 misc_RNA +chr8 127795962 127796028 + ENSG00000275264.1 MIR1204 miRNA +chr8 127890626 127890720 + ENSG00000278324.1 PVT1_3 misc_RNA +chr8 133229056 133229697 + ENSG00000270132.1 WISP1-OT1 sense_intronic +chr8 134598071 134598149 + ENSG00000276140.1 ZFAT-AS1_1 misc_RNA +chr8 134598318 134598518 + ENSG00000277732.1 ZFAT-AS1_2 misc_RNA +chr8 134600257 134600336 + ENSG00000278454.1 ZFAT-AS1_3 misc_RNA +chr8 142727283 142727690 - ENSG00000253806.1 CTD-2292P10.2 antisense +chr8 143542110 143542398 + ENSG00000275558.1 7SK misc_RNA +chr8 143579636 143580670 + ENSG00000254741.1 RP11-661A12.7 antisense +chr8 143837756 143837816 - ENSG00000274094.1 MIR6845 miRNA +chr8 144079874 144079942 + ENSG00000276472.1 MIR6847 miRNA +chr8 144262673 144262737 - ENSG00000277158.1 MIR7112 miRNA +chr8 144314590 144315138 - ENSG00000254690.1 GS1-393G12.12 antisense +chr8 144394149 144394230 - ENSG00000216133.1 MIR939 miRNA +chr8 144400086 144400165 - ENSG00000266624.1 MIR1234 miRNA +chr8 144400277 144400345 - ENSG00000274683.1 MIR6849 miRNA +chr8 144463817 144465101 + ENSG00000254578.1 CTD-2517M22.16 antisense +chr8 144853413 144853522 + ENSG00000263640.1 AF235103.1 miRNA +chr8 144994886 144995540 + ENSG00000254618.1 TMED10P1 processed_pseudogene +chr8 145005561 145005906 - ENSG00000255530.1 RP5-1047A19.6 processed_pseudogene +chr9 30144 30281 + ENSG00000278579.1 MIR1302-2 miRNA +chr9 4834156 4860275 + ENSG00000281007.1 AL158147.2 protein_coding +chr9 4850299 4850373 - ENSG00000228165.1 RP11-125K10.5 antisense +chr9 30774228 30774319 + ENSG00000211510.2 AL590726.1 miRNA +chr9 33042109 33042216 - ENSG00000222169.1 AL162590.1 miRNA +chr9 35449748 35450352 + ENSG00000244213.1 ZFAND6P1 processed_pseudogene +chr9 35657754 35658017 - ENSG00000277027.1 RNase_MRP ribozyme +chr9 35752990 35756613 - ENSG00000215183.4 MSMP protein_coding +chr9 35756712 35757940 - ENSG00000228843.2 RP11-112J3.15 antisense +chr9 35811476 35811550 - ENSG00000263448.1 AL133410.1 miRNA +chr9 35859062 35859983 - ENSG00000236083.1 OR13E1P unitary_pseudogene +chr9 35957139 35958098 - ENSG00000122718.5 OR2S2 protein_coding +chr9 37434177 37434586 + ENSG00000236156.2 CHCHD4P3 processed_pseudogene +chr9 38542314 38542569 - ENSG00000259898.1 CYP4F33P processed_pseudogene +chr9 38542389 38543215 - ENSG00000272934.1 RP11-392E22.10 processed_transcript +chr9 38543104 38543446 - ENSG00000250989.1 RP11-392E22.5 processed_pseudogene +chr9 39173794 39174513 - ENSG00000243695.1 RP11-290L7.3 processed_pseudogene +chr9 41280460 41280787 + ENSG00000278647.1 RP11-4L24.2 transcribed_processed_pseudogene +chr9 69428248 69428721 - ENSG00000243888.1 RP11-548B3.3 3prime_overlapping_ncrna +chr9 70311607 70311701 - ENSG00000239180.1 Y_RNA misc_RNA +chr9 76785782 76786190 + ENSG00000277320.1 PCA3_2 misc_RNA +chr9 79726069 79726321 + ENSG00000240979.1 RP11-79D8.2 processed_pseudogene +chr9 83969748 83969857 - ENSG00000207603.1 MIR7-1 miRNA +chr9 89497938 89498021 + ENSG00000264913.1 AL590233.1 miRNA +chr9 92836826 92837668 - ENSG00000226721.2 EEF1DP2 processed_pseudogene +chr9 95086012 95086096 - ENSG00000207617.3 MIR3074 miRNA +chr9 97700234 97700325 + ENSG00000266608.1 AL445531.1 miRNA +chr9 100732033 100732116 + ENSG00000281312.1 AL390876.1 miRNA +chr9 116398157 116400606 - ENSG00000256040.2 PAPPA-AS1 antisense +chr9 121341347 121348132 + ENSG00000280315.1 AL161784.1 protein_coding +chr9 122628579 122629535 - ENSG00000171484.4 OR1B1 protein_coding +chr9 123111546 123111643 - ENSG00000274325.1 MIR600 miRNA +chr9 127690687 127690795 - ENSG00000264329.1 MIR3911 miRNA +chr9 127785833 127785923 + ENSG00000266070.1 MIR3960 miRNA +chr9 127785836 127785905 + ENSG00000281546.1 AL162586.1 miRNA +chr9 128260749 128260817 - ENSG00000274982.1 AL590708.3 miRNA +chr9 128391461 128392016 + ENSG00000272593.1 RP11-339B21.11 lincRNA +chr9 128392618 128392714 - ENSG00000273685.1 MIR219B miRNA +chr9 129040566 129040674 + ENSG00000280692.1 AL592211.1 miRNA +chr9 129175807 129177575 + ENSG00000268050.2 RP11-247A12.8 antisense +chr9 131497479 131500191 - ENSG00000176868.2 RP11-334J6.7 antisense +chr9 133338990 133339465 + ENSG00000230064.1 RP11-244N20.7 processed_pseudogene +chr9 133349396 133349470 + ENSG00000206611.1 SNORD24 snoRNA +chr9 133350095 133350168 + ENSG00000200831.1 SNORD36B snoRNA +chr9 133350456 133350528 + ENSG00000199744.1 SNORD36A snoRNA +chr9 136249978 136251205 + ENSG00000279813.1 CR392000.1 protein_coding +chr9 136726104 136726239 - ENSG00000280496.1 SNORA43 snoRNA +chr9 136726105 136726234 - ENSG00000276161.1 SNORA17 snoRNA +chr9 136726747 136726879 - ENSG00000274998.1 SNORA17 snoRNA +chr9 136726748 136726879 - ENSG00000281808.1 SNORA17 snoRNA +chr9 137063535 137064581 + ENSG00000231864.2 RP11-229P13.23 antisense +chr9 137450026 137450086 - ENSG00000276682.1 MIR7114 miRNA +chr10 277184 277308 + ENSG00000264248.1 AL603831.1 miRNA +chr10 5524976 5525742 - ENSG00000256462.1 RP11-116G8.5 antisense +chr10 5861805 5862594 - ENSG00000240180.1 RP11-318E3.4 processed_pseudogene +chr10 22332587 22332981 - ENSG00000271981.1 RP11-573G6.8 lincRNA +chr10 29421476 29422012 - ENSG00000274985.1 PTCHD3P1 unprocessed_pseudogene +chr10 32346864 32346946 - ENSG00000222309.1 AL391839.1 miRNA +chr10 35641172 35641252 - ENSG00000264780.1 MIR4683 miRNA +chr10 37831889 37831971 + ENSG00000266800.1 AL135791.1 miRNA +chr10 42470082 42470268 - ENSG00000279239.1 RP11-178A10.3 transcribed_unprocessed_pseudogene +chr10 42832447 42832550 - ENSG00000252416.1 RNU6-885P snRNA +chr10 50393006 50394561 + ENSG00000279291.1 AC069547.1 protein_coding +chr10 50393006 50394561 - ENSG00000279027.1 RP11-521C22.3 TEC +chr10 72274915 72275980 - ENSG00000269926.1 RP11-442H21.2 antisense +chr10 77912280 77912362 - ENSG00000281632.1 AL391421.1 miRNA +chr10 79684494 79684983 - ENSG00000224886.2 RP11-119F19.4 transcribed_processed_pseudogene +chr10 79970963 79973213 - ENSG00000280259.1 RP11-479O17.9 TEC +chr10 80091411 80091867 - ENSG00000279399.1 RP11-369J21.12 TEC +chr10 86751611 86751772 - ENSG00000281735.1 Clostridiales-1 sRNA +chr10 86970237 86970826 - ENSG00000273413.1 RP11-96C23.15 antisense +chr10 87001636 87009905 + ENSG00000261011.1 RP11-96C23.11 transcribed_unprocessed_pseudogene +chr10 89644603 89645223 - ENSG00000249962.1 RP11-80H5.5 processed_pseudogene +chr10 89837612 89839334 - ENSG00000270670.1 RP11-248C1.3 processed_pseudogene +chr10 102409042 102409102 - ENSG00000275957.1 AL121928.1 miRNA +chr10 102483039 102483559 + ENSG00000273262.1 RP11-18I14.11 antisense +chr10 104312141 104313881 + ENSG00000270075.1 RP11-127L20.5 antisense +chr10 110898090 110898155 + ENSG00000265827.1 MIR4680 miRNA +chr10 114174105 114174179 - ENSG00000238742.1 MIR2110 miRNA +chr10 127013501 127026475 - ENSG00000232935.2 RP11-223P11.2 antisense +chr10 129837505 129837794 + ENSG00000275327.1 RP11-234G16.4 antisense +chr11 203623 205470 + ENSG00000254559.1 RP11-304M2.5 antisense +chr11 209336 209406 + ENSG00000274298.1 MIR6743 miRNA +chr11 811681 811814 + ENSG00000199785.1 SNORA52 snoRNA +chr11 1242261 1249676 - ENSG00000255177.2 RP11-532E4.2 antisense +chr11 1996540 1996610 + ENSG00000275266.1 H19_1 misc_RNA +chr11 1996730 1996842 - ENSG00000278648.1 AC051649.1 miRNA +chr11 1997484 1997552 + ENSG00000274866.1 H19_2 misc_RNA +chr11 1997713 1997828 + ENSG00000280586.1 H19_3 misc_RNA +chr11 2129121 2129964 - ENSG00000240801.1 AC132217.4 3prime_overlapping_ncrna +chr11 2404515 2407908 + ENSG00000230483.1 AC124057.5 antisense +chr11 2652099 2652562 + ENSG00000275666.1 KCNQ1OT1_1 misc_RNA +chr11 2674097 2674292 + ENSG00000276105.1 KCNQ1OT1_2 misc_RNA +chr11 2674625 2674740 + ENSG00000276494.1 KCNQ1OT1_3 misc_RNA +chr11 2697851 2698076 + ENSG00000276015.1 KCNQ1OT1_5 misc_RNA +chr11 3856062 3856141 + ENSG00000263421.1 MIR4687 miRNA +chr11 4367351 4368295 - ENSG00000280253.1 OR52B4 protein_coding +chr11 4768979 4769917 - ENSG00000188069.4 OR51F1 protein_coding +chr11 4803433 4804380 - ENSG00000176937.9 OR52R1 protein_coding +chr11 4803433 4804380 - ENSG00000279270.1 OR52R1 polymorphic_pseudogene +chr11 4923374 4924339 - ENSG00000176879.3 OR51G1 protein_coding +chr11 5069572 5070461 + ENSG00000236621.3 OR52E1 pseudogene +chr11 5323359 5324297 - ENSG00000184881.3 OR51B2 protein_coding +chr11 6608667 6610135 - ENSG00000254641.1 RP11-732A19.2 sense_overlapping +chr11 6621451 6622322 + ENSG00000255390.1 RP11-732A19.5 antisense +chr11 6785038 6785988 + ENSG00000170803.5 OR2AG1 protein_coding +chr11 7927718 7928662 - ENSG00000175393.3 OR10A6 protein_coding +chr11 8684204 8684314 + ENSG00000280884.1 AC091053.1 miRNA +chr11 8684227 8684356 + ENSG00000200983.1 SNORA45A snoRNA +chr11 8693357 8696607 + ENSG00000254665.1 RP11-152H18.3 antisense +chr11 9578028 9578131 + ENSG00000238387.1 snoU13 snoRNA +chr11 9958744 9959790 - ENSG00000254765.1 RP11-1H15.1 processed_pseudogene +chr11 10801467 10801608 - ENSG00000238622.1 SNORD97 snoRNA +chr11 11352426 11353307 + ENSG00000255351.1 RP11-567I13.1 antisense +chr11 12261426 12263173 - ENSG00000254680.1 RP11-265D17.2 antisense +chr11 15969533 15969621 - ENSG00000274140.1 MIR6073 miRNA +chr11 17075779 17075868 - ENSG00000201403.1 SNORD14B snoRNA +chr11 20119684 20120632 - ENSG00000254894.1 NAV2-AS1 antisense +chr11 32097143 32105091 - ENSG00000255252.3 RP1-65P5.3 antisense +chr11 32435738 32435846 + ENSG00000278822.1 WT1-AS_1 misc_RNA +chr11 32436797 32437030 + ENSG00000278045.1 WT1-AS_3 misc_RNA +chr11 32439086 32439206 + ENSG00000276530.1 WT1-AS_6 misc_RNA +chr11 32439716 32440009 + ENSG00000273908.1 WT1-AS_7 misc_RNA +chr11 32440109 32440383 + ENSG00000277119.1 WT1-AS_8 misc_RNA +chr11 33190062 33190503 - ENSG00000241950.1 RPL29P23 processed_pseudogene +chr11 33354465 33354545 + ENSG00000223134.1 AL122015.1 miRNA +chr11 35860654 35860744 - ENSG00000266590.1 AC090692.1 miRNA +chr11 45905941 45906461 - ENSG00000255498.1 RP11-618K13.2 antisense +chr11 47577725 47578277 + ENSG00000231880.2 RP11-76I23.7 antisense +chr11 48245104 48246015 + ENSG00000279556.1 OR4X2 protein_coding +chr11 48263861 48264778 + ENSG00000176567.1 OR4X1 protein_coding +chr11 48263861 48264778 + ENSG00000279260.1 OR4X1 polymorphic_pseudogene +chr11 55572128 55573060 + ENSG00000181935.3 OR4C16 protein_coding +chr11 55572128 55573060 + ENSG00000279514.1 OR4C16 polymorphic_pseudogene +chr11 55773438 55774382 + ENSG00000198877.1 OR5D13 protein_coding +chr11 55773438 55774382 + ENSG00000279761.1 OR5D13 polymorphic_pseudogene +chr11 55811467 55812402 + ENSG00000186117.3 OR5L1 protein_coding +chr11 56318307 56319245 + ENSG00000181689.1 OR8K3 protein_coding +chr11 56318307 56319245 + ENSG00000280314.1 OR8K3 polymorphic_pseudogene +chr11 56412744 56413668 + ENSG00000181395.6 OR5AL1 pseudogene +chr11 56417258 56418232 - ENSG00000174942.1 OR5R1 protein_coding +chr11 56417258 56418232 - ENSG00000279961.1 OR5R1 polymorphic_pseudogene +chr11 56663686 56664618 + ENSG00000279911.1 OR5AR1 protein_coding +chr11 57476493 57477534 - ENSG00000255301.1 RP11-624G17.3 antisense +chr11 58030953 58031906 + ENSG00000172381.4 OR6Q1 protein_coding +chr11 58214745 58215722 + ENSG00000172774.7 OR1S1 protein_coding +chr11 61792495 61792561 - ENSG00000207601.1 MIR611 miRNA +chr11 61815161 61815240 - ENSG00000222326.1 MIR1908 miRNA +chr11 62372656 62372719 + ENSG00000265696.1 AP003064.2 miRNA +chr11 62391516 62393372 - ENSG00000255126.1 CTD-2531D15.5 antisense +chr11 62665422 62665570 + ENSG00000206597.1 SNORA57 snoRNA +chr11 62789815 62789885 + ENSG00000274856.1 MIR6748 miRNA +chr11 62841619 62841809 - ENSG00000222328.1 RNU2-2P snRNA +chr11 62852910 62853035 - ENSG00000277194.1 SNORD22 snoRNA +chr11 62854161 62854285 - ENSG00000278527.1 SNORD22 snoRNA +chr11 62854621 62854695 - ENSG00000274544.1 SNORD28 snoRNA +chr11 62855012 62855083 - ENSG00000275996.1 SNORD27 snoRNA +chr11 62855292 62855366 - ENSG00000276788.1 SNORD26 snoRNA +chr11 62855564 62855632 - ENSG00000275043.1 SNORD25 snoRNA +chr11 64313860 64313950 + ENSG00000278148.1 AP001453.1 miRNA +chr11 64891132 64891243 - ENSG00000207648.2 MIR192 miRNA +chr11 64891355 64891439 - ENSG00000277225.1 MIR194-2 miRNA +chr11 65117157 65117458 + ENSG00000255173.1 AP003068.12 antisense +chr11 65129916 65129978 - ENSG00000277547.1 MIR6751 miRNA +chr11 65175035 65175502 - ENSG00000249251.1 PGAM1P8 processed_pseudogene +chr11 65423273 65423392 + ENSG00000278144.1 NEAT1_1 misc_RNA +chr11 65423638 65423742 + ENSG00000278050.1 NEAT1_2 misc_RNA +chr11 65423960 65424118 + ENSG00000277599.1 NEAT1_3 misc_RNA +chr11 65444458 65444557 + ENSG00000273834.1 MIR612 miRNA +chr11 65498010 65498405 - ENSG00000270117.1 AP000769.7 lincRNA +chr11 65502034 65503622 - ENSG00000279576.1 MALAT1 protein_coding +chr11 65502914 65503008 + ENSG00000278217.1 MALAT1 misc_RNA +chr11 65506117 65506173 + ENSG00000274072.1 mascRNA-menRNA sRNA +chr11 65795946 65797219 - ENSG00000255404.1 RP11-770G2.5 antisense +chr11 66276779 66277492 - ENSG00000254452.1 RP11-867G23.4 antisense +chr11 67431367 67435399 - ENSG00000255949.1 AP003419.16 antisense +chr11 67435510 67438067 - ENSG00000213402.2 PTPRCAP protein_coding +chr11 68032864 68032922 + ENSG00000277703.1 MIR7113 miRNA +chr11 68033897 68033981 + ENSG00000266737.1 MIR4691 miRNA +chr11 69671952 69672075 + ENSG00000281781.1 AP001888.1 protein_coding +chr11 70187788 70188509 - ENSG00000254902.1 ANO1-AS1 antisense +chr11 70206291 70207390 - ENSG00000254721.1 RP11-805J14.5 antisense +chr11 73395559 73396436 - ENSG00000256928.1 RP11-809N8.2 antisense +chr11 74455348 74456825 + ENSG00000254928.1 RP11-702H23.6 antisense +chr11 75264289 75265170 + ENSG00000254963.1 CTD-2562J17.9 antisense +chr11 75742270 75742331 + ENSG00000281528.1 AP001922.1 miRNA +chr11 76404140 76404252 - ENSG00000201756.1 Y_RNA misc_RNA +chr11 78015715 78016495 + ENSG00000254829.1 RP11-7I15.3 antisense +chr11 78016971 78079865 - ENSG00000259112.1 NDUFC2-KCTD14 protein_coding +chr11 82957423 82958216 + ENSG00000242279.1 RP11-659G9.1 processed_pseudogene +chr11 83185521 83187036 - ENSG00000269939.1 RP11-727A23.11 lincRNA +chr11 83213005 83213404 + ENSG00000241020.1 RP11-727A23.1 processed_pseudogene +chr11 86278333 86278398 + ENSG00000273630.1 MIR6755 miRNA +chr11 93721542 93721621 + ENSG00000275146.1 snoU2_19 snoRNA +chr11 93730513 93730646 - ENSG00000207112.1 SNORA25 snoRNA +chr11 93730979 93731099 - ENSG00000206799.1 SNORA32 snoRNA +chr11 93732004 93732133 - ENSG00000206834.1 SNORA1 snoRNA +chr11 93732361 93732499 - ENSG00000207304.1 SNORA8 snoRNA +chr11 93732435 93732501 - ENSG00000281293.1 AP001273.1 miRNA +chr11 93733228 93733300 - ENSG00000239195.1 SNORD5 snoRNA +chr11 93733466 93733597 - ENSG00000207145.1 SNORA18 snoRNA +chr11 93733674 93733764 - ENSG00000221170.1 MIR1304 miRNA +chr11 93735111 93735236 - ENSG00000210825.1 SNORA40 snoRNA +chr11 96341427 96341534 + ENSG00000266192.2 MIR1260B miRNA +chr11 102229851 102230922 - ENSG00000254422.1 RP11-864G5.3 antisense +chr11 112218326 112218720 + ENSG00000243930.1 RPS12P21 processed_pseudogene +chr11 113789242 113790027 + ENSG00000255870.1 RP11-667M19.5 processed_pseudogene +chr11 114312063 114312207 + ENSG00000280653.1 AP002518.1 miRNA +chr11 117820163 117876667 - ENSG00000255245.3 FXYD6-FXYD2 protein_coding +chr11 118644000 118644079 + ENSG00000275466.1 MIR6716 miRNA +chr11 118910708 118910787 + ENSG00000264211.1 MIR4492 miRNA +chr11 119018944 119019012 + ENSG00000266398.1 MIR3656 miRNA +chr11 119095051 119095191 + ENSG00000281272.1 AP003391.1 miRNA +chr11 119312950 119313012 - ENSG00000277325.1 MIR6756 miRNA +chr11 119336249 119337309 - ENSG00000245385.2 RP11-334E6.10 antisense +chr11 119338942 119340883 - ENSG00000223953.4 C1QTNF5 protein_coding +chr11 119417951 119419114 + ENSG00000263873.1 RP11-334E6.12 sense_intronic +chr11 119821829 119821927 - ENSG00000276827.1 AP001994.1 miRNA +chr11 120986258 120986353 + ENSG00000281726.1 GRIK4_3p_UTR misc_RNA +chr11 121055207 121055275 - ENSG00000281492.1 AP000646.1 miRNA +chr11 123058909 123058995 - ENSG00000207118.1 SNORD14D snoRNA +chr11 123059335 123059422 - ENSG00000202252.1 SNORD14C snoRNA +chr11 124319262 124320197 - ENSG00000197263.3 OR8D2 protein_coding +chr11 124423942 124424871 - ENSG00000198657.5 OR8B4 protein_coding +chr11 124636459 124636592 + ENSG00000200278.1 RNA5SP352 rRNA +chr11 124636491 124636552 + ENSG00000280773.1 AP001524.1 miRNA +chr11 124883691 124887789 + ENSG00000254568.1 RP11-664I21.5 antisense +chr11 124919244 124920677 + ENSG00000221932.6 HEPN1 protein_coding +chr11 125957900 125958652 + ENSG00000264299.1 RP11-680F20.12 antisense +chr11 130121672 130121780 - ENSG00000277300.1 AP003041.1 miRNA +chr11 133898504 133898581 - ENSG00000264919.1 MIR4697 miRNA +chr12 3726222 3726327 - ENSG00000222338.1 RNU6-174P snRNA +chr12 4809176 4813412 + ENSG00000151079.7 KCNA6 protein_coding +chr12 6510275 6510522 + ENSG00000276232.1 SCARNA10 sense_intronic +chr12 6537794 6538370 - ENSG00000269968.1 RP5-940J5.9 antisense +chr12 6943816 6943878 + ENSG00000238923.1 RNU7-1 snRNA +chr12 7115736 7116486 - ENSG00000276144.1 ABC12-49244600F4.4 lincRNA +chr12 9127871 9127985 + ENSG00000211542.1 AC007436.1 miRNA +chr12 9695384 9696227 + ENSG00000213443.2 RP11-75L1.2 processed_pseudogene +chr12 10214161 10214761 - ENSG00000255958.1 RP11-656E20.5 antisense +chr12 12915829 12915918 + ENSG00000207817.1 MIR614 miRNA +chr12 13540231 13544540 + ENSG00000277935.1 RP11-4N23.4 lincRNA +chr12 14774716 14774860 - ENSG00000281457.1 AC010168.1 miRNA +chr12 22195469 22195629 - ENSG00000212172.1 RNU1-149P snRNA +chr12 27798641 27800708 - ENSG00000256512.1 RP11-860B13.3 antisense +chr12 34027268 34027362 - ENSG00000264446.1 AC046130.1 miRNA +chr12 38321566 38321660 - ENSG00000266333.1 AC117372.1 miRNA +chr12 39443048 39443116 - ENSG00000252974.1 AC121334.1 miRNA +chr12 48487946 48488408 + ENSG00000240443.1 RPS10P20 transcribed_processed_pseudogene +chr12 48766194 48767323 + ENSG00000257653.1 RP11-579D7.2 antisense +chr12 48995150 48996334 + ENSG00000258283.1 RP11-386G11.3 antisense +chr12 49954639 49956125 - ENSG00000257378.1 RP11-469H8.8 antisense +chr12 51809705 51810600 - ENSG00000260122.1 RP11-923I11.3 lincRNA +chr12 52241481 52241613 - ENSG00000281415.1 AC021066.1 miRNA +chr12 52449492 52449620 + ENSG00000280966.1 AC055736.2 miRNA +chr12 52471155 52471283 + ENSG00000281474.1 AC055736.4 miRNA +chr12 52490592 52490720 + ENSG00000281194.1 AC055736.3 miRNA +chr12 52517613 52517741 + ENSG00000280827.1 AC055736.1 miRNA +chr12 52824221 52824358 + ENSG00000281110.1 AC107016.2 miRNA +chr12 52951708 52951795 + ENSG00000265039.1 AC107016.1 miRNA +chr12 53464720 53464901 + ENSG00000273658.1 uc_338 misc_RNA +chr12 53506690 53507638 - ENSG00000139574.8 NPFF protein_coding +chr12 53963629 53963697 + ENSG00000275589.1 HOTAIR_2 misc_RNA +chr12 53965965 53966061 + ENSG00000277994.1 HOTAIR_3 misc_RNA +chr12 54231397 54231476 - ENSG00000265371.1 MIR3198-2 miRNA +chr12 54543111 54544105 - ENSG00000257824.1 RP11-1049A21.2 antisense +chr12 55870941 55871219 - ENSG00000265119.2 RN7SL676P misc_RNA +chr12 55966838 55967474 - ENSG00000258554.1 RP11-973D8.4 antisense +chr12 56104614 56113905 - ENSG00000257553.1 RP11-603J24.17 antisense +chr12 57512688 57512750 + ENSG00000275657.1 MIR6758 miRNA +chr12 57694132 57721510 - ENSG00000257342.1 RP11-571M6.7 antisense +chr12 57806154 57806260 - ENSG00000206749.1 RNU6-1083P snRNA +chr12 62260359 62260454 + ENSG00000276811.1 MIR6125 miRNA +chr12 62934507 62934607 - ENSG00000275729.1 AC078814.1 miRNA +chr12 63004338 63006541 - ENSG00000257235.1 RP11-848D3.5 lincRNA +chr12 64146388 64147857 - ENSG00000255629.1 RP11-196H14.3 antisense +chr12 64266413 64266766 + ENSG00000256192.1 RP11-290I21.2 processed_pseudogene +chr12 64696191 64696550 - ENSG00000280320.1 RP11-629N8.5 TEC +chr12 65858013 65858132 + ENSG00000211577.1 AC090673.1 miRNA +chr12 66251082 66251157 + ENSG00000275058.1 MIR6502 miRNA +chr12 68841288 68843237 - ENSG00000257181.1 RP11-611O2.5 antisense +chr12 68841946 68842384 + ENSG00000256664.1 RP11-611O2.3 processed_pseudogene +chr12 69353493 69354225 - ENSG00000257764.2 RP11-1143G9.4 antisense +chr12 71793855 71799627 - ENSG00000257410.1 RP11-2H8.2 lincRNA +chr12 74538145 74538633 - ENSG00000257386.1 RP11-56G10.2 antisense +chr12 76032658 76033897 + ENSG00000257839.1 RP11-290L1.2 antisense +chr12 85958686 85960946 + ENSG00000258178.1 RP11-18J9.3 lincRNA +chr12 89544203 89544488 - ENSG00000279939.1 RP11-734K2.3 TEC +chr12 91986626 91993403 - ENSG00000279037.1 C12orf79 protein_coding +chr12 91986682 91993403 - ENSG00000280112.1 C12orf79 protein_coding +chr12 93509487 93509768 + ENSG00000243015.2 RN7SL737P misc_RNA +chr12 94571231 94571352 - ENSG00000277302.1 MIR7844 miRNA +chr12 94834398 94834513 + ENSG00000208038.1 MIR492 miRNA +chr12 97492462 97492597 + ENSG00000273942.1 RMST_2 misc_RNA +chr12 97530656 97530858 + ENSG00000275971.1 RMST_6 misc_RNA +chr12 97532691 97532958 + ENSG00000274819.1 RMST_7 misc_RNA +chr12 97563812 97563911 + ENSG00000207586.1 MIR135A2 miRNA +chr12 97564174 97564327 + ENSG00000277081.1 RMST_10 misc_RNA +chr12 100157256 100157338 - ENSG00000221770.1 AC010203.1 miRNA +chr12 100158502 100158765 - ENSG00000266610.2 RN7SL176P misc_RNA +chr12 101779814 101779929 + ENSG00000201168.1 RNA5SP368 rRNA +chr12 102217318 102217613 - ENSG00000264554.2 RN7SL793P misc_RNA +chr12 102230027 102230533 - ENSG00000258142.2 RP11-18O15.1 processed_pseudogene +chr12 103930425 103930555 + ENSG00000265072.1 MIR3652 miRNA +chr12 104125268 104125329 + ENSG00000280646.1 AC089983.1 miRNA +chr12 107650789 107650892 + ENSG00000222160.1 Y_RNA misc_RNA +chr12 108778912 108779166 + ENSG00000280110.1 RP11-423G4.8 TEC +chr12 109567975 109568115 - ENSG00000200274.1 RNU4-32P snRNA +chr12 110360456 110360538 - ENSG00000281030.1 AC006088.1 miRNA +chr12 110496352 110496421 - ENSG00000202335.1 SNORD50 snoRNA +chr12 111598951 111599031 - ENSG00000281280.1 AC137055.1 miRNA +chr12 111899263 111901391 + ENSG00000229186.4 ADAM1A unitary_pseudogene +chr12 113159113 113159177 - ENSG00000276908.1 MIR7106 miRNA +chr12 113185624 113192161 - ENSG00000257286.1 RP11-545P7.4 antisense +chr12 117453012 117456986 + ENSG00000255686.1 RP11-227B21.2 lincRNA +chr12 119713634 119713724 - ENSG00000221323.1 MIR1178 miRNA +chr12 120697124 120699541 - ENSG00000256364.1 RP11-173P15.3 antisense +chr12 120721507 120723639 - ENSG00000256569.1 RP11-173P15.5 antisense +chr12 120723193 120723266 + ENSG00000265455.1 MIR4700 miRNA +chr12 122865335 122867021 + ENSG00000256152.2 RP11-463O12.3 lincRNA +chr12 123364764 123364843 - ENSG00000277700.1 MIR8072 miRNA +chr12 125025434 125027410 - ENSG00000280634.1 THRIL antisense +chr12 130144954 130145691 + ENSG00000280405.1 RP11-143E21.4 TEC +chr12 130977562 130978768 + ENSG00000256204.1 RP11-243M5.1 lincRNA +chr12 131296110 131297972 - ENSG00000248703.2 RP11-495K9.3 lincRNA +chr12 132077803 132080460 - ENSG00000257000.1 RP13-820C6.2 antisense +chr12 132126461 132126764 - ENSG00000256804.1 RP13-977J11.5 processed_pseudogene +chr12 132279256 132280900 - ENSG00000278872.1 RP13-895J2.4 TEC +chr12 132835374 132835500 - ENSG00000252079.1 RNU6-327P snRNA +chr13 21287634 21287828 - ENSG00000277077.1 RP11-101P17.15 unprocessed_pseudogene +chr13 24321126 24321428 - ENSG00000281899.1 C1QTNF9B-AS1 protein_coding +chr13 25246531 25246615 + ENSG00000221324.1 AL590787.1 miRNA +chr13 25944606 25944673 + ENSG00000281497.1 AL138815.1 miRNA +chr13 27255064 27255135 + ENSG00000207500.1 SNORD102 snoRNA +chr13 27255401 27255526 + ENSG00000207051.1 SNORA27 snoRNA +chr13 30154066 30154586 + ENSG00000232643.1 LINC00385 lincRNA +chr13 30206616 30206699 - ENSG00000266816.1 AL356750.1 miRNA +chr13 44196129 44196200 - ENSG00000276319.1 MIR8079 miRNA +chr13 44432144 44432678 + ENSG00000280357.1 RP11-269C23.4 TEC +chr13 45336314 45336447 - ENSG00000253051.1 SNORA31 snoRNA +chr13 45340039 45341183 + ENSG00000273149.1 RP11-290D2.6 antisense +chr13 45732089 45732172 - ENSG00000280848.1 AL139320.1 miRNA +chr13 46437394 46437988 - ENSG00000241353.3 PPP1R2P4 processed_pseudogene +chr13 50044649 50044768 + ENSG00000277913.1 DLEU2_1 misc_RNA +chr13 50045143 50045232 + ENSG00000277066.1 DLEU2_2 misc_RNA +chr13 50049591 50049663 + ENSG00000274878.1 DLEU2_3 misc_RNA +chr13 50075570 50075653 + ENSG00000277752.1 DLEU2_5 misc_RNA +chr13 50081843 50081978 + ENSG00000275559.1 DLEU2_6 misc_RNA +chr13 50082295 50082484 + ENSG00000276089.1 DLEU1_1 misc_RNA +chr13 50104833 50105174 + ENSG00000273541.1 DLEU1_2 misc_RNA +chr13 51586424 51586532 - ENSG00000206920.1 RNY1P6 misc_RNA +chr13 57632759 57633575 - ENSG00000278722.1 RP11-95F22.1 antisense +chr13 77698012 77698116 - ENSG00000266325.1 MIR3665 miRNA +chr13 90231182 90231277 + ENSG00000207858.1 MIR622 miRNA +chr13 91350605 91350688 + ENSG00000275042.1 MIR17 miRNA +chr13 91350743 91350834 + ENSG00000274160.1 AL162375.1 miRNA +chr13 91350891 91350972 + ENSG00000277328.1 MIR19A miRNA +chr13 91351065 91351135 + ENSG00000275534.1 MIR20A miRNA +chr13 91351192 91351278 + ENSG00000275802.1 MIR19B1 miRNA +chr13 91351314 91351393 + ENSG00000276018.1 MIR92A1 miRNA +chr13 96949249 96949533 - ENSG00000222472.1 RN7SKP7 misc_RNA +chr13 97363007 97363069 - ENSG00000281389.1 AL442067.1 miRNA +chr13 102742990 102745224 + ENSG00000231633.1 LINC00283 antisense +chr13 105459815 105460015 + ENSG00000275968.1 DAOA-AS1_1 misc_RNA +chr13 105462302 105462505 + ENSG00000275502.1 DAOA-AS1_2 misc_RNA +chr13 106567662 106567799 + ENSG00000278371.2 AL442127.1 miRNA +chr14 20343075 20343407 - ENSG00000277209.1 RNaseP_nuc ribozyme +chr14 20474789 20477089 - ENSG00000258908.1 RP11-203M5.8 lincRNA +chr14 21023314 21023386 - ENSG00000278629.1 MIR6717 miRNA +chr14 21521083 21522660 + ENSG00000257096.1 AE000658.22 antisense +chr14 22281105 22281748 + ENSG00000211817.2 TRAV38-2DV8 TR_V_gene +chr14 22450089 22450139 + ENSG00000211825.1 TRDJ1 TR_J_gene +chr14 22509341 22509406 + ENSG00000211857.1 TRAJ32 TR_J_gene +chr14 22956950 22957029 - ENSG00000265037.1 MIR4707 miRNA +chr14 23033530 23033638 - ENSG00000207765.1 AL132780.1 miRNA +chr14 23356406 23357003 + ENSG00000259018.1 RP11-124D2.3 antisense +chr14 24143489 24143565 - ENSG00000276511.1 MIR7703 miRNA +chr14 35809212 35809300 - ENSG00000266264.1 AL162311.1 miRNA +chr14 49577392 49577794 - ENSG00000244270.1 RPL32P29 processed_pseudogene +chr14 49586580 49586878 + ENSG00000276168.1 RN7SL1 misc_RNA +chr14 49586722 49586791 + ENSG00000281893.1 AL139099.3 miRNA +chr14 49620815 49623480 - ENSG00000258377.1 RP11-649E7.5 antisense +chr14 49635000 49635077 + ENSG00000275425.1 AL139099.1 miRNA +chr14 49853703 49853772 - ENSG00000279868.2 AL627171.1 miRNA +chr14 49862638 49862707 - ENSG00000280602.1 AL627171.3 miRNA +chr14 50326526 50327909 - ENSG00000259071.1 RP11-247L20.4 lincRNA +chr14 51967003 51969800 - ENSG00000259007.1 RP11-463J10.3 antisense +chr14 52640839 52641566 + ENSG00000258757.1 RP11-841O20.2 antisense +chr14 52775237 52777740 + ENSG00000259049.1 RP11-589M4.1 antisense +chr14 53036755 53038251 + ENSG00000258985.1 RP11-368P15.3 antisense +chr14 53153049 53153115 - ENSG00000266552.1 AL356020.1 miRNA +chr14 55413556 55414059 - ENSG00000239199.1 RPL21P6 processed_pseudogene +chr14 60972272 60972466 - ENSG00000258656.1 RP11-193F5.4 processed_pseudogene +chr14 60974327 60974435 + ENSG00000206870.1 RNU6-398P snRNA +chr14 63598874 63599248 + ENSG00000258800.1 CTD-2302E22.2 lincRNA +chr14 68887785 68888084 - ENSG00000258967.1 HMGN1P3 processed_pseudogene +chr14 70610086 70610379 + ENSG00000240837.3 RN7SL77P misc_RNA +chr14 73698103 73700351 - ENSG00000258660.1 RP4-693M11.3 antisense +chr14 73885392 73885555 - ENSG00000273711.1 RP5-1021I20.8 antisense +chr14 74480133 74480204 - ENSG00000265649.1 MIR4709 miRNA +chr14 76778952 76782249 + ENSG00000258610.1 RP11-488C13.7 lincRNA +chr14 77266218 77266290 + ENSG00000221754.1 MIR1260A miRNA +chr14 88820297 88820432 + ENSG00000200653.1 RNU4-92P snRNA +chr14 89156743 89157574 + ENSG00000277801.1 RP11-681H18.2 lincRNA +chr14 95185117 95185854 - ENSG00000259143.1 CTD-2240H23.2 sense_overlapping +chr14 95662480 95662598 + ENSG00000275033.1 TCL6_2 misc_RNA +chr14 95670441 95670577 + ENSG00000277468.1 TCL6_3 misc_RNA +chr14 99512501 99513576 + ENSG00000258749.1 RP11-688G15.3 antisense +chr14 100291117 100294656 + ENSG00000258504.2 RP11-638I2.6 lincRNA +chr14 100333790 100354061 + ENSG00000258666.1 RP11-638I2.8 antisense +chr14 100829350 100829454 + ENSG00000276919.1 MEG3_2 misc_RNA +chr14 100834432 100861026 - ENSG00000258663.1 RP11-123M6.2 antisense +chr14 100881007 100881120 + ENSG00000208001.1 MIR431 miRNA +chr14 100881883 100882006 + ENSG00000207569.1 MIR433 miRNA +chr14 100882965 100883075 + ENSG00000207608.2 MIR127 miRNA +chr14 100884483 100884576 + ENSG00000272458.1 MIR432 miRNA +chr14 100884697 100884788 + ENSG00000207942.1 MIR136 miRNA +chr14 100894817 100894913 + ENSG00000276225.1 MEG8_1 misc_RNA +chr14 100898958 100899079 + ENSG00000275134.1 MEG8_2 misc_RNA +chr14 100987046 100987117 + ENSG00000200413.1 SNORD114-26 snoRNA +chr14 101560287 101560422 - ENSG00000277601.1 MIR1247 miRNA +chr14 102501676 102501779 + ENSG00000212330.1 RNU6-244P snRNA +chr14 103725728 103725825 - ENSG00000253096.1 Y_RNA misc_RNA +chr14 104589021 104589847 - ENSG00000259037.1 RP11-614O9.1 antisense +chr14 104769349 104770271 + ENSG00000258430.1 RP11-982M15.2 antisense +chr14 105384360 105384461 - ENSG00000281275.1 AL928654.1 miRNA +chr14 105736956 105737020 + ENSG00000278556.1 AL122127.3 miRNA +chr14 105863198 105863258 - ENSG00000211900.2 IGHJ6 IG_J_gene +chr14 105863814 105863862 - ENSG00000242472.1 IGHJ5 IG_J_gene +chr14 105864215 105864260 - ENSG00000240041.1 IGHJ4 IG_J_gene +chr14 106636704 106636763 + ENSG00000280494.1 AC245369.3 miRNA +chr15 20759311 20774794 - ENSG00000259383.1 RP11-403B2.6 lincRNA +chr15 23565857 23566366 - ENSG00000281873.1 AC126407.1 protein_coding +chr15 24981994 24982068 + ENSG00000276314.1 SNORD107 snoRNA +chr15 24985100 24985166 + ENSG00000276610.1 SNORD64 snoRNA +chr15 24986925 24986995 + ENSG00000239014.1 SNORD108 snoRNA +chr15 25041974 25042040 + ENSG00000274640.1 SNORD109B snoRNA +chr15 25087662 25087753 + ENSG00000278715.1 SNORD116-20 snoRNA +chr15 25088804 25088895 + ENSG00000277785.1 SNORD116-21 snoRNA +chr15 25104642 25104732 + ENSG00000278123.1 SNORD116-28 snoRNA +chr15 25182385 25182466 + ENSG00000278089.1 SNORD115-7 snoRNA +chr15 25193321 25193402 + ENSG00000273835.1 SNORD115-13 snoRNA +chr15 25218617 25218698 + ENSG00000275524.1 SNORD115-26 snoRNA +chr15 28857882 28858398 - ENSG00000270301.1 WI2-2334D6.1 processed_pseudogene +chr15 29233868 29233979 + ENSG00000252868.1 snoZ278 snoRNA +chr15 30380353 30386987 - ENSG00000263070.1 RP11-382B18.3 transcribed_unprocessed_pseudogene +chr15 34373789 34373871 - ENSG00000266205.1 AC025678.1 miRNA +chr15 34978351 34980084 + ENSG00000279364.1 RP11-463I20.1 TEC +chr15 39593580 39594231 - ENSG00000259279.1 CTD-2033D15.1 antisense +chr15 39922782 39922886 - ENSG00000200305.1 Y_RNA misc_RNA +chr15 40088832 40089386 + ENSG00000259409.1 RP11-521C20.3 antisense +chr15 40331537 40331648 - ENSG00000252714.1 RNA5SP392 rRNA +chr15 40368925 40369640 - ENSG00000259368.1 RP11-64K12.4 antisense +chr15 41770756 41772732 - ENSG00000260814.2 RP11-107F6.3 lincRNA +chr15 42531867 42532840 - ENSG00000261684.1 RP11-265N6.1 antisense +chr15 42567031 42569994 - ENSG00000261822.1 RP11-265N6.2 antisense +chr15 42724102 42724922 + ENSG00000278769.1 CTD-2036P10.5 antisense +chr15 42739118 42743202 + ENSG00000246283.2 CTD-2036P10.3 lincRNA +chr15 43793659 43793759 - ENSG00000221792.1 MIR1282 miRNA +chr15 44826540 44827094 + ENSG00000259187.1 CTD-2008A1.1 lincRNA +chr15 45073492 45074048 - ENSG00000259352.1 RP11-109D20.2 antisense +chr15 45214906 45215033 - ENSG00000261709.3 SNORA11 snoRNA +chr15 45433050 45433129 + ENSG00000211519.1 MIR147B miRNA +chr15 48972595 48972727 - ENSG00000280771.1 AC091073.1 miRNA +chr15 50359450 50360194 - ENSG00000259715.1 CTD-3110H11.1 lincRNA +chr15 50360329 50360410 + ENSG00000264109.1 MIR4712 miRNA +chr15 50557601 50560500 + ENSG00000259684.1 RP11-120K9.2 antisense +chr15 51908902 51909642 - ENSG00000259185.1 RP11-56B16.4 antisense +chr15 52116574 52122131 + ENSG00000259327.1 CTD-2184D3.6 lincRNA +chr15 62246284 62246349 - ENSG00000277779.1 AC126323.2 miRNA +chr15 63070025 63071911 - ENSG00000259627.1 RP11-244F12.2 antisense +chr15 64695041 64695594 + ENSG00000265967.1 AC100830.4 antisense +chr15 65077894 65078008 - ENSG00000281328.1 AC013553.1 miRNA +chr15 65655620 65656085 - ENSG00000275638.1 RP11-16E23.5 antisense +chr15 66379337 66379915 - ENSG00000240821.1 RPL9P25 processed_pseudogene +chr15 66501250 66501318 - ENSG00000199574.1 SNORD18C snoRNA +chr15 66502019 66502089 - ENSG00000280554.1 snoU18 snoRNA +chr15 66502020 66502091 - ENSG00000202529.1 SNORD18B snoRNA +chr15 66502812 66502910 - ENSG00000199673.1 SNORD16 snoRNA +chr15 66503243 66503314 - ENSG00000200623.1 SNORD18A snoRNA +chr15 69160651 69160724 + ENSG00000266374.1 AC026512.1 miRNA +chr15 69406441 69406552 + ENSG00000207395.1 Y_RNA misc_RNA +chr15 72587217 72587313 + ENSG00000207690.1 MIR630 miRNA +chr15 72589691 72591845 + ENSG00000260534.1 RP11-1006G14.4 sense_overlapping +chr15 74303005 74304343 + ENSG00000261384.1 RP11-60L3.2 sense_intronic +chr15 74598919 74599397 - ENSG00000275527.1 CTD-3154N5.2 lincRNA +chr15 75211301 75212167 - ENSG00000261779.1 RP11-69H7.3 antisense +chr15 75353611 75353685 - ENSG00000207636.1 MIR631 miRNA +chr15 75355210 75355746 - ENSG00000280564.1 MAN2C1 processed_transcript +chr15 75676227 75677162 + ENSG00000260892.1 CTD-2026K11.1 antisense +chr15 75762215 75762315 - ENSG00000274496.1 MIR4313 miRNA +chr15 77043680 77045160 + ENSG00000259652.1 RP11-797A18.3 lincRNA +chr15 77484275 77485606 - ENSG00000279033.1 RP11-500C12.1 TEC +chr15 78280950 78282190 - ENSG00000272418.1 RP11-762H8.4 sense_intronic +chr15 82044294 82044376 + ENSG00000222521.1 AC026956.1 miRNA +chr15 82477123 82477206 - ENSG00000221095.1 AC245033.1 miRNA +chr15 82726550 82726633 - ENSG00000266697.1 AC105339.2 miRNA +chr15 82755945 82756071 + ENSG00000277864.1 SCARNA15 scaRNA +chr15 82756006 82756069 + ENSG00000280933.1 AC105339.3 miRNA +chr15 83113617 83114566 - ENSG00000260579.1 RP11-382A20.2 antisense +chr15 84204516 84204599 - ENSG00000221008.1 AC136698.1 miRNA +chr15 84622015 84623237 - ENSG00000276278.1 RP11-182J1.3 antisense +chr15 84633874 84634635 + ENSG00000254779.4 EGLN1P1 transcribed_processed_pseudogene +chr15 84642487 84642763 - ENSG00000277578.1 Metazoa_SRP misc_RNA +chr15 85209115 85209198 - ENSG00000221266.1 AC044860.1 miRNA +chr15 85210053 85210299 - ENSG00000277582.1 Metazoa_SRP misc_RNA +chr15 85380596 85380662 + ENSG00000276648.1 MIR7706 miRNA +chr15 85579046 85580178 - ENSG00000259375.1 RP11-815J21.2 antisense +chr15 89201091 89201768 - ENSG00000259948.2 RP11-326A19.5 processed_pseudogene +chr15 89897576 89897637 + ENSG00000252645.1 RNU7-111P snRNA +chr15 90076424 90076486 + ENSG00000276376.1 U7 snRNA +chr15 90275129 90275716 + ENSG00000228998.4 RP11-697E2.7 transcribed_processed_pseudogene +chr15 90294059 90294142 + ENSG00000280965.1 AC091167.2 miRNA +chr15 90348844 90349197 - ENSG00000238244.3 GABARAPL3 processed_pseudogene +chr15 90920218 90921186 - ENSG00000259661.1 AC068831.15 antisense +chr15 91029888 91029996 + ENSG00000258542.1 AC068831.11 transcribed_processed_pseudogene +chr15 96333261 96333307 + ENSG00000222651.1 MIR1469 miRNA +chr15 99791127 99792847 + ENSG00000259655.1 CTD-2054N24.1 unprocessed_pseudogene +chr15 99791209 99791410 + ENSG00000241588.3 RN7SL484P misc_RNA +chr15 99792297 99792379 + ENSG00000221511.1 AC090825.1 miRNA +chr15 101772300 101772381 - ENSG00000277814.1 AC107977.2 miRNA +chr16 547185 553847 - ENSG00000261691.1 LA16c-366D1.3 antisense +chr16 678645 679061 + ENSG00000279441.1 LA16c-313D11.13 TEC +chr16 770183 770277 + ENSG00000207579.1 MIR662 miRNA +chr16 1065240 1066502 + ENSG00000261720.1 RP11-161M6.5 lincRNA +chr16 1445343 1446519 + ENSG00000261641.2 LA16c-390E6.5 antisense +chr16 1962334 1962466 - ENSG00000206811.1 SNORA10 snoRNA +chr16 1962973 1963106 - ENSG00000207405.1 SNORA64 snoRNA +chr16 1963745 1964095 - ENSG00000255513.1 AC005363.9 transcribed_processed_pseudogene +chr16 1965184 1965310 + ENSG00000273587.1 SNORA78 snoRNA +chr16 1997654 1998374 + ENSG00000260107.1 AC005606.15 lincRNA +chr16 2155025 2155104 - ENSG00000281010.1 snoR1 snoRNA +chr16 2211997 2212863 + ENSG00000261532.1 RP11-304L19.8 lincRNA +chr16 2253737 2262342 + ENSG00000279473.1 AC009065.4 protein_coding +chr16 2271737 2271846 + ENSG00000274753.1 MIR940 miRNA +chr16 2530035 2531417 - ENSG00000205923.3 CEMP1 protein_coding +chr16 2597881 2599718 - ENSG00000261093.1 CTD-3126B10.1 antisense +chr16 2769872 2769949 + ENSG00000265864.1 AC092117.2 miRNA +chr16 3069523 3069651 + ENSG00000252561.1 RNU1-125P snRNA +chr16 3307573 3308393 - ENSG00000262899.1 LA16c-360H6.3 antisense +chr16 3308609 3308816 + ENSG00000262554.1 LA16c-360H6.2 transcribed_processed_pseudogene +chr16 3485381 3485469 - ENSG00000273776.1 MIR6126 miRNA +chr16 3533429 3534258 - ENSG00000278942.1 LA16c-390H2.1 TEC +chr16 3581181 3583266 + ENSG00000261938.1 RP11-461A8.1 lincRNA +chr16 8683461 8683543 + ENSG00000281170.1 AC007224.1 miRNA +chr16 8853312 8854347 + ENSG00000259939.1 RP11-77H9.5 antisense +chr16 9104848 9113181 + ENSG00000263244.2 RP11-473I1.9 3prime_overlapping_ncrna +chr16 9105834 9107174 - ENSG00000260349.1 RP11-473I1.5 antisense +chr16 10529440 10532082 + ENSG00000256013.1 RP11-27M24.1 antisense +chr16 10934258 10934887 - ENSG00000275673.1 RP11-876N24.7 sense_intronic +chr16 11345483 11345560 - ENSG00000280717.1 AC009121.2 protein_coding +chr16 11348727 11348796 + ENSG00000238774.1 AC009121.1 miRNA +chr16 11915661 11915738 - ENSG00000266163.1 AC007216.1 miRNA +chr16 14901508 14901591 + ENSG00000274301.1 MIR3179-3 miRNA +chr16 14911220 14911313 + ENSG00000265537.1 MIR3180-3 miRNA +chr16 15611030 15611095 - ENSG00000275184.1 MIR6506 miRNA +chr16 15643281 15643390 + ENSG00000272213.2 AC026401.1 miRNA +chr16 16309875 16309968 + ENSG00000265373.1 MIR3180-1 miRNA +chr16 18402178 18402271 - ENSG00000266291.1 MIR3180-2 miRNA +chr16 19119976 19121629 - ENSG00000261759.1 RP11-626G11.3 antisense +chr16 19498610 19498747 + ENSG00000222750.1 RNU4-46P snRNA +chr16 19501689 19502286 + ENSG00000260934.1 CTA-363E6.7 antisense +chr16 19503861 19504625 + ENSG00000279747.1 CTA-363E6.8 TEC +chr16 21519830 21520365 - ENSG00000258186.2 SLC7A5P2 transcribed_processed_pseudogene +chr16 23061406 23064173 - ENSG00000260566.2 RP11-20G6.3 3prime_overlapping_ncrna +chr16 23061767 23062232 + ENSG00000280400.1 RP11-20G6.1 TEC +chr16 23571854 23572278 - ENSG00000279618.1 CTD-2196E14.7 TEC +chr16 23670011 23675499 + ENSG00000260482.3 CTD-2196E14.9 3prime_overlapping_ncrna +chr16 25238318 25239287 + ENSG00000259955.1 CTD-2547G23.2 antisense +chr16 27066928 27067858 - ENSG00000261482.1 RP11-673P17.2 antisense +chr16 28958583 28958661 + ENSG00000266868.1 MIR4517 miRNA +chr16 28982014 28982130 + ENSG00000281146.1 AC109460.1 miRNA +chr16 28989140 28990778 - ENSG00000261552.1 RP11-264B17.5 antisense +chr16 29804430 29804990 - ENSG00000275857.1 AC009133.21 antisense +chr16 29862760 29863417 + ENSG00000278713.1 CTD-2574D22.7 antisense +chr16 30064306 30064825 - ENSG00000274904.1 CTD-2515O10.5 antisense +chr16 30670160 30670297 + ENSG00000280843.1 AC093249.1 miRNA +chr16 30875266 30875323 - ENSG00000265991.1 MIR4519 miRNA +chr16 30875766 30895216 - ENSG00000262721.1 AC106782.20 sense_overlapping +chr16 30893903 30893985 + ENSG00000211591.1 MIR762 miRNA +chr16 31109211 31109318 + ENSG00000252809.3 AC135050.1 miRNA +chr16 31131433 31131877 + ENSG00000261385.1 RP11-388M20.2 antisense +chr16 31709113 31711984 - ENSG00000261731.2 CTD-2358C21.4 antisense +chr16 34159185 34159299 + ENSG00000207986.1 AC136932.1 miRNA +chr16 35023339 35023765 + ENSG00000262885.1 CTD-2144E22.11 processed_pseudogene +chr16 35149457 35149790 - ENSG00000261510.1 RP11-244B22.13 processed_pseudogene +chr16 35193335 35193734 + ENSG00000260590.1 RP11-244B22.7 processed_pseudogene +chr16 48447904 48448398 - ENSG00000261802.1 RP11-44I10.6 sense_overlapping +chr16 54114697 54114878 - ENSG00000280454.1 AC007347.1 pseudogene +chr16 58559796 58559929 - ENSG00000206952.3 SNORA76A snoRNA +chr16 66509437 66510048 + ENSG00000260755.1 RP11-403P17.3 lincRNA +chr16 66550457 66550567 + ENSG00000275745.1 Y_RNA misc_RNA +chr16 66720897 66731785 + ENSG00000260465.1 RP11-63M22.2 antisense +chr16 66944660 66945096 - ENSG00000280416.1 RP11-361L15.3 TEC +chr16 67019727 67019811 + ENSG00000281589.1 AC009084.1 miRNA +chr16 67390691 67390850 - ENSG00000239194.1 RNU1-123P snRNA +chr16 67542123 67542963 - ENSG00000259945.1 CTD-2012K14.3 antisense +chr16 67542304 67542572 - ENSG00000260894.1 CTD-2012K14.4 antisense +chr16 67877657 67877740 + ENSG00000221526.1 AC040162.1 miRNA +chr16 68199841 68200981 - ENSG00000279621.1 RP11-67A1.4 TEC +chr16 68233426 68233499 - ENSG00000276151.1 MIR6773 miRNA +chr16 68742495 68742551 + ENSG00000281454.1 AC099314.1 miRNA +chr16 69328644 69334871 - ENSG00000272617.1 RP11-343C2.12 protein_coding +chr16 69463844 69466264 + ENSG00000260108.1 RP11-140H17.1 3prime_overlapping_ncrna +chr16 69565808 69565868 - ENSG00000223109.1 MIR1538 miRNA +chr16 69703065 69704652 + ENSG00000260772.1 RP11-311C24.1 3prime_overlapping_ncrna +chr16 69709874 69710583 + ENSG00000261602.1 CTD-2033A16.1 antisense +chr16 70379457 70399502 + ENSG00000260111.1 RP11-529K1.4 antisense +chr16 70768967 70770221 - ENSG00000279122.1 RP11-394B2.3 TEC +chr16 72787693 72787773 - ENSG00000265573.1 AC004943.1 miRNA +chr16 75108601 75110712 - ENSG00000247033.1 RP11-252E2.1 antisense +chr16 81170740 81181213 - ENSG00000279362.1 PKD1L2 protein_coding +chr16 81385018 81385093 + ENSG00000273048.1 MIR4720 miRNA +chr16 81385463 81387560 + ENSG00000272923.1 RP11-391L3.1 lincRNA +chr16 81961926 81962243 + ENSG00000260682.3 7SK misc_RNA +chr16 86510780 86510877 - ENSG00000277360.1 AC009108.2 miRNA +chr16 87326987 87327584 + ENSG00000270082.1 RP11-178L8.8 antisense +chr16 88686544 88687053 - ENSG00000280603.1 AC138028.1 protein_coding +chr16 89226807 89228692 - ENSG00000260659.1 RP11-46C24.6 antisense +chr16 89561434 89561517 + ENSG00000200084.1 SNORD68 snoRNA +chr17 1713895 1714005 - ENSG00000275595.1 AC130343.2 miRNA +chr17 1725748 1738585 - ENSG00000262791.1 RP11-961A15.1 antisense +chr17 2041936 2043430 + ENSG00000262664.2 OVCA2 protein_coding +chr17 2042900 2043425 - ENSG00000262533.1 RP11-667K14.4 antisense +chr17 2329119 2329213 - ENSG00000212552.3 SNORD91B snoRNA +chr17 2330276 2330370 - ENSG00000274802.1 SNORD91A snoRNA +chr17 2366589 2366791 - ENSG00000274758.1 RP1-59D14.10 antisense +chr17 2384847 2386664 + ENSG00000262456.1 RP1-59D14.1 antisense +chr17 2683305 2685088 - ENSG00000272770.1 RP11-74E22.5 antisense +chr17 2748078 2748182 - ENSG00000221200.1 MIR1253 miRNA +chr17 2748078 2748182 + ENSG00000272920.1 MIR1253 lincRNA +chr17 3310312 3311189 + ENSG00000180068.8 OR3A4P transcribed_unprocessed_pseudogene +chr17 4163910 4164713 + ENSG00000263165.1 RP11-810M2.2 antisense +chr17 5514209 5514270 - ENSG00000280489.1 U7 snRNA +chr17 6651914 6652270 + ENSG00000279257.1 C17orf100 protein_coding +chr17 7017615 7017701 - ENSG00000275739.1 MIR195 miRNA +chr17 7017911 7018022 - ENSG00000273895.1 MIR497 miRNA +chr17 7352687 7354944 - ENSG00000263171.1 RP11-542C16.1 antisense +chr17 7436557 7437523 - ENSG00000262624.1 RP11-104H15.9 antisense +chr17 7439159 7443327 - ENSG00000263301.1 RP11-104H15.8 antisense +chr17 7574713 7574847 + ENSG00000209582.1 SNORA48 snoRNA +chr17 7576811 7576952 + ENSG00000238917.1 SNORD10 snoRNA +chr17 7577955 7578091 + ENSG00000277985.1 SNORA67 snoRNA +chr17 8144994 8145071 - ENSG00000278027.1 MIR6883 miRNA +chr17 8173454 8173587 - ENSG00000200463.1 SNORD118 snoRNA +chr17 8188933 8189688 + ENSG00000279152.1 AC129492.1 protein_coding +chr17 12990149 12990610 - ENSG00000263707.1 RP11-597M12.1 antisense +chr17 15588852 15589084 + ENSG00000223544.1 AC005838.2 unprocessed_pseudogene +chr17 16044621 16044961 + ENSG00000243686.2 RPLP1P11 processed_pseudogene +chr17 16382152 16382669 - ENSG00000265401.1 RP11-138I1.4 antisense +chr17 16439505 16439576 + ENSG00000277108.1 SNORD49 snoRNA +chr17 16440036 16440106 + ENSG00000277370.1 SNORD49A snoRNA +chr17 16441226 16441298 + ENSG00000277512.1 SNORD65 snoRNA +chr17 16812447 16812651 + ENSG00000264892.1 NOS2P4 processed_pseudogene +chr17 17242564 17242647 + ENSG00000265109.1 AC055811.1 miRNA +chr17 17675366 17675522 + ENSG00000273948.1 SMCR2_1 misc_RNA +chr17 17676211 17676371 + ENSG00000274836.1 SMCR2_2 misc_RNA +chr17 17777781 17779094 + ENSG00000264167.1 RP1-253P7.1 sense_intronic +chr17 18340814 18340886 - ENSG00000275820.1 MIR6778 miRNA +chr17 18388871 18389459 + ENSG00000267441.1 RP1-37N7.5 processed_pseudogene +chr17 19061912 19062128 + ENSG00000280523.1 SNORD3B-2 snoRNA +chr17 19061912 19062129 + ENSG00000276271.1 SNORD3B-2 snoRNA +chr17 19063919 19064136 - ENSG00000274088.1 SNORD3B-1 snoRNA +chr17 19063920 19064136 - ENSG00000281187.1 SNORD3B-2 snoRNA +chr17 19112419 19112636 - ENSG00000277947.1 SNORD3D snoRNA +chr17 19112420 19112636 - ENSG00000281000.1 SNORD3D snoRNA +chr17 19188016 19188232 + ENSG00000281298.1 SNORD3A snoRNA +chr17 19188016 19188233 + ENSG00000277813.1 SNORD3A snoRNA +chr17 19190028 19190245 - ENSG00000275303.1 SNORD3C snoRNA +chr17 19190029 19190245 - ENSG00000280847.1 SNORD3C snoRNA +chr17 19334308 19336127 - ENSG00000265263.1 RP11-135L13.4 antisense +chr17 19557560 19557711 + ENSG00000276660.1 SNORA59A snoRNA +chr17 19649494 19649587 - ENSG00000275982.1 Y_RNA misc_RNA +chr17 20981489 20981654 + ENSG00000266009.1 RP11-746M1.2 transcribed_processed_pseudogene +chr17 20999747 21000323 + ENSG00000263986.1 RP11-746M1.1 antisense +chr17 21457434 21458989 + ENSG00000264023.1 RP11-728E14.2 lincRNA +chr17 22411893 22412229 - ENSG00000266746.1 RP11-1109M24.7 processed_pseudogene +chr17 22435629 22436064 + ENSG00000265705.1 RP11-744K17.8 transcribed_processed_pseudogene +chr17 22528853 22528921 - ENSG00000241225.2 NMTRS-TGA3-1 unprocessed_pseudogene +chr17 27625484 27626438 - ENSG00000266872.1 RP11-19P22.8 antisense +chr17 28276463 28276553 - ENSG00000207844.1 AC061975.1 miRNA +chr17 28360654 28360734 + ENSG00000264302.1 MIR4723 miRNA +chr17 28361601 28362859 - ENSG00000265618.1 CTB-96E2.7 antisense +chr17 28655557 28655983 + ENSG00000240494.2 RPS12P28 processed_pseudogene +chr17 28722582 28722653 + ENSG00000238578.1 SNORD4A snoRNA +chr17 28723429 28723492 + ENSG00000238649.1 SNORD42A snoRNA +chr17 28723682 28723753 + ENSG00000238597.1 SNORD4B snoRNA +chr17 28749731 28750079 - ENSG00000265840.1 AC010761.10 antisense +chr17 28861369 28861440 - ENSG00000273915.1 MIR451A miRNA +chr17 28861514 28861624 - ENSG00000277441.1 AC024267.1 miRNA +chr17 28861655 28861730 - ENSG00000278521.1 MIR4732 miRNA +chr17 29390662 29390730 + ENSG00000263719.1 MIR4523 miRNA +chr17 30059054 30059147 - ENSG00000267482.1 RNY4P13 unprocessed_pseudogene +chr17 30117079 30117172 + ENSG00000199071.2 MIR423 miRNA +chr17 30117079 30117172 - ENSG00000266919.1 MIR3184 sense_intronic +chr17 30557732 30558502 - ENSG00000266775.1 RP11-218M11.6 antisense +chr17 30702506 30702586 - ENSG00000281673.1 AC005562.2 miRNA +chr17 30899110 30899651 + ENSG00000275185.1 RP13-753N3.3 antisense +chr17 31533171 31533938 - ENSG00000278977.1 RP1-41C23.2 TEC +chr17 31534883 31534971 + ENSG00000266459.1 MIR4724 miRNA +chr17 35097560 35097657 - ENSG00000252328.1 Vault misc_RNA +chr17 36491199 36491322 - ENSG00000274164.1 5S_rRNA rRNA +chr17 37609739 37613841 + ENSG00000277688.1 RP11-697E22.1 antisense +chr17 38727833 38728198 - ENSG00000279119.1 AC006449.2 protein_coding +chr17 38851524 38851659 - ENSG00000252699.1 SNORA21 snoRNA +chr17 38852863 38852994 - ENSG00000199293.1 SNORA21 snoRNA +chr17 38918801 38921769 - ENSG00000265784.1 RP1-56K13.3 antisense +chr17 39081065 39081187 + ENSG00000222494.1 AC091178.1 miRNA +chr17 40026332 40026409 - ENSG00000275267.1 MIR6884 miRNA +chr17 41500983 41502409 + ENSG00000229732.1 AC019349.5 antisense +chr17 41867581 41867736 + ENSG00000274630.1 RP11-229E13.4 antisense +chr17 43387226 43387417 - ENSG00000277084.1 U2 snRNA +chr17 43444806 43444885 + ENSG00000278547.1 MIR2117 miRNA +chr17 44123539 44123649 - ENSG00000274433.1 AC023855.1 miRNA +chr17 44303965 44304203 + ENSG00000221496.2 U3 snoRNA +chr17 44899712 44905390 + ENSG00000214447.4 FAM187A protein_coding +chr17 44903161 44904546 - ENSG00000224911.1 AC015936.3 antisense +chr17 45637110 45637206 + ENSG00000273917.1 AC126544.1 miRNA +chr17 46537734 46537813 + ENSG00000280583.1 AC138645.1 miRNA +chr17 46939155 46939860 - ENSG00000280158.1 RP11-63A1.4 TEC +chr17 47042025 47042104 + ENSG00000281864.1 AC068152.1 miRNA +chr17 47118502 47118591 + ENSG00000221016.1 AC002558.1 miRNA +chr17 47682417 47682683 - ENSG00000264558.1 RP11-138C9.1 antisense +chr17 47946802 47948275 - ENSG00000264019.1 RP11-6N17.6 antisense +chr17 48048365 48048430 + ENSG00000273862.1 AC004477.1 miRNA +chr17 48060383 48060669 - ENSG00000266341.1 RP5-890E16.4 antisense +chr17 48382371 48382586 - ENSG00000200538.2 U3 snoRNA +chr17 48579838 48579947 - ENSG00000274592.1 MIR10A miRNA +chr17 48724408 48724475 - ENSG00000263602.1 MIR3185 miRNA +chr17 48931791 48937100 + ENSG00000230532.1 AC091133.1 antisense +chr17 49049540 49049624 + ENSG00000264552.1 AC105030.1 miRNA +chr17 49404081 49405197 + ENSG00000250186.3 RP11-1079K10.4 antisense +chr17 50693448 50695449 + ENSG00000262967.1 RP11-294J22.6 antisense +chr17 50840057 50841626 - ENSG00000261976.2 RP11-506D12.5 antisense +chr17 51166831 51171507 + ENSG00000280803.1 NME1-NME2 protein_coding +chr17 53106069 53106157 - ENSG00000276078.1 AC091154.1 miRNA +chr17 56888880 56891841 + ENSG00000262112.1 RP11-670E13.5 antisense +chr17 56891270 56891355 - ENSG00000265238.1 MIR3614 miRNA +chr17 58331215 58331325 - ENSG00000273667.1 AC004687.1 miRNA +chr17 59841266 59841337 + ENSG00000199004.1 MIR21 miRNA +chr17 60810957 60811052 + ENSG00000211515.1 AC079005.1 miRNA +chr17 62005737 62006016 - ENSG00000242398.3 RN7SL800P misc_RNA +chr17 64146337 64146471 + ENSG00000281311.1 SNORA76C snoRNA +chr17 64146339 64146471 + ENSG00000277887.1 SNORA76C snoRNA +chr17 64500773 64500839 - ENSG00000265695.1 MIR3064 miRNA +chr17 64501214 64501313 - ENSG00000266241.1 MIR5047 miRNA +chr17 64749918 64758603 - ENSG00000266820.1 RP13-104F24.1 transcribed_unprocessed_pseudogene +chr17 64780759 64780824 + ENSG00000278581.1 MIR6080 miRNA +chr17 64940606 64940717 + ENSG00000272346.2 RP11-927P21.11 transcribed_unprocessed_pseudogene +chr17 67032409 67033290 - ENSG00000265664.1 RP11-74H8.1 antisense +chr17 68134675 68135604 - ENSG00000267352.1 SH3GL1P3 processed_pseudogene +chr17 74748613 74748699 + ENSG00000264624.1 MIR3615 miRNA +chr17 75041740 75042015 + ENSG00000239607.3 RN7SL573P misc_RNA +chr17 75145670 75145762 + ENSG00000252042.1 Y_RNA misc_RNA +chr17 75498548 75498628 + ENSG00000276372.1 MIR6785 miRNA +chr17 75784521 75784607 - ENSG00000263565.1 MIR4738 miRNA +chr17 75910723 75938149 - ENSG00000281844.1 FBF1 protein_coding +chr17 75943832 75945142 + ENSG00000267615.1 RP11-552F3.13 antisense +chr17 76558791 76558868 + ENSG00000274091.1 SNORD1C snoRNA +chr17 76561634 76561705 + ENSG00000278261.1 SNORD1A snoRNA +chr17 76736450 76736548 - ENSG00000207556.1 MIR636 miRNA +chr17 77089307 77089493 + ENSG00000275143.1 SCARNA16 scaRNA +chr17 77089417 77089497 + ENSG00000281678.1 MIR6516 miRNA +chr17 78855478 78855844 + ENSG00000267601.1 RP11-323N12.5 antisense +chr17 80999509 81000130 - ENSG00000263218.2 CTD-2561B21.7 antisense +chr17 81251194 81251803 + ENSG00000276101.1 RP11-455O6.8 antisense +chr17 81461013 81461937 - ENSG00000263271.1 RP11-1055B8.8 antisense +chr17 81511026 81511109 - ENSG00000266077.1 AC139149.1 miRNA +chr17 81697025 81697714 - ENSG00000275902.1 RP13-1032I1.11 antisense +chr17 81843165 81843958 + ENSG00000262831.1 RP11-498C9.2 antisense +chr17 81867721 81868552 + ENSG00000262413.1 RP11-498C9.3 antisense +chr17 81878667 81879557 + ENSG00000263859.1 RP11-498C9.16 lincRNA +chr17 81932398 81933058 + ENSG00000263585.1 RP11-498C9.13 antisense +chr17 82236668 82236728 + ENSG00000275505.1 MIR6787 miRNA +chr17 82362349 82363196 - ENSG00000278964.1 RP13-20L14.2 TEC +chr17 82602989 82604178 - ENSG00000261845.2 RP13-638C3.4 antisense +chr18 738058 739444 + ENSG00000264339.1 RP11-769O8.2 antisense +chr18 3246401 3247086 - ENSG00000272688.1 RP13-270P17.3 lincRNA +chr18 5240255 5241167 - ENSG00000265188.1 RP11-835E18.4 processed_pseudogene +chr18 11851414 11852751 - ENSG00000267165.1 RP11-78A19.3 antisense +chr18 11908712 11909223 + ENSG00000273141.1 RP11-820I16.4 antisense +chr18 14074912 14075741 - ENSG00000267756.1 RP11-411B10.4 unprocessed_pseudogene +chr18 14104542 14105226 + ENSG00000267356.1 RP11-411B10.3 antisense +chr18 21712017 21712119 - ENSG00000281568.1 AC106037.1 miRNA +chr18 21825698 21825785 - ENSG00000276792.1 MIR133A1 miRNA +chr18 21828996 21829106 - ENSG00000278753.1 AC103987.1 miRNA +chr18 22933349 22933438 + ENSG00000264817.1 MIR4741 miRNA +chr18 23136435 23136512 - ENSG00000222999.1 AC105247.1 miRNA +chr18 26688111 26689668 + ENSG00000263677.1 RP11-17A19.2 lincRNA +chr18 26689316 26689529 - ENSG00000275900.1 U3 snoRNA +chr18 27452623 27452712 - ENSG00000280492.1 AC068408.1 miRNA +chr18 31726043 31726397 - ENSG00000263772.1 RP11-549B18.3 processed_pseudogene +chr18 35466938 35466991 - ENSG00000277489.1 AC007998.1 miRNA +chr18 49487373 49487422 + ENSG00000278544.1 MIR1539 miRNA +chr18 49489245 49489308 - ENSG00000202093.1 SNORD58C snoRNA +chr18 49491283 49491347 - ENSG00000206602.1 SNORD58A snoRNA +chr18 49491664 49491729 - ENSG00000271982.1 SNORD58B snoRNA +chr18 49814133 49814276 + ENSG00000251992.1 SCARNA17 scaRNA +chr18 49814361 49814443 + ENSG00000252139.1 SCARNA18 scaRNA +chr18 57435821 57435880 + ENSG00000266636.1 AC090340.1 miRNA +chr18 58398663 58400082 - ENSG00000267396.1 RP11-845C23.3 antisense +chr18 58451068 58451176 + ENSG00000207778.2 MIR122 miRNA +chr18 58752179 58753898 - ENSG00000267705.1 RP11-108P20.3 lincRNA +chr18 67506589 67514030 + ENSG00000263424.1 CTD-2541J13.2 antisense +chr18 79253577 79254856 - ENSG00000267628.1 RP11-1136J12.1 antisense +chr19 305573 306467 + ENSG00000267124.2 CTD-3113P16.5 antisense +chr19 804940 805001 + ENSG00000265767.1 MIR4745 miRNA +chr19 813584 813653 + ENSG00000263414.1 MIR3187 miRNA +chr19 1010221 1010907 + ENSG00000274177.1 LLNLR-284B4.1 antisense +chr19 1038727 1039064 - ENSG00000279753.1 AC011558.5 TEC +chr19 1376773 1377520 - ENSG00000267755.1 AC004623.2 antisense +chr19 1556605 1556686 + ENSG00000222720.1 AC027307.1 miRNA +chr19 1576939 1577086 + ENSG00000279009.1 AC005943.6 TEC +chr19 1815249 1815873 + ENSG00000267007.1 CTB-31O20.3 antisense +chr19 1816159 1816238 - ENSG00000223244.1 MIR1909 miRNA +chr19 1875016 1875992 + ENSG00000267232.1 CTB-31O20.9 lincRNA +chr19 2235829 2235926 - ENSG00000276587.1 MIR6789 miRNA +chr19 2250639 2250718 + ENSG00000267021.1 MIR4321 miRNA +chr19 3052910 3053724 + ENSG00000267469.1 AC005944.2 antisense +chr19 3118665 3119304 - ENSG00000267139.1 AC005262.3 antisense +chr19 3121116 3122128 - ENSG00000267688.1 AC005262.2 antisense +chr19 3544199 3557569 + ENSG00000267436.1 AC005786.7 antisense +chr19 3961414 3961512 - ENSG00000207733.1 MIR637 miRNA +chr19 4445978 4446048 + ENSG00000266437.1 MIR4746 miRNA +chr19 4447304 4448217 + ENSG00000267030.1 CTB-50L17.7 antisense +chr19 4652171 4652283 - ENSG00000280849.1 AC005339.1 miRNA +chr19 4654964 4655524 - ENSG00000268565.1 AC005339.2 antisense +chr19 4791745 4795559 - ENSG00000269604.1 AC005523.2 antisense +chr19 6176256 6176442 - ENSG00000267415.1 CTC-503J8.2 transcribed_processed_pseudogene +chr19 6494320 6494805 + ENSG00000268203.1 CTD-2396E7.9 antisense +chr19 6494320 6495025 + ENSG00000268191.1 CTD-2396E7.10 antisense +chr19 6736712 6736778 - ENSG00000277714.1 MIR6791 miRNA +chr19 7519916 7520460 - ENSG00000269371.1 CTD-2207O23.11 antisense +chr19 7870561 7871296 + ENSG00000268120.1 CTD-3193O13.11 lincRNA +chr19 7912648 7913518 - ENSG00000268149.1 CTD-3193O13.13 antisense +chr19 7959123 7960012 + ENSG00000269813.1 CTD-3193O13.14 antisense +chr19 10286967 10288522 + ENSG00000105371.8 ICAM4 protein_coding +chr19 10403458 10403538 - ENSG00000221566.1 MIR1181 miRNA +chr19 11322156 11324195 + ENSG00000267576.1 CTC-510F12.6 sense_intronic +chr19 12379746 12383687 - ENSG00000248406.1 CTD-3105H18.4 transcribed_unprocessed_pseudogene +chr19 12688922 12689238 + ENSG00000267791.1 CTD-2659N19.2 antisense +chr19 12796823 12801849 + ENSG00000267062.1 CTD-2659N19.10 antisense +chr19 12944118 12944487 - ENSG00000267458.1 CTC-425F1.4 antisense +chr19 13153071 13154193 - ENSG00000267598.1 CTC-250I14.6 antisense +chr19 14073361 14073479 + ENSG00000277805.1 MIR1199 miRNA +chr19 14163039 14163183 + ENSG00000281044.1 AC022098.2 miRNA +chr19 14213241 14213321 + ENSG00000280955.1 AC011509.1 miRNA +chr19 14402717 14408723 - ENSG00000267379.1 CTC-548K16.5 antisense +chr19 14529543 14529640 + ENSG00000207707.1 MIR639 miRNA +chr19 15449548 15449608 + ENSG00000269782.1 MIR1470 miRNA +chr19 15834730 15834804 + ENSG00000273782.1 UCA1 misc_RNA +chr19 16551773 16552328 + ENSG00000268309.1 CTD-3222D19.11 antisense +chr19 16565551 16566330 + ENSG00000269085.1 CTD-3222D19.10 antisense +chr19 16586905 16587985 - ENSG00000269578.1 CTD-3222D19.5 sense_intronic +chr19 16910496 16910579 - ENSG00000280536.1 AC008737.1 miRNA +chr19 17419305 17419774 - ENSG00000269053.1 CTD-2521M24.8 antisense +chr19 17727840 17734513 - ENSG00000268112.1 CTD-3149D2.4 antisense +chr19 17862588 17862720 + ENSG00000207166.1 SNORA68 snoRNA +chr19 18009440 18009881 - ENSG00000279172.1 CTB-52I2.7 TEC +chr19 18144522 18151691 - ENSG00000269145.2 AC007192.6 antisense +chr19 18448275 18448802 + ENSG00000279262.1 CTD-3137H5.5 TEC +chr19 18568506 18569375 - ENSG00000268983.1 AC005253.4 antisense +chr19 18868545 18896096 - ENSG00000130283.8 GDF1 protein_coding +chr19 19255638 19256141 + ENSG00000280282.1 LLNLR-259H9.1 TEC +chr19 20125044 20125484 + ENSG00000224864.4 CTC-260E6.2 transcribed_processed_pseudogene +chr19 21188598 21188953 + ENSG00000268995.1 VN1R82P unprocessed_pseudogene +chr19 21592727 21593041 + ENSG00000274503.1 RP11-678G14.6 unprocessed_pseudogene +chr19 21753011 21753083 + ENSG00000265084.1 AC092364.1 miRNA +chr19 22601485 22601707 + ENSG00000240713.3 RN7SL860P misc_RNA +chr19 33207129 33207639 + ENSG00000273420.1 CTD-2540B15.13 3prime_overlapping_ncrna +chr19 33299934 33301168 + ENSG00000267580.1 CTD-2540B15.11 antisense +chr19 33301279 33301940 + ENSG00000267727.1 CTD-2540B15.7 antisense +chr19 34645662 34646070 + ENSG00000269811.2 SCGB2B3P processed_pseudogene +chr19 35122700 35122764 + ENSG00000278663.1 MIR6887 miRNA +chr19 36114362 36115146 + ENSG00000279504.1 AD001527.4 TEC +chr19 38385522 38386759 + ENSG00000267090.1 AC005789.9 antisense +chr19 41425359 41426237 + ENSG00000268475.1 CTC-435M10.6 antisense +chr19 41605341 41605671 - ENSG00000269266.1 DNAJC19P2 processed_pseudogene +chr19 42078588 42080107 - ENSG00000268525.1 CTB-59C6.3 antisense +chr19 42424384 42425071 - ENSG00000268605.1 CTB-50E14.4 sense_intronic +chr19 44758657 44758721 + ENSG00000277736.1 MIR8085 miRNA +chr19 45149750 45150605 - ENSG00000267037.1 AC005757.7 lincRNA +chr19 45638994 45639087 - ENSG00000199066.1 MIR330 miRNA +chr19 45938139 45938282 - ENSG00000280818.1 AC008623.1 miRNA +chr19 46382492 46383169 - ENSG00000268810.1 AC007193.9 antisense +chr19 46390515 46390852 - ENSG00000269151.1 AC007193.8 antisense +chr19 46609277 46610779 - ENSG00000269292.1 CTB-12A17.3 antisense +chr19 47226942 47227021 + ENSG00000265134.1 MIR3190 miRNA +chr19 47581231 47581313 + ENSG00000281476.1 AC010331.1 miRNA +chr19 47755853 47755962 + ENSG00000221803.1 SNORD23 snoRNA +chr19 48966694 48966775 - ENSG00000281206.1 AC026803.1 miRNA +chr19 49489965 49490048 + ENSG00000201675.1 SNORD32A snoRNA +chr19 49490615 49490699 + ENSG00000199631.1 SNORD33 snoRNA +chr19 49490904 49490974 + ENSG00000202503.1 SNORD34 snoRNA +chr19 49491175 49491260 + ENSG00000200259.1 SNORD35A snoRNA +chr19 49497720 49497806 + ENSG00000200530.1 SNORD35B snoRNA +chr19 49625994 49626439 - ENSG00000268636.1 CTB-33G10.11 antisense +chr19 49688853 49690573 - ENSG00000268677.1 CTB-33G10.6 antisense +chr19 49832018 49832099 + ENSG00000277609.1 MIR6800 miRNA +chr19 49888175 49888230 + ENSG00000263462.1 MIR4750 miRNA +chr19 49933064 49933137 + ENSG00000265438.1 MIR4751 miRNA +chr19 49949316 49949527 - ENSG00000221125.2 U3 snoRNA +chr19 50310022 50310539 - ENSG00000267815.1 CTB-191K22.5 antisense +chr19 50480119 50483351 - ENSG00000268854.1 CTD-2545M3.2 antisense +chr19 50486810 50487638 - ENSG00000268518.1 CTD-2545M3.8 lincRNA +chr19 50799032 50799122 - ENSG00000221381.1 SNORD88B snoRNA +chr19 50802328 50802418 - ENSG00000220988.1 SNORD88C snoRNA +chr19 51014374 51014734 + ENSG00000268739.1 CTC-518B2.12 antisense +chr19 51692773 51692881 + ENSG00000198972.2 MIRLET7E miRNA +chr19 51693254 51693339 + ENSG00000208008.1 MIR125A miRNA +chr19 52564076 52564231 + ENSG00000242255.1 RPL39P34 transcribed_unprocessed_pseudogene +chr19 53162428 53163563 + ENSG00000268842.1 CTD-2245F17.2 antisense +chr19 53386233 53386388 + ENSG00000241069.1 CTD-3141N22.1 transcribed_processed_pseudogene +chr19 53426019 53426499 - ENSG00000249435.1 CTD-2224J9.7 processed_pseudogene +chr19 53495887 53496037 + ENSG00000241434.1 CTD-2224J9.4 transcribed_processed_pseudogene +chr19 53762335 53762445 + ENSG00000280835.1 AC011453.2 miRNA +chr19 53787675 53787741 + ENSG00000199031.1 MIR371A miRNA +chr19 53787890 53787956 + ENSG00000199095.1 MIR372 miRNA +chr19 53874626 53876049 - ENSG00000232220.2 AC008440.5 antisense +chr19 53982307 53982397 + ENSG00000215998.1 MIR935 miRNA +chr19 54589441 54590287 + ENSG00000269271.1 CTB-83J4.2 lincRNA +chr19 55312029 55312495 - ENSG00000268729.1 CTD-2105E13.14 antisense +chr19 55388181 55388242 + ENSG00000277326.1 MIR6805 miRNA +chr19 55705978 55706172 - ENSG00000267689.1 AC010525.5 processed_pseudogene +chr19 55707550 55707855 + ENSG00000268593.3 CTD-2611O12.6 processed_pseudogene +chr19 56316524 56316636 - ENSG00000200646.1 Y_RNA misc_RNA +chr19 56377037 56377792 - ENSG00000267459.1 AC006116.27 transcribed_processed_pseudogene +chr19 56547105 56547241 + ENSG00000281545.1 AC005498.1 miRNA +chr19 56840905 56840992 + ENSG00000281440.1 MIMT1_1 sRNA +chr19 56848068 56848204 + ENSG00000274777.1 MIMT1_2 misc_RNA +chr19 58310917 58311002 - ENSG00000264333.1 AC020915.1 miRNA +chr19 58475355 58475763 - ENSG00000269106.1 CTD-2619J13.23 antisense +chr19 58582024 58582609 + ENSG00000268784.1 MGC2752 transcribed_unprocessed_pseudogene +chr19 58586158 58586356 + ENSG00000269032.1 AC016629.7 transcribed_processed_pseudogene +chr20 1392900 1392961 - ENSG00000276741.1 MIR6869 miRNA +chr20 2652777 2652842 + ENSG00000221062.1 MIR1292 miRNA +chr20 2654212 2654286 + ENSG00000221116.1 SNORD110 snoRNA +chr20 2656097 2656182 + ENSG00000212498.1 SNORD86 snoRNA +chr20 2656624 2656694 + ENSG00000229686.1 SNORD56 snoRNA +chr20 2656939 2657010 + ENSG00000226572.1 SNORD57 snoRNA +chr20 10025917 10026168 + ENSG00000276525.1 Metazoa_SRP misc_RNA +chr20 21501395 21501968 + ENSG00000227693.1 GSTM3P1 processed_pseudogene +chr20 21511447 21512309 + ENSG00000258197.1 NKX2-2-AS1 antisense +chr20 22733210 22733301 - ENSG00000265151.1 AL158175.1 miRNA +chr20 31581089 31581192 + ENSG00000201770.1 RNU6-384P snRNA +chr20 32237795 32237847 + ENSG00000221667.1 MIR1825 miRNA +chr20 35544430 35544747 - ENSG00000224497.1 RPL36P4 processed_pseudogene +chr20 36255716 36255820 + ENSG00000281193.1 AL121895.1 miRNA +chr20 38421647 38421774 - ENSG00000277034.1 SNORA71 snoRNA +chr20 38425198 38425331 - ENSG00000273718.1 SNORA71 snoRNA +chr20 38448084 38448215 + ENSG00000274309.1 SNORA71E snoRNA +chr20 43549389 43550949 - ENSG00000226143.1 RP1-138B7.5 antisense +chr20 47651067 47651156 + ENSG00000280970.1 AL034418.1 miRNA +chr20 49278427 49278624 + ENSG00000277830.1 ZNFX1-AS1_1 misc_RNA +chr20 49279116 49279208 + ENSG00000274760.1 ZNFX1-AS1_2 misc_RNA +chr20 49280485 49280571 + ENSG00000277967.1 ZNFX1-AS1_3 misc_RNA +chr20 49280683 49280772 + ENSG00000212304.1 SNORD12 snoRNA +chr20 50840615 50840749 - ENSG00000230043.1 TMSB4XP6 processed_pseudogene +chr20 56459428 56459583 - ENSG00000228601.1 RPL39P processed_pseudogene +chr20 56460266 56460357 - ENSG00000212084.2 AL121914.1 miRNA +chr20 58817615 58817694 - ENSG00000276373.1 MIR296 miRNA +chr20 58842030 58842132 + ENSG00000276859.1 GNAS-AS1_1 misc_RNA +chr20 58850530 58850641 + ENSG00000274491.1 GNAS-AS1_4 misc_RNA +chr20 58850783 58850903 + ENSG00000275069.1 GNAS-AS1_5 misc_RNA +chr20 61953546 61953662 - ENSG00000221417.1 MIR1257 miRNA +chr20 62064802 62064885 + ENSG00000265306.1 MIR3195 miRNA +chr20 62134680 62135089 - ENSG00000280448.1 AL078633.1 pseudogene +chr20 62845664 62845912 + ENSG00000233838.5 DPH3P1 processed_pseudogene +chr20 63102142 63102259 + ENSG00000274915.1 HAR1A misc_RNA +chr20 63696668 63698684 + ENSG00000243509.4 TNFRSF6B protein_coding +chr20 63744689 63745958 + ENSG00000229299.2 RP4-583P15.10 antisense +chr20 63861213 63863306 + ENSG00000183260.6 ABHD16B protein_coding +chr20 63941465 63941544 - ENSG00000272045.1 MIR1914 miRNA +chr21 6994374 6997737 - ENSG00000279313.1 CH507-145C22.4 lincRNA +chr21 14143581 14144158 + ENSG00000240755.1 ERLEC1P1 processed_pseudogene +chr21 14961309 14964233 + ENSG00000229047.1 AF127577.10 antisense +chr21 25573980 25574044 + ENSG00000275402.1 MIR155 miRNA +chr21 33550662 33550728 + ENSG00000275224.1 MIR6501 miRNA +chr21 36132450 36133032 + ENSG00000214889.3 RPS9P1 processed_pseudogene +chr21 36485983 36487411 - ENSG00000223741.1 PSMD4P1 processed_pseudogene +chr21 39171462 39171560 - ENSG00000276873.1 uc_338 misc_RNA +chr21 39184469 39184899 + ENSG00000272991.1 AF129408.17 antisense +chr21 41180097 41180626 + ENSG00000224388.1 BACE2-IT1 antisense +chr21 41739373 41741308 + ENSG00000236883.1 AP001615.9 antisense +chr21 44328944 44330221 - ENSG00000241728.4 AP001062.8 sense_overlapping +chr22 15826566 15827187 + ENSG00000271672.1 DUXAP8 transcribed_processed_pseudogene +chr22 16961936 17008222 - ENSG00000215568.6 GAB4 protein_coding +chr22 17012404 17012932 + ENSG00000270226.1 AC006548.26 processed_pseudogene +chr22 17021398 17022570 - ENSG00000276400.1 VN1R9P processed_pseudogene +chr22 18101612 18101888 + ENSG00000235617.1 XXbac-B476C20.10 unprocessed_pseudogene +chr22 18102457 18103649 - ENSG00000225225.1 ARL2BPP10 processed_pseudogene +chr22 19018043 19018916 - ENSG00000270393.1 AC000095.9 processed_pseudogene +chr22 19130808 19132623 + ENSG00000206203.4 TSSK2 protein_coding +chr22 19454179 19454605 + ENSG00000273300.1 AC000068.9 antisense +chr22 19722945 19724771 + ENSG00000203618.5 GP1BB protein_coding +chr22 20064552 20065705 - ENSG00000268292.1 AC006547.15 antisense +chr22 20085746 20085833 + ENSG00000266567.1 MIR3618 miRNA +chr22 20086042 20086150 + ENSG00000221366.2 MIR1306 miRNA +chr22 20110821 20111875 - ENSG00000243762.1 AC006547.8 antisense +chr22 20126402 20126526 + ENSG00000264346.1 SNORA77 snoRNA +chr22 20483225 20483316 - ENSG00000265300.1 AC007731.1 miRNA +chr22 20889206 20891214 - ENSG00000272600.1 AC007308.7 antisense +chr22 22895375 22895834 + ENSG00000211675.2 IGLC1 IG_C_gene +chr22 23782283 23783958 - ENSG00000280178.1 AP000349.2 protein_coding +chr22 23903381 23903455 + ENSG00000277002.1 AP000350.1 miRNA +chr22 26671402 26671550 + ENSG00000277941.1 MIAT_exon5_1 misc_RNA +chr22 26672164 26672248 + ENSG00000275942.1 MIAT_exon5_2 misc_RNA +chr22 26673254 26673630 + ENSG00000276991.1 MIAT_exon5_3 misc_RNA +chr22 27919384 27919545 + ENSG00000277460.1 TTC28-AS1_1 misc_RNA +chr22 27922022 27922178 + ENSG00000276150.1 TTC28-AS1_2 misc_RNA +chr22 28002484 28002595 + ENSG00000276675.1 TTC28-AS1_4 misc_RNA +chr22 28794555 28800597 - ENSG00000100219.15 XBP1 protein_coding +chr22 28798628 28798725 - ENSG00000280977.1 Z93930.1 miRNA +chr22 28992721 29018620 + ENSG00000235786.1 ZNRF3-IT1 sense_intronic +chr22 29024999 29031476 - ENSG00000177993.3 ZNRF3-AS1 antisense +chr22 29058672 29061844 - ENSG00000100249.4 C22orf31 protein_coding +chr22 29073078 29168333 + ENSG00000183762.11 KREMEN1 protein_coding +chr22 29099041 29111683 - ENSG00000226772.1 CTA-747E2.10 antisense +chr22 29126261 29126404 - ENSG00000281592.1 Z95116.1 miRNA +chr22 29134014 29134120 + ENSG00000200871.1 RNU6-810P snRNA +chr22 29191697 29191808 - ENSG00000251952.1 RNU6-1219P snRNA +chr22 29333163 29333258 - ENSG00000239127.1 SNORD125 snoRNA +chr22 29437583 29437733 + ENSG00000274457.1 AC000041.10 unprocessed_pseudogene +chr22 29711820 29719714 - ENSG00000239446.1 RP1-76B20.12 antisense +chr22 30007049 30007113 + ENSG00000275818.1 MIR6818 miRNA +chr22 30971005 30971149 + ENSG00000276664.1 TUG1_1 misc_RNA +chr22 30971296 30971382 + ENSG00000276057.1 TUG1_2 misc_RNA +chr22 30972857 30973093 + ENSG00000276965.1 TUG1_3 misc_RNA +chr22 30973417 30973597 + ENSG00000275307.1 TUG1_4 misc_RNA +chr22 30976515 30978848 - ENSG00000269987.1 RP3-430N8.11 lincRNA +chr22 30977516 30977858 - ENSG00000269972.1 RP3-430N8.10 lincRNA +chr22 31059989 31060285 + ENSG00000240186.3 RN7SL633P misc_RNA +chr22 31205264 31205616 - ENSG00000254835.1 RNF185-AS1 antisense +chr22 31621467 31621531 - ENSG00000275382.1 MIR7109 miRNA +chr22 31837238 31837298 + ENSG00000252909.1 RNU6-201P snRNA +chr22 32362972 32363059 + ENSG00000274617.1 RFPL3-AS1_1 misc_RNA +chr22 32368497 32368567 + ENSG00000274170.1 RFPL3-AS1_2 misc_RNA +chr22 33164063 33166439 + ENSG00000232073.1 RP1-302D9.3 antisense +chr22 33704786 33704920 - ENSG00000253007.2 SNORA76 snoRNA +chr22 35164304 35165347 + ENSG00000238153.1 CTA-714B7.4 processed_pseudogene +chr22 35194699 35194942 - ENSG00000243453.1 COX7BP1 processed_pseudogene +chr22 37371684 37372858 + ENSG00000272694.1 RP1-63G5.8 antisense +chr22 37844272 37844371 - ENSG00000207945.1 MIR658 miRNA +chr22 37891347 37891448 + ENSG00000207227.1 RNU6-900P snRNA +chr22 37950965 37951778 + ENSG00000272582.1 RP5-1039K5.17 antisense +chr22 37967563 37967624 + ENSG00000277321.1 MIR6820 miRNA +chr22 37970686 37987422 - ENSG00000100146.15 SOX10 protein_coding +chr22 37988794 37988853 + ENSG00000264505.1 MIR4534 miRNA +chr22 39120293 39120566 - ENSG00000226024.1 COX5BP7 processed_pseudogene +chr22 39313819 39313911 - ENSG00000209480.1 SNORD83B snoRNA +chr22 39319050 39319113 - ENSG00000263764.1 SNORD43 snoRNA +chr22 41092513 41092566 + ENSG00000221160.1 MIR1281 miRNA +chr22 41923222 41923297 - ENSG00000263463.1 MIR378I miRNA +chr22 42569147 42569250 - ENSG00000251913.1 RNU6-513P snRNA +chr22 42615244 42615393 + ENSG00000276027.1 RNU12 snRNA +chr22 45976878 45976954 - ENSG00000265610.1 CR536603.1 miRNA +chr22 46112732 46112840 + ENSG00000198986.2 MIRLET7A3 miRNA +chr22 46113566 46113657 + ENSG00000264147.1 MIR4763 miRNA +chr22 46113686 46113768 + ENSG00000207875.1 MIRLET7B miRNA +chr22 50275674 50275921 + ENSG00000279216.1 AL022328.1 protein_coding +chrX 2904904 2906081 + ENSG00000229851.1 ARSD-AS1 antisense +chrX 3607258 3607365 - ENSG00000207332.1 RNU6-146P snRNA +chrX 15785716 15787589 - ENSG00000281371.1 INE2 antisense +chrX 21872707 21872808 + ENSG00000206639.1 Y_RNA misc_RNA +chrX 30834623 30835300 + ENSG00000231542.1 TAB3-AS1 antisense +chrX 30854321 30854707 + ENSG00000235512.1 TAB3-AS2 antisense +chrX 37441523 37442068 + ENSG00000241607.1 RP11-357K9.2 unprocessed_pseudogene +chrX 39837561 39837613 + ENSG00000263972.1 MIR1587 miRNA +chrX 41233633 41234283 + ENSG00000269941.1 RP5-1172N10.4 sense_intronic +chrX 52063347 52063474 - ENSG00000221705.1 SNORA11E snoRNA +chrX 52190621 52190748 + ENSG00000221475.1 SNORA11D snoRNA +chrX 53143034 53143117 - ENSG00000266700.1 AL139396.1 miRNA +chrX 54927305 54927433 + ENSG00000221750.1 SNORA11 snoRNA +chrX 66018870 66018979 + ENSG00000207939.1 MIR223 miRNA +chrX 67545277 67545422 + ENSG00000280956.1 AL049564.1 miRNA +chrX 70846080 70846295 - ENSG00000280704.1 U3 snoRNA +chrX 73821657 73821724 - ENSG00000274655.1 XIST_intron misc_RNA +chrX 73831145 73831270 - ENSG00000277577.1 Xist_exon4 misc_RNA +chrX 73850487 73850571 + ENSG00000278039.1 Xist_exon1 misc_RNA +chrX 73944332 73944466 + ENSG00000274430.1 JPX_1 misc_RNA +chrX 73944595 73944663 + ENSG00000276784.1 JPX_2 misc_RNA +chrX 74280936 74281082 + ENSG00000275254.1 FTX_3 misc_RNA +chrX 74293357 74293574 + ENSG00000277922.1 FTX_5 misc_RNA +chrX 80024067 80031111 + ENSG00000281700.1 TBX22 protein_coding +chrX 85244095 85244171 + ENSG00000264517.1 AC003001.1 miRNA +chrX 92460452 92460545 + ENSG00000211526.1 AL121869.1 miRNA +chrX 107428989 107429079 + ENSG00000207846.1 AL035088.1 miRNA +chrX 112780718 112780788 - ENSG00000263351.1 MIR4329 miRNA +chrX 115622308 115649561 + ENSG00000281638.1 AL589842.1 protein_coding +chrX 115703812 115703867 + ENSG00000264759.1 AC005000.1 miRNA +chrX 120877496 120878924 - ENSG00000278646.1 RP1-321E8.5 protein_coding +chrX 126472809 126472882 - ENSG00000264338.1 AL359973.1 miRNA +chrX 136879199 136879271 - ENSG00000206979.1 SNORD61 snoRNA +chrX 138627077 138627343 + ENSG00000232183.1 RP6-27P15.2 antisense +chrX 140086058 140086145 + ENSG00000280833.1 AL589987.1 miRNA +chrX 140783176 140784660 + ENSG00000281508.1 CDR1-AS antisense +chrX 148501098 148991332 + ENSG00000281817.1 AFF2 protein_coding +chrX 151956683 151956778 - ENSG00000264120.1 AF274855.1 miRNA +chrX 151958578 151958658 - ENSG00000207621.1 MIR224 miRNA +chrX 151959607 151959719 - ENSG00000207753.2 MIR452 miRNA +chrX 154400281 154400415 + ENSG00000207165.1 SNORA70 snoRNA +chrX 154768596 154768656 + ENSG00000281135.1 MIR664B miRNA +chrX 154774998 154775126 + ENSG00000206693.1 SNORA56 snoRNA +chrX 154887360 154887458 - ENSG00000221533.1 MIR1184-1 miRNA +chrX 155383100 155383198 - ENSG00000221190.1 MIR1184-1 miRNA +chrX 155457517 155457615 + ENSG00000221603.1 MIR1184-3 miRNA +chrX 156022631 156022698 + ENSG00000276543.3 AJ271736.1 miRNA +chrY 5573928 5574019 + ENSG00000252059.2 AC012667.1 miRNA +chrY 7378672 7378779 + ENSG00000252155.1 RNU6-941P snRNA +chrY 57209151 57209218 + ENSGR0000276543.3 AJ271736.1 miRNA diff --git a/docs/Data/Data_Security/Data_Security.md b/docs/Data/Data_Security/Data_Security.md index 62cc8b9d1..1ffa095ed 100644 --- a/docs/Data/Data_Security/Data_Security.md +++ b/docs/Data/Data_Security/Data_Security.md @@ -34,7 +34,7 @@ The GDC Data Transfer Tool and the GDC API use tokens for authentication. GDC au #### Obtaining A Token -Users can obtain authentication tokens from the [GDC Data Portal](https://portal.gdc.cancer.gov) and the [GDC Data Submission Portal](https://portal.gdc.cancer.gov/submission). See the [GDC Data Portal User's Guide](../../Data_Portal/Users_Guide/Authentication.md#gdc-authentication-tokens) and the [GDC Data Submission Portal User's Guide](../../Data_Submission_Portal/Users_Guide/Authentication.md#gdc-authentication-tokens) for instructions. +Users can obtain authentication tokens from the [GDC Data Portal](https://portal.gdc.cancer.gov) and the [GDC Data Submission Portal](https://portal.gdc.cancer.gov/submission). See the [GDC Data Portal User's Guide](../../Data_Portal/Users_Guide/Repository.md#gdc-authentication-tokens) and the [GDC Data Submission Portal User's Guide](../../Data_Submission_Portal/Users_Guide/Data_Submission_Process.md#authentication) for instructions. #### Token Expiration diff --git a/docs/Data/File_Formats/MAF_Format.md b/docs/Data/File_Formats/MAF_Format.md index 966d7c2de..46e049a41 100644 --- a/docs/Data/File_Formats/MAF_Format.md +++ b/docs/Data/File_Formats/MAF_Format.md @@ -2,11 +2,11 @@ ## Introduction -Mutation Annotation Format (MAF) is a tab-delimited text file with aggregated mutation information from [VCF Files](VCF_Format.md) and are generated on a project-level. MAF files are produced through the [Somatic Aggregation Workflow](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=somatic_aggregation_workflow&_top=1) The GDC produces MAF files at two permission levels: __protected__ and __somatic__ (or open-access). One MAF files is produced per variant calling pipeline per GDC project. MAFs are produced by aggregating the GDC annotated VCF files generated from one pipeline for one project. +Mutation Annotation Format (MAF) is a tab-delimited text file with aggregated mutation information from [VCF Files](VCF_Format.md) and are generated on a project-level. MAF files are produced through the [Somatic Aggregation Workflow](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=somatic_aggregation_workflow&_top=1). The GDC produces MAF files at two permission levels: __protected__ and __somatic__ (or open-access). One MAF file is produced per variant calling pipeline per GDC project. MAFs are produced by aggregating the GDC annotated VCF files generated from one pipeline for one project. Annotated VCF files often have variants reported on multiple transcripts whereas the MAF files generated from the VCFs (\*protected.maf) only report the most critically affected one. Somatic MAFs (\*somatic.maf), which are also known as [Masked Somatic Mutation](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=masked_somatic_mutation) files, are further processed to remove lower quality and potential germline variants. For tumor samples that contain variants from multiple combinations of tumor-normal aliquot pairs, only one pair is selected in the Somatic MAF based on their sample type. Somatic MAFs are publicly available and can be freely distributed within the boundaries of the [GDC Data Access Policies](https://gdc.cancer.gov/access-data/data-access-policies). -The GDC MAF file format is based on the [TCGA Mutation Annotation Format](https://wiki.nci.nih.gov/display/TCGA/Mutation+Annotation+Format+(MAF)+Specification) specifications, with additional columns included. +The GDC MAF file format is based on the TCGA Mutation Annotation Format specifications, with additional columns included. __Note:__ The criteria for allowing mutations into open-access are purposefully implemented to overcompensate and filter out germline variants. If omission of true-positive somatic mutations is a concern, the GDC recommends using protected MAFs. @@ -16,13 +16,13 @@ The process for modifying a protected MAF into a somatic MAF is as follows: * Aliquot Selection: only one tumor-normal pair are selected for each tumor sample based on the plate number, sample type, analyte type and other features extracted from tumor TCGA aliquot barcode. * Low quality variant filtering and germline masking: - 1. Variants with __Mutation_Status != 'Somatic'__ or __GDC_FILTER = 'Gapfiller', 'ContEst', 'multiallelic', 'nonselectedaliquot', 'BCR_Duplicate' or 'BadSeq'__ are __removed__. - 2. Remaining variants with __GDC_Valid_Somatic = True__ are __included__ in the Somatic MAF. - 3. Remaining variants with __FILTER != 'panel_of_normals' or PASS__ are __removed__. Note that the `FILTER != panel_of_normals` value is only relevant for the variants generated from the MuTect2 pipeline. - 4. Remaining variants with __MC3_Overlap = True__ are __included__ in the Somatic MAF. - 5. Remaining variants with __GDC_FILTER = 'ndp', 'NonExonic', 'bitgt', 'gdc_pon'__ are __removed__. - 6. Remaining variants with __SOMATIC != null__ are __included__ in the Somatic MAF. - 7. Remaining variants with __dbSNP_RS = 'novel' or null__ are __included__ in the Somatic MAF. + 1. Variants with ```Mutation_Status != 'Somatic'``` or ```GDC_FILTER = 'Gapfiller'```, ```'ContEst'```, ```'multiallelic'```, ```'nonselectedaliquot'```, ```'BCR_Duplicate'``` or ```'BadSeq'``` are __removed__. + 2. Remaining variants with ```GDC_Valid_Somatic = True``` are __included__ in the Somatic MAF. + 3. Remaining variants with ```FILTER != 'panel_of_normals'``` or ```PASS``` are __removed__. Note that the `FILTER != panel_of_normals` value is only relevant for the variants generated from the MuTect2 pipeline. + 4. Remaining variants with ```MC3_Overlap = True``` are __included__ in the Somatic MAF. + 5. Remaining variants with ```GDC_FILTER = 'ndp'```, ```'NonExonic'```, ```'bitgt'```, ```'gdc_pon'``` are __removed__. + 6. Remaining variants with ```SOMATIC != null``` are __included__ in the Somatic MAF. + 7. Remaining variants with ```dbSNP_RS = 'novel'``` or ```null``` are __included__ in the Somatic MAF. 8. Remaining variants are __removed__. * Removal of the following columns: * vcf_region @@ -180,21 +180,21 @@ The table below describes the columns in a protected MAF and their definitions. ### Notes About GDC MAF Implementation -1. Column #4 __NCBI_Build__ is GRCh38 by default -2. Column #32 __Sequencer__ includes the sequencers used. If different sequencers were used to generate normal and tumor data, the normal sequencer is listed first. -3. Column #61 VEP name "STRAND" is changed to __TRANSCRIPT_STRAND__ to avoid confusion with Column#8 "Strand" -4. Column #94 __IMPACT__ categories are defined by the VEP software and do not necessarily reflect the relative biological influence of each mutation. -4. Column #122-125 __vcf_info, vcf_format, vcf_tumor_gt, and vcf_normal_gt__ are the corresponding columns from the VCF files. Including them facilitates parsing specific variant information. -5. Column #120 __GDC_Validation_Status__: GDC also collects TCGA validation sequences. It compares these with variants derived from Next-Generation Sequencing data from the same sample and populates the comparison result in "GDC_Validation_Status". +1. Column #4: __NCBI_Build__ is GRCh38 by default +2. Column #32: __Sequencer__ includes the sequencers used. If different sequencers were used to generate normal and tumor data, the normal sequencer is listed first. +3. Column #61: VEP name "STRAND" is changed to __TRANSCRIPT_STRAND__ to avoid confusion with Column#8 "Strand" +4. Column #94: __IMPACT__ categories are defined by the VEP software and do not necessarily reflect the relative biological influence of each mutation. +4. Column #122-125: __vcf_info, vcf_format, vcf_tumor_gt, and vcf_normal_gt__ are the corresponding columns from the VCF files. Including them facilitates parsing specific variant information. +5. Column #120: __GDC_Validation_Status__: GDC also collects TCGA validation sequences. It compares these with variants derived from Next-Generation Sequencing data from the same sample and populates the comparison result in "GDC_Validation_Status". * "Valid", if the alternative allele(s) in the tumor validation sequence is(are) the same as GDC variant call * "Invalid", if none of the alternative allele(s) in the tumor validation sequence is the same as GDC variant call * "Inconclusive" if two alternative allele exists, and one matches while the other does not * "Unknown" if no validation sequence exists -6. Column #121 __GDC_Valid_Somatic__ is TRUE if GDC_Validation_Status is "Valid" and the variant is "Somatic" in validation calls. It is FALSE if these criteria are not met +6. Column #121: __GDC_Valid_Somatic__ is TRUE if GDC_Validation_Status is "Valid" and the variant is "Somatic" in validation calls. It is FALSE if these criteria are not met ### FILTER Value Definitions (column 111) -* __oxog :__ Signifies that this variant was determined to be an OxoG artifact. This was calculated with [D-ToxoG](http://archive.broadinstitute.org/cancer/cga/dtoxog) +* __oxog :__ Signifies that this variant was determined to be an OxoG artifact. This was calculated with [D-ToxoG](https://software.broadinstitute.org/cancer/cga/dtoxog) * __bPcr :__ Signifies that this variant was determined to be an artifact of bias on the PCR template strand. This was calculated with the [DKFZ Bias Filter](https://github.com/eilslabs/DKFZBiasFilter). * __bSeq :__ Signifies that this variant was determined to be an artifact of bias on the forward/reverse strand. This was also calculated with the [DKFZ Bias Filter](https://github.com/eilslabs/DKFZBiasFilter). diff --git a/docs/Data/Release_Notes/CPTAC-3_7Cases-WXS-MAFs_GDC-Manifest.txt b/docs/Data/Release_Notes/CPTAC-3_7Cases-WXS-MAFs_GDC-Manifest.txt new file mode 100644 index 000000000..3fa3fc0a1 --- /dev/null +++ b/docs/Data/Release_Notes/CPTAC-3_7Cases-WXS-MAFs_GDC-Manifest.txt @@ -0,0 +1,15 @@ +id file_name file_size md5sum case.submitter_id +21762c82-a87e-4119-b087-40620a2bbe5c 4191375a-bb45-4939-94fe-ea4e24384d9a.wxs.aliquot_ensemble_raw.maf.gz 157552 9828d9b43165faa665889f8edaf14417 C3N-03839 +3bc87d89-1cc5-4b00-a2ed-bd26eabb1245 fb39440b-fff0-456e-a5f9-77b8808f5b1b.wxs.aliquot_ensemble_masked.maf.gz 13263 77609c30faab4b495453e7e195622bbf C3L-04080 +56d1ff72-dd53-422c-9ea5-7664081f6eb2 19fbb6ef-8a29-4c7a-a625-94ce85d5f08f.wxs.aliquot_ensemble_raw.maf.gz 134555 c719416ab33bb0b0476e44f9481205ea C3N-02971 +650494a4-4028-4a6f-bedd-5fc292d38da2 258fbc6f-d743-402e-88d1-013fd76f252f.wxs.aliquot_ensemble_raw.maf.gz 324199 6eecaa89d64347754f7e3eab07e7532e C3N-03754 +65a9b395-df17-40c5-b2fa-25379a911add 19fbb6ef-8a29-4c7a-a625-94ce85d5f08f.wxs.aliquot_ensemble_masked.maf.gz 21187 210adef89d4ecaaf1345de5552fecd63 C3N-02971 +76fbfacf-7eb1-4407-ae38-db7e4bfc63f7 4191375a-bb45-4939-94fe-ea4e24384d9a.wxs.aliquot_ensemble_masked.maf.gz 26713 e9889b239822214ab80a692cf3f6b440 C3N-03839 +825e095a-5e8d-4893-adef-064f7c15a67c e26bafe4-cb2f-45c5-a386-5c458960ee3c.wxs.aliquot_ensemble_masked.maf.gz 18146 05b3f087a2a489c7c592e84b4172b7fa C3N-02585 +9e62722b-d67d-4ff9-ac27-098cd0e9e938 e26bafe4-cb2f-45c5-a386-5c458960ee3c.wxs.aliquot_ensemble_raw.maf.gz 151404 872dbc067ef652c897a6cca7645fda97 C3N-02585 +b63f873c-a8e8-4159-acae-1c300a328eff da42134c-423e-4989-9881-4b17c600addd.wxs.aliquot_ensemble_raw.maf.gz 159672 fcbb939472407c466871be3bbaf6a627 C3L-04027 +cf3a2c57-3ce6-48fe-bd3f-cbc319c0cdc3 fb39440b-fff0-456e-a5f9-77b8808f5b1b.wxs.aliquot_ensemble_raw.maf.gz 129399 772938c1e6964acc1b812923373d441d C3L-04080 +d58a6cdb-6a56-4fd8-b682-95bd0e38a467 02173eb6-d1ff-47b9-b500-773bdb5323ed.wxs.aliquot_ensemble_masked.maf.gz 20124 91a1509ef515762b7188efba3b17eaad C3N-02768 +da4c2c8d-c943-4354-9d8b-d61d0e53c13e 02173eb6-d1ff-47b9-b500-773bdb5323ed.wxs.aliquot_ensemble_raw.maf.gz 140724 2f0ad190f2c83d14bd3ac56288ebe442 C3N-02768 +edbe1f1c-11cd-41f0-a872-cbb95b52c9d7 da42134c-423e-4989-9881-4b17c600addd.wxs.aliquot_ensemble_masked.maf.gz 22928 af0aeae46af0feb04cb2ad8febef624b C3L-04027 +f27b38f0-c3ce-4f1f-931d-f6215513802b 258fbc6f-d743-402e-88d1-013fd76f252f.wxs.aliquot_ensemble_masked.maf.gz 13007 ab9496f355d1094a0a95554680e3eb69 C3N-03754 diff --git a/docs/Data/Release_Notes/CPTAC-3_7CasesRNASeq_GDC-Manifest.txt b/docs/Data/Release_Notes/CPTAC-3_7CasesRNASeq_GDC-Manifest.txt new file mode 100644 index 000000000..2d437dee6 --- /dev/null +++ b/docs/Data/Release_Notes/CPTAC-3_7CasesRNASeq_GDC-Manifest.txt @@ -0,0 +1,57 @@ +id file_name file_size md5sum case.submitter_id +578e63fe-1d86-4311-978e-24327e96e156 0c2673ac-0672-4841-b8c4-71fe453149b6.rna_seq.chimeric.gdc_realn.bam 148539489 497148897dc81d05c6e5983b2b838776 C3L-07037 +d3c7b9b1-fca0-4324-a0ee-dcc265da89ad 0c2673ac-0672-4841-b8c4-71fe453149b6.rna_seq.genomic.gdc_realn.bam 9669771486 7e6c8fe75b8ecf672785e64bed6f6ce0 C3L-07037 +25a8cac8-1953-4a17-9344-a2f680cfe5ea 0c2673ac-0672-4841-b8c4-71fe453149b6.rna_seq.star_gene_counts.tsv.gz 420625 7149de73b61e6ad12981a08ce13cab82 C3L-07037 +4293d821-2e6b-4920-9dcc-4225fb2727e4 0c2673ac-0672-4841-b8c4-71fe453149b6.rna_seq.star_splice_junctions.tsv.gz 3085884 b7cfdee5feef8806e973455af0d32e5d C3L-07037 +04fe28c3-7897-446f-9268-abf90e3fe3c0 0c2673ac-0672-4841-b8c4-71fe453149b6.rna_seq.transcriptome.gdc_realn.bam 9423577591 3fb99dc7c8affa0b0710c13772102095 C3L-07037 +0b71dd7c-3616-414a-a378-065a6e3b9a26 210dfc8a-9ab8-4dc8-8002-939823690011.rna_seq.chimeric.gdc_realn.bam 50238801 1e32c733c3c90da5cbe83f7c8214b7b8 C3L-07034 +02c03a17-d4a0-4e7e-a7cc-b036247b3d3b 210dfc8a-9ab8-4dc8-8002-939823690011.rna_seq.genomic.gdc_realn.bam 7963264496 e66fc7947a6bfa0fea9bc140832de1a2 C3L-07034 +4f5f9f02-6e86-44fa-b202-0e7acf560e80 210dfc8a-9ab8-4dc8-8002-939823690011.rna_seq.star_gene_counts.tsv.gz 400559 a8875cd52668160cf3bcaa9e4a4e0d94 C3L-07034 +30d6150e-224c-45eb-ac12-bd0132726114 210dfc8a-9ab8-4dc8-8002-939823690011.rna_seq.star_splice_junctions.tsv.gz 2630123 3c8fa945fa679a4f311c0c151b301a0c C3L-07034 +f3cefeae-ab58-446f-a50f-f191963bcf38 210dfc8a-9ab8-4dc8-8002-939823690011.rna_seq.transcriptome.gdc_realn.bam 8288989317 82a8dae88b6d1188d6bd88640ec0f0f1 C3L-07034 +2ccacfee-208e-4aca-b17d-ace68b8e2032 36e96206-6f09-4d10-890f-227feedd305e.FPKM-UQ.txt.gz 433691 2d7fce5e0a4252344ffea09fd31ba4dd C3L-07032 +0b145cb0-1ed8-4e04-b260-22ba20bf35ec 36e96206-6f09-4d10-890f-227feedd305e.FPKM.txt.gz 325787 bf4309a9389999372281c082e30cbb09 C3L-07032 +266aee11-d103-4e2a-832e-a53599ee305e 36e96206-6f09-4d10-890f-227feedd305e.htseq_counts.txt.gz 254753 80e51ff984eb152904fc3cad70ad4936 C3L-07032 +21fd222d-06d4-4730-a767-a3c13f99a321 3b8f9519-dd16-4837-a7c2-da55809a9836.FPKM-UQ.txt.gz 437428 36f471c6e8cb54fcc8d7e8f129f98591 C3L-07035 +327c06ed-4a0e-499b-8396-5b868fbbd653 3b8f9519-dd16-4837-a7c2-da55809a9836.FPKM.txt.gz 329629 f3686829b693ceaf1b30a20de09eb70f C3L-07035 +8b11aadb-b118-4ec9-8996-464eb5f9dc0b 3b8f9519-dd16-4837-a7c2-da55809a9836.htseq_counts.txt.gz 256062 f958a22e029a6728f8accaa86e74ea7c C3L-07035 +cbd25d77-ff4d-41e2-b7c1-c7e69ea1a3ca 425fc7d2-4577-4d39-800d-79ef56034a5a.FPKM-UQ.txt.gz 447433 28ef94e8bd47471b69e6600b4e406dc0 C3L-03513 +ce360072-ec4b-40f7-a9b2-84b29c2e4f2b 425fc7d2-4577-4d39-800d-79ef56034a5a.FPKM.txt.gz 339993 ad03fd408e8b59dee471fdc21fb668d3 C3L-03513 +355669ca-2d77-4ae7-b851-58bfa30919f5 425fc7d2-4577-4d39-800d-79ef56034a5a.htseq_counts.txt.gz 260851 c81924a52f7d8ab3d0909b7e40e4309c C3L-03513 +10fd571c-1a46-4fa6-93fd-72e3d145af8f 53ee9ccb-3048-4123-9b1a-6f286e2ee069.rna_seq.chimeric.gdc_realn.bam 63101475 1d3bff362c80a0ab10c05f5a9644eb9f C3L-07032 +3ebd283a-7cf2-44af-a41f-7f7960818c95 53ee9ccb-3048-4123-9b1a-6f286e2ee069.rna_seq.genomic.gdc_realn.bam 9496427474 ffb5add6a7a217aa1bd4a7beb517bee9 C3L-07032 +f3d72777-02e2-44f2-8ff8-05800bb49285 53ee9ccb-3048-4123-9b1a-6f286e2ee069.rna_seq.star_gene_counts.tsv.gz 410833 8c043a7fa571748cce7029ab03b9a4aa C3L-07032 +2751c0c0-df8f-41d5-bafe-6141793374bb 53ee9ccb-3048-4123-9b1a-6f286e2ee069.rna_seq.star_splice_junctions.tsv.gz 2792064 f93235d02d9cfd4e16cbaf50cc724a3c C3L-07032 +f5add029-957c-4b0b-8fdf-d616e81eb5d8 53ee9ccb-3048-4123-9b1a-6f286e2ee069.rna_seq.transcriptome.gdc_realn.bam 11805312153 b99cb1fb9ca354c76b41ed6ddf21c591 C3L-07032 +a402f516-0d34-4e5d-bf4f-5900285c1fd8 5604e9ee-e520-47b3-b783-e02dde552ea2.FPKM-UQ.txt.gz 417488 fc6dd882bd79f51bd39e336a4223bf4b C3L-07036 +194c7526-4933-47bf-989c-645c4cca69dc 5604e9ee-e520-47b3-b783-e02dde552ea2.FPKM.txt.gz 317345 493d583b5a167ceba0e939024b33bacf C3L-07036 +5ce85bc6-decd-4b05-9d77-8d0566303d9c 5604e9ee-e520-47b3-b783-e02dde552ea2.htseq_counts.txt.gz 248447 da6a8c9196288d274597164d3321e02d C3L-07036 +e7d89641-e35a-4485-9769-ab355ae6598b 5d446ce9-7572-43c4-9be0-defc29dd7d6b.rna_seq.chimeric.gdc_realn.bam 95588020 985c0855184b3f08fdc1c03f2687c73f C3L-07033 +cf8ec0e0-3302-42ab-8dc6-116c10eed835 5d446ce9-7572-43c4-9be0-defc29dd7d6b.rna_seq.genomic.gdc_realn.bam 10282231536 85432c491f60699a8c246b15f920e772 C3L-07033 +f6b024a9-e196-454b-ba09-66e757132afc 5d446ce9-7572-43c4-9be0-defc29dd7d6b.rna_seq.star_gene_counts.tsv.gz 421130 3b662d10030da24e83811538822cbe77 C3L-07033 +0b4b1c6e-9624-4f67-a240-c6fbc6d25cb4 5d446ce9-7572-43c4-9be0-defc29dd7d6b.rna_seq.star_splice_junctions.tsv.gz 2989806 18f904e5c795895b6eb948567e091135 C3L-07033 +1c86d34c-b942-4609-abb6-e1dc5fc813c8 5d446ce9-7572-43c4-9be0-defc29dd7d6b.rna_seq.transcriptome.gdc_realn.bam 10262751426 60c145ad2cba9f8c2039d76f0ed491dc C3L-07033 +47a68f6e-5682-4d02-b85e-950dd2560b1f 842aa67b-623e-4595-8019-bcc44b4cb203.rna_seq.chimeric.gdc_realn.bam 204655296 aab1cc93c916ed1c60e537b0a3905200 C3L-03513 +f8f72d45-2530-4d0f-98be-4b1790ed139a 842aa67b-623e-4595-8019-bcc44b4cb203.rna_seq.genomic.gdc_realn.bam 8448899163 2c4e41b9fe6f5acbd8ba851c284f4646 C3L-03513 +85e96a99-02e6-452b-895f-0249ea43d7e0 842aa67b-623e-4595-8019-bcc44b4cb203.rna_seq.star_gene_counts.tsv.gz 429757 73a326218d812568eb9fecdbafddb426 C3L-03513 +40554ffe-dab0-4b6d-859f-1e2cf0a0e647 842aa67b-623e-4595-8019-bcc44b4cb203.rna_seq.star_splice_junctions.tsv.gz 3245314 4e4630aadb97a7761c1bd584342cb4ff C3L-03513 +89b55346-3b4e-4c79-9838-960b0b63dfea 842aa67b-623e-4595-8019-bcc44b4cb203.rna_seq.transcriptome.gdc_realn.bam 9971583430 067cbdc7504b92ecefa03fc4573619e9 C3L-03513 +972f155d-998a-4570-9957-a9f353e4ba85 af235ec2-8e46-4903-819d-967a8f000802.rna_seq.chimeric.gdc_realn.bam 65312248 222e880fafef3947612530ed52b6cc88 C3L-07035 +509accf5-0f92-4207-ba6b-62b27798db47 af235ec2-8e46-4903-819d-967a8f000802.rna_seq.genomic.gdc_realn.bam 9866815856 68788b05b4719badc827ceae0f6685d4 C3L-07035 +d52e4262-2c7c-45aa-92e8-66545097d732 af235ec2-8e46-4903-819d-967a8f000802.rna_seq.star_gene_counts.tsv.gz 413930 707f130cb33a50733186e756006dbe10 C3L-07035 +8a0bcdc9-fbcf-42d1-891c-9f257b936b9e af235ec2-8e46-4903-819d-967a8f000802.rna_seq.star_splice_junctions.tsv.gz 2966323 44efa5cc5c7b735eff12644f15041335 C3L-07035 +532a5520-6adc-4927-9263-0c1c4b0b4bad af235ec2-8e46-4903-819d-967a8f000802.rna_seq.transcriptome.gdc_realn.bam 10782705068 4b1f41c5454676bf803e1bb6867cdc4c C3L-07035 +fc53684f-fe59-428b-b5a7-cf28b42f0140 c56914a9-7024-46cd-b99d-26e452759db4.FPKM-UQ.txt.gz 425057 6d1eb09570dc63ac4634da2aae31f877 C3L-07034 +a003336d-aa42-4c2b-8a85-aba8dc2c38f5 c56914a9-7024-46cd-b99d-26e452759db4.FPKM.txt.gz 322190 72430e138d87930213e086c3bb38c934 C3L-07034 +5c498733-5a80-4693-a3d2-0fa02ab9a926 c56914a9-7024-46cd-b99d-26e452759db4.htseq_counts.txt.gz 250884 91abacef22be0ad08ef4a68e6636ecc5 C3L-07034 +3375a850-cd16-4b61-8738-7ea1b843bbf7 c90a3c07-8b2f-4420-b756-8ee25517da9d.FPKM-UQ.txt.gz 439062 60d976880d4fa0df3e8277987a81c679 C3L-07037 +99410e52-4a60-409d-a1dd-d6edb7538a20 c90a3c07-8b2f-4420-b756-8ee25517da9d.FPKM.txt.gz 334131 8e6ee22da04214446825281c99727c2c C3L-07037 +f6e2c7ba-1884-456f-8b6e-f979b0ccc351 c90a3c07-8b2f-4420-b756-8ee25517da9d.htseq_counts.txt.gz 257860 9edefdb9d3eaf7dc0553133145bf6b04 C3L-07037 +2d59913a-5d89-4069-8539-f311fd9425bd db13341b-c318-48b4-b16b-b3463f0fb93f.rna_seq.chimeric.gdc_realn.bam 38992789 9df7e0aabbb5a4bf17dcc33b9a3ffe0d C3L-07036 +e029a913-df36-4400-b90d-9e325b35d70c db13341b-c318-48b4-b16b-b3463f0fb93f.rna_seq.genomic.gdc_realn.bam 7426531496 c29c1cbbb24ff10ef67a09cdb51cbcb5 C3L-07036 +983a4982-0616-4c13-99cd-e41206c00676 db13341b-c318-48b4-b16b-b3463f0fb93f.rna_seq.star_gene_counts.tsv.gz 395631 efb7c66aa65f4d14caa8c863cf101049 C3L-07036 +5f81f69b-d449-43e3-8487-08e8e9739af7 db13341b-c318-48b4-b16b-b3463f0fb93f.rna_seq.star_splice_junctions.tsv.gz 2574662 58c8a2efea83de6b12f15233cabc9c72 C3L-07036 +e34adf40-8b81-426d-b5fa-ac255001b8e0 db13341b-c318-48b4-b16b-b3463f0fb93f.rna_seq.transcriptome.gdc_realn.bam 8248662090 8a91584cb6a3e4fe5199ace61d80d09e C3L-07036 +cacfb28b-9687-4e7e-91ea-060f3b2c982c e220f05d-0551-4ea9-bd30-e1036ab49dfb.FPKM-UQ.txt.gz 444739 1d4bafb64e9890fa0fe5caee8c0f8fb8 C3L-07033 +dea87199-ff26-45ce-bda7-09c320c3f41b e220f05d-0551-4ea9-bd30-e1036ab49dfb.FPKM.txt.gz 335352 8ae429ede383540c4199c779ec6d75b8 C3L-07033 +495be1d4-5d28-4088-a96d-6626e071369a e220f05d-0551-4ea9-bd30-e1036ab49dfb.htseq_counts.txt.gz 258487 d0e35b6d2c55fc106c714ac868694efb C3L-07033 diff --git a/docs/Data/Release_Notes/Data_Release_Notes.md b/docs/Data/Release_Notes/Data_Release_Notes.md index 4b7eee745..42cf9e37b 100644 --- a/docs/Data/Release_Notes/Data_Release_Notes.md +++ b/docs/Data/Release_Notes/Data_Release_Notes.md @@ -2,6 +2,25 @@ | Version | Date | |---|---| +| [v29.0](Data_Release_Notes.md#data-release-290) | March 31, 2021 | +| [v28.0](Data_Release_Notes.md#data-release-280) | February 2, 2021 | +| [v27.0-fix](Data_Release_Notes.md#data-release-270-bug-fix) | November 9, 2020 | +| [v27.0](Data_Release_Notes.md#data-release-270) | October 29, 2020 | +| [v26.0](Data_Release_Notes.md#data-release-260) | September 8, 2020 | +| [v25.0](Data_Release_Notes.md#data-release-250) | July 22, 2020 | +| [v24.0](Data_Release_Notes.md#data-release-240) | May 7, 2020 | +| [v23.0](Data_Release_Notes.md#data-release-230) | April 7, 2020 | +| [v22.0](Data_Release_Notes.md#data-release-220) | January 16, 2020 | +| [v21.0](Data_Release_Notes.md#data-release-210) | December 10, 2019 | +| [v20.0](Data_Release_Notes.md#data-release-200) | November 11, 2019 | +| [v19.1](Data_Release_Notes.md#data-release-191) | November 6, 2019 | +| [v19.0](Data_Release_Notes.md#data-release-190) | September 17, 2019 | +| [v18.0](Data_Release_Notes.md#data-release-180) | July 8, 2019 | +| [v17.1](Data_Release_Notes.md#data-release-171) | June 12, 2019 | +| [v17.0](Data_Release_Notes.md#data-release-170) | June 5, 2019 | +| [v16.0](Data_Release_Notes.md#data-release-160) | March 26, 2019 | +| [v15.0](Data_Release_Notes.md#data-release-150) | February 20, 2019 | +| [v14.0](Data_Release_Notes.md#data-release-140) | December 18, 2018 | | [v13.0](Data_Release_Notes.md#data-release-130) | September 27, 2018 | | [v12.0](Data_Release_Notes.md#data-release-120) | June 13, 2018 | | [v11.0](Data_Release_Notes.md#data-release-110) | May 21, 2018 | @@ -17,6 +36,1452 @@ | [v2.0](Data_Release_Notes.md#data-release-20) | August 9, 2016 | | [v1.0](Data_Release_Notes.md#initial-data-release-10) | June 6, 2016 | +## Data Release 29.0 + +* __GDC Product__: Data +* __Release Date__: March 31, 2021 + +### New updates + +1. Count Me In Program + * Aliquot-level MAFs are now available for projects CMI-ASC, CMI-MBC, and CMI-MPC. + * Somatic mutation are now explorable for projects CMI-ASC, CMI-MBC, and CMI-MPC +2. CPTAC Program + * CPTAC-2 open-access somatic mutations are now browsable through the GDC Exploration Portal. + * MSI data is now browsable through the faceted search for CPTAC-2 and CPTAC-3. +3. HCMI-CMDC - Data files and explorable mutations for 18 new cases are now available. + +A complete list of files for this release are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + +* [gdc_manifest_20210331_data_release_29.0_active.tsv.gz](gdc_manifest_20210331_data_release_29.0_active.tsv.gz) +* [gdc_manifest_20210331_data_release_29.0_legacy.tsv.gz](gdc_manifest_20210331_data_release_29.0_legacy.tsv.gz) + +### Bugs Fixed Since Last Release + +* The aggregated and masked MAF files that were missing for seven pancreatic cases in CPTAC-3 have been restored to the data portal. +* The missing RNA-Seq data files for the seven normal pancreatic cases in CPTAC-3 have been restored to the data portal. + +### Known Issues and Workarounds + +* The Copy Number Estimate files in GENIE are labeled on the portal as TXT while the files are actually in TSV format. +* Some tumor-only annotated VCFs (not raw VCFs) could have a small proportion of variants that appear twice. Tumor-only annotated VCFs can be identified by searching for workflow "GATK4 MuTect2 Annotation" +* The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* TCGA Projects + * Incorrect information about treatment may be included for patients within TCGA-HNSC and TCGA-LGG. Please refer to the clinical XML for accurate information on treatment + * 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. + * Two tissue slide images are unavailable for download from GDC Data Portal + * The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + * Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). + * Tumor grade property is not populated + * Progression_or_recurrence property is not populated +* TARGET projects + * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D + * 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal + * There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank + * There are two cases with identical submitter_id `TARGET-10-PARUYU` + * Some TARGET cases are missing `days_to_last_follow_up` + * Some TARGET cases are missing `age_at_diagnosis` + * Some TARGET files are not connected to all related aliquots + * Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) + * The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data + * Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. + * No data from TARGET-MDLS is available. +* Issues in the Legacy Archive + * The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file.* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available + * SDF Files are not linked to Project or Case in the Legacy Archive + * Two biotab files are not linked to Project or Case in the Legacy Archive + * SDRF files are not linked to Project or Case in the Legacy Archive + * TARGET-MDLS cases do not have disease_type or primary_site populated + + ## Data Release 28.0 + + * __GDC Product__: Data + * __Release Date__: February 2, 2021 + + ### New updates + + 1. New Project: CMI-MPC - Count Me In - The Metastatic Prostate Cancer Project + * WXS alignments and variant calls (VCFs) are available. + 2. New Data Type: Single nuclei (snRNA-Seq) data is now available for 18 CPTAC-3 cases. See the [RNA-Seq](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#scrna-seq-pipeline) documentation for details. + 3. CPTAC-3 + * Data files for 147 new cases from the pancreatic cohort are now available. + * CPTAC-3 open-access somatic mutations are now browsable through the GDC Exploration Portal. + * RNA-Seq transcript fusion files are now available. + * Targeted Sequencing alignments and raw tumor-only variant calls (VCF) are now available. + 4. HCMI-CMDC + * Data files for 22 new cases are now available. + * The HCMI-CMDC open-access somatic mutations have been refreshed on the GDC Exploration Portal to reflect all newly released cases. + + A complete list of files for this release are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + + * [gdc_manifest_20210202_data_release_28.0_active.tsv.gz](gdc_manifest_20210202_data_release_28.0_active.tsv.gz) + * [gdc_manifest_20210202_data_release_28.0_legacy.tsv.gz](gdc_manifest_20210202_data_release_28.0_legacy.tsv.gz) + + ### Bugs Fixed Since Last Release + + * None + + ### Known Issues and Workarounds + + * The aggregated and masked MAF files for seven pancreatic cases in CPTAC-3 do not appear in the Data Portal. See below for download instructions. + - [This manifest](CPTAC-3_7Cases-WXS-MAFs_GDC-Manifest.txt) can be used to download the files. + - To download the raw aggregated MAF files, dbGaP access to CPTAC-3 (phs001287) is required. The masked MAF files are open-access. + - The seven cases are as follows: C3L-04027, C3L-04080, C3N-02585, C3N-02768, C3N-02971, C3N-03754, and C3N-03839. The case the each file is associated with is denoted in the manifest. + * The RNA-Seq data files for the seven normal pancreatic cases in CPTAC-3 do not appear in the Data Portal. See below for download instructions. + - [This manifest](CPTAC-3_7CasesRNASeq_GDC-Manifest.txt) can be used to download the files. + - To download the alignments or splice-junction files, dbGaP access to CPTAC-3 (phs001287) is required. The other gene expression files are open-access. + - The seven cases are as follows: C3L-03513, C3L-07032, C3L-07033, C3L-07034, C3L-07035, C3L-07036, C3L-07037. The case the each file is associated with is denoted in the manifest. + * The Copy Number Estimate files in GENIE are labeled on the portal as TXT while the files are actually in TSV format. + * Some tumor-only annotated VCFs (not raw VCFs) could have a small proportion of variants that appear twice. Tumor-only annotated VCFs can be identified by searching for workflow "GATK4 MuTect2 Annotation" + * The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. + * Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release + * Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. + * Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. + * Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. + * BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. + * Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg + * TCGA Projects + * Incorrect information about treatment may be included for patients within TCGA-HNSC and TCGA-LGG. Please refer to the clinical XML for accurate information on treatment + * 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. + * Two tissue slide images are unavailable for download from GDC Data Portal + * The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + * Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). + * Tumor grade property is not populated + * Progression_or_recurrence property is not populated + * TARGET projects + * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D + * 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal + * There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank + * There are two cases with identical submitter_id `TARGET-10-PARUYU` + * Some TARGET cases are missing `days_to_last_follow_up` + * Some TARGET cases are missing `age_at_diagnosis` + * Some TARGET files are not connected to all related aliquots + * Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) + * The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data + * Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. + * No data from TARGET-MDLS is available. + * Issues in the Legacy Archive + * The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file.* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available + * SDF Files are not linked to Project or Case in the Legacy Archive + * Two biotab files are not linked to Project or Case in the Legacy Archive + * SDRF files are not linked to Project or Case in the Legacy Archive + * TARGET-MDLS cases do not have disease_type or primary_site populated + + + +## Data Release 27.0 Bug Fix + +* __GDC Product__: Data +* __Release Date__: November 9, 2020 + +### New updates + +1. None, see bug fix section below. + +A complete list of files for this release are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + +* [gdc_manifest_20201109_data_release_27.0_active.tsv.gz](gdc_manifest_20201109_data_release_27.0_active.tsv.gz) +* [gdc_manifest_20201109_data_release_27.0_legacy.tsv.gz](gdc_manifest_20201109_data_release_27.0_legacy.tsv.gz) + +### Bugs Fixed Since Last Release + +* Some files in projects CGCI-BLGSP, CGCI-HTMCP-CC, and HCMI-CMDC were marked on the portal as controlled-access, when they were supposed to be open-access. These are now downloadable as open-access files. + +### Known Issues and Workarounds + +* The Copy Number Estimate files in GENIE are labeled on the portal as TXT while the files are actually in TSV format. +* Some tumor-only annotated VCFs (not raw VCFs) could have a small proportion of variants that appear twice. Tumor-only annotated VCFs can be identified by searching for workflow "GATK4 MuTect2 Annotation" +* The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* TCGA Projects + * Incorrect information about treatment may be included for patients within TCGA-HNSC and TCGA-LGG. Please refer to the clinical XML for accurate information on treatment + * 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. + * Two tissue slide images are unavailable for download from GDC Data Portal + * The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + * Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). + * Tumor grade property is not populated + * Progression_or_recurrence property is not populated +* TARGET projects + * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D + * 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal + * There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank + * There are two cases with identical submitter_id `TARGET-10-PARUYU` + * Some TARGET cases are missing `days_to_last_follow_up` + * Some TARGET cases are missing `age_at_diagnosis` + * Some TARGET files are not connected to all related aliquots + * Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) + * The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data + * Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. + * No data from TARGET-MDLS is available. +* Issues in the Legacy Archive + * The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file.* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available + * SDF Files are not linked to Project or Case in the Legacy Archive + * Two biotab files are not linked to Project or Case in the Legacy Archive + * SDRF files are not linked to Project or Case in the Legacy Archive + * TARGET-MDLS cases do not have disease_type or primary_site populated + +## Data Release 27.0 + +* __GDC Product__: Data +* __Release Date__: October 29, 2020 + +### New updates + +1. Initial release for the WGS variant calling pipeline. See the [documentation](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline/#whole-genome-sequencing-variant-calling) on WGS variant calling for more details on the available files. This includes data from the following projects: + * CGCI-BLGSP + * CGCI-HTMCP-CC + * HCMI-CMDC +2. RNA-Seq transcript fusion files are available for the following projects: + * CGCI-BLGSP + * CGCI-HTMCP-CC + * HCMI-CMDC +3. Aliquot level MAFs were released for CGCI-HTMCP-CC Targeted Sequencing variants. Open access MAFs are included. +4. 17 new cases were released for the HCMI-CMDC project. This includes WGS, WXS, and RNA-Seq data. +5. WGS alignments were released for 99 TCGA-LUAD cases (196 files). +6. Therapeutic agents (treatment) and tumor stage (diagnosis) properties were migrated to remove deprecated values and better adhere to a standardized set of values. + +A complete list of files for DR27.0 are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + +* [gdc_manifest_20201029_data_release_27.0_active.tsv.gz](gdc_manifest_20201029_data_release_27.0_active.tsv.gz) +* [gdc_manifest_20201029_data_release_27.0_legacy.tsv.gz](gdc_manifest_20201029_data_release_27.0_legacy.tsv.gz) + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Some files in projects CGCI-BLGSP, CGCI-HTMCP-CC, and HCMI-CMDC are marked on the portal as controlled-access. These files are publicly downloadable using the Data Transfer Tool or API. All files from the following data types should be open-access within the previously specified projects: Biospecimen Supplement, Clinical Supplement, Gene Expression Quantification, Masked Somatic Mutation +* The Copy Number Estimate files in GENIE are labeled on the portal as TXT while the files are actually in TSV format. +* Some tumor-only annotated VCFs (not raw VCFs) could have a small proportion of variants that appear twice. Tumor-only annotated VCFs can be identified by searching for workflow "GATK4 MuTect2 Annotation" +* The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* TCGA Projects + * Incorrect information about treatment may be included for patients within TCGA-HNSC and TCGA-LGG. Please refer to the clinical XML for accurate information on treatment + * 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. + * Two tissue slide images are unavailable for download from GDC Data Portal + * The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + * Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). + * Tumor grade property is not populated + * Progression_or_recurrence property is not populated +* TARGET projects + * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D + * 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal + * There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank + * There are two cases with identical submitter_id `TARGET-10-PARUYU` + * Some TARGET cases are missing `days_to_last_follow_up` + * Some TARGET cases are missing `age_at_diagnosis` + * Some TARGET files are not connected to all related aliquots + * Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) + * The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data + * Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. + * No data from TARGET-MDLS is available. +* Issues in the Legacy Archive + * The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file.* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available + * SDF Files are not linked to Project or Case in the Legacy Archive + * Two biotab files are not linked to Project or Case in the Legacy Archive + * SDRF files are not linked to Project or Case in the Legacy Archive + * TARGET-MDLS cases do not have disease_type or primary_site populated + +## Data Release 26.0 + +* __GDC Product__: Data +* __Release Date__: September 8, 2020 + +### New updates + +1. New program released: + * Count Me In (CMI) + * CMI-ASC - The Angiosarcoma Project + * RNA-Seq + * WXS + * CMI-MBC - The Metastatic Breast Cancer Project + * RNA-Seq + * WXS +2. Somatic mutations are now available on the exploration portal for the following projects: + * MMRF-COMMPASS + * TARGET-ALL-P3 + * TARGET-AML + * TARGET-NBL + * TARGET-WT +3. Primary sites and disease types were updated for multiple projects to correspond to GDC Dictionary updates. + +A complete list of files for DR26.0 are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + +* [gdc_manifest_20200908_data_release_26.0_active.tsv.gz](gdc_manifest_20200908_data_release_26.0_active.tsv.gz) +* [gdc_manifest_20200908_data_release_26.0_legacy.tsv.gz](gdc_manifest_20200908_data_release_26.0_legacy.tsv.gz) + +### Bugs Fixed Since Last Release + +* The CPTAC-3 head and neck cohort can now be queried by choosing the head and neck anatomic site on the GDC home page. + +### Known Issues and Workarounds + +* The Copy Number Estimate files in GENIE are labeled on the portal as TXT while the files are actually in TSV format. +* Some tumor-only annotated VCFs (not raw VCFs) could have a small proportion of variants that appear twice. Tumor-only annotated VCFs can be identified by searching for workflow "GATK4 MuTect2 Annotation" +* The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* TCGA Projects + * Incorrect information about treatment may be included for patients within TCGA-HNSC and TCGA-LGG. Please refer to the clinical XML for accurate information on treatment + * 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. + * Two tissue slide images are unavailable for download from GDC Data Portal + * The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + * Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). + * Tumor grade property is not populated + * Progression_or_recurrence property is not populated +* TARGET projects + * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D + * 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal + * There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank + * There are two cases with identical submitter_id `TARGET-10-PARUYU` + * Some TARGET cases are missing `days_to_last_follow_up` + * Some TARGET cases are missing `age_at_diagnosis` + * Some TARGET files are not connected to all related aliquots + * Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) + * The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data + * Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. + * No data from TARGET-MDLS is available. +* Issues in the Legacy Archive + * The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file.* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available + * SDF Files are not linked to Project or Case in the Legacy Archive + * Two biotab files are not linked to Project or Case in the Legacy Archive + * SDRF files are not linked to Project or Case in the Legacy Archive + * TARGET-MDLS cases do not have disease_type or primary_site populated + + ## Data Release 25.0 + + * __GDC Product__: Data + * __Release Date__: July 22, 2020 + + ### New updates + + 1. New data types released: + * RNA-Seq Transcript Fusion files were released for the following projects: + * TARGET-ALL-P1 + * TARGET-ALL-P2 + * TARGET-ALL-P3 + * TARGET-CCSK + * TARGET-NBL + * TARGET-OS + * TARGET-RT + * TARGET-WT + * The msi_status and msi_score properties can be queried on the GDC Portal for the CPTAC-3 project. + * To query for these fields: go to the [GDC Repository](https://portal.gdc.cancer.gov/repository), click on "Add a File Filter" at the top left of the screen, type msi_score or msi_status in the field, and click on "msi_score" or "msi_status". This should bring up the corresponding filters to use on the portal. + 2. 108 cases from the CPTAC-3 LSCC Cohort were released. Includes the following data types: + * WXS + * WGS + * RNA-Seq + * miRNA-Seq + 3. Aliquot level MAFs were released for MMRF-COMMPASS WXS variants. Open access MAFs are included. + 4. HCMI-CMDC open-access somatic mutations were released to the [Exploration Portal](https://portal.gdc.cancer.gov/exploration). + + A complete list of files for DR25.0 are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + + * [gdc_manifest_20200722_data_release_25.0_active.tsv.gz](gdc_manifest_20200722_data_release_25.0_active.tsv.gz) + * [gdc_manifest_20200722_data_release_25.0_legacy.tsv.gz](gdc_manifest_20200722_data_release_25.0_legacy.tsv.gz) + + ### Bugs Fixed Since Last Release + + * A few supplements from CGCI-BLGSP are now associated with their correct versions. + + ### Known Issues and Workarounds + + * Currently the CPTAC-3 HNSCC cohort does not appear when the "Head and Neck" primary site is selected from the GDC home page. This cohort can be queried by clicking [here](https://portal.gdc.cancer.gov/repository?facetTab=cases&filters=%7B%22op%22%3A%22and%22%2C%22content%22%3A%5B%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22cases.primary_site%22%2C%22value%22%3A%5B%22other%20and%20ill-defined%20sites%22%5D%7D%7D%2C%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22cases.project.project_id%22%2C%22value%22%3A%5B%22CPTAC-3%22%5D%7D%7D%5D%7D) + * The Copy Number Estimate files in GENIE are labeled on the portal as TXT while the files are actually in TSV format. + * Some tumor-only annotated VCFs (not raw VCFs) could have a small proportion of variants that appear twice. Tumor-only annotated VCFs can be identified by searching for workflow "GATK4 MuTect2 Annotation" + * The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. + * Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release + * Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. + * Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. + * Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. + * BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. + * Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg + * TCGA Projects + * Incorrect information about treatment may be included for patients within TCGA-HNSC and TCGA-LGG. Please refer to the clinical XML for accurate information on treatment + * 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. + * Two tissue slide images are unavailable for download from GDC Data Portal + * The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + * Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). + * Tumor grade property is not populated + * Progression_or_recurrence property is not populated + * TARGET projects + * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D + * 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal + * There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank + * There are two cases with identical submitter_id `TARGET-10-PARUYU` + * Some TARGET cases are missing `days_to_last_follow_up` + * Some TARGET cases are missing `age_at_diagnosis` + * Some TARGET files are not connected to all related aliquots + * Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) + * The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data + * Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. + * No data from TARGET-MDLS is available. + * Issues in the Legacy Archive + * The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file.* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available + * SDF Files are not linked to Project or Case in the Legacy Archive + * Two biotab files are not linked to Project or Case in the Legacy Archive + * SDRF files are not linked to Project or Case in the Legacy Archive + * TARGET-MDLS cases do not have disease_type or primary_site populated + + +## Data Release 24.0 + +* __GDC Product__: Data +* __Release Date__: May 7, 2020 + +### New updates + +1. New project released: CGCI-HTMCP-CC - HIV+ Tumor Molecular Characterization Project - Cervical Cancer + * RNA-Seq: Alignments and gene expression levels + * miRNA-Seq: Alignments and miRNA expression levels + * WGS: Alignments + * Targeted Sequencing: Alignments + +2. 110 new cases were released from the HNSCC cohort of CPTAC-3. This includes WXS, WGS, RNA-Seq and miRNA-Seq data. + +3. Aliquot-level WXS MAFs are now available from the following projects: + * CPTAC-2 + * CPTAC-3 + +A complete list of files for DR24.0 are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + +* [gdc_manifest_20200507_data_release_24.0_active.tsv.gz](gdc_manifest_20200507_data_release_24.0_active.tsv.gz) +* [gdc_manifest_20200507_data_release_24.0_legacy.tsv.gz](gdc_manifest_20200507_data_release_24.0_legacy.tsv.gz) + + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Currently the CPTAC-3 HNSCC cohort does not appear when the "Head and Neck" primary site is selected from the GDC home page. This cohort can be queried by clicking [here](https://portal.gdc.cancer.gov/repository?facetTab=cases&filters=%7B%22op%22%3A%22and%22%2C%22content%22%3A%5B%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22cases.primary_site%22%2C%22value%22%3A%5B%22other%20and%20ill-defined%20sites%22%5D%7D%7D%2C%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22cases.project.project_id%22%2C%22value%22%3A%5B%22CPTAC-3%22%5D%7D%7D%5D%7D) +* The Copy Number Estimate files in GENIE are labeled on the portal as TXT while the files are actually in TSV format. +* Some tumor-only annotated VCFs (not raw VCFs) could have a small proportion of variants that appear twice. Tumor-only annotated VCFs can be identified by searching for workflow "GATK4 MuTect2 Annotation" +* The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* TCGA Projects + * Incorrect information about treatment may be included for patients within TCGA-HNSC and TCGA-LGG. Please refer to the clinical XML for accurate information on treatment + * 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. + * Two tissue slide images are unavailable for download from GDC Data Portal + * The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + * Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). + * Tumor grade property is not populated + * Progression_or_recurrence property is not populated +* TARGET projects + * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D + * 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal + * There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank + * There are two cases with identical submitter_id `TARGET-10-PARUYU` + * Some TARGET cases are missing `days_to_last_follow_up` + * Some TARGET cases are missing `age_at_diagnosis` + * Some TARGET files are not connected to all related aliquots + * Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) + * The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data + * Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. + * No data from TARGET-MDLS is available. +* Issues in the Legacy Archive + * The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file.* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available + * SDF Files are not linked to Project or Case in the Legacy Archive + * Two biotab files are not linked to Project or Case in the Legacy Archive + * SDRF files are not linked to Project or Case in the Legacy Archive + * TARGET-MDLS cases do not have disease_type or primary_site populated + +## Data Release 23.0 + +* __GDC Product__: Data +* __Release Date__: April 7, 2020 + +### New updates + +1. New data types released: + * Aliquot-level MAFs: MAF Files with mutations derived from one tumor/normal pair + * HCMI-CMDC + * TARGET-ALL-P2 + * TARGET-ALL-P3 + * TARGET-AML + * TARGET-NBL + * TARGET-OS + * TARGET-WT + * Note: Previously released TARGET project level MAFs can be downloaded with the following manifest: [TARGET_Project-Level-MAF_GDC-Manifest.txt](TARGET_Project-Level-MAF_GDC-Manifest.txt) + * Copy number segment and estimate files from SNP6 ASCAT + * All TCGA Projects + * TARGET-ALL-P2 + * TARGET-AML + +2. To accommodate users who prefer to use project-level MAFs, a MAF aggregation tool was developed by the GDC: + * [Github Release](https://github.com/NCI-GDC/gdc-maf-tool/releases) + +3. New RNA-Seq data was released from HCMI-CMDC for nine additional cases. + +4. Clinical updates were performed for the following projects + * CGCI-BLGSP + * HCMI-CMDC + * WCDT-MCRPC + +A complete list of files for DR23.0 are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + +* [gdc_manifest_20200407_data_release_23.0_active.tsv.gz](gdc_manifest_20200407_data_release_23.0_active.tsv.gz) +* [gdc_manifest_20200407_data_release_23.0_legacy.tsv.gz](gdc_manifest_20200407_data_release_23.0_legacy.tsv.gz) + + +### Bugs Fixed Since Last Release + +* The 6 HCMI-CMDC cases without clinical data now have clinical data. +* Most of the "associated_entities" fields in CGCI-BLGSP were not populated correct, this has been resolved. + +### Known Issues and Workarounds + +* The Copy Number Estimate files in GENIE are labeled on the portal as TXT while the files are actually in TSV format. +* Some tumor-only annotated VCFs (not raw VCFs) could have a small proportion of variants that appear twice. Tumor-only annotated VCFs can be identified by searching for workflow "GATK4 MuTect2 Annotation" +* The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* TCGA Projects + * Incorrect information about treatment may be included for patients within TCGA-HNSC and TCGA-LGG. Please refer to the clinical XML for accurate information on treatment + * 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. + * Two tissue slide images are unavailable for download from GDC Data Portal + * The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + * Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). + * Tumor grade property is not populated + * Progression_or_recurrence property is not populated +* TARGET projects + * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D + * 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal + * There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank + * There are two cases with identical submitter_id `TARGET-10-PARUYU` + * Some TARGET cases are missing `days_to_last_follow_up` + * Some TARGET cases are missing `age_at_diagnosis` + * Some TARGET files are not connected to all related aliquots + * Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) + * The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data + * Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. + * No data from TARGET-MDLS is available. +* Issues in the Legacy Archive + * The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file.* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available + * SDF Files are not linked to Project or Case in the Legacy Archive + * Two biotab files are not linked to Project or Case in the Legacy Archive + * SDRF files are not linked to Project or Case in the Legacy Archive + * TARGET-MDLS cases do not have disease_type or primary_site populated + +## Data Release 22.0 + +* __GDC Product__: Data +* __Release Date__: January 16, 2020 + +### New updates + +1. New projects released: + * WCDT-MCRPC - Genomic Characterization of Metastatic Castration Resistant Prostate Cancer (phs001648) + * RNA-Seq; WGS Data +2. New data from HCMI-CMDC + * 16 New Cases + * Includes WXS, WGS, and RNA-Seq data +3. New data from CPTAC-3 + * 108 New Cases + * Includes WXS, WGS, and RNA-Seq data + * miRNA-Seq data for currently released cases + +A complete list of files for DR22.0 are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + +* [gdc_manifest_20200116_data_release_22.0_active.tsv.gz](gdc_manifest_20200116_data_release_22.0_active.tsv.gz) +* [gdc_manifest_20200116_data_release_22.0_legacy.tsv.gz](gdc_manifest_20200116_data_release_22.0_legacy.tsv.gz) + + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* The Copy Number Estimate files in GENIE are labeled on the portal as TXT while the files are actually in TSV format. +* 6 of the HCMI-CMDC cases are missing clinical nodes + * HCM-CSHL-0060-C18 + * HCM-CSHL-0089-C25 + * HCM-CSHL-0090-C25 + * HCM-CSHL-0092-C25 + * HCM-CSHL-0091-C25 + * HCM-CSHL-0057-C18 +* Some tumor-only annotated VCFs (not raw VCFs) could have a small proportion of variants that appear twice. Tumor-only annotated VCFs can be identified by searching for workflow "GATK4 MuTect2 Annotation" +* The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* TCGA Projects + * Incorrect information about treatment may be included for patients within TCGA-HNSC and TCGA-LGG. Please refer to the clinical XML for accurate information on treatment + * 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. + * Two tissue slide images are unavailable for download from GDC Data Portal + * The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + * Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). + * Tumor grade property is not populated + * Progression_or_recurrence property is not populated +* TARGET projects + * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D + * 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal + * There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank + * There are two cases with identical submitter_id `TARGET-10-PARUYU` + * Some TARGET cases are missing `days_to_last_follow_up` + * Some TARGET cases are missing `age_at_diagnosis` + * Some TARGET files are not connected to all related aliquots + * Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) + * The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data + * Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. + * No data from TARGET-MDLS is available. +* Issues in the Legacy Archive + * The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file.* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available + * SDF Files are not linked to Project or Case in the Legacy Archive + * Two biotab files are not linked to Project or Case in the Legacy Archive + * SDRF files are not linked to Project or Case in the Legacy Archive + * TARGET-MDLS cases do not have disease_type or primary_site populated + +## Data Release 21.0 + +* __GDC Product__: Data +* __Release Date__: December 10, 2019 + +### New updates + +1. New projects released: + * GENIE - AACR Project Genomics Evidence Neoplasia Information Exchange (phs001337) + * Includes Targeted Sequencing, Transcript Fusion, Copy Number Estimate from GENIE 5.0 + * AACR Project GENIE is divided by sequencing center: + * GENIE-MSK + * GENIE-DFCI + * GENIE-MDA + * GENIE-JHU + * GENIE-UHN + * GENIE-VICC + * GENIE-GRCC + * GENIE-NKI + +A complete list of files for DR21.0 are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + +* [gdc_manifest_20191210_data_release_21.0_active.txt.gz](gdc_manifest_20191210_data_release_21.0_active.txt.gz) +* [gdc_manifest_20191210_data_release_21.0_legacy.txt.gz](gdc_manifest_20191210_data_release_21.0_legacy.txt.gz) + + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* The Copy Number Estimate files in GENIE are labeled on the portal as TXT while the files are actually in TSV format. +* Some tumor-only annotated VCFs (not raw VCFs) could have a small proportion of variants that appear twice. Tumor-only annotated VCFs can be identified by searching for workflow "GATK4 MuTect2 Annotation" +* The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* TCGA Projects + * Incorrect information about treatment may be included for patients within TCGA-HNSC and TCGA-LGG. Please refer to the clinical XML for accurate information on treatment + * 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. + * Two tissue slide images are unavailable for download from GDC Data Portal + * The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + * Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). + * Tumor grade property is not populated + * Progression_or_recurrence property is not populated +* TARGET projects + * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D + * 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal + * There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank + * There are two cases with identical submitter_id `TARGET-10-PARUYU` + * Some TARGET cases are missing `days_to_last_follow_up` + * Some TARGET cases are missing `age_at_diagnosis` + * Some TARGET files are not connected to all related aliquots + * Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) + * The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data + * Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. + * No data from TARGET-MDLS is available. +* Issues in the Legacy Archive + * The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file.* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available + * SDF Files are not linked to Project or Case in the Legacy Archive + * Two biotab files are not linked to Project or Case in the Legacy Archive + * SDRF files are not linked to Project or Case in the Legacy Archive + * TARGET-MDLS cases do not have disease_type or primary_site populated + +## Data Release 20.0 + +* __GDC Product__: Data +* __Release Date__: November 11, 2019 + +### New updates + +1. New projects released: + * CPTAC-2 - CPTAC Proteogenomic Confirmatory Study (phs000892) + * Includes WXS, RNA-Seq, and miRNA-Seq + * OHSU-CNL - Genomic landscape of Neutrophilic Leukemias of Ambiguous Diagnosis (phs001799) + * Includes WXS and RNA-Seq + * No VCF files will be included at this time. They will follow in a later release. +2. New TARGET data released + * TARGET-OS: WGS, WXS + * TARGET-NBL: WGS + * TARGET-AML: miRNA +3. CGCI-BLGSP miRNA-Seq released + + +A complete list of files for DR20.0 are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + + +* [gdc_manifest_20191111_data_release_20.0_active.txt.gz](gdc_manifest_20191111_data_release_20.0_active.txt.gz) +* [gdc_manifest_20191111_data_release_20.0_legacy.txt.gz](gdc_manifest_20191111_data_release_20.0_legacy.txt.gz) + + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Some tumor-only annotated VCFs (not raw VCFs) could have a small proportion of variants that appear twice. Tumor-only annotated VCFs can be identified by searching for workflow "GATK4 MuTect2 Annotation" +* The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* TCGA Projects + * Incorrect information about treatment may be included for patients within TCGA-HNSC and TCGA-LGG. Please refer to the clinical XML for accurate information on treatment + * 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. + * Two tissue slide images are unavailable for download from GDC Data Portal + * The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + * Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). + * Tumor grade property is not populated + * Progression_or_recurrence property is not populated +* TARGET projects + * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D + * 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal + * There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank + * There are two cases with identical submitter_id `TARGET-10-PARUYU` + * Some TARGET cases are missing `days_to_last_follow_up` + * Some TARGET cases are missing `age_at_diagnosis` + * Some TARGET files are not connected to all related aliquots + * Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) + * The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data + * Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. + * No data from TARGET-MDLS is available. +* Issues in the Legacy Archive + * The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file.* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available + * SDF Files are not linked to Project or Case in the Legacy Archive + * Two biotab files are not linked to Project or Case in the Legacy Archive + * SDRF files are not linked to Project or Case in the Legacy Archive + * TARGET-MDLS cases do not have disease_type or primary_site populated + + + + +## Data Release 19.1 + +* __GDC Product__: Data +* __Release Date__: November 6, 2019 + +### New updates + +* The following cases are no longer available in the GDC Data Portal. They had no data files associated with them in DR 19 so there are no changes in file availability in this release. + * TARGET-00-NAAENF + * TARGET-00-NAAENG + * TARGET-00-NAAENH + * TARGET-00-NAAENI + * TARGET-00-NAAENJ + * TARGET-00-NAAENK + * TARGET-00-NAAENL + * TARGET-00-NAAENM + * TARGET-00-NAAENN + * TARGET-00-NAAENP + * TARGET-00-NAAENR + * TARGET-00-NAAEPE + +A complete list of files for DR19.1 are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + +* [gdc_manifest_20190917_data_release_19.0_active.txt.gz](gdc_manifest_20190917_data_release_19.0_active.txt.gz) +* [gdc_manifest_20190917_data_release_19.0_legacy.txt.gz](gdc_manifest_20190917_data_release_19.0_legacy.txt.gz) + + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Some tumor-only annotated VCFs (not raw VCFs) could have a small proportion of variants that appear twice. Tumor-only annotated VCFs can be identified by searching for workflow "GATK4 MuTect2 Annotation" +* The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* TCGA Projects + * Incorrect information about treatment may be included for patients within TCGA-HNSC and TCGA-LGG. Please refer to the clinical XML for accurate information on treatment + * 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. + * Two tissue slide images are unavailable for download from GDC Data Portal + * The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + * Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). + * Tumor grade property is not populated + * Progression_or_recurrence property is not populated +* TARGET projects + * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D + * 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal + * There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank + * There are two cases with identical submitter_id `TARGET-10-PARUYU` + * Some TARGET cases are missing `days_to_last_follow_up` + * Some TARGET cases are missing `age_at_diagnosis` + * Some TARGET files are not connected to all related aliquots + * Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) + * The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data + * Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. + * No data from TARGET-MDLS is available. +* Issues in the Legacy Archive + * The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file.* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available + * SDF Files are not linked to Project or Case in the Legacy Archive + * Two biotab files are not linked to Project or Case in the Legacy Archive + * SDRF files are not linked to Project or Case in the Legacy Archive + * TARGET-MDLS cases do not have disease_type or primary_site populated + + +## Data Release 19.0 + +* __GDC Product__: Data +* __Release Date__: September 17, 2019 + +### New updates + +1. New projects released: + * BEATAML1.0-COHORT - Functional Genomic Landscape of Acute Myeloid Leukemia (phs001657) + * Includes WXS and RNA-Seq +2. New TARGET data released + * TARGET-ALL-P1 RNA-Seq + * TARGET-ALL-P2 RNA-Seq, WXS, and miRNA-Seq + * TARGET-ALL-P3 miRNA-Seq + * TARGET-AML WXS, WGS, and miRNA-Seq + * TARGET-NBL WXS and RNA-Seq + * TARGET-RT WGS and RNA-Seq + * TARGET-WT WGS, WXS, and RNA-Seq +3. Additional CGCI-BLGSP WGS data released +4. Pindel VCFs released for TARGET-ALL-P2, TARGET-ALL-P3, TARGET-AML, TARGET-NBL, TARGET-WT, MMRF-COMMPASS, HCMI-CMDC, and CPTAC-3 +5. Disease-specific staging properties for many projects were released + + +A complete list of files for DR19.0 are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + +* [gdc_manifest_20190917_data_release_19.0_active.txt.gz](gdc_manifest_20190917_data_release_19.0_active.txt.gz) +* [gdc_manifest_20190917_data_release_19.0_legacy.txt.gz](gdc_manifest_20190917_data_release_19.0_legacy.txt.gz) + + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Some tumor-only annotated VCFs (not raw VCFs) could have a small proportion of variants that appear twice. Tumor-only annotated VCFs can be identified by searching for workflow "GATK4 MuTect2 Annotation" +* The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* TCGA Projects + * Incorrect information about treatment may be included for patients within TCGA-HNSC and TCGA-LGG. Please refer to the clinical XML for accurate information on treatment + * 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. + * Two tissue slide images are unavailable for download from GDC Data Portal + * The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + * Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). + * Tumor grade property is not populated + * Progression_or_recurrence property is not populated +* TARGET projects + * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D + * 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal + * There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank + * There are two cases with identical submitter_id `TARGET-10-PARUYU` + * Some TARGET cases are missing `days_to_last_follow_up` + * Some TARGET cases are missing `age_at_diagnosis` + * Some TARGET files are not connected to all related aliquots + * Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) + * The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data + * Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. + * No data from TARGET-MDLS is available. +* Issues in the Legacy Archive + * The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file.* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available + * SDF Files are not linked to Project or Case in the Legacy Archive + * Two biotab files are not linked to Project or Case in the Legacy Archive + * SDRF files are not linked to Project or Case in the Legacy Archive + * TARGET-MDLS cases do not have disease_type or primary_site populated + + +## Data Release 18.0 + +* __GDC Product__: Data +* __Release Date__: July 8, 2019 + +### New updates + +1. New Projects released + * MMRF-COMMPASS - Multiple Myeloma CoMMpass Study (phs000748) + * Includes WGS, WXS, and RNA-Seq + * ORGANOID-PANCREATIC - Pancreas Cancer Organoid Profiling (phs001611) + * Includes WGS, WXS, and RNA-Seq + * TARGET-ALL-P1 - Acute Lymphoblastic Leukemia - Phase I (phs000218) + * Includes WGS + * TARGET-ALL-P2 - Acute Lymphoblastic Leukemia - Phase II (phs000218) + * Includes WGS + * CGCI-BLGSP - Burkitt Lymphoma Genome Sequencing Project (phs000235) + * Includes WGS and RNA-Seq +2. New versions of RNA-Seq data for TARGET-ALL-P3 +3. New RNA-Seq data for TARGET-CCSK +4. New RNA-Seq data for TARGET-OS + + +A complete list of files for DR18.0 are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + +* [gdc_manifest_20190708_data_release_18.0_active.txt.gz](gdc_manifest_20190708_data_release_18.0_active.txt.gz) +* [gdc_manifest_20190708_data_release_18.0_legacy.txt.gz](gdc_manifest_20190708_data_release_18.0_legacy.txt.gz) + + +### Bugs Fixed Since Last Release + +* New versions of RNA-Seq data for TARGET-ALL-P3 resolve issue with missing reads from BAM files. + +### Known Issues and Workarounds + +* Some tumor-only annotated VCFs (not raw VCFs) could have a small proportion of variants that appear twice. Tumor-only annotated VCFs can be identified by searching for workflow "GATK4 MuTect2 Annotation" +* The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* TCGA Projects + * Incorrect information about treatment may be included for patients within TCGA-HNSC and TCGA-LGG. Please refer to the clinical XML for accurate information on treatment + * 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. + * Two tissue slide images are unavailable for download from GDC Data Portal + * The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + * Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). + * Tumor grade property is not populated + * Progression_or_recurrence property is not populated +* TARGET projects + * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D + * 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal + * There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank + * There are two cases with identical submitter_id `TARGET-10-PARUYU` + * Some TARGET cases are missing `days_to_last_follow_up` + * Some TARGET cases are missing `age_at_diagnosis` + * Some TARGET files are not connected to all related aliquots + * Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) + * The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data + * Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. + * No data from TARGET-MDLS is available. +* Issues in the Legacy Archive + * The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file.* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available + * SDF Files are not linked to Project or Case in the Legacy Archive + * Two biotab files are not linked to Project or Case in the Legacy Archive + * SDRF files are not linked to Project or Case in the Legacy Archive + * TARGET-MDLS cases do not have disease_type or primary_site populated + + +## Data Release 17.1 + +* __GDC Product__: Data +* __Release Date__: June 12, 2019 + +### New updates + +1. Rebuilt indices for NCICCR-DLBCL and CTSP-DLBCL1. Fewer files viewable in GDC Data Portal or API. + +A complete list of files for DR17.1 are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + +* [gdc_manifest_20190612_data_release_17.1_active.txt.gz](gdc_manifest_20190612_data_release_17.1_active.txt.gz) +* [gdc_manifest_20190612_data_release_17.1_legacy.txt.gz](gdc_manifest_20190612_data_release_17.1_legacy.txt.gz) + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + + +* The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* TCGA Projects + * Incorrect information about treatment may be included for patients within TCGA-HNSC and TCGA-LGG. Please refer to the clinical XML for accurate information on treatment + * 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. + * Two tissue slide images are unavailable for download from GDC Data Portal + * The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + * Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). + * Tumor grade property is not populated + * Progression_or_recurrence property is not populated +* TARGET projects + * TARGET ALL-P3 RNA-Seq results from DR14 are missing ~18% of reads. Downsampling appears to be completely random and count files have a very high correlation (>99.99%) with complete data. New versions of these files will be created that include the entire set of reads. + * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D + * 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal + * There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank + * There are two cases with identical submitter_id `TARGET-10-PARUYU` + * Some TARGET cases are missing `days_to_last_follow_up` + * Some TARGET cases are missing `age_at_diagnosis` + * Some TARGET files are not connected to all related aliquots + * Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) + * The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data + * Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. + * No data from TARGET-MDLS is available. +* Issues in the Legacy Archive + * Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available + * SDF Files are not linked to Project or Case in the Legacy Archive + * Two biotab files are not linked to Project or Case in the Legacy Archive + * SDRF files are not linked to Project or Case in the Legacy Archive + * TARGET-MDLS cases do not have disease_type or primary_site populated + + + +## Data Release 17.0 + +* __GDC Product__: Data +* __Release Date__: June 5, 2019 + +### New updates + +1. New Projects released + * HCMI-CMDC - NCI Cancer Model Development for the Human Cancer Model Initiative (HCMI) (phs001486) + * BEATAML1.0-CRENOLANIB - Clinical Resistance to Crenolanib in Acute Myeloid Leukemia Due to Diverse Molecular Mechanisms (phs001628) +2. RNA-Seq data for NCICCR-DLBCL and CTSP-DLBCL1 are released +3. ATAC-Seq data for TCGA projects are released +4. CPTAC-3 RNA-Seq data are released +5. Clinical data updates for TCGA - to see parser code updates review [API v1.20 release notes](https://docs.gdc.cancer.gov/API/Release_Notes/API_Release_Notes/#v1200) +6. Clinical data updates for other projects to accommodate migration of vital_status, days_to_birth, and days_to_death from the Diagnosis to the Demographic node + +A complete list of files for DR17.0 are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + +* [gdc_manifest_20190605_data_release_17.0_active.txt.gz](gdc_manifest_20190605_data_release_17.0_active.txt.gz) +* [gdc_manifest_20190605_data_release_17.0_legacy.txt.gz](gdc_manifest_20190605_data_release_17.0_legacy.txt.gz). + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + + +* The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* TCGA Projects + * Incorrect information about treatment may be included for patients within TCGA-HNSC and TCGA-LGG. Please refer to the clinical XML for accurate information on treatment + * 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. + * Two tissue slide images are unavailable for download from GDC Data Portal + * The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + * Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). + * Tumor grade property is not populated + * Progression_or_recurrence property is not populated +* TARGET projects + * TARGET ALL-P3 RNA-Seq results from DR14 are missing ~18% of reads. Downsampling appears to be completely random and count files have a very high correlation (>99.99%) with complete data. New versions of these files will be created that include the entire set of reads. + * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D + * 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal + * There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank + * There are two cases with identical submitter_id `TARGET-10-PARUYU` + * Some TARGET cases are missing `days_to_last_follow_up` + * Some TARGET cases are missing `age_at_diagnosis` + * Some TARGET files are not connected to all related aliquots + * Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) + * The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data + * Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. + * No data from TARGET-MDLS is available. +* Issues in the Legacy Archive + * Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available + * SDF Files are not linked to Project or Case in the Legacy Archive + * Two biotab files are not linked to Project or Case in the Legacy Archive + * SDRF files are not linked to Project or Case in the Legacy Archive + * TARGET-MDLS cases do not have disease_type or primary_site populated + + + + + +## Data Release 16.0 + +* __GDC Product__: Data +* __Release Date__: March 26, 2019 + +### New updates + +1. The CPTAC-3 project (phs001287) is released with WXS and WGS data. RNA-Seq will be released at a later date. Additional project details can be found at on the [CPTAC Data Source page](https://gdc.cancer.gov/about-gdc/contributed-genomic-data-cancer-research/clinical-proteomic-tumor-analysis-consortium-cptac). +2. TARGET-ALL-P3 (phs000218) WGS BAM files are released. +3. VAREPOP-APOLLO (phs001374) VCF files are released. + +A complete list of files for DR16.0 are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + +* [gdc_manifest_20190326_data_release_16.0_active.txt.gz](gdc_manifest_20190326_data_release_16.0_active.txt.gz) +* [gdc_manifest_20190326_data_release_16.0_legacy.txt.gz](gdc_manifest_20190326_data_release_16.0_legacy.txt.gz). + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* TARGET ALL-P3 RNA-Seq results from DR14 are missing ~18% of reads. Downsampling appears to be completely random and count files have a very high correlation (>99.99%) with complete data. New versions of these files will be created that include the entire set of reads. +* The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. +* TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. +* 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal +* Two tissue slide images are unavailable for download from GDC Data Portal +* The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + +* There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank +* There are two cases with identical submitter_id `TARGET-10-PARUYU` +* TARGET-MDLS cases do not have disease_type or primary_site populated +* Some TARGET cases are missing `days_to_last_follow_up` +* Some TARGET cases are missing `age_at_diagnosis` +* Some TARGET files are not connected to all related aliquots +* Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data +* Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. +* Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* No data from TARGET-MDLS is available. +* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available +* SDF Files are not linked to Project or Case in the Legacy Archive +* Two biotab files are not linked to Project or Case in the Legacy Archive +* SDRF files are not linked to Project or Case in the Legacy Archive +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* Tumor grade property is not populated +* Progression_or_recurrence property is not populated + + + + +## Data Release 15.0 + +* __GDC Product__: Data +* __Release Date__: February 20, 2019 + +### New updates + +1. TARGET-ALL-P3 is now available and includes RNA-Seq and WXS data. +2. New RNA-Seq workflow is now being utilized for new projects. More details can be found in the [RNA-Seq pipeline documentation](../../Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#rna-seq-alignment-workflow). +3. New tumor only variant calling pipeline is now being utilized for new projects. More details can be found in the [Tumor only pipeline documentation](../../Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline/#tumor-only-variant-calling-workflow). + + +A complete list of files for DR15.0 are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + +* [gdc_manifest_20190220_data_release_15.0_active.txt.gz](gdc_manifest_20190220_data_release_15.0_active.txt.gz) +* [gdc_manifest_20190220_data_release_15.0_legacy.txt.gz](gdc_manifest_20190220_data_release_15.0_legacy.txt.gz). + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* The read alignment end coordinates in the x.isoform.quantification.txt files produced by the miRNA pipeline are exclusive (i.e. offset by 1) for all TCGA miRNA legacy (GRCh37/hg19) and current harmonized (GRCh38/hg38) miRNA data. This error has no impact on miRNA alignment or quantification - only the coordinates reported in the quantification file. +* TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. +* 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal +* Two tissue slide images are unavailable for download from GDC Data Portal +* The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + +* There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank +* There are two cases with identical submitter_id `TARGET-10-PARUYU` +* TARGET-MDLS cases do not have disease_type or primary_site populated +* Some TARGET cases are missing `days_to_last_follow_up` +* Some TARGET cases are missing `age_at_diagnosis` +* Some TARGET files are not connected to all related aliquots +* Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data +* Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. +* Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* No data from TARGET-MDLS is available. +* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available +* SDF Files are not linked to Project or Case in the Legacy Archive +* Two biotab files are not linked to Project or Case in the Legacy Archive +* SDRF files are not linked to Project or Case in the Legacy Archive +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* Tumor grade property is not populated +* Progression_or_recurrence property is not populated + + +## Data Release 14.0 + +* __GDC Product__: Data +* __Release Date__: December 18, 2018 + +### New updates + +1. Copy Number Variation (CNV) data derived from GISTIC2 results are now available for download for TCGA projects +2. New miRNA data available for 181 aliquots for TARGET and TCGA +3. Released two SNP6 files (6cd4ef5e-324a-4ace-8779-7a33bd559c83, dfa89ee9-6ee5-460b-bd58-b5ca0e9cb7ac) +4. New versions of TCGA biospecimen supplements are available +5. Updated primary site for `TCGA-AG-3881` to `Unknown` +6. 8 New Harmonized WGS BAM files for TARGET-WT, TARGET-NBL, TARGET-AML added to the portal + +A complete list of files for DR14.0 are listed for the GDC Data Portal and the GDC Legacy Archive are found below: + +* [gdc_manifest_20181218_data_release_14.0_active.txt.gz](gdc_manifest_20181218_data_release_14.0_active.txt.gz) +* [gdc_manifest_20181218_data_release_14.0_legacy.txt.gz](gdc_manifest_20181218_data_release_14.0_legacy.txt.gz). + + +### Bugs Fixed Since Last Release + +* FM-AD clinial and biospecimen supplements are now correctly labeled as TSV rather than XLSX + +### Known Issues and Workarounds + +* TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. + * TARGET-20-PASJGZ-04A-02D + * TARGET-30-PAPTLY-01A-01D + * TARGET-20-PAEIKD-09A-01D + * TARGET-20-PASMYS-14A-02D + * TARGET-20-PAMYAS-14A-02D + * TARGET-10-PAPZST-09A-01D +* Some miRNA files with QC failed reads were not swapped in DR11.0. 361 aliquots remain to be swapped in a later release +* 74 Diagnostic TCGA slides are attached to a portion rather than a sample like the rest of the diagnostic slides. The reflects how these original samples were handled. +* 11 bam files for TARGET-NBL RNA-Seq are not available in the GDC Data portal +* Two tissue slide images are unavailable for download from GDC Data Portal +* The raw and annotated VarScan VCF files for aliquot `TCGA-VR-A8ET-01A-11D-A403-09` are not available. These VCFs files will be replaced in a later release. + +* There are 5051 TARGET files for which `experimental_strategy`, `data_format`, `platform`, and `data_subtype` are blank +* There are two cases with identical submitter_id `TARGET-10-PARUYU` +* TARGET-MDLS cases do not have disease_type or primary_site populated +* Some TARGET cases are missing `days_to_last_follow_up` +* Some TARGET cases are missing `age_at_diagnosis` +* Some TARGET files are not connected to all related aliquots +* Samples of TARGET sample_type `Recurrent Blood Derived Cancer - Bone Marrow` are mislabeled as `Recurrent Blood Derived Cancer - Peripheral Blood`. A workaround is to look at the sample barcode, which is -04 for `Recurrent Blood Derived Cancer - Bone Marrow`. (e.g. `TARGET-20-PAMYAS-04A-03R`) +* Mutation frequency may be underestimated when using MAF files for genes that overlap other genes. This is because MAF files only record one gene per variant. +* Most intronic mutations are removed for MAF generation. However, validated variants may rescue these in some cases. Therefore intronic mutations in MAF files are not representative of those called by mutation callers. +* The latest TARGET data is not yet available at the GDC. For the complete and latest data, please see the [TARGET Data Matrix](https://ocg.cancer.gov/programs/target/data-matrix). Data that is not present or is not the most up to date includes: + * All microarray data and metadata + * All sequencing analyzed data and metadata + * 1180 of 12063 sequencing runs of raw data +* Demographic information for some TARGET patients is incorrect. The correct information can be found in the associated clinical supplement file. Impacted patients are TARGET-50-PAJNUS. +* Some TCGA annotations are unavailable in the Legacy Archive or Data Portal. These annotations can be found [here](tcga-annotations-unavailable-20170315.json). +* Public MAF files for different variant calling pipelines but the same project may contain different numbers of samples. Samples are omitted from the public MAF files if they have no PASS variants, which can lead to this apparent discrepancy. +* BAM files produced by the GDC RNA-Seq Alignment workflow will currently fail validation using the Picard ValidateSamFiles tool. This is caused by STAR2 not recording mate mapping information for unmapped reads, which are retained in our BAM files. Importantly, all affected BAM files are known to behave normally in downstream workflows including expression quantification. +* No data from TARGET-MDLS is available. +* Slide barcodes (`submitter_id` values for Slide entities in the Legacy Archive) are not available +* SDF Files are not linked to Project or Case in the Legacy Archive +* Two biotab files are not linked to Project or Case in the Legacy Archive +* SDRF files are not linked to Project or Case in the Legacy Archive +* Portion "weight" property is incorrectly described in the Data Dictionary as the weight of the patient in kg, should be described as the weight of the portion in mg +* Tumor grade property is not populated +* Progression_or_recurrence property is not populated + + ## Data Release 13.0 @@ -47,7 +1512,7 @@ A complete list of files for DR13.0 are listed for the GDC Data Portal and the G * 506 Copy Number Segment and 36 Slide Image files are designated as controlled-access on the GDC Data Portal. These files are actually open-access and will be downloadable without a token using [this manifest](gdc_manifest_20181003_data_release_13.0_cnv_slides.txt). * 2 Copy Number Segment files from TCGA-TGCT do not appear on the GDC Portal. They can be downloaded using the Data Transfer Tool using the following UUIDs. * 6cd4ef5e-324a-4ace-8779-7a33bd559c83 - RAMPS_p_TCGA_Batch_430_NSP_GenomeWideSNP_6_E07_1538238.nocnv_grch38.seg.v2.txt - * dfa89ee9-6ee5-460b-bd58-b5ca0e9cb7ac - RAMPS_p_TCGA_Batch_430_NSP_GenomeWideSNP_6_E07_1538238.grch38.seg.v2.txt + * dfa89ee9-6ee5-460b-bd58-b5ca0e9cb7ac - RAMPS_p_TCGA_Batch_430_NSP_GenomeWideSNP_6_E07_1538238.grch38.seg.v2.txt * TARGET CGI BAMs in the Legacy Archive for the following aliquots should not be used because they were not repaired and concatenated into their original composite BAM files by CGHub. * TARGET-20-PASJGZ-04A-02D * TARGET-30-PAPTLY-01A-01D diff --git a/docs/Data/Release_Notes/TARGET_Project-Level-MAF_GDC-Manifest.txt b/docs/Data/Release_Notes/TARGET_Project-Level-MAF_GDC-Manifest.txt new file mode 100644 index 000000000..161242a7c --- /dev/null +++ b/docs/Data/Release_Notes/TARGET_Project-Level-MAF_GDC-Manifest.txt @@ -0,0 +1,13 @@ +id filename md5 size +1183ffba-db7d-4500-a6d3-ec0d6a2743f8 TARGET.NBL.somaticsniper.adb9b34a-1569-460b-bf4f-445f2cae8aa0.DR-10.0.protected.maf.gz b42bd212667c20ed60d5b397ce3be636 56900458 +1bb203d5-5d1e-4d06-ab5c-1a6268b6a165 TARGET.WT.muse.b6d4ff24-d9c1-4d4b-bb15-1e7906bff012.DR-10.0.protected.maf.gz f8feca05891bb5f8fe31e6f32ad4bb2f 3515260 +2b8a7c04-e4a6-4b1f-b72e-73edc3c6aaad TARGET.NBL.mutect.e340b302-73f0-4bc9-827a-e599f812a764.DR-10.0.protected.maf.gz cfe0c4ee8d2912d45f05d621c52de57f 322809850 +33d2037d-2d4f-452b-98f6-e578a72ebb16 TARGET.WT.varscan.8c8bd4e8-277a-44aa-896b-19402b8224c4.DR-10.0.protected.maf.gz c8d8ec9edf0ede7b4a0a59b20d672554 8339966 +53c3eed7-7f14-4898-af82-0ee85f5f5c5e TARGET.NBL.muse.8ed4474e-b5c2-4dfb-8ee8-fd943e4d5d19.DR-10.0.protected.maf.gz 2954acf704d4fda74b50742224c97427 13277550 +6e4e70b7-7367-4844-84fe-d494557923b8 TARGET.AML.somaticsniper.ad0b04dd-9cae-45c6-8775-5ea631ab4639.DR-10.0.protected.maf.gz e43bcb4385fc5ce198c0c804c3adb1a4 11832501 +7295b8e0-207d-44d2-acb5-41a57b975e89 TARGET.NBL.varscan.e93e8709-c5a7-425f-b5aa-dd2dcc229241.DR-10.0.protected.maf.gz 9c41666ed86236c603377ff994df1d29 57899036 +c7d14f2b-55fc-4b95-968d-2b7a357111c2 TARGET.WT.mutect.77a8f02b-4602-409b-bbfa-b960d93310f6.DR-10.0.protected.maf.gz e4dfde638423b849992ab5c68c07df23 61232854 +ce31f4c0-bbca-41d2-ae5c-1fe04a2cbc54 TARGET.WT.somaticsniper.b613163f-fdb6-456c-9b11-e6087dddda20.DR-10.0.protected.maf.gz 846fda34f38cef45670b0a8dbf3df99d 22063446 +ce5cb21e-8432-4f41-ac2c-efa01400d574 TARGET.AML.mutect.a058a35a-2382-46bd-9a7b-bcb72bd70473.DR-10.0.protected.maf.gz 1b5082d499d0cd1fcf7aada4be8a718b 34937119 +d29f1c96-602a-4734-becd-f0a3a65fef49 TARGET.AML.muse.5c543196-00d8-4ef2-9b34-f89d7c9be518.DR-10.0.protected.maf.gz 0c621de9097c01fcc5dba0bfa041e423 3079094 +d3d844d9-e533-42ed-b9e0-ab95b69dcc10 TARGET.AML.varscan.6205d922-205d-45fa-84c9-df2529fffde2.DR-10.0.protected.maf.gz f29666ef72b78029a0f2bf2acc1d1245 2788928 diff --git a/docs/Data/Release_Notes/gdc_manifest_20181218_data_release_14.0_active.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20181218_data_release_14.0_active.txt.gz new file mode 100644 index 000000000..93617898d Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20181218_data_release_14.0_active.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20181218_data_release_14.0_legacy.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20181218_data_release_14.0_legacy.txt.gz new file mode 100644 index 000000000..f6968958b Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20181218_data_release_14.0_legacy.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20190220_data_release_15.0_active.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20190220_data_release_15.0_active.txt.gz new file mode 100644 index 000000000..80d7a0699 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20190220_data_release_15.0_active.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20190220_data_release_15.0_legacy.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20190220_data_release_15.0_legacy.txt.gz new file mode 100644 index 000000000..34d61a005 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20190220_data_release_15.0_legacy.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20190326_data_release_16.0_active.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20190326_data_release_16.0_active.txt.gz new file mode 100644 index 000000000..fd2d8e61e Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20190326_data_release_16.0_active.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20190326_data_release_16.0_legacy.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20190326_data_release_16.0_legacy.txt.gz new file mode 100644 index 000000000..ba037dcf5 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20190326_data_release_16.0_legacy.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20190605_data_release_17.0_active.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20190605_data_release_17.0_active.txt.gz new file mode 100644 index 000000000..58f702540 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20190605_data_release_17.0_active.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20190605_data_release_17.0_legacy.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20190605_data_release_17.0_legacy.txt.gz new file mode 100644 index 000000000..9defa2437 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20190605_data_release_17.0_legacy.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20190612_data_release_17.1_active.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20190612_data_release_17.1_active.txt.gz new file mode 100644 index 000000000..4ec817dad Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20190612_data_release_17.1_active.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20190612_data_release_17.1_legacy.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20190612_data_release_17.1_legacy.txt.gz new file mode 100644 index 000000000..71b47ecd1 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20190612_data_release_17.1_legacy.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20190708_data_release_18.0_active.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20190708_data_release_18.0_active.txt.gz new file mode 100644 index 000000000..f50f8f810 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20190708_data_release_18.0_active.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20190708_data_release_18.0_legacy.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20190708_data_release_18.0_legacy.txt.gz new file mode 100644 index 000000000..4660ff4da Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20190708_data_release_18.0_legacy.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20190917_data_release_19.0_active.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20190917_data_release_19.0_active.txt.gz new file mode 100644 index 000000000..19ea84cb8 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20190917_data_release_19.0_active.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20190917_data_release_19.0_legacy.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20190917_data_release_19.0_legacy.txt.gz new file mode 100644 index 000000000..215b6894b Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20190917_data_release_19.0_legacy.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20191111_data_release_20.0_active.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20191111_data_release_20.0_active.txt.gz new file mode 100644 index 000000000..8ebd33c82 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20191111_data_release_20.0_active.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20191111_data_release_20.0_legacy.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20191111_data_release_20.0_legacy.txt.gz new file mode 100644 index 000000000..bfd02fd81 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20191111_data_release_20.0_legacy.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20191210_data_release_21.0_active.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20191210_data_release_21.0_active.txt.gz new file mode 100644 index 000000000..86b51631f Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20191210_data_release_21.0_active.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20191210_data_release_21.0_legacy.txt.gz b/docs/Data/Release_Notes/gdc_manifest_20191210_data_release_21.0_legacy.txt.gz new file mode 100644 index 000000000..d06ae3c2b Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20191210_data_release_21.0_legacy.txt.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20200116_data_release_22.0_active.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20200116_data_release_22.0_active.tsv.gz new file mode 100755 index 000000000..6cf2039b1 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20200116_data_release_22.0_active.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20200116_data_release_22.0_legacy.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20200116_data_release_22.0_legacy.tsv.gz new file mode 100755 index 000000000..f7bcc2d39 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20200116_data_release_22.0_legacy.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20200407_data_release_23.0_active.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20200407_data_release_23.0_active.tsv.gz new file mode 100644 index 000000000..e739a0dd6 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20200407_data_release_23.0_active.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20200407_data_release_23.0_legacy.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20200407_data_release_23.0_legacy.tsv.gz new file mode 100644 index 000000000..3ffdba4b9 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20200407_data_release_23.0_legacy.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20200507_data_release_24.0_active.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20200507_data_release_24.0_active.tsv.gz new file mode 100644 index 000000000..4d7a4c909 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20200507_data_release_24.0_active.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20200507_data_release_24.0_legacy.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20200507_data_release_24.0_legacy.tsv.gz new file mode 100644 index 000000000..2deec8793 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20200507_data_release_24.0_legacy.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20200722_data_release_25.0_active.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20200722_data_release_25.0_active.tsv.gz new file mode 100644 index 000000000..93ad70925 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20200722_data_release_25.0_active.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20200722_data_release_25.0_legacy.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20200722_data_release_25.0_legacy.tsv.gz new file mode 100644 index 000000000..e29d8bec9 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20200722_data_release_25.0_legacy.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20200908_data_release_26.0_active.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20200908_data_release_26.0_active.tsv.gz new file mode 100644 index 000000000..d9ac6f985 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20200908_data_release_26.0_active.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20200908_data_release_26.0_legacy.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20200908_data_release_26.0_legacy.tsv.gz new file mode 100644 index 000000000..b36fced54 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20200908_data_release_26.0_legacy.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20201029_data_release_27.0_active.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20201029_data_release_27.0_active.tsv.gz new file mode 100644 index 000000000..8c567c25c Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20201029_data_release_27.0_active.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20201029_data_release_27.0_legacy.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20201029_data_release_27.0_legacy.tsv.gz new file mode 100644 index 000000000..fc0319f1a Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20201029_data_release_27.0_legacy.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20201109_data_release_27.0_active.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20201109_data_release_27.0_active.tsv.gz new file mode 100644 index 000000000..14da11082 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20201109_data_release_27.0_active.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20201109_data_release_27.0_legacy.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20201109_data_release_27.0_legacy.tsv.gz new file mode 100644 index 000000000..b263ff93b Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20201109_data_release_27.0_legacy.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20210202_data_release_28.0_active.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20210202_data_release_28.0_active.tsv.gz new file mode 100644 index 000000000..6416ee5f9 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20210202_data_release_28.0_active.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20210202_data_release_28.0_legacy.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20210202_data_release_28.0_legacy.tsv.gz new file mode 100644 index 000000000..a1dc1bb00 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20210202_data_release_28.0_legacy.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20210331_data_release_29.0_active.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20210331_data_release_29.0_active.tsv.gz new file mode 100644 index 000000000..43ecc5368 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20210331_data_release_29.0_active.tsv.gz differ diff --git a/docs/Data/Release_Notes/gdc_manifest_20210331_data_release_29.0_legacy.tsv.gz b/docs/Data/Release_Notes/gdc_manifest_20210331_data_release_29.0_legacy.tsv.gz new file mode 100644 index 000000000..0dd962484 Binary files /dev/null and b/docs/Data/Release_Notes/gdc_manifest_20210331_data_release_29.0_legacy.tsv.gz differ diff --git a/docs/Data_Dictionary/Release_Notes/Data_Dictionary_Release_Notes.md b/docs/Data_Dictionary/Release_Notes/Data_Dictionary_Release_Notes.md index b3064a348..08f7faf0b 100644 --- a/docs/Data_Dictionary/Release_Notes/Data_Dictionary_Release_Notes.md +++ b/docs/Data_Dictionary/Release_Notes/Data_Dictionary_Release_Notes.md @@ -1,16 +1,905 @@ # Data Dictionary Release Notes - | Version | Date | |---|---| +| [v.2.4.0](Data_Dictionary_Release_Notes.md#v240) | June 21, 2021 | +| [v.2.3.0](Data_Dictionary_Release_Notes.md#v230) | January 5, 2021 | +| [v.2.2.0](Data_Dictionary_Release_Notes.md#v220) | July 2, 2020 | +| [v.2.1.0](Data_Dictionary_Release_Notes.md#v210) | March 10, 2020 | +| [v.2.0.0](Data_Dictionary_Release_Notes.md#v200) | January 30, 2020 | +| [v.1.18.1](Data_Dictionary_Release_Notes.md#v1181) | November 6, 2019 | +| [v.1.18](Data_Dictionary_Release_Notes.md#v118) | July 31, 2019 | +| [v.1.17](Data_Dictionary_Release_Notes.md#v117) | June 5, 2019 | +| [v.1.16](Data_Dictionary_Release_Notes.md#v116) | April 17, 2019 | +| [v.1.15](Data_Dictionary_Release_Notes.md#v115) | December 18, 2018 | | [v.1.14](Data_Dictionary_Release_Notes.md#v114) | September 27, 2018 | | [v1.13](Data_Dictionary_Release_Notes.md#v113) | May 21, 2018 | | [v1.12.1](Data_Dictionary_Release_Notes.md#v1121) | April 26, 2018 | | [v1.12](Data_Dictionary_Release_Notes.md#v112) | April 23, 2018 | | [v1.11](Data_Dictionary_Release_Notes.md#v111) | January 20, 2018 | -| [v10.0](Data_Dictionary_Release_Notes.md#release-with-api-v1100) | August 22, 2017 | -| [v7.1](Data_Dictionary_Release_Notes.md#release-with-api-v171) | March 16, 2017 | -| [v3.1](Data_Dictionary_Release_Notes.md#release-with-api-v131) | September 7, 2016 | +| [v1.10.0](Data_Dictionary_Release_Notes.md#release-with-api-v1100) | August 22, 2017 | +| [v1.7.1](Data_Dictionary_Release_Notes.md#release-with-api-v171) | March 16, 2017 | +| [v1.3.1](Data_Dictionary_Release_Notes.md#release-with-api-v131) | September 7, 2016 | + +## v2.4.0 + +* __GDC Product__: GDC Data Dictionary +* __Release Date__: June 21, 2021 + +### New Features and Changes + +* Added mix-max limitations on property values in `demographic`, `portion`, `aliquot`, `family_history`, `slide`, `follow_up`, `read_group`, `sample`, `analyte`, `exposure`, `diagnosis`, `treatment`, `molecular_test` +* Altered `submitted_unaligned_reads`, `submitted_aligned_reads`, `annotated_somatic_mutation`, `simple_somatic_mutation`, `masked_somatic_mutation`, `submitted_genomic_profile`, `aligned_reads`, `aggregated_somatic_mutation`, `simple_germline_variation` Entities + * Changes made to `experimental_strategy` + * Removed permissible value: `Low Pass WGS` +* Altered `follow_up` Entity + * New property: `eye_color` + * New property: `history_of_tumor` + * New property: `history_of_tumor_type` + * New property: `undescended_testis_corrected` + * New property: `undescended_testis_corrected_age` + * New property: `undescended_testis_corrected_laterality` + * New property: `undescended_testis_corrected_method` + * New property: `undescended_testis_history` + * New property: `undescended_testis_history_laterality` + * Changes made to `comorbidity` + * New permissible value: `Dermatomyosis` + * New permissible value: `Herpes Zoster` + * New permissible value: `Varicella Zoster Virus` + * Changes made to `risk_factor` + * New permissible value: `Dermatomyosis` + * New permissible value: `Herpes Zoster` + * New permissible value: `Varicella Zoster Virus` +* Altered `read_group` Entity + * Changes made to `instrument_model` + * New permissible value: `Illumina NovaSeq 6000` + * Changes made to `single_cell_library` + * New permissible value: `Chromium scATAC v1 Library` + * Changes made to `target_capture_kit` + * New permissible value: `Twist Human Comprehensive Exome` +* Altered `somatic_aggregation_workflow` Entity + * Added link: `simple_somatic_mutations` + * Changes made to `workflow_type` + * New permissible value: `CaVEMan Variant Aggregation and Masking` +* Altered `sample` Entity + * Changes made to `sample_type` + * New permissible value: `Next Generation Cancer Model Expanded Under Non-conforming Conditions` + * Changes made to `sample_type_id` + * New permissible value: `87` +* Altered `germline_mutation_calling_workflow` Entity + * New link: `submitted_genotyping_arrays` + * Changes made to `workflow_type` + * New permissible value: `Birdseed` +* Altered `slide_image` Entity + * Changes made to `data_format` + * New permissible value: `JPEG 2000` +* Altered `exposure` Entity + * New property: `exposure_duration_years` + * New property: `parent_with_radiation_exposure` + * Changes made to `exposure_type` + * New permissible value: `Radiation` +* Altered `simple_germline_variation` Entity + * New property: `platform` + * Changes made to `data_format` + * New permissible value: `TSV` +* Altered `pathology_detail` Entity + * New property: `consistent_pathology_review` + * New property: `residual_tumor` + * New property: `size_extraocular_nodule` + * New property: `tumor_thickness` +* Altered `diagnosis` Entity + * New property: `adrenal_hormone` + * New property: `primary_disease` + * Changes made to `ajcc_pathologic_stage` + * New permissible value: `Stage IIIA1` + * New permissible value: `Stage IIIA2` + * Changes made to `metastasis_at_diagnosis_site` + * New permissible value: `Bladder` + * New permissible value: `Bronchus` + * New permissible value: `Head, Face or Neck, NOS` + * New permissible value: `Lymph Node, Regional` + * New permissible value: `Lymph Node, Subcarinal` + * Changes made to `method_of_diagnosis` + * New permissible value: `Exoresection` + * Changes made to `morphology` + * New permissible value: `8246/6` + * New permissible value: `8380/6` + * New permissible value: `8461/6` + * New permissible value: `8522/6` + * Changes made to `sites_of_involvement` + * New permissible value: `Mesothelium` +* Altered `treatment` Entity + * New property: `route_of_administration` + * Changes made to `treatment_arm` + * New permissible value: `A081801` + * Changes made to `therapeutic_agents` + * New permissible value: `Interferon Alfa-2B` + * New permissible value: `Levoleucovorin Calcium` + * New permissible value: `Mistletoe Extract` +* Altered `somatic_mutation_calling_workflow` Entity + * Changes made to `workflow_type` + * New permissible value: `Strelka2 RNA` +* Altered `molecular_test` Entity + * New property: `days_to_test` + * New link: `diagnoses` + * Changes made to `antigen` + * New permissible value: `FMC-7` + * New permissible value: `Kappa, Surface` + * New permissible value: `Lambda, Surface` + * Changes made to `gene_symbol` + * New permissible value: `AQP1` + * New permissible value: `CALB2` + * New permissible value: `DNTT` + * New permissible value: `EPCAM` + * New permissible value: `GCET1` + * New permissible value: `PDPN` + * New permissible value: `PTGS2` + * Changes made to `laboratory_test` + * New permissible value: `BG8` + * New permissible value: `Circulating Endothelial Cells` + * New permissible value: `Cytokeratin 5` + * New permissible value: `Cytokeratin 6` + * New permissible value: `Dopamine-Secreting` + * New permissible value: `Epinephrine-Secreting` + * New permissible value: `Metanephrine-Secreting` + * New permissible value: `Methoxytyramine-Secreting` + * New permissible value: `Microsatellite Instability` + * New permissible value: `Norepinephrine-Secreting` + * New permissible value: `Normetanephrine-Secreting` + * New permissible value: `Serum Mesothelin` + * New permissible value: `TAG-72` + +### Known Issues and Workarounds + +* The `mitotic_count` field in `diagnosis` is erroneously set as "deprecated" and does not appear in the dictionary viewer. This field can be uploaded successfully without issue and will appear in the dictionary viewer at a later release. + + +## v2.3.0 + +* __GDC Product__: GDC Data Dictionary +* __Release Date__: January 5, 2021 + +### New Features and Changes +* Altered `submitted_unaligned_reads` Entity + * Changes made to `experimental_strategy` + * New permissible value: `HiChIP` + * New permissible value: `m6A RNA Methylation` + * New permissible value: `scATAC-Seq` + * Changes made to `read_pair_number` + * New permissible value: `R3` +* Altered `submitted_aligned_reads` Entity + * Changes made to `experimental_strategy` + * New permissible value: `HiChIP` + * New permissible value: `m6A RNA Methylation` + * New permissible value: `scATAC-Seq` +* Altered `follow_up` Entity + * Changes made to `comorbidity` + * New permissible value: `Abnormal Glucose Level` + * New permissible value: `Chronic Fatigue Syndrome` + * New permissible value: `Clonal Hematopoiesis` + * New permissible value: `Fibromyalgia` + * New permissible value: `Gastritis` + * Changes made to `risk_factor` + * New permissible value: `Abnormal Glucose Level` + * New permissible value: `Chronic Kidney Disease` + * New permissible value: `Escherichia coli` + * New permissible value: `Gastritis` + * New permissible value: `Skin Rash` +* New Entity: `masked_methylation_array` +* Altered `read_group` Entity + * New property: `chipseq_antibody` + * New property: `fragmentation_enzyme` + * Removed property: `RIN` + * Changes made to `library_strategy` + * New permissible value: `HiChIP` + * New permissible value: `m6A RNA Methylation` + * New permissible value: `scATAC-Seq` +* Altered `sample` Entity + * New property: `sample_ordinal` + * Changes made to `composition` + * New permissible value: `Mixed Adherent Suspension` +* Altered `analyte` Entity + * New property: `experimental_protocol_type` + * New property: `rna_integrity_number` +* Altered `pathology_detail` Entity + * New property: `additional_pathology_findings` + * New property: `necrosis_percent` + * New property: `necrosis_present` + * New property: `rhabdoid_percent` + * New property: `rhabdoid_present` + * New property: `sarcomatoid_percent` + * New property: `sarcomatoid_present` + * Changes made to `dysplasia_degree` + * New permissible value: `Mild` + * New permissible value: `Moderate` + * New permissible value: `Severe` + * Changes made to `dysplasia_type` + * New permissible value: `Epithelial` + * New permissible value: `Keratinizing` + * New permissible value: `Nonkeratinizing` + * Changes made to `lymph_node_involvement` +* Altered `diagnosis` Entity + * New property: `ann_arbor_b_symptoms_described` + * Changes made to `ajcc_clinical_stage` + * New permissible value: `Stage IA3` + * Changes made to `ajcc_pathologic_m` + * New permissible value: `M1d` + * Changes made to `metastasis_at_diagnosis_site` + * New permissible value: `Gastrointestinal Tract` + * New permissible value: `Heart` + * New permissible value: `Neck` + * New permissible value: `Retroperitoneum` + * New permissible value: `Urethra` + * New permissible value: `Uterine Adnexa` + * New permissible value: `Vertebral Canal` + * New permissible value: `Vulva, NOS` + * Changes made to `morphology` + * New permissible value: `8249/6` + * New permissible value: `8800/6` + * Changes made to `supratentorial_localization` + * New permissible value: `Frontal lobe` + * New permissible value: `Occipital lobe` + * New permissible value: `Parietal lobe` + * New permissible value: `Temporal lobe` +* Altered `treatment` Entity + * Changes made to `treatment_dose_units` + * New permissible value: `mg` + * Changes made to `treatment_type` + * New permissible value: `Radiation, Hypofractionated` + * New permissible value: `Radiation, Mixed Photon Beam` + * New permissible value: `Radiation, Photon Beam` + * Changes made to `therapeutic_agents` + * Updated enum list. +* Altered `case` Entity + * Updated `disease_type` and `primary_site` enum values. +* Altered `rna_expression_workflow` Entity + * Changes made to `workflow_type` + * New permissible value: `STAR - Smart-Seq2 Raw Counts` + * New permissible value: `STAR - Smart-Seq2 Filtered Counts` +* Altered `molecular_test` Entity + * New link: `slides` + * Changes made to `antigen` + * New permissible value: `Ki67` + * Changes made to `gene_symbol` + * New permissible value: `CHGA` + * New permissible value: `SYP` + * Changes made to `molecular_consequence` + * New permissible value: `Exon Variant` + * Changes made to `second_gene_symbol` + * New permissible value: `CHGA` + * New permissible value: `SYP` +* Altered `aligned_reads` Entity + * Changes made to `experimental_strategy` + * New permissible value: `HiChIP` + * New permissible value: `scATAC-Seq` + +### Bugs Fixed Since Last Release +* None + +## v2.2.0 + +* __GDC Product__: GDC Data Dictionary +* __Release Date__: July 2, 2020 + +### New Features and Changes +* Added NCIt codes associated with enumeration values in `diagnosis` entity type +* Added `pathology_detail` entity +* Modified `treatment` entity + - Added new enumerations for `therapeutic_agents` property + * `Itraconazole` + * `Tipiracil` + * `Tipiracil Hydrochloride` + * `Zirconium Zr 89 Panitumumab` + * `Estradiol mustard` + * `Progestational IUD` + * `PD-1 Inhibitor` + * `IGF-1R Inhibitor` + * `CDK4/6 Inhibitor` + * `ALK Inhibitor` +* Modified `annotation` entity + - Added `many_to_many` link to `copy_number_estimate` entity +* Modified `diagnosis` entity + - Added new properties: + * `eln_risk_classification` + * `satellite_nodule_present` + * `who_cns_grade` + * `who_nte_grade` + * `sites_of_involvement` + - Added new permissible values to `ajcc_pathologic_stage` property + * `Stage IA3` + - Added new permissible values to `classification_of_tumor` property + * `Progression` + - Added new permissible values to `metastasis_at_diagnosis_site` property + * `Esophagus` + - Added new permissible values to `morphology` property + - Removed properties as `required` + * `days_to_last_follow_up` + * `tumor_grade` + * `progression_or_recurrence` + * `days_to_recurrence` + * `days_to_last_known_disease_status` + * `last_known_disease_status` + - Deprecated properties + * `mitotic_count` + * `papillary_renal_cell_type` + * `micropapillary_features` + * `non_nodal_regional_disease` + * `non_nodal_tumor_deposits` +* Modified `sample` entity + - Changed description of properties + * `days_to_collection` + * `days_to_sample_procurement` + - Deprecated properties + * `is_ffpe` + * `oct_embedded` + - Added new permissible values to `sample_type` property + * `Mixed Adherent Suspension Saliva` +* Modified `follow_up` entity + - Added new properties + * `procedures_performed` + * `hormonal_contraceptive_type` + * `hormone_replacement_therapy_type` + - Added new permissible values to `comorbidity` property + - Added new permissible values to `risk_factor` property + - Added new permissible values to `evidence_of_recurrence_type` property + - Added new permissible values to `aids_risk_factors` property +* Modified `exposure` entity + - Added new properties + * `smokeless_tobacco_quit_age` + * `alcohol_type` + - Added new permissible values to `exposure_type` property + * `Wood Dust` + * `Smoke` + - Added new permissible value to `type_of_smoke_exposure` property + * `Tobacco smoke, NOS` +* Modified `submitted_unaligned_reads` property + - Removed permissible value from `read_pair_number` property + * `I1` +* Modified `aliquot` entity + - Added new permissible value to `analyte_type` property + * `Nuclei RNA` +* Modified `rna_expression_workflow` entity + - Removed permissible value from `workflow_type` property + * `STAR - Smart-Seq2 Counts` + - Added new permissible values to `workflow_type` property + * `STAR - Smart-Seq2 Gene Counts` + * `STAR - Smart-Seq2 GeneFull Counts` +* Modified `molecular_test` entity + - Added new properties + * `mitotic_count` + * `mitotic_total_area` + * `biospecimen_volume` + - Added new permissible values to `gene_symbol` property + - Added new permissible values to `second_gene_symbol` property + - Added new permissible values to `antigen` property + - Added new permissible values to `laboratory_test` property + +### Bugs Fixed Since Last Release +* None + +## v2.1.0 + +* __GDC Product__: GDC Data Dictionary +* __Release Date__: March 10, 2020 + +### New Features and Changes + +* Added NCIt codes associated with enumeration values in `exposure`, `family_history`, and `demographic` entity types +* Restructured dictionary for consistency across types +* __read_group entity__ +* Added 5 new target capture kits + - `Custom Twist Broad PanCancer Panel - 396 Genes` + - `Nextera DNA Exome` + - `Custom Twist Broad Exome v1.0 - 35.0 Mb` + - `Custom SureSelect CGCI-HTMCP-CC KMT2D And Hotspot Panel - 37.0 Kb` + - `TruSeq RNA Exome` +* Add one new enum to `library_strategy` + - `scRNA-Seq` +* __structural_variation entity__ +* Added `VCF` to `data_format` property +* Added links to `structural_variation` from `somatic_mutation_index` +* Added links to `aligned_reads` from `somatic_copy_number` workflow +* __copy_number_segment entity__ +* Added new permissible values to `experimental_strategy` property + - `WGS` + - `WXS` +* Added new enum to `experimental_strategy` + - `WGS` +* __aligned_reads entity__ +* Added 2 new properties + - `tumor_ploidy` + - `tumor_purity` +* __treatment entity__ +* Added enumeration to `therapeutic_agent` property +* __copy_number_estimate entity__ +* Added new enum to `data_format` + - `TSV` +* __demographic entity__ +* Added new property `country_of_residence_at_enrollment` +* __family_history entity__ +* Added new permissible values to `relationship_primary_diagnosis` property +* __follow_up entity__ +* Added new properties + - `body_surface_area` + - `recist_targeted_regions_number` + - `recist_targeted_regions_sum` + - `adverse_event_grade` + - `cd4_count` + - `imaging_type` + - `scan_tracer_used` + - `nadir_cd4_count` + - `hiv_viral_load` + - `aids_risk_factors` + - `haart_treatment_indicator` + - `immunosuppressive_treatment_type` + - `evidence_of_recurrence_type` + - `imaging_result` + - `hormonal_contraceptive_use` + - `pregnancy_outcome` + - `hysterectomy_type` + - `hysterectomy_margins_involved` + - `days_to_imaging` + - `cdc_hiv_risk_factors` + - `risk_factor` +* Added new enum to `days_to_follow_up` + - `null` +* __molecular_test entity__ +* Added new permissible values to various properties + - `laboratory_test` + - `second_gene_symbol` + - `molecular_consequence` + - `biospecimen_type` + - `molecular_analysis_method` + - `gene_symbol` + - `clonality` +* __sample entity__ +* Added new permissible values to various properties + - `method_of_sample_procurement` + - `biospecimen_anatomic_site` + - `tumor_descriptor` +* Added new property + - `tissue_collection_type` +* __treatment entity__ +* Added new properties + - `treatment_arm` + - `reason_treatment_ended` + - `number_of_cycles` + - `treatment_effect_indicator` + - `treatment_dose` + - `treatment_dose_units` + - `treatment_frequency` + - `chemo_concurent_to_radiation` +* Added new permissible values to various properties + - `therapeutic_agent` + - `treatment_effect` + - `treatment_intent_type` +* __slide entity__ +* Added new properties + - `percent_sarcomatoid_features` + - `percent_rhabdoid_features` + - `prostatic_chips_total_count` + - `prostatic_chips_positive_count` + - `prostatic_involvement_percent` + - `bone_marrow_malignant_cells` + - `percent_follicular_component` + - `tissue_microarray_coordinates` +* __diagnosis entity__ +* Added new properties + - `tumor_depth` + - `margin_distance` + - `transglottic_extension` + - `margins_involved_site` + - `gleason_grade_tertiary` + - `papillary_renal_cell_type` + - `gleason_patterns_percent` + - `greatest_tumor_dimension` + - `lymph_node_involved_site` + - `pregnant_at_diagnosis` + - `figo_staging_edition_year` +* Added new permissible values to various properties + - `classification_of_tumor` + - `figo_stage` + - `tumor_grade` + - `metastasis_at_diagnosis_site` + - `gleason_grade_group` + - `tissue_or_organ_of_origin` + - `morphology` +* __exposure entity__ +* Added new properties + - `secondhand_smoke_as_child` + - `exposure_type` + - `type_of_tobacco_used` + - `exposure_duration` + - `tobacco_use_per_day` + - `age_at_onset` + - `marijuana_use_per_week` + - `tobacco_smoking_status` +* __case entity__ +* Added new properties + - `consent_type` + - `days_to_consent` +* Added new permissible values to `index_date` +* Added `scRNA-Seq` as new enum to `experimental_strategy` in 4 entities + - `submitted_unaligned_reads` + - `submitted_aligned_reads` + - `aligned_reads` + - `gene_expression` +* Added 3 new permissible values to `workflow_type` in alignment_workflow + - `CellRanger - 10x Chromium` + - `STAR - Smart-Seq2` + - `zUMIs - Smart-Seq2` +* Added 4 new permissible values to `workflow_type` in rna_expression_workflow + - `CellRanger - 10x Raw Counts` + - `CellRanger - 10x Filtered Counts` + - `STAR - Smart-Seq2 Counts` + - `zUMIs - Smart-Seq2 Counts` +* Add one new integer property to `read_group` + - `number_expect_cells` +* Add one new enum to `read_pair_number` in `submitted_unaligned_reads` + - `I1` +* Add one new enum property in `read_group` node + - `Chromium 3' Gene Expression v2 Library` + - `Chromium 3' Gene Expression v3 Library` + - `Smart-Seq2` +* Created new node `expression_analysis_workflow` +* Created new node `secondary_expression_analysis` +* Add one new enum to `data_format` in `gene_expression` + - `MEX` + +### Bugs Fixed Since Last Release +* None + +## v2.0.0 + +* __GDC Product__: GDC Data Dictionary +* __Release Date__: January 30, 2020 + +### New Features and Changes +* The API that includes the GDC data dictionary now uses Python 3. + +### Bugs Fixed Since Last Release + +* None + + +## v.1.18.1 + +* __GDC Product__: GDC Data Dictionary +* __Release Date__: November 6, 2019 + +### New Features and Changes +* Added new permissible value `deleted` to property `file_state` + +### Bugs Fixed Since Last Release + +* None + + +## v.1.18 + +* __GDC Product__: GDC Data Dictionary +* __Release Date__: July 31, 2019 + +### New Features and Changes + +* Added new entities + - `protein_expression_quantification` + - `submitted_genotyping_array` + - `somatic_copy_number_workflow` +* Add links in data model to `somatic_copy_number_workflow` from `copy_number_segment` +* Add links in data model to `somatic_copy_number_workflow` from `copy_number_estimate` +* Add links in data model from `annotated_somatic_mutation` to `genomic_profile_harmonization_workflow` +* Modified `copy_number_segment` entity + - Add new data type + - `Allele-specific Copy Number Segment` +* Update data dictionary to support new annotation classifications +* Fixed typo in `sample` entity schema +* Unrequired project link for aliquot level MAFs +* Added NCIt codes for gender values +* Changed description of `masked_somatic_mutation` and `aggregated_somatic_mutation` nodes to be the same +* Modified `archive` entity + - Set `downloadable` property to `true` +* Modified `publication` entity + - Set `downloadable` property to `true` +* Modified `filtered_copy_number_segment` entity + - Set `downloadable` property to `true` +* Modified `aligned_reads` entity + - Added `MSI properties` as new property +* Modified `read_group` entity + - Added `Custom SureSelect Human All Exon v1.1 Plus 3 Boosters` as new permissible value for `target_capture_kit` field + - Added `SeqCap EZ Human Exome v3.0` as new permissible value for `target_capture_kit` field + - Added new permissible values for `instrument_model` + - `Unknown` + - `Not Reported` + - `Ion Torrent S5` +* Modified `biospecimen_supplement` entity + - Added `CDC JSON` as new permissible data format +* Modified `demographic` entity + - Added new property + - `age_is_obfuscated` +* Modified `demographic` entity + - Added new properties + - `cause_of_death_source` + - `occupation_duration_years` +* Modified `diagnosis` entity + - Added new properties + - `non_nodal_regional_disease` + - `non_nodal_tumor_deposits` + - `ovarian_specimen_status` + - `ovarian_surface_involvement` + - `percent_tumor_invasion` + - `peritoneal_fluid_cytological_status` + - `breslow_thickness` + - `international_prognostic_index` + - `largest_extrapelvic_peritoneal_focus` + - `mitotic_count` + - Removed permissible values from `primary_diagnosis` + - Removed permissible values from `site_of_resection_or_biopsy` + - Removed `tumor_stage` as required property + - Added new permissible values for + - `ajcc_pathologic_stage` + - `metastasis_at_diagnosis_site` + - Migrated values not part of permissible values to `Not Reported` for following properties + - `primary_diagnosis` + - `site_of_resection_or_biopsy` + - `tumor_grade` + - `tissue_or_organ_of_origin` + - Removed permissible values from + - `tissue_or_organ_of_origin` + - `morphology` +- Modified `structural_variant_calling_workflow` entity + - Added new `workflow_type` +- Modified `structural_variant` entity + - Added `BEDPE` to `data_format` as permissible value +- Modified `molecular_test` entity + - Added new permissible values for `gene_symbol` + - Added new permissible values for `test_result` + - Added new permissible values for `antigen` + - Added new property `pathogenicity` +- Modified `follow_up` entity + - Added new permissible value for `risk_factor` +- Modified `sample` entity + - Added new permissible values for `method_of_sample_procurement` +- Modified `genomic_profile_harmonization_workflow` entity + - Added new `workflow_type` permissible values +- Modified `somatic_mutation_calling_workflow` entity + - Added new `workflow_type` permissible values + - Modified `somatic_annotation_workflow` entity + - Added new `workflow_type` permissible values +- Modified `case` entity + - Added new permissible value to `disease_type` + - `Not Applicable` +- Modified `copy_number_estimate` entity + - Added new permissible value to `experimental_strategy` + - `WXS` +- Modified `family_history` entity + - Added new property + - `relatives_with_cancer_history_count` +- Modified `sample` entity + - Added new permissible values + - `sample_type` + - `sample_type_id` + +### Bugs Fixed Since Last Release + +* None + +## v.1.17 + +* __GDC Product__: GDC Data Dictionary +* __Release Date__: June 5, 2019 + + +### New Features and Changes + +* Deleted vital status, days_to_birth, and days_to_death from Diagnosis node. Data submission and data requests should all be directed to the corresponding properties on the Demographic Node. + +### Bugs Fixed Since Last Release + +* None + + + +## v.1.16 + +* __GDC Product__: GDC Data Dictionary +* __Release Date__: April 17, 2019 + + +### New Features and Changes + +* Updates to the Data Dictionary Search Tool +* Added new bioinformatics workflow for methylation arrays (Sesame) +* Changed `somatic_mutation_calling_workflow` link from `one_to_many` to `many_to_many` +* Modified `read_group` entity + - Added `SeqCap EZ Human Exome v2.0` as new permissible value for `target_capture_kit` field + - Added `Custom SureSelect Human All Exon v1.1 Plus 3 Boosters` as new permissible value `target_capture_kit` field + - Added `Custom SureSelect CGCI-HTMCP-CC Panel - 19.7 Mb` as new permissible value `target_capture_kit` field +* Modified `case` entity + - Updated the description for the `primary_site` field + - Added new permissible value to `lost_to_followup` field +* Modified `molecular_test` entity + - Removed properties with genomic coordinates + - Add new permissible values to `test_result` + - Added `second_exon` as new property +* Modified `aligned_reads_index` entity + - Made these files not submittable +* Modified `somatic_mutation_index` entity + - Made these files not submittable +* Modified `sample` entity + - Added new permissible values for `sample_type` + - `Blood Derived Cancer - Bone Marrow` + - `Blood Derived Cancer - Peripheral Blood` + - Added new permissible values to `sample_type_id` +* Modified `diagnosis` entity + - Added 6 new staging and grading properties for TCGA + - `igcccg_stage` + - `masaoka_stage` + - `gleason_grade_group` + - `primary_gleason_grade` + - `secondary_gleason_grade` + - `weiss_assessment_score` + - Made `vital_status` an optional field + - Removed deprecated properties + - `days_to_death` + - `days_to_birth` + - `cause_of_death` + - `hiv_positive` + - `days_to_hiv_diagnosis` + - `ldh_normal_range_upper` + - `new_event_type` + - `hpv_status` + - `hpv_positive_type` + - `colon_polyps_history` + - `progression_free_survival` + - `progression_free_survival_event` + - `overall_survival` + - `days_to_treatment` + - `ldh_level_at_diagnosis` + - Added `vital_status` property to deprecated list +* Modified `somatic_aggregation_workflow` entity + - Added `Aliquot Ensemble Somatic Variant Merging and Masking` as new permissible value to `workflow_type` +* Modified `slide` entity + - Updated the description for the `magnification` field +* Modified `aliquot` entity + - Updated the the description for several fields + - `selected_normal_low_pass_wgs` + - `selected_normal_targeted_sequencing` + - `selected_normal_wgs` + - `selected_normal_wxs` +* Modified `follow-up` entity + - Added new field `days_to_progression_free` +* Modified `demographic` entity + - Made `vital_status` a required field +* Modified `exposure` entity + - Added new properties + - `environmental_tobacco_smoke_exposure` + - `respirable_crystalline_silica_exposure` + - `coal_dust_exposure` + - `type_of_smoke_exposure` + - `type_of_tobacco_used` + - `smoking_frequency` + - `time_between_waking_and_first_smoke` + - Removed `cigarettes_per_day` property from deprecated list +* Modified `annotation` entity + - Modified permissible values to `status` + - Approved + - Rescinded + +### Bugs Fixed Since Last Release + +* None + + +## v.1.15 + +* __GDC Product__: GDC Data Dictionary +* __Release Date__: December 18, 2018 + + +### New Features and Changes + +* Removed `Raw Sequencing Data` and `Sequencing Data` as permissible values from `submitted_aligned_reads`, `submitted_unaligned_reads`, and `aligned_reads` +* Deleted `aligned_reads_metrics` entity +* Created new `raw_methylation_array` entity +* Add regex validation to property `md5sum` for following entities: + - `slide_image` + - `analysis_metadata` + - `clinical_supplement` + - `experiment_metadata` + - `pathology_report` + - `run_metadata` + - `biospecimen_supplement` + - `submitted_aligned_reads` + - `submitted_genomic_profile` + - `submitted_methylation_beta_value` + - `submitted_tangent_copy_number` + - `submitted_unaligned_reads` +* Modified `molecular_test` entity + - Migrated data from `blood_test` to `laboratory_test` and `biospecimen_type` for all entities + - Added new property `intron` + - Deleted `blood_test` entity + - Added new permissible values for `gene_symbol` + - Added new permissible values for `antigen` + - Added new permissible values for `molecular_analysis_method` + - Added new permissible values for `variant_type` + - Added new permissible values for `test_result` + - Added new permissible values for `molecular_consequence` + - Added regex validation to property `transcript` + - Added regex validation to property `locus` + - Changed data type of `exon` property to be `string` with regex validation +* Modified `diagnosis` entity + - Added new fields + - `tumor_focality` + - `tumor_regression_grade` + - `lymph_nodes_tested` + - Added new permissible value for `primary_diagnosis` field + - Added min and max values to time-based properties + - Added new permissible value for `morphology` field +* Modified `follow_up` entity + - Added new permissible values for `ecog_performance_status` + - Added new permissible values for `comorbidity` + - Added new permissible values for `disease_response` + - Added new permissible values for `risk_factor` + - Added min and max values to time-based properties + - Added new property: + - `hepatitis_sustained_virological_response` + - Updated CDE, CDE version, description and URL for `comorbidity` + - Added a CDE for `days_to_comorbidity` + - Removed `reflux_treatment` property + - Add a new property: + - `risk_factor_treatment` +* Modified `aligned_reads` entity + - Added new contamination properties + - `contamination` + - `contamination_error` +* Modified `read_group` entity + - Added new permissible values for `target_capture_kit` + - Updated description for property `instrument_model` + - Added new permissible values for `target_capture_kit` + - Added new permissible values for `library_strategy` + - Added regex validation to property `adapter_sequence` + - Added regex validation to property `multiplex_barcode` + - Allow users to enter null for property `read_length` + - Allow users to enter null for property `is_paired_end` +* Modified `family_history` entity + - Added new permissible values for `relationship_primary_diagnosis` + - Added min and max values to properties +* Modified `case` entity + - Add min and max values to properties + - Delete permissible value from `primary_site` + - `Unknown Primary Site` +* Modified `analyte` entity + - Corrected the description for fields `analyte_volume` to include microliters as unit +* Modified `exposure` entity + - Added new properties + - `asbestos_exposure` + - `radon_exposure` +* Modified `sample` entity + - Added new permissible values to `method_of_sample_procurement` + - Added regex validation to `pathology_report_uuid` + - Change type from string to number for properties: + - `intermediate_dimension` + - `longest_dimension` + - `shortest_dimension` + - `time_between_clamping_and_freezing` + - `time_between_excision_and_freezing` + - Add min and max to properties on the sample node + - Populated sample nodes that have no value for `tissue_type` to "Not Reported" +* Modified `treatment` entity + - Added a new property + - `prior_treatment_effect` + - Add min and max values to properties +* Modified `aliquot` entity + - Corrected the description for fields `analyte_volume` to include microliters as unit +* Modified `demographic` entity + - Added min and max to properties + + +### Bugs Fixed Since Last Release + +* Fixed value of `pathology_report_uuid` on sample entity `7b29b034-86e4-4266-8657-036e96e04430` to satisfy regex requirements +* Migrated a few unsupported values for sample.pathology_report_uuid, read_group.adapter_sequence, read_group.multiplex_barcode ## v.1.14 @@ -32,10 +921,10 @@ - Added new permissible values for `gene_symbol` fields - `Not Applicable` - Deleted field `blood_test` - - Add new permissible values for `antigen` field - - Add new permissible values to `molecular_analysis_method` - - Add new permissible values for `variant_type` field - - Add new permissible values to `test_result` + - Added new permissible values for `antigen` field + - Added new permissible values to `molecular_analysis_method` + - Added new permissible values for `variant_type` field + - Added new permissible values to `test_result` * Modified `case` entity - Modified permissible values on `index_date` - Added new value `Initial Genomic Sequencing` @@ -215,7 +1104,7 @@ ### Bugs Fixed Since Last Release -* N/A +* None ## v.1.13 diff --git a/docs/Data_Dictionary/gdcmvs.md b/docs/Data_Dictionary/gdcmvs.md new file mode 100644 index 000000000..b6dadba7e --- /dev/null +++ b/docs/Data_Dictionary/gdcmvs.md @@ -0,0 +1,70 @@ + + + +

GDC Metadata Validation Services

+ + + + +
+ +
+
+
GDC Dictionary Version
+
+ + + + diff --git a/docs/Data_Dictionary/images/CDE_Data_Element_Details.png b/docs/Data_Dictionary/images/CDE_Data_Element_Details.png new file mode 100644 index 000000000..c562a45ad Binary files /dev/null and b/docs/Data_Dictionary/images/CDE_Data_Element_Details.png differ diff --git a/docs/Data_Dictionary/images/CDE_Details.png b/docs/Data_Dictionary/images/CDE_Details.png new file mode 100644 index 000000000..7cdf1a11a Binary files /dev/null and b/docs/Data_Dictionary/images/CDE_Details.png differ diff --git a/docs/Data_Dictionary/images/GDC_DD_Links.png b/docs/Data_Dictionary/images/GDC_DD_Links.png new file mode 100644 index 000000000..042a79b80 Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_DD_Links.png differ diff --git a/docs/Data_Dictionary/images/GDC_DD_Properties_Boolean.png b/docs/Data_Dictionary/images/GDC_DD_Properties_Boolean.png new file mode 100644 index 000000000..96d821475 Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_DD_Properties_Boolean.png differ diff --git a/docs/Data_Dictionary/images/GDC_DD_Properties_Enumeration.png b/docs/Data_Dictionary/images/GDC_DD_Properties_Enumeration.png new file mode 100644 index 000000000..f222712cb Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_DD_Properties_Enumeration.png differ diff --git a/docs/Data_Dictionary/images/GDC_DD_Properties_Integer.png b/docs/Data_Dictionary/images/GDC_DD_Properties_Integer.png new file mode 100644 index 000000000..dd8305998 Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_DD_Properties_Integer.png differ diff --git a/docs/Data_Dictionary/images/GDC_DD_Properties_Number.png b/docs/Data_Dictionary/images/GDC_DD_Properties_Number.png new file mode 100644 index 000000000..2b7980ffd Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_DD_Properties_Number.png differ diff --git a/docs/Data_Dictionary/images/GDC_DD_Properties_String.png b/docs/Data_Dictionary/images/GDC_DD_Properties_String.png new file mode 100644 index 000000000..614ee99b5 Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_DD_Properties_String.png differ diff --git a/docs/Data_Dictionary/images/GDC_DD_Title_and_Summary.png b/docs/Data_Dictionary/images/GDC_DD_Title_and_Summary.png new file mode 100644 index 000000000..3f001b0b7 Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_DD_Title_and_Summary.png differ diff --git a/docs/Data_Dictionary/images/GDC_search.png b/docs/Data_Dictionary/images/GDC_search.png new file mode 100644 index 000000000..178f80808 Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_search.png differ diff --git a/docs/Data_Dictionary/images/GDC_search_exact_match_sample_type.png b/docs/Data_Dictionary/images/GDC_search_exact_match_sample_type.png new file mode 100644 index 000000000..e5b14b97c Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_search_exact_match_sample_type.png differ diff --git a/docs/Data_Dictionary/images/GDC_search_general_age.png b/docs/Data_Dictionary/images/GDC_search_general_age.png new file mode 100644 index 000000000..8ae1d56f8 Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_search_general_age.png differ diff --git a/docs/Data_Dictionary/images/GDC_search_general_living.png b/docs/Data_Dictionary/images/GDC_search_general_living.png new file mode 100644 index 000000000..f542e6d84 Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_search_general_living.png differ diff --git a/docs/Data_Dictionary/images/GDC_search_general_sample_type.png b/docs/Data_Dictionary/images/GDC_search_general_sample_type.png new file mode 100644 index 000000000..63e0cac7f Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_search_general_sample_type.png differ diff --git a/docs/Data_Dictionary/images/GDC_search_general_squamous_dictionary.png b/docs/Data_Dictionary/images/GDC_search_general_squamous_dictionary.png new file mode 100644 index 000000000..318641170 Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_search_general_squamous_dictionary.png differ diff --git a/docs/Data_Dictionary/images/GDC_search_general_squamous_matched.png b/docs/Data_Dictionary/images/GDC_search_general_squamous_matched.png new file mode 100644 index 000000000..c9147cc12 Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_search_general_squamous_matched.png differ diff --git a/docs/Data_Dictionary/images/GDC_search_general_squamous_see_all_values.png b/docs/Data_Dictionary/images/GDC_search_general_squamous_see_all_values.png new file mode 100644 index 000000000..5db48b170 Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_search_general_squamous_see_all_values.png differ diff --git a/docs/Data_Dictionary/images/GDC_search_general_squamous_values.png b/docs/Data_Dictionary/images/GDC_search_general_squamous_values.png new file mode 100644 index 000000000..d75bacf03 Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_search_general_squamous_values.png differ diff --git a/docs/Data_Dictionary/images/GDC_search_general_squamous_values_cadsr_values.png b/docs/Data_Dictionary/images/GDC_search_general_squamous_values_cadsr_values.png new file mode 100644 index 000000000..b2a914234 Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_search_general_squamous_values_cadsr_values.png differ diff --git a/docs/Data_Dictionary/images/GDC_search_general_squamous_values_compare_list.png b/docs/Data_Dictionary/images/GDC_search_general_squamous_values_compare_list.png new file mode 100644 index 000000000..8a53d4215 Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_search_general_squamous_values_compare_list.png differ diff --git a/docs/Data_Dictionary/images/GDC_search_general_squamous_values_terms.png b/docs/Data_Dictionary/images/GDC_search_general_squamous_values_terms.png new file mode 100644 index 000000000..51644af3c Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_search_general_squamous_values_terms.png differ diff --git a/docs/Data_Dictionary/images/GDC_search_property_description_age.png b/docs/Data_Dictionary/images/GDC_search_property_description_age.png new file mode 100644 index 000000000..1b86df3ac Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_search_property_description_age.png differ diff --git a/docs/Data_Dictionary/images/GDC_search_synonym_living.png b/docs/Data_Dictionary/images/GDC_search_synonym_living.png new file mode 100644 index 000000000..bf7ea75c8 Binary files /dev/null and b/docs/Data_Dictionary/images/GDC_search_synonym_living.png differ diff --git a/docs/Data_Dictionary/index.md b/docs/Data_Dictionary/index.md index 38a60383d..0ce743b46 100644 --- a/docs/Data_Dictionary/index.md +++ b/docs/Data_Dictionary/index.md @@ -2,17 +2,150 @@ ## Introduction -The GDC Data Dictionary defines components of the [GDC Data Model](../Data/Data_Model/GDC_Data_Model.md) and relationships between them. +The GDC Data Dictionary is a resource that describes the clinical, biospecimen, administrative, and genomic metadata that can be used in parallel with the genomic data generated by the GDC. The dictionary defines the structure of a database, the [data model](../Data/Data_Model/GDC_Data_Model.md), and the rules the data need to follow. In addition, the dictionary includes information about the relationships between entities within the data model. + +### Data Dictionary Components: + +The GDC Data Dictionary consists of the following components: + +* Comprehensive list of nodes, which represent entities in the data model and help to group metadata into categories. +* Comprehensive list of properties in the database and their schemas, which describe specific data elements that can be submitted to the GDC. +* Comprehensive list of unique keys and links between properties. +* Constraints and requirements defined on nodes and properties, including acceptable values and data types. + +### Standards and Conventions + +All properties and values in the GDC Data Dictionary include references to external standards defined and maintained by the [NCI Thesaurus](https://ncit.nci.nih.gov/ncitbrowser/) (NCIt) and the [Cancer Data Standards Registry and Repository](https://wiki.nci.nih.gov/display/caDSR/caDSR+Wiki) (caDSR). Both of these standards are operated by groups at [NCI's Center for Bioinformatics and Information Technology](https://cbiit.cancer.gov/) (CBIIT). + +Each property is assigned a [Common Data Element](https://cdebrowser.nci.nih.gov/cdebrowserClient/cdeBrowser.html#/search) (CDE) created by the caDSR. The CDE provides detailed information about the property including links to the NCIt through assigned concept codes. NCIt concepts are also assigned at the permissible value level for enumerated properties. The images below are an example of a caDSR CDE and its related property-level NCIt concepts. + +[![CDE Data Elements Details](images/CDE_Data_Element_Details.png)](images/CDE_Data_Element_Details.png "Click to see the full image.") +[![CDE Details](images/CDE_Details.png)](images/CDE_Details.png "Click to see the full image.") + +In addition to the caDSR and NCIt references, many of the properties are defined by additional standards including, but not limited to the following: [International Classification of Diseases](https://www.who.int/health-topics/international-classification-of-diseases) ([ICD-O-3](http://codes.iarc.fr/) and [ICD-10](https://www.cdc.gov/nchs/icd/icd10cm.htm)), [American Joint Committee on Cancer](https://cancerstaging.org/Pages/default.aspx) staging classifications, [Children's Oncology Group](https://www.childrensoncologygroup.org/) (COG) categorizations, and the [International Federation of Gynecology and Obstetrics](https://www.figo.org/) (FIGO) classifications. When these additional standards are used to describe a property, this is referenced in the description and the list of allowable values will reflect the criteria defined by the standard. + +Using external standards benefits both data contributors and data consumers at the GDC. For example, the curated lists of synonyms provided by NCIt allows for easy mapping of other study-specific clinical data standards to the GDC data dictionary. The available synonyms can be leveraged using the [GDC Data Dictionary Search](gdcmvs/). ## Data Dictionary Viewer The [GDC Data Dictionary Viewer](viewer.md) is a user-friendly interface for accessing the dictionary. It includes the following functionality: -* _Dictionary contents:_ Display of entities defined in the dictionary, including their descriptions, properties, and links. -* _Links to semantic resources:_ Links to semantic data resources that define [Common Data Elements (CDEs)](http://cde.nih.gov) used in the dictionary -* _Submission templates:_ Generation JSON and TSV templates for use in GDC data submission. +* __Dictionary contents:__ Display of entities defined in the dictionary, including their descriptions, values or types, and links. +* __Links to semantic resources:__ Links to semantic data resources that define [Common Data Elements (CDEs)](https://cde.nlm.nih.gov/home) used in the dictionary +* __Submission templates:__ JSON and TSV template generation for use in GDC data submission. + +### Components of the Data Dictionary Viewer + +The sections below provide an example of the information available for each specific node in the GDC Data Dictionary. + +#### Summary + +[![Title and Summary](images/GDC_DD_Title_and_Summary.png)](images/GDC_DD_Title_and_Summary.png "Click to see the full image.") + +* __Type:__ The name of the node. +* __Category:__ The type of metadata; some examples are Clinical, Biospecimen, Analysis and Submittable Data Files. +* __Description:__ This section contains a written explanation for the type of data that would be found in this node. +* __Unique Keys:__ The properties or list of properties that can be used to identify this node, and only this node, within the commons. + +This section also contains a "Download Template" link with a drop-down menu containing the two template file types: TSV and JSON. These files will contain all properties that are found in the node, but not all [properties are required](#properties) to upload the node. + +#### Links + +[![Links](images/GDC_DD_Links.png)](images/GDC_DD_Links.png "Click to see the full image.") + +* __Links to Entity:__ Other nodes that can be connected to the focal node. +* __Link Name:__ A simplified stand in for the node link structure (requirement, target type, multiplicity, label). Its declaration categorizes the relationship between nodes. +* __Relationship:__ The written description for the association between the focal node and the other connected node. +* __Required:__ Displays whether the link to the node is required for the existence of the focal node. To link the focal node to a parent node, use the __.submitter_id__ with the value of that field set to the appropriate `submitter_id` in the parent node. For more information on creating links between nodes, please see the [Data Submission Walkthrough](Data_Submission_Portal/Users_Guide/Data_Submission_Walkthrough). + +#### Properties + +[![Properties Enumeration](images/GDC_DD_Properties_Enumeration.png)](images/GDC_DD_Properties_Enumeration.png "Click to see the full image.") +[![Properties Integer](images/GDC_DD_Properties_Integer.png)](images/GDC_DD_Properties_Integer.png "Click to see the full image.") +[![Properties Number](images/GDC_DD_Properties_Number.png)](images/GDC_DD_Properties_Number.png "Click to see the full image.") +[![Properties String](images/GDC_DD_Properties_String.png)](images/GDC_DD_Properties_String.png "Click to see the full image.") +[![Properties Boolean](images/GDC_DD_Properties_Boolean.png)](images/GDC_DD_Properties_Boolean.png "Click to see the full image.") + +* __Property:__ The name of the property. + +* __Description:__ The written explanation for the expected type and characterization of data found in this property. + +* __Acceptable Types or Values:__ The values that can be entered into the field based on the type category. + * Enumeration: A list of predetermined strings. The user must select the exact string from the list to be a valid entry. Case does matter. Many of these properties with enumerations have numerous values. To see all of the values, click the "More Values" link at the bottom of the property row under the __Acceptable Types or Values__ column. + * Integer: A field that only accepts whole numbers. + * Number: A field that can accept any number including numbers with decimal places. + * String: A field in which alphanumeric characters and `_`, `.`, `-`, up to a length of 32,767, can be entered. Do not use other characters as it will create submission errors. Some string fields contain regex restrictions to coerce data to a specific pattern. + * Boolean: A field that only accepts `true` or `false` as acceptable values. If these values are not entered as lowercase, the dictionary will not recognize the value and an error will occur. + +* __Required:__ This informs the user whether this field is necessary for the submission of the node. If information for a required field is unknown or not reported, there is often a value to reflect that missing information. + +* __CDE:__ The caDSR CDE Public ID, with the direct link to its respective Data Element Details page. + +## Search Tool + +The Search Tool enables easier query of the GDC Data Dictionary for data submitters and recommends GDC properties and values based on synonyms. Created by the NCI CBIIT EVS Team, it leverages NCI vocabulary systems caDSR and NCIt. Below are some of the features included in the Search Tool: + +* Users can complete partial or exact match searches. +* Searches can include terms that are synonymous to the GDC allowable values. +* Users can compare their list of values to the GDC allowable values. +* Dictionary paths are described so users can find the specific node where a property is located. + +### Components of the Search Tool + +The sections below provide an example of the information available for each portion of the Search Tool. + +#### Search Tool Modifiers + +The Search Tool is equipped with the following modifiers to customize searches in the GDC Data Dictionary: + +[![GDC search](images/GDC_search.png)](images/GDC_search.png "Click to see the full image.") + +* __Exact match:__ This will return matches for only the exact value entered into the search field. + + [![GDC search general sample type](images/GDC_search_general_sample_type.png)](images/GDC_search_general_sample_type.png "Click to see the full image.") + [![GDC search exact match sample type](images/GDC_search_exact_match_sample_type.png)](images/GDC_search_exact_match_sample_type.png "Click to see the full image.") + +* __Property description:__ This will return matches for the value found not only in the property, but also searches within the description of the property. + + [![GDC search general age](images/GDC_search_general_age.png)](images/GDC_search_general_age.png "Click to see the full image.") + [![GDC search property description age](images/GDC_search_property_description_age.png)](images/GDC_search_property_description_age.png "Click to see the full image.") + +* __Synonyms:__ This will return matches that not only match the value entered, but other values that NCIt consider to be synonymous with the entered value. + + [![GDC search general living](images/GDC_search_general_living.png)](images/GDC_search_general_living.png "Click to see the full image.") + [![GDC search synonym living](images/GDC_search_synonym_living.png)](images/GDC_search_synonym_living.png "Click to see the full image.") + +#### Result Fields + +The results from searches can be sorted into three different result fields: + +* __Values:__ This result section will return three columns that displays matches to values that are found in the GDC Data Dictionary: + [![GDC search general squamous values](images/GDC_search_general_squamous_values.png)](images/GDC_search_general_squamous_values.png "Click to see the full image.") + * __Category / Node / Property:__ This section displays the GDC Data Dictionary hierarchy that precedes the search term. This section can also contain information such as: + * __See All Values:__ This window will display all GDC values for this property. + [![GDC search general squamous see all values](images/GDC_search_general_squamous_see_all_values.png)](images/GDC_search_general_squamous_see_all_values.png "Click to see the full image.") + * __Compare with User List:__ This window allows the user to input a list of values to check against the acceptable values for that property. + [![GDC search general squamous see all values](images/GDC_search_general_squamous_values_compare_list.png)](images/GDC_search_general_squamous_values_compare_list.png "Click to see the full image.") + * __See All Terms:__ This window will display the NCIt code assigned to the specific term and the synonymous NCIt terms associated. + [![GDC search general squamous values terms](images/GDC_search_general_squamous_values_terms.png)](images/GDC_search_general_squamous_values_terms.png "Click to see the full image.") + * __caDSR: CDE , Values , Compare with GDC:__ This group of links can send the user to the CDE property page (CDE), opens a window that displays the caDSR values for that property (Values), or opens a window that compares the caDSR values with GDC values (Compare wth GDC). + [![GDC search general squamous values cadsr values](images/GDC_search_general_squamous_values_cadsr_values.png)](images/GDC_search_general_squamous_values_cadsr_values.png "Click to see the full image.") + * __Matched GDC Values:__ This column will display all GDC values that match the term with ICD-O-3 and NCIt values if they are available. + [![GDC search general squamous matched](images/GDC_search_general_squamous_matched.png)](images/GDC_search_general_squamous_matched.png "Click to see the full image.") + * __CDE Permissible Values:__ This column displays GDC dictionary properties that have corresponding caDSR clinical data elements (CDE). +* __Properties:__ This result section will return five columns that displays matches to properites of the GDC Data Dictionary: + [![GDC search general age](images/GDC_search_general_age.png)](images/GDC_search_general_age.png "Click to see the full image.") + * __Category / Node:__ This column displays the Category and Node hierarchy for the search value. + * __Property:__ This column displays the name of the Property for the search value. + * __Description:__ This column displays the description for the returned property. + * __GDC Property Values:__ This column displays the value type for the returned property. For more information see the Acceptable Types or Values section under [Properties](#properties). + * __caDSR CDE Reference:__ This column displays the CDE link for the returned property. +* __Dictionary:__ This result section will return two columns that display matches to values within the structure of the GDC Data Dictionary: + [![GDC search general squamous dictionary](images/GDC_search_general_squamous_dictionary.png)](images/GDC_search_general_squamous_dictionary.png "Click to see the full image.") + * __Name:__ This column displays the name of the Category, Node, or Property with a returned value total for each level. + * __Description:__ This column displays the GDC Data Dictionary description for each level. -## Entity JSON Schemas +## Data Dictionary API In technical terms, the dictionary is a set of YAML files that define JSON schemas for each entity in the dictionary. The files are available [on GitHub](https://github.com/NCI-GDC/gdcdictionary/tree/develop/gdcdictionary/schemas). diff --git a/docs/Data_Portal/Release_Notes/Data_Portal_Release_Notes.md b/docs/Data_Portal/Release_Notes/Data_Portal_Release_Notes.md index 6b21a1cdd..bf2b3b2eb 100644 --- a/docs/Data_Portal/Release_Notes/Data_Portal_Release_Notes.md +++ b/docs/Data_Portal/Release_Notes/Data_Portal_Release_Notes.md @@ -2,6 +2,18 @@ | Version | Date | |---|---| +| [v1.28.0](Data_Portal_Release_Notes.md#release-1280) | May 17, 2021 | +| [v1.25.1](Data_Portal_Release_Notes.md#release-1251) | August 14, 2020 | +| [v1.25.0](Data_Portal_Release_Notes.md#release-1250) | July 2, 2020 | +| [v1.24.1](Data_Portal_Release_Notes.md#release-1240) | March 10, 2020 | +| [v1.23.1](Data_Portal_Release_Notes.md#release-1231) | December 10, 2019 | +| [v1.23.0](Data_Portal_Release_Notes.md#release-1230) | November 6, 2019 | +| [v1.22.0](Data_Portal_Release_Notes.md#release-1220) | July 31, 2019 | +| [v1.21.0](Data_Portal_Release_Notes.md#release-1210) | June 5, 2019 | +| [v1.20.0](Data_Portal_Release_Notes.md#release-1200) | April 17, 2019 | +| [v1.19.0](Data_Portal_Release_Notes.md#release-1190) | February 20, 2019 | +| [v1.18.0](Data_Portal_Release_Notes.md#release-1180) | December 18, 2018 | +| [v1.17.0](Data_Portal_Release_Notes.md#release-1170) | November 7, 2018 | | [v1.16.0](Data_Portal_Release_Notes.md#release-1160) | September 27, 2018 | | [v1.15.0](Data_Portal_Release_Notes.md#release-1150) | August 23, 2018 | | [v1.14.0](Data_Portal_Release_Notes.md#release-1140) | June 13, 2018 | @@ -20,6 +32,462 @@ | [v1.0.1](Data_Portal_Release_Notes.md#release-101) | May 18, 2016 | --- +## Release 1.28.0 + +* __GDC Product__: GDC Data Portal +* __Release Date__: May 17, 2021 + +### New Features and Changes + +* New columns were added to the "molecular test" table at the bottom of the case entity page to display additional molecular test fields. + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* When accessing the data portal with Chrome v.91.0.4472, users may experience some display errors. This includes the data summary in the clinical analysis and cart pages being only partially displayed. +* Pre-release Data Portal login is not supported on Internet Explorer or the last version of Edge (42). Edge 41 does login successfully. +* Custom Facet Filters + * Some definitions are missing from the property list when adding custom facet file or case filters. +* Visualizations + * Negative numbers may be displayed for the Missing value category in the Treatment node within a Clinical Analysis. This occurs with projects that have multiple treatment nodes per case. All other values should be accurate. + * SIFT and PolyPhen annotations are missing from the export JSON of the mutation table. They are present in the export TSV. + * Data Portal graphs cannot be exported as PNG images in Internet Explorer. Graphs can be exported in PNG or SVG format from Chrome or Firefox browsers . Internet Explorer does not display chart legend and title when re-opening previously downloaded SVG files, the recommendation is to open downloaded SVG files with another program. +* Repository and Cart + * The annotation count in File table of Repository and Cart does not link to the Annotations page anymore. The user can navigate to the annotations through the annotation count in Repository - Case table. +* Legacy Archive + * The footer says version 1.9, but it is actually 1.13 + * Filtering by vital_status does not function in the Legacy Archive due to updates in how this property has been indexed. A workaround is to perform the case level filtering in the GDC Data Portal and copy the filter string for use in the Legacy Archive or the legacy API. + * Downloading a token in the GDC Legacy Archive does not refresh it. If a user downloads a token in the GDC Data Portal and then attempts to download a token in the GDC Legacy Archive, an old token may be provided. Reloading the Legacy Archive view will allow the user to download the updated token. + * Exporting the Cart table in JSON will export the GDC Archive file table instead of exporting the files in the Cart only. +* Web Browsers + * Browsers limit the number of concurrent downloads, it is generally recommended to add files to the cart and download large number of files through the GDC Data Transfer Tool, more details can be found on [GDC Website](https://gdc.cancer.gov/about-gdc/gdc-faqs). + * The GDC Portals are not compatible with Internet Explorer running in compatibility mode. Workaround is to disable compatibility mode. + +## Release 1.25.1 + +* __GDC Product__: GDC Data Portal +* __Release Date__: August 14, 2020 + +### New Features and Changes + +* API improvements were made to increase portal performance. + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Pre-release Data Portal login is not supported on Internet Explorer or the last version of Edge (42). Edge 41 does login successfully. +* Custom Facet Filters + * Some definitions are missing from the property list when adding custom facet file or case filters. +* Visualizations + * Negative numbers may be displayed for the Missing value category in the Treatment node within a Clinical Analysis. This occurs with projects that have multiple treatment nodes per case. All other values should be accurate. + * SIFT and PolyPhen annotations are missing from the export JSON of the mutation table. They are present in the export TSV. + * Data Portal graphs cannot be exported as PNG images in Internet Explorer. Graphs can be exported in PNG or SVG format from Chrome or Firefox browsers . Internet Explorer does not display chart legend and title when re-opening previously downloaded SVG files, the recommendation is to open downloaded SVG files with another program. +* Repository and Cart + * The annotation count in File table of Repository and Cart does not link to the Annotations page anymore. The user can navigate to the annotations through the annotation count in Repository - Case table. +* Legacy Archive + * The footer says version 1.9, but it is actually 1.13 + * Filtering by vital_status does not function in the Legacy Archive due to updates in how this property has been indexed. A workaround is to perform the case level filtering in the GDC Data Portal and copy the filter string for use in the Legacy Archive or the legacy API. + * Downloading a token in the GDC Legacy Archive does not refresh it. If a user downloads a token in the GDC Data Portal and then attempts to download a token in the GDC Legacy Archive, an old token may be provided. Reloading the Legacy Archive view will allow the user to download the updated token. + * Exporting the Cart table in JSON will export the GDC Archive file table instead of exporting the files in the Cart only. +* Web Browsers + * Browsers limit the number of concurrent downloads, it is generally recommended to add files to the cart and download large number of files through the GDC Data Transfer Tool, more details can be found on [GDC Website](https://gdc.cancer.gov/about-gdc/gdc-faqs). + * The GDC Portals are not compatible with Internet Explorer running in compatibility mode. Workaround is to disable compatibility mode. + +## Release 1.25.0 + +* __GDC Product__: GDC Data Portal +* __Release Date__: July 2, 2020 + +### New Features and Changes + +* Suppressed Experimental Strategy filter on the Exploration page as this currently filters for files with a particular strategy, not for cases. This may cause confusion amongst users. The filter will be re-instated in a future release once the logic is available to filter more appropriately for cases tied to a specific strategy. +* Updated the filter control panel styling across the Portal to have clearer titles (e.g. "Search Cases" instead of "Cases" in the quick search box). +* Made minor updates to the styling of the filter query display at the top of the Exploration page (spacing, borders). +* Added an expand/collapse control to the quick search bar of Clinical tab on the Exploration page, to be consistent with other Exploration tabs. +* Added a clear title above the counts in each filter control panel across the Portal (e.g. "# Cases", "# Genes", etc.). +* Moved various action buttons above the results table on the Repository Page to more accessible locations. +* Improved load time of the initial custom filter list on the Repository Page, when clicking "Add a Filter Filter" or "Add a Case/Biospecimen Filter". + +### Bugs Fixed Since Last Release + +* Fixed a bug in the Age at Diagnosis table on the Cohort Comparison page, where the # of cases in the table was not consistent with the # of cases shown when clicking the link to the Exploration page. +* Fixed minor positional accuracy issue of the lollipop data points on the Protein Viewer. +* Fixed bug on the Protein Viewer where, if clicking to switch between different lollipop data points, details of the previous lollipop was not closing. +* Fixed bug where the quick search bar on the Exploratin Page's Genes filter tab was not expanding/collapsing properly. +* Fixed bug in the pop-up warning message when adding or removing items from the Cart, where long filenames were spilling outside the border of the pop-up. +* Fixed typo in the "View Cases in Exploration" button on the Repository page. +* Fixed typo in the pop-up user consent message when downloading controlled files from the Cart. + +### Known Issues and Workarounds + +* Pre-release Data Portal login is not supported on Internet Explorer or the last version of Edge (42). Edge 41 does login successfully. +* Custom Facet Filters + * Some definitions are missing from the property list when adding custom facet file or case filters. +* Visualizations + * Negative numbers may be displayed for the Missing value category in the Treatment node within a Clinical Analysis. This occurs with projects that have multiple treatment nodes per case. All other values should be accurate. + * SIFT and PolyPhen annotations are missing from the export JSON of the mutation table. They are present in the export TSV. + * Data Portal graphs cannot be exported as PNG images in Internet Explorer. Graphs can be exported in PNG or SVG format from Chrome or Firefox browsers . Internet Explorer does not display chart legend and title when re-opening previously downloaded SVG files, the recommendation is to open downloaded SVG files with another program. +* Repository and Cart + * The annotation count in File table of Repository and Cart does not link to the Annotations page anymore. The user can navigate to the annotations through the annotation count in Repository - Case table. +* Legacy Archive + * The footer says version 1.9, but it is actually 1.13 + * Filtering by vital_status does not function in the Legacy Archive due to updates in how this property has been indexed. A workaround is to perform the case level filtering in the GDC Data Portal and copy the filter string for use in the Legacy Archive or the legacy API. + * Downloading a token in the GDC Legacy Archive does not refresh it. If a user downloads a token in the GDC Data Portal and then attempts to download a token in the GDC Legacy Archive, an old token may be provided. Reloading the Legacy Archive view will allow the user to download the updated token. + * Exporting the Cart table in JSON will export the GDC Archive file table instead of exporting the files in the Cart only. +* Web Browsers + * Browsers limit the number of concurrent downloads, it is generally recommended to add files to the cart and download large number of files through the GDC Data Transfer Tool, more details can be found on [GDC Website](https://gdc.cancer.gov/about-gdc/gdc-faqs). + * The GDC Portals are not compatible with Internet Explorer running in compatibility mode. Workaround is to disable compatibility mode. + +## Release 1.24.1 + +* __GDC Product__: GDC Data Portal +* __Release Date__: March 10, 2020 + +### New Features and Changes + +* Removed unnecessary comma and y-axis value from title of the mutation details pop-up in the Protein Viewer. +* Added Tobacco Smoking Status field to the Exposures tab on the Case entity page. +* Added a link to the Cart where users can access instructions for downloading the GDC Genome Build reference files. +* Added logic to prevent duplicate fetching of data for Clinical Analysis survival plots and optimize rendering. +* Added a button to clear searches for certain Portal search controls that were previously missing this ability. +* Reduced whitespace between Oncogrid and its control panel to optimize spacing and layout. +* Made entire Clinical Analysis results page responsive (card columns now scale & stack in response to the size of the browser window). +* Replaced Clinical Analysis function for printing clinical cards to a single PDF file, with more flexible functionality to instead download all the cards in SVG and/or PNG format. +* Added message to notify users when they try to access the Portal using Microsoft Internet Explorer, indicating which browsers are officially supported. +* Added arrow icon to sortable columns across the Portal to indicate the current sort direction. + +### Bugs Fixed Since Last Release + +* Fixed bug where clicking a primary site on the Human Body Image was not re-directing to the Exploration page. +* Fixed layout issue where long Annotation Notes were exceeding the border of the text box. +* Fixed layout issue where the Repository header and action buttons were scaling and wrapping incorrectly if the browser window is shrunk beyond a certain threshold. +* Fixed layout issue where the responsive Clinical Analysis Cards were clipping improperly as the browser window is shrunk beyond a certain threshold. +* Fixed bug where the Clinical Tab on the Exploration page was crashing when entering a custom range of Years for the Age at Diagnosis facet. +* Fixed various minor cosmetic and color issues in PNG, SVG downloads of the Clinical Analysis survival plots. +* Fixed bug where the x-axis in PNG, SVG downloads of histograms across the Portal was being bolded incorrectly. +* Fixed bug where the expand/collapse symbols in the UI were incorrectly being exported in the TSV download of the Projects table. +* Fixed bug where Oncogrid's modal for customizing colors could not be scrolled below the fold if it was shrunk beyond a certain threshold. +* Fixed incorrect DTT hyperlink in the GDC Apps menu. +* Fixed bug where the "dbSNP rs ID" facet could not be minimized in the Exploration page's Mutations facet tab. +* Fixed layout issue where the Portal's header incorrectly overlaps some content when a notification banner is displayed. +* Fixed some minor layout & styling issues in the Exploration page's facets panel. +* Fixed bug where the Case ID on the Exploration page's Cases facet tab was not searchable in certain scenarios. +* Fixed bug where the Expand/Collapse button state was not changing properly when being used in the Biospecimen section of the Case entity page. +* Fixed incorrect capitalization of "dbGaP" in the Summary section of the Project entity page. +* Fixed layout issue where the Advanced Search query box on the Repository page could expanded beyond the margins of the box's border. + +### Known Issues and Workarounds + +* Pre-release Data Portal login is not supported on Internet Explorer or the last version of Edge (42). Edge 41 does login successfully. +* Custom Facet Filters + * Some definitions are missing from the property list when adding custom facet file or case filters. +* Visualizations + * Negative numbers may be displayed for the Missing value category in the Treatment node within a Clinical Analysis. This occurs with projects that have multiple treatment nodes per case. All other values should be accurate. + * SIFT and PolyPhen annotations are missing from the export JSON of the mutation table. They are present in the export TSV. + * Data Portal graphs cannot be exported as PNG images in Internet Explorer. Graphs can be exported in PNG or SVG format from Chrome or Firefox browsers . Internet Explorer does not display chart legend and title when re-opening previously downloaded SVG files, the recommendation is to open downloaded SVG files with another program. +* Repository and Cart + * The annotation count in File table of Repository and Cart does not link to the Annotations page anymore. The user can navigate to the annotations through the annotation count in Repository - Case table. +* Legacy Archive + * The footer says version 1.9, but it is actually 1.13 + * Filtering by vital_status does not function in the Legacy Archive due to updates in how this property has been indexed. A workaround is to perform the case level filtering in the GDC Data Portal and copy the filter string for use in the Legacy Archive or the legacy API. + * Downloading a token in the GDC Legacy Archive does not refresh it. If a user downloads a token in the GDC Data Portal and then attempts to download a token in the GDC Legacy Archive, an old token may be provided. Reloading the Legacy Archive view will allow the user to download the updated token. + * Exporting the Cart table in JSON will export the GDC Archive file table instead of exporting the files in the Cart only. +* Web Browsers + * Browsers limit the number of concurrent downloads, it is generally recommended to add files to the cart and download large number of files through the GDC Data Transfer Tool, more details can be found on [GDC Website](https://gdc.cancer.gov/about-gdc/gdc-faqs). + * The GDC Portals are not compatible with Internet Explorer running in compatibility mode. Workaround is to disable compatibility mode. + +## Release 1.23.1 + +* __GDC Product__: GDC Data Portal +* __Release Date__: December 10, 2019 + +### New Features and Changes + +* Updated display of x-axis units on the homepage Human Body chart to more easily display increased case counts for newly-added projects + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Pre-release Data Portal login is not supported on Internet Explorer or the last version of Edge (42). Edge 41 does login successfully. +* Custom Facet Filters + * Some definitions are missing from the property list when adding custom facet file or case filters. +* Visualizations + * Negative numbers may be displayed for the Missing value category in the Treatment node within a Clinical Analysis. This occurs with projects that have multiple treatment nodes per case. All other values should be accurate. + * SIFT and PolyPhen annotations are missing from the export JSON of the mutation table. They are present in the export TSV. + * Data Portal graphs cannot be exported as PNG images in Internet Explorer. Graphs can be exported in PNG or SVG format from Chrome or Firefox browsers . Internet Explorer does not display chart legend and title when re-opening previously downloaded SVG files, the recommendation is to open downloaded SVG files with another program. +* Repository and Cart + * The annotation count in File table of Repository and Cart does not link to the Annotations page anymore. The user can navigate to the annotations through the annotation count in Repository - Case table. +* Legacy Archive + * The footer says version 1.9, but it is actually 1.13 + * Filtering by vital_status does not function in the Legacy Archive due to updates in how this property has been indexed. A workaround is to perform the case level filtering in the GDC Data Portal and copy the filter string for use in the Legacy Archive or the legacy API. + * Downloading a token in the GDC Legacy Archive does not refresh it. If a user downloads a token in the GDC Data Portal and then attempts to download a token in the GDC Legacy Archive, an old token may be provided. Reloading the Legacy Archive view will allow the user to download the updated token. + * Exporting the Cart table in JSON will export the GDC Archive file table instead of exporting the files in the Cart only. +* Web Browsers + * Browsers limit the number of concurrent downloads, it is generally recommended to add files to the cart and download large number of files through the GDC Data Transfer Tool, more details can be found on [GDC Website](https://gdc.cancer.gov/about-gdc/gdc-faqs). + * The GDC Portals are not compatible with Internet Explorer running in compatibility mode. Workaround is to disable compatibility mode. + +## Release 1.23.0 + +* __GDC Product__: GDC Data Portal +* __Release Date__: November 6, 2019 + +### New Features and Changes + +* Added Clinical Data Analysis feature that allows Users to: + * Explore clinical data via the new Clinical Tab on the Exploration page. + * Build custom Case sets based on that clinical data for later analysis. + * Create an analysis to examine the clinical variables in a Case set, using various tools including histograms, survival plots, box plots, QQ plots, and custom binning. + * Download the data (as TSV, JSON) and plots (as PNG, SVG) of each clinical variable in an anlysis. + * Save an analysis to local storage to resume later (as long as storage is not cleared). +* Added links to CIViC annotations on the Gene and Mutation entity pages. +* Updated the default Top Mutated Genes histogram on the Exploration page to display only COSMIC Genes by default. +* Added Follow-Ups tab and nested Molecular Tests to Case entity page. +* Added text to BAM slicing modal to instruct Users how to access unmapped reads. + +### Bugs Fixed Since Last Release + +* Fixed font in exported PNGs, SVGs to be consistent with the Portal UI. +* Made custom Case and File filters in the Repository page case insensitive. +* Fixed bug where pfam domains in Protein Viewer could not be clicked in Firefox. +* Fixed bug where TSV download button could not be clicked in MS Edge. +* Fixed controlled access alert pop-up in the Cart so that the modal disappears correctly once the User has successfully logged in and initiated the download. + +### Known Issues and Workarounds + +* Pre-release Data Portal login is not supported on Internet Explorer or the last version of Edge (42). Edge 41 does login successfully. +* Custom Facet Filters + * Some definitions are missing from the property list when adding custom facet file or case filters. +* Visualizations + * Negative numbers may be displayed for the Missing value category in the Treatment node within a Clinical Analysis. This occurs with projects that have multiple treatment nodes per case. All other values should be accurate. + * SIFT and PolyPhen annotations are missing from the export JSON of the mutation table. They are present in the export TSV. + * Data Portal graphs cannot be exported as PNG images in Internet Explorer. Graphs can be exported in PNG or SVG format from Chrome or Firefox browsers . Internet Explorer does not display chart legend and title when re-opening previously downloaded SVG files, the recommendation is to open downloaded SVG files with another program. +* Repository and Cart + * The annotation count in File table of Repository and Cart does not link to the Annotations page anymore. The user can navigate to the annotations through the annotation count in Repository - Case table. +* Legacy Archive + * The footer says version 1.9, but it is actually 1.13 + * Filtering by vital_status does not function in the Legacy Archive due to updates in how this property has been indexed. A workaround is to perform the case level filtering in the GDC Data Portal and copy the filter string for use in the Legacy Archive or the legacy API. + * Downloading a token in the GDC Legacy Archive does not refresh it. If a user downloads a token in the GDC Data Portal and then attempts to download a token in the GDC Legacy Archive, an old token may be provided. Reloading the Legacy Archive view will allow the user to download the updated token. + * Exporting the Cart table in JSON will export the GDC Archive file table instead of exporting the files in the Cart only. +* Web Browsers + * Browsers limit the number of concurrent downloads, it is generally recommended to add files to the cart and download large number of files through the GDC Data Transfer Tool, more details can be found on [GDC Website](https://gdc.cancer.gov/about-gdc/gdc-faqs). + * The GDC Portals are not compatible with Internet Explorer running in compatibility mode. Workaround is to disable compatibility mode. + +## Release 1.22.0 + +* __GDC Product__: GDC Data Portal +* __Release Date__: July 31, 2019 + +### New Features and Changes + +* Replaced existing Clinical, Biospecimen columns on the Projects page with 4 columns: Clinical, Clinical Supplement, Biospecimen, Biospecimen Supplement. The Clinical and Biospecimen columns now link directly to the project page, and their counts indicate the total cases in the project. The Clinical Supplement and Biospecimen Supplement columns work the same as the old Clinical and Biospecimen columns - They link to the Repository page with Files filtered based on the Project and Data Category (Clinical or Biospecimen). +* Added a new icon to the GDC Apps menu, which links to the GDC Publications website page. +* Added the Synchronous Malignancy field to the Diagnoses / Treatments tab on the Case entity page. +* Added the Pack Years Smoked field to the Exposures tab on the Case entity page. +* Increased length of x-axis labels on histograms to 10 characters so that projects with names that are typically standard 10 chars will display fully (e.g. most TCGA projects like TCGA-BRCA). + +### Bugs Fixed Since Last Release + +* Fixed bug where the PNG, SVG files for the Overall Survival Plot could not be downloaded. + +### Known Issues and Workarounds + +* Pre-release Data Portal login is not supported on Internet Explorer or the last version of Edge (42). Edge 41 does login successfully. +* Custom Facet Filters + * Some definitions are missing from the property list when adding custom facet file or case filters. +* Visualizations + * SIFT and PolyPhen annotations are missing from the export JSON of the mutation table. They are present in the export TSV. + * Data Portal graphs cannot be exported as PNG images in Internet Explorer. Graphs can be exported in PNG or SVG format from Chrome or Firefox browsers . Internet Explorer does not display chart legend and title when re-opening previously downloaded SVG files, the recommendation is to open downloaded SVG files with another program. +* Repository and Cart + * The annotation count in File table of Repository and Cart does not link to the Annotations page anymore. The user can navigate to the annotations through the annotation count in Repository - Case table. +* Legacy Archive + * Filtering by vital_status does not function in the Legacy Archive due to updates in how this property has been indexed. A workaround is to perform the case level filtering in the GDC Data Portal and copy the filter string for use in the Legacy Archive or the legacy API. + * Downloading a token in the GDC Legacy Archive does not refresh it. If a user downloads a token in the GDC Data Portal and then attempts to download a token in the GDC Legacy Archive, an old token may be provided. Reloading the Legacy Archive view will allow the user to download the updated token. + * Exporting the Cart table in JSON will export the GDC Archive file table instead of exporting the files in the Cart only. +* Web Browsers + * Browsers limit the number of concurrent downloads, it is generally recommended to add files to the cart and download large number of files through the GDC Data Transfer Tool, more details can be found on [GDC Website](https://gdc.cancer.gov/about-gdc/gdc-faqs). + * The GDC Portals are not compatible with Internet Explorer running in compatibility mode. Workaround is to disable compatibility mode. + +## Release 1.21.0 + +* __GDC Product__: GDC Data Portal +* __Release Date__: June 5, 2019 + +### New Features and Changes + +* Changed all Survival Plots to display the Duration (x-axis) in years instead of days. +* Updated data references to clinical properties throughout the Portal to match the underlying changes in the GDC data dictionary. + +### Bugs Fixed Since Last Release + +* Fixed bug where X-axis labels in histograms were cut off when displayed. +* Renamed the 'Experimental Strategies' facet on the Projects page to singular form. +* Fixed bug where columns with a % value of infinity (due to division by zero) show as 'NaN%'. Replaced instead with a label of '--'. +* Fixed bug where the download button in the cart access banner was still disabled after a user logged in from the banner. Instead, the experience is now improved so that after login, the banner is closed and the user must explicitly click 'Download' again. +* Fixed bug where if a new user logs into the Portal and views their profile, the app crashes if the user has no projects assigned yet. +* Fixed bug where Survival Rate numbers in the Survival Plot plot y-axis did not scale properly and overlapped into the axis lines. + +### Known Issues and Workarounds + +* Pre-release Data Portal login is not supported on Internet Explorer or the last version of Edge (42). Edge 41 does login successfully. +* Custom Facet Filters + * Some definitions are missing from the property list when adding custom facet file or case filters. +* Visualizations + * SIFT and PolyPhen annotations are missing from the export JSON of the mutation table. They are present in the export TSV. + * Data Portal graphs cannot be exported as PNG images in Internet Explorer. Graphs can be exported in PNG or SVG format from Chrome or Firefox browsers . Internet Explorer does not display chart legend and title when re-opening previously downloaded SVG files, the recommendation is to open downloaded SVG files with another program. +* Repository and Cart + * The annotation count in File table of Repository and Cart does not link to the Annotations page anymore. The user can navigate to the annotations through the annotation count in Repository - Case table. +* Legacy Archive + * Downloading a token in the GDC Legacy Archive does not refresh it. If a user downloads a token in the GDC Data Portal and then attempts to download a token in the GDC Legacy Archive, an old token may be provided. Reloading the Legacy Archive view will allow the user to download the updated token. + * Exporting the Cart table in JSON will export the GDC Archive file table instead of exporting the files in the Cart only. +* Web Browsers + * Browsers limit the number of concurrent downloads, it is generally recommended to add files to the cart and download large number of files through the GDC Data Transfer Tool, more details can be found on [GDC Website](https://gdc.cancer.gov/about-gdc/gdc-faqs). + * The GDC Portals are not compatible with Internet Explorer running in compatibility mode. Workaround is to disable compatibility mode. + +## Release 1.20.0 + +* __GDC Product__: GDC Data Portal +* __Release Date__: April 17, 2019 + +### New Features and Changes + +* Upgraded the Portal to use the latest React Javascript library (version 16.8) + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Pre-release Data Portal login is not supported on Internet Explorer or the last version of Edge (42). Edge 41 does login successfully. +* Custom Facet Filters + * Some definitions are missing from the property list when adding custom facet file or case filters. +* Visualizations + * SIFT and PolyPhen annotations are missing from the export JSON of the mutation table. They are present in the export TSV. + * Data Portal graphs cannot be exported as PNG images in Internet Explorer. Graphs can be exported in PNG or SVG format from Chrome or Firefox browsers . Internet Explorer does not display chart legend and title when re-opening previously downloaded SVG files, the recommendation is to open downloaded SVG files with another program. +* Repository and Cart + * The annotation count in File table of Repository and Cart does not link to the Annotations page anymore. The user can navigate to the annotations through the annotation count in Repository - Case table. +* Legacy Archive + * Downloading a token in the GDC Legacy Archive does not refresh it. If a user downloads a token in the GDC Data Portal and then attempts to download a token in the GDC Legacy Archive, an old token may be provided. Reloading the Legacy Archive view will allow the user to download the updated token. + * Exporting the Cart table in JSON will export the GDC Archive file table instead of exporting the files in the Cart only. +* Web Browsers + * Browsers limit the number of concurrent downloads, it is generally recommended to add files to the cart and download large number of files through the GDC Data Transfer Tool, more details can be found on [GDC Website](https://gdc.cancer.gov/about-gdc/gdc-faqs). + * The GDC Portals are not compatible with Internet Explorer running in compatibility mode. Workaround is to disable compatibility mode. + +## Release 1.19.0 + +* __GDC Product__: GDC Data Portal +* __Release Date__: February 20, 2019 + +### New Features and Changes + +* Added support for viewing of controlled-access mutations in the Data Portal +* Added a new data access notification to remind logged-in users with access to controlled data that they need to follow their data use agreement. The message is fixed at the top of the Portal. +* Added the ability to search for previous versions of files. If the user enters the UUID of a previous version that cannot be found, the Portal returns the UUID of the latest version available. +* Renamed the Data Category for "Raw Sequencing Data" to "Sequencing Reads" throughout the portal where this appears, to be consistent with the Data Dictionary. +* Added a link in the Portal footer to the GDC support page. + +### Bugs Fixed Since Last Release + +* Fixed bug where Survival Plot button never stops loading if plotting mutated vs. non-mutated cases for a single Gene. +* Fixed inconsistent button styling when downloading controlled Downstream Analyses Files from File Entity page. +* Removed unnecessary Survival column from Arrange Columns button on Case Entity, Gene Entity pages. +* Removed unnecessary whitespace from pie charts on Repository page. +* Added missing File Size unit to Clinical Supplement File, Biospecimen Supplement File tables on Case Entity page. +* Fixed bug where clicking on Case Counts in Projects Graph tab was going to the Repository Files tab instead of the Cases tab. +* Fixed bug where the counts shown beside customer filters on the Repository Cases tab were not updating when filtering on other facets. +* Fixed bug where clicking the # of Affected Cases denominator on the Gene page's Most Frequent Somatic Mutations table displayed an incorrect number of Cases. + +### Known Issues and Workarounds + +* Pre-release Data Portal login is not supported on Internet Explorer or the last version of Edge (42). Edge 41 does login successfully. +* Custom Facet Filters + * Some definitions are missing from the property list when adding custom facet file or case filters. +* Visualizations + * SIFT and PolyPhen annotations are missing from the export JSON of the mutation table. They are present in the export TSV. + * Data Portal graphs cannot be exported as PNG images in Internet Explorer. Graphs can be exported in PNG or SVG format from Chrome or Firefox browsers . Internet Explorer does not display chart legend and title when re-opening previously downloaded SVG files, the recommendation is to open downloaded SVG files with another program. +* Repository and Cart + * The annotation count in File table of Repository and Cart does not link to the Annotations page anymore. The user can navigate to the annotations through the annotation count in Repository - Case table. +* Legacy Archive + * Downloading a token in the GDC Legacy Archive does not refresh it. If a user downloads a token in the GDC Data Portal and then attempts to download a token in the GDC Legacy Archive, an old token may be provided. Reloading the Legacy Archive view will allow the user to download the updated token. + * Exporting the Cart table in JSON will export the GDC Archive file table instead of exporting the files in the Cart only. +* Web Browsers + * Browsers limit the number of concurrent downloads, it is generally recommended to add files to the cart and download large number of files through the GDC Data Transfer Tool, more details can be found on [GDC Website](https://gdc.cancer.gov/about-gdc/gdc-faqs). + * The GDC Portals are not compatible with Internet Explorer running in compatibility mode. Workaround is to disable compatibility mode. + +## Release 1.18.0 + +* __GDC Product__: GDC Data Portal +* __Release Date__: December 18, 2018 + +### New Features and Changes + +* A new data access message has been added when downloading controlled data. Users must agree to abide by data access control policies when downloading controlled data. +* In the Mutation free-text search in Exploration, mutation display now includes the UUID, genomic location, and matched search term for easier mutation searching. +* The ability to sort on ranked columns has been made available. + +### Bugs Fixed Since Last Release + +* In some cases, text was being cut off on the Project page visualization tab. Text is no longer cut off. +* HGNC link on Gene page broke as the source format url changed; The format was updated and the link is now functional +* In the biospecimen details on the Case page, the cart icon would disappear once clicked. It now is always visible. + +### Known Issues and Workarounds + +* Pre-release Data Portal login is not supported on Internet Explorer or the last version of Edge (42). Edge 41 does login successfully. +* Custom Facet Filters + * Some definitions are missing from the property list when adding custom facet file or case filters. +* Visualizations + * SIFT and PolyPhen annotations are missing from the export JSON of the mutation table. They are present in the export TSV. + * Data Portal graphs cannot be exported as PNG images in Internet Explorer. Graphs can be exported in PNG or SVG format from Chrome or Firefox browsers . Internet Explorer does not display chart legend and title when re-opening previously downloaded SVG files, the recommendation is to open downloaded SVG files with another program. +* Repository and Cart + * The annotation count in File table of Repository and Cart does not link to the Annotations page anymore. The user can navigate to the annotations through the annotation count in Repository - Case table. +* Legacy Archive + * Downloading a token in the GDC Legacy Archive does not refresh it. If a user downloads a token in the GDC Data Portal and then attempts to download a token in the GDC Legacy Archive, an old token may be provided. Reloading the Legacy Archive view will allow the user to download the updated token. + * Exporting the Cart table in JSON will export the GDC Archive file table instead of exporting the files in the Cart only. +* Web Browsers + * Browsers limit the number of concurrent downloads, it is generally recommended to add files to the cart and download large number of files through the GDC Data Transfer Tool, more details can be found on [GDC Website](https://gdc.cancer.gov/about-gdc/gdc-faqs). + * The GDC Portals are not compatible with Internet Explorer running in compatibility mode. Workaround is to disable compatibility mode. + +## Release 1.17.0 + +* __GDC Product__: GDC Data Portal +* __Release Date__: November 7, 2018 + +### New Features and Changes + +* Copy Number Variation (CNV) data derived from GISTIC results are now available in the portal: + * View number of CNV events on a gene in a cohort in the Explore Gene table tab + * Explore CNVs associated with a gene on the Gene Entity Page + * Explore CNVs concurrently with mutations on the Oncogrid with new visualization + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* Custom Facet Filters + * Some definitions are missing from the property list when adding custom facet file or case filters. +* Visualizations + * SIFT and PolyPhen annotations are missing from the export JSON of the mutation table. They are present in the export TSV. + * Data Portal graphs cannot be exported as PNG images in Internet Explorer. Graphs can be exported in PNG or SVG format from Chrome or Firefox browsers . Internet Explorer does not display chart legend and title when re-opening previously downloaded SVG files, the recommendation is to open downloaded SVG files with another program. +* Repository and Cart + * The annotation count in File table of Repository and Cart does not link to the Annotations page anymore. The user can navigate to the annotations through the annotation count in Repository - Case table. +* Legacy Archive + * Downloading a token in the GDC Legacy Archive does not refresh it. If a user downloads a token in the GDC Data Portal and then attempts to download a token in the GDC Legacy Archive, an old token may be provided. Reloading the Legacy Archive view will allow the user to download the updated token. + * Exporting the Cart table in JSON will export the GDC Archive file table instead of exporting the files in the Cart only. +* Web Browsers + * Browsers limit the number of concurrent downloads, it is generally recommended to add files to the cart and download large number of files through the GDC Data Transfer Tool, more details can be found on [GDC Website](https://gdc.cancer.gov/about-gdc/gdc-faqs). + * The GDC Portals are not compatible with Internet Explorer running in compatibility mode. Workaround is to disable compatibility mode. ## Release 1.16.0 diff --git a/docs/Data_Portal/Users_Guide/Advanced_Search.md b/docs/Data_Portal/Users_Guide/Advanced_Search.md index 7a9c676f2..5f3b23844 100644 --- a/docs/Data_Portal/Users_Guide/Advanced_Search.md +++ b/docs/Data_Portal/Users_Guide/Advanced_Search.md @@ -7,7 +7,7 @@ Only available in the Repository view, the Advanced Search page offers complex q ## Overview: GQL -Advanced search allows, via Genomic Query Language (GQL), to use structured queries to search for files and cases. +Advanced search allows for structured queries to search for files and cases. This is done via Genomic Query Language (GQL), a query language created by the [GDC](https://gdc.cancer.gov/) and [OICR](https://oicr.on.ca/). [![Advanced Search View](images/gdc-data-portal-advanced-search.png)](images/gdc-data-portal-advanced-search.png "Click to see the full image.") @@ -17,7 +17,7 @@ A simple query in GQL (also known as a 'clause') consists of a __field__, follow Note that it is not possible to compare two fields (e.g. disease_type = project.name). -__Note__: GQL is not a database query language. For example, GQL does not have a "SELECT" statement. +> __Note:__ GQL is not a database query language. For example, GQL does not have a "SELECT" statement. ### Switching between Advanced Search and Facet Filters @@ -27,7 +27,7 @@ A query created in Advanced Search is not translated back to facet filters. Clic ## Using the Advanced Search -When opening the advanced search page (via the Repository view), the search field will be automatically populated with facets filters already applied (if any). +When opening the Advanced Search Page (via the Repository View), the search field will be automatically populated with facets filters already applied (if any). This default query can be removed by pressing "Reset". @@ -37,7 +37,7 @@ Once the query has been entered and is identified as a "Valid Query", click on " As a query is being written, the GDC Data Portal will analyze the context and offer a list of auto-complete suggestions. Auto-complete suggests both fields and values as described below. -#### Field Auto-complete +### Field Auto-complete The list of auto-complete suggestions includes __all__ available fields matching the user text input. The user has to scroll down to see more fields in the dropdown: @@ -51,17 +51,17 @@ The value auto-complete is not aware of the general context of the query, the sy [![Value Auto-complete](images/gdc-data-portal-advanced-search-value.png)](images/gdc-data-portal-advanced-search-value.png "Click to see the full image.") -__Note__: Quotes are automatically added to the value if it contains spaces. +> __Note:__ Quotes are automatically added to the value if it contains spaces. ## Setting Precedence of Operators You can use parentheses in complex GQL statements to enforce the precedence of operators. -For example, if you want to find all the open files in TCGA program as well as the files in TARGET program, you can use parentheses to enforce the precedence of the boolean operators in your query, i.e.: +For example, if you want to find all the open files in TCGA program as well as the files in TARGET program, you can use parentheses to enforce the precedence of the Boolean operators in your query, i.e.: (files.access = open and cases.project.program.name = TCGA) or cases.project.program.name = TARGET -__Note__: Without parentheses, the statement will be evaluated left-to-right. +> __Note:__ Without parentheses, the statement will be evaluated left-to-right. ## Keywords @@ -69,42 +69,42 @@ A GQL keyword is a word that joins two or more clauses together to form a comple **List of Keywords:** -* AND -* OR +* __AND__ +* __OR__ -__Note__: parentheses can be used to control the order in which clauses are executed. +> __Note:__ Parentheses can be used to control the order in which clauses are executed. -### AND Keyword +### "__AND__" Keyword Used to combine multiple clauses, allowing you to refine your search. Examples: -* Find all open files in breast cancer +* Find all open files in breast cancer: - cases.project.primary_site = Breast and files.access = open + cases.primary_site = Breast and files.access = open -* Find all open files in breast cancer and data type is copy number variation +* Find all open files in breast cancer and data type is gene expression quantification: - cases.project.primary_site = Breast and files.access = open and files.data_type = "Copy number variation" + cases.primary_site = Breast and files.access = open and files.data_type = "Gene Expression Quantification" -### OR Keyword +### "__OR__" Keyword Used to combine multiple clauses, allowing you to expand your search. -__Note__: __IN__ keyword can be an alternative to OR and result in simplified queries. +> __Note:__ The __IN__ keyword can be an alternative to __OR__ and result in simplified queries. Examples: -* Find all files that are raw sequencing data or raw microarray data: +* Find all files that are raw sequencing data or aligned reads: - files.data_type = "Raw microarray data" or files.data_type = "Raw sequencing data" + files.data_type = "Aligned Reads" or files.data_type = "Raw sequencing data" -* Find all files where donors are male or vital status is alive: +* Find all files where cases are male or vital status is alive: - cases.demographic.gender = male or cases.diagnoses.vital_status = alive + cases.demographic.gender = male or cases.diagnoses.vital_status = alive ## Operators @@ -127,136 +127,154 @@ An operator in GQL is one or more symbols or words comparing the value of a fiel | NOT MISSING | Field NOT MISSING | -### "=" operator - EQUAL +### "__=__" Operator - __EQUAL__ -The "=" operator is used to search for files where the value of the specified field exactly matches the specified value. +The "__=__" operator is used to search for files where the value of the specified field exactly matches the specified value. Examples: -* Find all files that are gene expression: +* Find all files that are gene expression quantification: - files.data_type = "Gene expression" + files.data_type = "Gene Expression Quantification" * Find all cases whose gender is female: - cases.demographic.gender = female + cases.demographic.gender = female -### "!=" operator - NOT EQUAL +### "__!=__" Operator - __NOT EQUAL__ -The "!=" operator is used to search for files where the value of the specified field does not match the specified value. +The "__!=__" operator is used to search for files where the value of the specified field does not match the specified value. -The "!=" operator will not match a field that has no value (i.e. a field that is empty). For example, 'gender != male' will only match cases who have a gender and the gender is not male. To find cases other than male or with no gender populated, you would need to type gender != male or gender is missing. +The "__!=__" operator will not match a field that has no value (i.e. a field that is empty). For example: + + cases.demographic.gender != male + +This search will only match cases who have a gender and the gender is not male. To find cases other than male or with no gender populated, you would need to search: + + cases.demographic.gender != male or cases.demographic.gender is missing. Example: -* Find all files with an experimental different from genotyping array: +* Find all files with an experimental strategy that is not genotyping array: - files.experimental_strategy != "Genotyping array" + files.experimental_strategy != "Genotyping array" -### ">" operator - GREATER THAN +### "__>__" Operator - __GREATER THAN__ -The ">" operator is used to search for files where the value of the specified field is greater than the specified value. +The "__>__" operator is used to search for files where the value of the specified field is greater than the specified value. Example: * Find all cases whose number of days to death is greater than 60: - cases.diagnoses.days_to_death > 60 + cases.diagnoses.days_to_death > 60 -### ">=" operator - GREATER THAN OR EQUALS +### "__>=__" Operator - __GREATER THAN OR EQUALS__ -The ">=" operator is used to search for files where the value of the specified field is greater than or equal to the specified value. +The "__>=__" operator is used to search for files where the value of the specified field is greater than or equal to the specified value. Example: * Find all cases whose number of days to death is equal or greater than 60: - cases.diagnoses.days_to_death >= 60 + cases.diagnoses.days_to_death >= 60 -### "<" operator - LESS THAN +### "__<__" Operator - __LESS THAN__ -The "<" operator is used to search for files where the value of the specified field is less than the specified value. +The "__<__" operator is used to search for files where the value of the specified field is less than the specified value. Example: * Find all cases whose age at diagnosis is less than 400 days: - cases.diagnoses.age_at_diagnosis < 400 + cases.diagnoses.age_at_diagnosis < 400 -### "<=" operator - LESS THAN OR EQUALS +### "__<=__" Operator - __LESS THAN OR EQUALS__ -The "<=" operator is used to search for files where the value of the specified field is less than or equal to the specified value. +The "__<=__" operator is used to search for files where the value of the specified field is less than or equal to the specified value. Example: * Find all cases with a number of days to death less than or equal to 20: - cases.diagnoses.days_to_death <= 20 + cases.diagnoses.days_to_death <= 20 -### "IN" Operator +### "__IN__" Operator -The "IN" operator is used to search for files where the value of the specified field is one of multiple specified values. The values are specified as a comma-delimited list, surrounded by brackets [ ]. +The "__IN__" operator is used to search for files where the value of the specified field is one of multiple specified values. The values are specified as a comma-delimited list, surrounded by brackets [ ]. -Using "IN" is equivalent to using multiple 'EQUALS (=)' statements, but is shorter and more convenient. That is, typing 'project IN [ProjectA, ProjectB, ProjectC]' is the same as typing 'project = "ProjectA" OR project = "ProjectB" OR project = "ProjectC"'. +Using "__IN__" is equivalent to using multiple "__=__" (__EQUALS__) statements, but is shorter and more convenient. That is, these two following statement will retrieve the same output: + + cases.project.name IN [ProjectA, ProjectB, ProjectC] + cases.project.name = "ProjectA" OR cases.project.name = "ProjectB" OR cases.project.name = "ProjectC" Examples: -* Find all files in breast, breast and lung and cancer: +* Find all files in breast, brain, and lung cancer: + + cases.primary_site IN [Breast, Brain, Lung] + +* Find all files that are annotated somactic mutations or raw simple somatic mutations: + + files.data_type IN ["Annotated Somatic Mutation", "Raw Simple Somatic Mutation"] + - cases.project.primary_site IN [Brain, Breast,Lung] +### "__EXCLUDE__" Operator -* Find all files tagged with exon or junction or hg19: +The "__EXCLUDE__" operator is used to search for files where the value of the specified field is not one of multiple specified values. - files.data_type IN ["Aligned reads", "Unaligned reads"] +Using "__EXCLUDE__" is equivalent to using multiple "__!=__" (__NOT_EQUALS__) statements, but is shorter and more convenient. That is, these two following statement will retrieve the same output: + cases.project.name EXCLUDE [ProjectA, ProjectB, ProjectC] + cases.project.name != "ProjectA" OR cases.project.name != "ProjectB" OR cases.project.name != "ProjectC" -### "EXCLUDE" Operator +The "__EXCLUDE__" operator will not match a field that has no value (i.e. a field that is empty). For example: -The "EXCLUDE" operator is used to search for files where the value of the specified field is not one of multiple specified values. + files.experimental_strategy EXCLUDE ["WGS","WXS"] -Using "EXCLUDE" is equivalent to using multiple 'NOT_EQUALS (!=)' statements, but is shorter and more convenient. That is, typing 'project EXCLUDE [ProjectA, ProjectB, ProjectC]' is the same as typing 'project != "ProjectA" OR project != "ProjectB" OR project != "ProjectC"' +This search will only match files that have an experimental strategy **and** the experimental strategy is not "WGS" or "WXS". To find files with an experimental strategy different than "WGS" or "WXS" **or is not assigned**, you would need to type: -The "EXCLUDE" operator will not match a field that has no value (i.e. a field that is empty). For example, 'experimental strategy EXCLUDE ["WGS","WXS"]' will only match files that have an experimental strategy **and** the experimental strategy is not "WGS" or "WXS". To find files with an experimental strategy different from than "WGS" or "WXS" **or is not assigned**, you would need to type: files.experimental_strategy in ["WXS","WGS"] or files.experimental_strategy is missing. + files.experimental_strategy in ["WXS","WGS"] or files.experimental_strategy is missing Examples: * Find all files where experimental strategy is not WXS, WGS, Genotyping array: - files.experimental_strategy EXCLUDE [WXS, WGS, "Genotyping array"] + files.experimental_strategy EXCLUDE [WXS, WGS, "Genotyping array"] -### "IS MISSING" Operator +### "__IS MISSING__" Operator -The "IS" operator can only be used with "MISSING". That is, it is used to search for files where the specified field has no value. +The "__IS__" operator can only be used with "__MISSING__". That is, it is used to search for files where the specified field has no value. Examples: * Find all cases where gender is missing: - cases.demographic.gender is MISSING + cases.demographic.gender is MISSING -### "NOT MISSING" Operator +### "__NOT MISSING__" Operator -The "NOT" operator can only be used with "MISSING". That is, it is used to search for files where the specified field has a value. +The "__NOT__" operator can only be used with "__MISSING__". That is, it is used to search for files where the specified field has a value. Examples: * Find all cases where race is not missing: - cases.demographic.race NOT MISSING + cases.demographic.race NOT MISSING ## Special Cases -### Date format +### Date Format The date format should be the following: **YYYY-MM-DD** (without quotes). Example: - files.updated_datetime > 2015-12-31 + files.updated_datetime > 2015-12-31 ### Using Quotes @@ -265,9 +283,9 @@ A value must be quoted if it contains a space. Otherwise the advanced search wil Quotes are not necessary if the value consists of one single word. -* Example: Find all cases with primary site is brain and data type is copy number variation: +* Example: Find all cases with primary site is brain and data type is copy number segment: - cases.project.primary_site = Brain and files.data_type = "Copy number variation" + cases.primary_site = Brain and files.data_type = "Copy Number Segment" ### Age at Diagnosis - Unit in Days @@ -277,7 +295,7 @@ The __conversion factor__ is 1 year = 365.25 days * Example: Find all cases whose age at diagnosis > 40 years old (40 * 365.25) - cases.diagnoses.age_at_diagnosis > 14610 + cases.diagnoses.age_at_diagnosis > 14610 @@ -285,119 +303,4 @@ The __conversion factor__ is 1 year = 365.25 days The full list of fields available on the GDC Data Portal can be found through the GDC API using the following endpoint: -[https://api.gdc.cancer.gov/gql/_mapping](https://api.gdc.cancer.gov/gql/_mapping) - -Alternatively, a static list of fields is available below (not exhaustive). - -### Files - -+ files.access -+ files.acl -+ files.archive.archive_id -+ files.archive.revision -+ files.archive.submitter_id -+ files.center.center_id -+ files.center.center_type -+ files.center.code -+ files.center.name -+ files.center.namespace -+ files.center.short_name -+ files.data_format -+ files.data_subtype -+ files.data_type -+ files.experimental_strategy -+ files.file_id -+ files.file_name -+ files.file_size -+ files.md5sum -+ files.origin -+ files.platform -+ files.related_files.file_id -+ files.related_files.file_name -+ files.related_files.md5sum -+ files.related_files.type -+ files.state -+ files.state_comment -+ files.submitter_id -+ files.tags - -### Cases - -+ cases.case_id -+ cases.submitter_id -+ cases.diagnoses.age_at_diagnosis -+ cases.diagnoses.days_to_death -+ cases.demographic.ethnicity -+ cases.demographic.gender -+ cases.demographic.race -+ cases.diagnoses.vital_status -+ cases.project.disease_type -+ cases.project.name -+ cases.project.program.name -+ cases.project.program.program_id -+ cases.project.project_id -+ cases.project.state -+ cases.samples.sample_id -+ cases.samples.submitter_id -+ cases.samples.sample_type -+ cases.samples.sample_type_id -+ cases.samples.shortest_dimension -+ cases.samples.time_between_clamping_and_freezing -+ cases.samples.time_between_excision_and_freezing -+ cases.samples.tumor_code -+ cases.samples.tumor_code_id -+ cases.samples.current_weight -+ cases.samples.days_to_collection -+ cases.samples.days_to_sample_procurement -+ cases.samples.freezing_method -+ cases.samples.initial_weight -+ cases.samples.intermediate_dimension -+ cases.samples.is_ffpe -+ cases.samples.longest_dimension -+ cases.samples.oct_embedded -+ cases.samples.pathology_report_uuid -+ cases.samples.portions.analytes.a260_a280_ratio -+ cases.samples.portions.analytes.aliquots.aliquot_id -+ cases.samples.portions.analytes.aliquots.amount -+ cases.samples.portions.analytes.aliquots.center.center_id -+ cases.samples.portions.analytes.aliquots.center.center_type -+ cases.samples.portions.analytes.aliquots.center.code -+ cases.samples.portions.analytes.aliquots.center.name -+ cases.samples.portions.analytes.aliquots.center.namespace -+ cases.samples.portions.analytes.aliquots.center.short_name -+ cases.samples.portions.analytes.aliquots.concentration -+ cases.samples.portions.analytes.aliquots.source_center -+ cases.samples.portions.analytes.aliquots.submitter_id -+ cases.samples.portions.analytes.amount -+ cases.samples.portions.analytes.analyte_id -+ cases.samples.portions.analytes.analyte_type -+ cases.samples.portions.analytes.concentration -+ cases.samples.portions.analytes.spectrophotometer_method -+ cases.samples.portions.analytes.submitter_id -+ cases.samples.portions.analytes.well_number -+ cases.samples.portions.center.center_id -+ cases.samples.portions.center.center_type -+ cases.samples.portions.center.code -+ cases.samples.portions.center.name -+ cases.samples.portions.center.namespace -+ cases.samples.portions.center.short_name -+ cases.samples.portions.is_ffpe -+ cases.samples.portions.portion_id -+ cases.samples.portions.portion_number -+ cases.samples.portions.slides.number_proliferating_cells -+ cases.samples.portions.slides.percent_eosinophil_infiltration -+ cases.samples.portions.slides.percent_granulocyte_infiltration -+ cases.samples.portions.slides.percent_inflam_infiltration -+ cases.samples.portions.slides.percent_lymphocyte_infiltration -+ cases.samples.portions.slides.percent_monocyte_infiltration -+ cases.samples.portions.slides.percent_necrosis -+ cases.samples.portions.slides.percent_neutrophil_infiltration -+ cases.samples.portions.slides.percent_normal_cells -+ cases.samples.portions.slides.percent_stromal_cells -+ cases.samples.portions.slides.percent_tumor_cells -+ cases.samples.portions.slides.percent_tumor_nuclei -+ cases.samples.portions.slides.section_location -+ cases.samples.portions.slides.slide_id -+ cases.samples.portions.slides.submitter_id -+ cases.samples.portions.submitter_id -+ cases.samples.portions.weight +[https://api.gdc.cancer.gov/gql/_mapping](https://api.gdc.cancer.gov/gql/_mapping) \ No newline at end of file diff --git a/docs/Data_Portal/Users_Guide/Cart.md b/docs/Data_Portal/Users_Guide/Cart.md index f24070cb7..c7b1c840d 100644 --- a/docs/Data_Portal/Users_Guide/Cart.md +++ b/docs/Data_Portal/Users_Guide/Cart.md @@ -1,62 +1,62 @@ # Cart and File Download -## Overview - -While browsing the GDC Data Portal, files can either be downloaded individually from [file detail pages](Repository.md#file-summary-page) or collected in the file cart to be downloaded as a bundle. Clicking on the shopping cart icon that is next to any item in the GDC will add the item to your cart. +While browsing the GDC Data Portal, files can either be downloaded individually from [File Summary Pages](Repository.md#file-summary-page) or collected in the file cart to be downloaded as a bundle. Clicking on the shopping cart icon that is next to any item in the GDC will add the item to your cart. ## GDC Cart [![Cart](images/cart-overview_v2.png)](images/cart-overview_v2.png "Click to see the full image.") -### Cart Summary +## Cart Summary -The cart page shows a summary of all files currently in the cart: +The Cart Summary Page shows a summary of all files currently in the cart: -* Number of files -* Number of cases associated with the files -* Total file size +* Number of files. +* Number of cases associated with the files. +* Total file size. The Cart page also displays two tables: -* __File count by project__: Breaks down the files and cases by each project -* __File count by authorization level__: Breaks down the files in the cart by authorization level. A user must be logged into the GDC in order to download 'Controlled-Access files' +* __File count by project__: Breaks down the files and cases by each project. +* __File count by authorization level__: Breaks down the files in the cart by authorization level. A user must be logged into the GDC in order to download 'Controlled-Access files'. -The cart also directs users how to download files in the cart. For large data files, it is recommended that the GDC Data Transfer Tool be used. +The cart also directs users how to download files in the cart. For large data files, it is recommended that the [GDC Data Transfer Tool](https://gdc.cancer.gov/access-data/gdc-data-transfer-tool) be used. -### Cart Items +## Cart Items [![Cart](images/gdc-cart-items_v2.png)](images/gdc-cart-items_v2.png "Click to see the full image.") The Cart Items table shows the list of all the files that were added to the Cart. The table gives the folowing information for each file in the cart: * __Access__: Displays whether the file is open or controlled access. Users must login to the GDC Portal and have the appropriate credentials to access these files. -* __File Name__: Name of the file. Clicking the link will bring the user to the file summary page. -* __Cases__: How many cases does the file contain. Clicking the link will bring the user to the case summary page. -* __Project__: The Project that the file belongs to. Clicking the link will bring the user to the Project summary page. -* __Category__: Type of data -* __Format__: The file format -* __Size__: The size of the file -* __Annotations__: Whether there are any annotations +* __File Name__: Name of the file. Clicking the link will bring the user to the [File Summary Page](#file-summary-page). +* __Cases__: How many cases does the file contain. Clicking the link will bring the user to the [Case Summary Page](Exploration.md#case-summary-page). +* __Project__: The Project that the file belongs to. Clicking the link will bring the user to the [Project Summary Page](Projects.md#project-summary-page). +* __Category__: Type of data. +* __Format__: The file format. +* __Size__: The size of the file. +* __Annotations__: Whether there are any annotations. -## Download Options +# Download Options [![Cart](images/gdc-download-options_v2.png)](images/gdc-download-options_v2.png "Click to see the full image.") -There are a few buttons on the Cart page that allow users to download files. The following download options are available: +The following buttons on the Cart page allows users to download files that are related to the ones in the cart. The following download options are available: -* __Biospecimen__: Downloads bioscpecimen data related to files in the cart in either TSV or JSON format. -* __Clinical__: Downloads clinical data related to files in the cart in either TSV or JSON format. -* __Sample Sheet__: Downloads a tab-separated file which contains the associated case/sample IDs and sample type for each file in the cart. -* __Metadata__: GDC harmonized clinical, biospecimen, and file metadata associated with the files in the cart. -* __Download Manifest__: Download a manifest file for use with the GDC Data Transfer Tool to download files. A manifest file contains a list of the UUIDs that correspond to the files in the cart. -* __Download Cart__: Download the files in the Cart directly through the browser. Users have to be cautious of the amount of data in the cart since this option will not optimize bandwidth and will not provide resume capabilities. -* __SRA XML, MAGE-TAB__: This option is available in the GDC Legacy Archive only. It is used to download metadata files associated with the files in the cart. +* __Biospecimen:__ Downloads biospecimen data related to files in the cart in either TSV or JSON format. +* __Clinical:__ Downloads clinical data related to files in the cart in either TSV or JSON format. +* __Sample Sheet:__ Downloads a tab-separated file which contains the associated case/sample IDs and the sample type (Tumor/Normal) for each file in the cart. +* __Metadata:__ GDC harmonized clinical, biospecimen, and file metadata associated with the files in the cart. +* __Download:__ + * __Manifest:__ Download a manifest file for use with the GDC Data Transfer Tool to download files. A manifest file contains a list of the UUIDs that correspond to the files in the cart. + * __Cart:__ Download the files in the Cart directly through the browser. Users have to be cautious of the amount of data in the cart since this option will not optimize bandwidth and will not provide resume capabilities. +* __Remove from Cart:__ Remove all files or unauthorized files from the cart. +* __SRA XML, MAGE-TAB:__ This option is available in the GDC Legacy Archive only. It is used to download metadata files associated with the files in the cart. -The cart allows users to download up to 5 GB of data directly through the web browser. This is not recommended for downloading large volumes of data, in particular due to the absence of a retry/resume mechanism. For downloads over 5 GB we recommend using the GDC Data Transfer Tool. +The cart allows users to download up to 5 GB of data directly through the web browser. This is not recommended for downloading large volumes of data, in particular due to the absence of a retry/resume mechanism. For downloads over 5 GB we recommend using the `Download Manifest` button and download a manifest file that can be imported into [GDC Data Transfer Tool](https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Getting_Started/). -__Note__: when downloading multiple files from the cart, they are automatically bundled into one single Gzipped (.tar.gz) file. +>__Note__: when downloading multiple files from the cart, they are automatically bundled into one single Gzipped (.tar.gz) file. -### GDC Data Transfer Tool +## [GDC Data Transfer Tool](https://gdc.cancer.gov/access-data/gdc-data-transfer-tool) The `Download Manifest` button will download a manifest file that can be imported into the GDC Data Transfer Tool. Below is an example of the contents of a manifest file used for download: @@ -73,14 +73,69 @@ c57673ac-998a-4a50-a12b-4cac5dc3b72e mdanderson.org_KIRP.MDA_RPPA_Core.mage-tab. The Manifest contains a list of the file UUIDs in the cart and can be used together with the GDC Data Transfer Tool to download all files. -Information on the GDC Data Transfer Tool is available in the [GDC Data Transfer Tool User's Guide](/node/8196/). +Information on the GDC Data Transfer Tool is available in the [GDC Data Transfer Tool User's Guide](../../Data_Transfer_Tool/Users_Guide/Getting_Started.md). + +# Controlled Files + +If a user tries to download a cart containing controlled files and without being authenticated, a pop-up will be displayed to offer the user either to download only open access files or to login into the GDC Data Portal through eRA Commons. See [Authentication](#Authentication) for details. + +Once a user is logged in, controlled files that they have access to can be downloaded. To download files from the portal, users must agree to the GDC and individual project Data Use Agreements by selecting the agreement checkbox on the Access Alert message. + +[![Cart Page](images/gdc-data-portal-download-cart_v2.png)](images/gdc-data-portal-download-cart_v2.png "Click to see the full image.") + +# Authentication + +The GDC Data Portal provides granular metadata for all datasets available in the GDC. Any user can see a listing of all available data files, including controlled-access files. The GDC Data Portal also allows users to download open-access files without logging in. However, downloading of controlled-access files is restricted to authorized users and requires authentication. + +## Logging into the GDC + +To login to the GDC, users must click on the `Login` button on the top right of the GDC Website. + +![Login](images/gdc-login.png) + +After clicking Login, users authenticate themselves using their eRA Commons login and password. If authentication is successful, the eRA Commons username will be displayed in the upper right corner of the screen, in place of the "Login" button. + +Upon successful authentication, GDC Data Portal users can: + +- See which controlled-access files they can access. +- Download controlled-access files directly from the GDC Data Portal. +- Download an authentication token for use with the GDC Data Transfer Tool or the GDC API. +- See controlled-access mutation data they can access. + +Controlled-access files are identified using a "lock" icon: + +[![GDC Data Portal Main Page](images/gdc-data-portal-controlled-files.png)](images/gdc-data-portal-controlled-files.png "Click to see the full image.") + +The rest of this section describes controlled data access features of the GDC Data Portal available to authorized users. For more information about open and controlled-access data, and about obtaining access to controlled data, see [Data Access Processes and Tools](https://gdc.cancer.gov/access-data/data-access-processes-and-tools). + +## User Profile + +After logging into the GDC Portal, users can view which projects they have access to by clicking the `User Profile` section in the dropdown menu in the top corner of the screen. + +[![User Profile Drop Down](images/gdc-user-profile-dropdown.png)](images/gdc-user-profile-dropdown.png "Click to see the full image.") + +Clicking this button shows the list of projects. + +[![User Profile](images/gdc-user-profile.png)](images/gdc-user-profile.png "Click to see the full image.") + +## GDC Authentication Tokens + +The GDC Data Portal provides authentication tokens for use with the GDC Data Transfer Tool or the GDC API. To download a token: + +1. Log into the GDC using your eRA Commons credentials. +2. Click the username in the top right corner of the screen. +3. Select the "Download token" option. + +![Token Download Button](images/gdc-data-portal-token-download.png) + +A new token is generated each time the `Download Token` button is clicked. -### Individual Files Download +For more information about authentication tokens, see [Data Security](../../Data/Data_Security/Data_Security.md#authentication-tokens). -Similar to the files page, each row contains a download button to download a particular file individually. +>__Note:__ The authentication token should be kept in a secure location, as it allows access to all data accessible by the associated user account. -## Controlled Files +## Logging Out -If a user tries to download a cart containing controlled files and without being authenticated, a pop-up will be displayed to offer the user either to download only open access files or to login into the GDC Data Portal through eRA Commons. See [Authentication](Authentication.md) for details. +To log out of the GDC, click the username in the top right corner of the screen, and select the Logout option. -[![Cart Page](images/gdc-data-portal-download-cart.png)](images/gdc-data-portal-download-cart.png "Click to see the full image.") +![Logout link](images/gdc-data-portal-token-download.png) diff --git a/docs/Data_Portal/Users_Guide/Custom_Set_Analysis.md b/docs/Data_Portal/Users_Guide/Custom_Set_Analysis.md index 6cfdc8d18..f14799d03 100644 --- a/docs/Data_Portal/Users_Guide/Custom_Set_Analysis.md +++ b/docs/Data_Portal/Users_Guide/Custom_Set_Analysis.md @@ -1,6 +1,6 @@ -# Custom Set Analysis +# Analysis -In addition to the [Exploration page](Exploration.md), the GDC Data Portal also has features used to save and compare sets of cases, genes, and mutations. These sets can either be generated with existing filters (e.g. males with lung cancer) or through custom selection (e.g. a user-generated list of case IDs). +In addition to the [Exploration Page](Exploration.md), the GDC Data Portal also has features used to save and compare sets of cases, genes, and mutations. These sets can either be generated with existing filters (e.g. males with lung cancer) or through custom selection (e.g. a user-generated list of case IDs). Note that saving a set only saves the type of entity included in the set. For example, a saved case set will not include filters that were applied to genes or mutations. Please be aware that your custom sets are deleted during each new GDC data release. You can export them and re-upload them in the "Manage Sets" link at the top right of the Portal. @@ -8,27 +8,63 @@ Note that saving a set only saves the type of entity included in the set. For ex Cohort sets are completely customizable and can be generated for cases, genes, or mutations using the following methods: -__Upload ID Set:__ This feature is available in the "Manage Sets" link at the top right of the Portal. Choose "Upload Set" and then select whether the set comprises cases, genes, or mutations. A set of IDs (IDs* or UUIDs) can then be uploaded in a text file or copied and pasted into the list of identifiers field along with a name identifying the set. Once the list of identifiers is uploaded, they are validated and grouped according to whether the identifier matched an existing GDC ID or did not match ("Unmatched"). +__Apply Filters in Exploration:__ Sets can be assembled using the existing filters in the Exploration page. They can be saved by choosing the "Save/Edit Case Set" button under the pie charts for case sets. This will prompt a decision to save as new case set. The same can be done for both gene and mutation filters, and can be applied and saved in the Genes and Mutations tab, respectively. + +[![Exploration Set](images/GDC-ExplorationSet-Cohort_v2.png)](images/GDC-ExplorationSet-Cohort_v2.png "Click to see the full image.") + +__Upload ID Set:__ This feature is available in the "Manage Sets" link at the top right of the Portal. Choose "Upload Set" and then select whether the set comprises cases, genes, or mutations. A set of IDs or UUIDs can then be uploaded in a text file or copied and pasted into the list of identifiers field along with a name identifying the set. Once the list of identifiers is uploaded, the IDs are validated and grouped according to whether or not the identifier matched an existing GDC ID. [![Upload Set](images/GDC-UploadSet-Cohort_v2.png)](images/GDC-UploadSet-Cohort_v2.png "Click to see the full image.") -\* This is referred to as a `submitter_id` in the GDC API, which is a non-UUID identifier such as a TCGA barcode. +### Upload Case Set -__Apply Filters in Exploration:__ Sets can be assembled using the existing filters in the Exploration page. They can be saved by choosing the "Save/Edit Case Set" button under the pie charts for case sets. This will prompt a decision to save as new case set. +In the `Cases` filters panel, instead of supplying cases one-by-one, users can supply a list of cases. Clicking on the `Upload Case Set` button will launch a dialog as shown below, where users can supply a list of cases or upload a comma-separated text file of cases. -Similarly, gene and mutation filters can be applied and saved in the Exploration page in the Genes and Mutations tab, respectively. +[![Upload Case Set](images/gdc-exploration-case-set.png)](images/gdc-exploration-case-set.png "Click to see the full image.") -[![Exploration Set](images/GDC-ExplorationSet-Cohort_v2.png)](images/GDC-ExplorationSet-Cohort_v2.png "Click to see the full image.") +After supplying a list of cases, a table below will appear which indicates whether the case was found. + +[![Upload Case Set Validation](images/gdc-exploration-case-set-validation.png)](images/gdc-exploration-case-set-validation.png "Click to see the full image.") + +Clicking on `Submit` will filter the results in the Exploration Page by those cases. + +[![Upload Case Set Results](images/case-set-filter_v3.png)](images/case-set-filter_v2.png "Click to see the full image.") + +### Upload Gene Set + +In the `Genes` filters panel, instead of supplying genes one-by-one, users can supply a list of genes. Clicking on the `Upload Gene Set` button will launch a dialog as shown below, where users can supply a list of genes or upload a comma-separated text file of genes. + +[![Upload Gene Set](images/Exploration-Upload-Gene-Set.png)](images/Exploration-Upload-Gene-Set.png "Click to see the full image.") + +After supplying a list of genes, a table below will appear which indicates whether the gene was found. + +[![Upload Gene Set Validation](images/Exploration-Upload-Gene-Set-Validation.png)](images/Exploration-Upload-Gene-Set-Validation.png "Click to see the full image.") + +Clicking on `Submit` will filter the results in the Exploration Page by those genes. + +### Upload Mutation Set + +In the `Mutations` filters panel, instead of supplying mutation id's one-by-one, users can supply a list of mutations. Clicking on the `Upload Mutation Set` button will launch a dialog as shown below, where users can supply a list of mutations or upload a comma-separated text file of mutations. + +[![Upload Case Set](images/gdc-exploration-mutation-set.png)](images/gdc-exploration-mutation-set.png "Click to see the full image.") + +After supplying a list of mutations, a table below will appear which indicates whether the mutation was found. + +[![Upload Case Set Validation](images/gdc-exploration-mutation-set-validation.png)](images/gdc-exploration-mutation-set-validation.png "Click to see the full image.") + +Clicking on `Submit` will filter the results in the Exploration Page by those mutations. + +[![Upload Case Set Results](images/mutation-set-filter_v2.png)](images/mutation-set-filter_v2.png "Click to see the full image.") ## Analysis Page Clicking on the `Analysis` button in the top toolbar will launch the Analysis Page which displays the various options available for comparing saved sets. -[![Analysis Tab](images/GDC-Analysis-Tab.png)](images/GDC-Analysis-Tab.png "Click to see the full image.") +[![Analysis Tab](images/GDC-Analysis-Tab_v2.png)](images/GDC-Analysis-Tab_v2.png "Click to see the full image.") -There are two tabs on this page: +There are three tabs on this page: -* __Launch Analysis__: Where users can select either to do `Set Operations` or `Cohort Comparison` -* __Results__: Where users can view the results of current or previous set analyses +* __Launch Analysis__: Where users can select either to do `Set Operations`, `Cohort Comparison` or `Clinical Data Analysis`. +* __Results__: Where users can view the results of current or previous set analyses. ## Analysis Page: Set Operations @@ -38,31 +74,198 @@ Up to three sets of the same set type can be compared and exported based on comp * __Venn Diagram:__ Visually displays the overlapping items included within the three sets. Subsets based on overlap can be selected by clicking one or many sections of the Venn diagram. As sections of the Venn Diagram become highlighted in blue, their corresponding row in the overlap table becomes highlighted. -* __Summary Table:__ Displays the alias, item type, and name for each set included in this analysis - -* __Overlap Table:__ Displays the number of overlapping items with set operations rather than a visual diagram. Subsets can be selected by checking boxes in the "Select" column, which will highlight the corresponding section of the Venn Diagram. As rows are selected, the "Union of selected sets" row is populated. Each row has an option to save the subset as a new set, export the set as a TSV, or view files in the repository. The links that correspond to the number of items in each row will open the cohort in the Exploration page. +* __Summary Table:__ Displays the alias, item type, and name for each set included in this analysis. +* __Overlap Table:__ Displays the number of overlapping items with set operations rather than a visual diagram. Subsets can be selected by checking boxes in the "Select" column, which will highlight the corresponding section of the Venn Diagram. As rows are selected, the "Union of selected sets" row is populated. Each row has an option to save the subset as a new set, export the set as a TSV, or view files in the repository. The links that correspond to the number of items in each row will open the cohort in the Exploration Page. ## Analysis Tab: Cohort Comparison The "Cohort Comparison" analysis displays a series of graphs and tables that demonstrate the similarities and differences between two case sets. The following features are displayed for each two sets: -* A key detailing the number of cases in each cohort and the color that represents each (blue/gold) +* A key detailing the number of cases in each cohort and the color that represents each (blue/gold). -* A Venn diagram, which shows the overlap between the two cohorts. The Venn diagram can be opened in a 'Set Operations' tab by choosing "Open venn diagram in new tab" +* A Venn diagram, which shows the overlap between the two cohorts. The Venn diagram can be opened in a 'Set Operations' tab by choosing "Open Venn diagram in new tab". -* A selectable [survival plot](Projects/#survival-analysis) that compares both sets with information about the percentage of represented cases +* A selectable [survival plot](Exploration.md#survival-analysis) that compares both sets with information about the percentage of represented cases. -[![Top Cohort](images/GDC-Cohort-Comparison-Top.png)](images/GDC-Cohort-Comparison-Top.png "Click to see the full image.") +[![Top Cohort](images/GDC-Cohort-Comparison-Top_v2.png)](images/GDC-Cohort-Comparison-Top_v2.png "Click to see the full image.") * A breakdown of each cohort by selectable clinical facets with a bar graph and table. Facets include `vital_status`, `gender`, `race`, `ethnicity`, and `age_at_diagnosis`. A p-value (if it can be calculated from the data) that demonstrates whether the statuses are proportionally represented is displayed for the `vital_status`, `gender`, and `ethnicity` facets. -[![Clinical Cohort](images/GDC-Clinical-Cohort.png)](images/GDC-Clinical-Cohort.png "Click to see the full image.") +[![Clinical Cohort](images/GDC-Clinical-Cohort_v2.png)](images/GDC-Clinical-Cohort_v2.png "Click to see the full image.") + +## Analysis Tab: Clinical Data Analysis + +The "Clinical Data Analysis" feature allows users to specifically examine the clinical data of a single case set in more detail. Users can select which clinical fields they want to display and visualize the data using various supported plot types. The clinical analysis features include: + +* Ability to select which clinical fields to display +* Examine the clinical data of each field using these visualizations: + * Histogram + * Survival Plot + * Box Plot + * QQ Plot +* Create custom bins for each field and re-visualize the data with those bins +* Select specific cases from a clinical field and use them to create a new set, or modify/remove from an existing set +* Download the visualizations of each plot type for each variable in SVG, PNG, JSON formats +* Download the data table of each field in TSV format +* Print all clinical variable cards in the analysis with their active plot to a single PDF + +### Selecting a Case Set + +First, a case set must be selected to run the clinical analysis on: + +[![Select Clinical Cohort](images/GDC-Select-Clinical-Cohort.png)](images/GDC-Select-Clinical-Cohort.png "Click to see the full image.") + +Click __'Run'__ - The results page loads with a new tab for the new clinical analysis for the selected set: + +[![Select Clinical Cohort](images/GDC-Load-Clinical-Analysis.png)](images/GDC-Load-Clinical-Analysis.png "Click to see the full image.") + +### Enabling Clinical Variable Cards + +Users can use the control panel on the left side of the analysis to display which clinical variables they want. To enable or disable specific variables for display, click the on/off toggle controls: + +[![Enable Clinical Cards](images/GDC-Enable-Clinical-Cards.png)](images/GDC-Enable-Clinical-Cards.png "Click to see the full image.") + +The clinical fields are grouped into these categories: + +* __Demographic:__ Data for the characterization of the patient by means of segmenting the population (e.g. characterization by age, sex, race, etc.). +* __Diagnoses:__ Data from the investigation, analysis, and recognition of the presence and nature of disease, condition, or injury from expressed signs and symptoms; also, the scientific determination of any kind; the concise results of such an investigation. +* __Treatments:__ Records of the administration and intention of therapeutic agents provided to a patient to alter the course of a pathologic process. +* __Exposures:__ Clinically-relevant patient information not immediately resulting from genetic predispositions. + +Since the list of fields can be long, users can collapse and expand the field list for each clinical category for easier browsing, or use the search box: + +[![Search Clinical Fields](images/GDC-Search-Clinical-Fields.png)](images/GDC-Search-Clinical-Fields.png "Click to see the full image.") + +### Exploring Clinical Card Visualizations + +Users can explore different visualizations for each clinical field they have enabled for display. Each card supports theses plot types: + +* Histogram +* Survival Plot +* Box Plot & QQ Plot (these plots are visualized side-by-side) + +To switch between plot types, click the different plot type icons in the top-right of each card. + +#### Histogram + +The histogram plot type suppports these features: + +* View the distribution of cases (# and % of cases) in the cohort for the clinical field's data categories as a histogram +* View the distribution of cases in tabular format +* Select the cases for specific data categories to create new sets, append to existing sets, or remove from existing sets +* Download the histogram visualization in SVG or PNG format +* Download the raw data used to generate the histogram in JSON format + +[![Clinical Analysis Histogram](images/GDC-Clinical-Analysis-Histogram.png)](images/GDC-Clinical-Analysis-Histogram.png "Click to see the full image.") + +Note that the histogram plot applies to, and can be displayed for, both categorical and continuous variables. + +#### Survival Plot + +The survival plot type supports these features: + +* View the distribution of cases (# and % of cases) in the cohort for the clinical field's data categories as a table +* Select and plot the survival analysis for the cases of specific data categories in the table: + * By default the top 2 categories (highest # of cases) are displayed + * Users can manually select and plot up to 5 categories at a time +* Download the survival plot visualization in SVG or PNG format +* Download the raw data used to generate the survival plot in JSON or TSV format + +[![Clinical Analysis Survival Plot](images/GDC-Clinical-Analysis-Survival-Plot.png)](images/GDC-Clinical-Analysis-Survival-Plot.png "Click to see the full image.") + +Note that the survival plot applies to, and can be displayed for, both categorical and continuous variables. + +#### Box Plot & QQ Plot + +The box plot and QQ plot are displayed side-by-side in the same visualization. This visualization supports these features: + +* View standard summary statistics for the clinical field's data across the cohort as both a box plot visualization and a data table: + * Minimum + * Maximum + * Mean + * Median + * Standard Deviation + * Interquartile Range (IQR) +* View a QQ plot visualization to explore whether the clinical field's data across the cohort is normally distributed, where: + * The clinical data values are plotted as the sample quantiles on the vertical axis + * The the quantiles of the normal distribution are plotted on the horizontal axis +* Download the box plot and QQ plot visualizations in SVG or PNG format +* Download the raw data used to generate the QQ plot in JSON or TSV format + +[![Clinical Analysis Box & QQ Plots](images/GDC-Clinical-Analysis-Box-And-QQ-Plots.png)](images/GDC-Clinical-Analysis-Box-And-QQ-Plots.png "Click to see the full image.") + +Note that the box plot and QQ plot only apply to continuous variables. They cannot be displayed for categorical variables. + +### Creating Custom Bins + +For each clinical variable, whether categorical or continuous, users can create custom bins to group the data in ways they find scientifically interesting or significant. Once saved, the bins are applied to these visualizations and they are then re-rendered: + +* Histogram and associated data table +* Survival plot and associated data table + +Custom bins can be reset to their defaults at any time for each card. Note that custom bins are __saved per analysis__. + +#### Categorical Binning + +To create custom bins for a categorical variable, click *__Customize Bins__*, then *__Edit Bins__*. A configuration window appears where the user can create their bins: + +[![Clinical Analysis Categorical Bins](images/GDC-Clinical-Analysis-Categorical-Bins.png)](images/GDC-Clinical-Analysis-Categorical-Bins.png "Click to see the full image.") + +The user can: + +* Group existing individual values into a single group +* Give a custom name to each group +* Ungroup previously grouped values +* Completely hide values from being shown in the visualization +* Re-show previously hidden values + +#### Continuous Binning + +To create custom bins for a continuous variable, click *__Customize Bins__*, then *__Edit Bins__*. A configuration window appears where the user can create their bins: + +[![Clinical Analysis Continuous Bins](images/GDC-Clinical-Analysis-Continuous-Bins.png)](images/GDC-Clinical-Analysis-Continuous-Bins.png "Click to see the full image.") + +The user can choose one of these continuous binning methods: + +* (1) Create equi-distant bins based on a set interval: + * User must choose the interval (e.g. equi-distant bins of 1,825 days for the Age of Diagnosis field) + * User can optionally define the starting and ending value between which the equi-distant bins will be created +* (2) Create completely custom ranges: + * User manually enters 1 or more bins with custom ranges + * User must enter a name for each range and the start and end values + * The ranges can be of different interval lengths + +Before saving the bins, if there are errors in the configuration, the user will be notified to correct them and try saving again. For example: + +[![Clinical Analysis Continuous Bins Error Example 1](images/GDC-Clinical-Analysis-Continuous-Bins-Error-Example1.png)](images/GDC-Clinical-Analysis-Continuous-Bins-Error-Example1.png "Click to see the full image.") + +[![Clinical Analysis Continuous Bins Error Example 2](images/GDC-Clinical-Analysis-Continuous-Bins-Error-Example2.png)](images/GDC-Clinical-Analysis-Continuous-Bins-Error-Example2.png "Click to see the full image.") + +### Other Useful Functions + +Clinical Analysis also provides these additional useful functions: + +* Like other analysis types, all Clinical Analysis tabs are saved to the browser's local storage: + * Each Analysis tab and its associated configurations (active cards, active plots, custom bins) is saved and is not deleted until local storage is cleared + * The currently-enabled clinical cards and their currently-selected plot types are __saved per analysis__ + * Custom bins are __saved per analysis__ +* Switch the current set that the analysis applies to - This does the following: + * Applies all currently-enabled clinical cards and their currently-selected plot types to the data in the set being switched to + * Re-renders all active visualizations to reflect the data in the set being switched to +* Rename your analysis with a custom name +* Copy your current analysis to a new analysis: + * User is prompted to name the new copy + * All currently-enabled clinical cards and their currently-selected plot types are copied to the new analysis + * A new vertical tab appears on the left for the new copy +* Print the current analysis to PDF format: + * All currently-enabled clinical cards and their currently-selected plot types are printed + * 3 cards per page, with the fixed Overall Survival Plot displayed as the first card in the entire file ## Analysis Page: Results The results of the previous analyses are displayed on this page. -[![Results](images/gdc-analysis-resultstab.png)](images/gdc-analysis-resultstab.png "Click to see the full image.") +[![Results](images/gdc-analysis-resultstab_v2.png)](images/gdc-analysis-resultstab_v2.png "Click to see the full image.") Each tab at the left side of the page is labeled according to the analysis type and the date that the analysis was performed and can be reviewed as long as it is present. The "Delete All" button will remove all of the previous analyses. diff --git a/docs/Data_Portal/Users_Guide/Exploration.md b/docs/Data_Portal/Users_Guide/Exploration.md index 4d571c458..d5b7e6cb6 100644 --- a/docs/Data_Portal/Users_Guide/Exploration.md +++ b/docs/Data_Portal/Users_Guide/Exploration.md @@ -1,8 +1,8 @@ # Exploration -The Exploration page allows users to explore data in the GDC using advanced filters/facets, which includes those on a gene and mutation level. Users choose filters on specific `Cases`, `Genes`, and/or `Mutations` on the left of this page and then can visualize these results on the right. The Gene/Mutation data for these visualizations comes from the Open-Access MAF files on the GDC Portal. +The Exploration Page allows users to explore data in the GDC using advanced filters/facets, which includes those on a gene and mutation level. Users choose filters on specific `Cases`, `Genes`, and/or `Mutations` on the left of this page and then can visualize these results on the right. The Gene/Mutation data for these visualizations comes from the Open-Access MAF files on the GDC Data Portal. There is also a `Clinical` tab with filters that apply specifically to clinical data. -[![Exploration Page](images/GDC-Exploration-Page_v5.png)](images/GDC-Exploration-Page_v4.png "Click to see the full image.") +[![Exploration Page](images/GDC-Exploration-Page_v6.png)](images/GDC-Exploration-Page_v6.png "Click to see the full image.") ## Filters / Facets On the left of this page, users can create advanced filters to narrow down results to create synthetic cohorts. @@ -11,238 +11,391 @@ On the left of this page, users can create advanced filters to narrow down resul The first tab of filters is for cases in the GDC. -[![Exploration Case Filters](images/Exploration-Cases-Filter.png)](images/Exploration-Cases-Filter.png "Click to see the full image.") +[![Exploration Case Filters](images/Exploration-Cases-Filter_v2.png)](images/Exploration-Cases-Filter_v2.png "Click to see the full image.") These criteria limit the results only to specific cases within the GDC. The default filters available are: -* __Case__: Specify individual cases using submitter ID (barcode), UUID, or list of Cases ('Case Set') -* __Case Submitter ID__: Search for cases using a part (prefix) of the submitter ID (barcode). -* __Primary Site__: Anatomical site of the cancer under investigation or review. -* __Program__: A cancer research program, typically consisting of multiple focused projects. -* __Project__: A cancer research project, typically part of a larger cancer research program. -* __Disease Type__: Type of cancer studied. -* __Gender__: Gender of the patient. -* __Age at Diagnosis__: Patient age at the time of diagnosis. -* __Vital Status__: Indicator of whether the patient was living or deceased at the date of last contact. -* __Days to Death__: Number of days from date of diagnosis to death of the patient. -* __Race__: Race of the patient. -* __Ethnicity__: Ethnicity of the patient. +* __Case:__ Specify individual cases using submitter ID (barcode), UUID, or list of Cases ('Case Set'). +* __Primary Site:__ Anatomical site of the cancer under investigation or review. +* __Program:__ A cancer research program, typically consisting of multiple focused projects. +* __Project:__ A cancer research project, typically part of a larger cancer research program. +* __Disease Type:__ Type of cancer studied. +* __Experimental Strategy:__ Experimental strategy used for molecular characterization of the cancer. +* __Sample Type:__ Describes the source of a biospecimen used for a laboratory test. +* __Available Variation Data:__ Indicates the types of genomic variation data that a case has been tested for. -In addition to the defaults, users can add additional case filters by clicking on the link titled 'Add a Case Filter' +### Clinical Filters -#### Upload Case Set +The second tab of filters is used to specifically explore clinical data for cases in the GDC. -In the `Cases` filters panel, instead of supplying cases one-by-one, users can supply a list of cases. Clicking on the `Upload Case Set` button will launch a dialog as shown below, where users can supply a list of cases or upload a comma-separated text file of cases. +[![Exploration Clinical Filters](images/Exploration-Clinical-Filter.png)](images/Exploration-Clinical-Filter.png "Click to see the full image.") -[![Upload Case Set](images/gdc-exploration-case-set.png)](images/gdc-exploration-case-set.png "Click to see the full image.") +Users can filter by specific clinical variables, grouped into these categories: -After supplying a list of cases, a table below will appear which indicates whether the case was found. +* __Demographic:__ Data for the characterization of the patient by means of segmenting the population (e.g. characterization by age, sex, race, etc.). +* __Diagnoses:__ Data from the investigation, analysis, and recognition of the presence and nature of disease, condition, or injury from expressed signs and symptoms; also, the scientific determination of any kind; the concise results of such an investigation. +* __Treatments:__ Records of the administration and intention of therapeutic agents provided to a patient to alter the course of a pathologic process. +* __Exposures:__ Clinically-relevant patient information not immediately resulting from genetic predispositions. -[![Upload Case Set Validation](images/gdc-exploration-case-set-validation.png)](images/gdc-exploration-case-set-validation.png "Click to see the full image.") +### Gene Filters -Clicking on `Submit` will filter the results in the Exploration Page by those cases. +The third tab of filters is for genes affected by mutations in the GDC. -[![Upload Case Set Results](images/case-set-filter_v3.png)](images/case-set-filter_v2.png "Click to see the full image.") +[![Exploration Gene Filters](images/Exploration-Gene-Filter_v2.png)](images/Exploration-Gene-Filter_v2.png "Click to see the full image.") -### Gene Filters +Users can filter by: -The second tab of filters is for genes affected by mutations in the GDC. +* __Gene:__ Specify a Gene Symbol, ID, or list of Genes ('Gene Set'). +* __Biotype:__ Classification of the type of gene according to Ensembl. The biotypes can be grouped into protein coding, pseudogene, long noncoding and short noncoding. Examples of biotypes in each group are as follows: + * __Protein coding:__ IGC gene, IGD gene, IG gene, IGJ gene, IGLV gene, IGM gene, IGV gene, IGZ gene, nonsense mediated decay, nontranslating CDS, non stop decay, polymorphic pseudogene, TRC gene, TRD gene, TRJ gene, TRV gene. + * __Pseudogene:__ disrupted domain, IGC pseudogene, IGJ pseudogene, IG pseudogene, IGV pseudogene, processed pseudogene, transcribed processed pseudogene, transcribed unitary pseudogene, transcribed unprocessed pseudogene, translated processed pseudogene, translated unprocessed pseudogene, TRJ pseudogene, TRV pseudogene, unprocessed pseudogene. + * __Long noncoding:__ 3 prime overlapping ncrna, ambiguous orf, antisense, antisense RNA, lincRNA, macro lincRNA, ncrna host, processed transcript, sense intronic, sense overlapping. + * __Short noncoding:__ miRNA, miRNA pseudogene, miscRNA, miscRNA pseudogene, Mt rRNA, Mt tRNA, rRNA, scRNA, snlRNA, snoRNA, snRNA, tRNA, tRNA pseudogene, vaultRNA. +* __Is Cancer Gene Census:__ Whether or not a gene is part of [The Cancer Gene Census](http://cancer.sanger.ac.uk/census/). -[![Exploration Gene Filters](images/Exploration-Gene-Filter.png)](images/Exploration-Gene-Filter.png "Click to see the full image.") +### Mutation Filters -The second tab of filters are for specific genes. Users can filter by: +The final tab of filters is for specific mutations. -* __Gene__ - Entering in a specific Gene Symbol, ID, or list of Genes ('Gene Set') -* __Biotype__ - Classification of the type of gene according to Ensembl. The biotypes can be grouped into protein coding, pseudogene, long noncoding and short noncoding. Examples of biotypes in each group are as follows: - * __Protein coding__: IGC gene, IGD gene, IG gene, IGJ gene, IGLV gene, IGM gene, IGV gene, IGZ gene, nonsense mediated decay, nontranslating CDS, non stop decay, polymorphic pseudogene, TRC gene, TRD gene, TRJ gene. - * __Pseudogene__: disrupted domain, IGC pseudogene, IGJ pseudogene, IG pseudogene, IGV pseudogene, processed pseudogene, transcribed processed pseudogene, transcribed unitary pseudogene, transcribed unprocessed pseudogene, translated processed pseudogene, TRJ pseudogene, unprocessed pseudogene - * __Long noncoding__: 3prime overlapping ncrna, ambiguous orf, antisense, antisense RNA, lincRNA, ncrna host, processed transcript, sense intronic, sense overlapping - * __Short noncoding__: miRNA, miRNA_pseudogene, miscRNA, miscRNA pseudogene, Mt rRNA, Mt tRNA, rRNA, scRNA, snlRNA, snoRNA, snRNA, tRNA, tRNA_pseudogene -* __Is Cancer Gene Census__ - Whether or not a gene is part of [The Cancer Gene Census](http://cancer.sanger.ac.uk/census/) +[![Exploration Mutation Filters](images/Exploration-Mutations-Filter_v2.png)](images/Exploration-Mutations-Filter_v2.png "Click to see the full image.") -#### Upload Gene Set +Users can filter by: -In the `Genes` filters panel, instead of supplying genes one-by-one, users can supply a list of genes. Clicking on the `Upload Gene Set` button will launch a dialog as shown below, where users can supply a list of genes or upload a comma-separated text file of genes. +* __Mutation:__ Unique ID for that mutation. Users can use the following: + * UUID - c7c0aeaa-29ed-5a30-a9b6-395ba4133c63 + * DNA Change - chr12:g.121804752delC + * COSMIC ID - COSM202522 + * List of any mutation UUIDs or DNA Change id's ('Mutation Set'). +* __Impact:__ A subjective classification of the severity of the variant consequence. These scores are determined using the three following tools: + * __[Ensembl VEP](http://useast.ensembl.org/info/genome/variation/prediction/index.html):__ + * __HIGH (H):__ The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay. + * __MODERATE (M):__ A non-disruptive variant that might change protein effectiveness. + * __LOW (L):__ Assumed to be mostly harmless or unlikely to change protein behavior. + * __MODIFIER (MO):__ Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact. + * __[PolyPhen](http://genetics.bwh.harvard.edu/pph/):__ + * __probably damaging (PR):__ It is with high confidence supposed to affect protein function or structure. + * __possibly damaging (PO):__ It is supposed to affect protein function or structure. + * __benign (BE):__ Most likely lacking any phenotypic effect. + * __unknown (UN):__ When in some rare cases, the lack of data does not allow PolyPhen to make a prediction. + * __[SIFT](http://sift.jcvi.org/):__ + * __tolerated:__ Not likely to have a phenotypic effect. + * __tolerated_low_confidence:__ More likely to have a phenotypic effect than 'tolerated'. + * __deleterious:__ Likely to have a phenotypic effect. + * __deleterious_low_confidence:__ Less likely to have a phenotypic effect than 'deleterious'. +* __Consequence Type:__ Consequence type of this variation; [sequence ontology](http://www.sequenceontology.org/) terms. +* __Type:__ A general classification of the mutation. +* __Variant Caller:__ The variant caller used to identify the mutation. +* __COSMIC ID:__ This option will filter out only mutations with a COSMIC ID. +* __dbSNP rs ID:__ This option will filter out only mutations with a SNP identifer maintained in dbSNP. -[![Upload Gene Set](images/Exploration-Upload-Gene-Set.png)](images/Exploration-Upload-Gene-Set.png "Click to see the full image.") +## Results -After supplying a list of genes, a table below will appear which indicates whether the gene was found. +As users add filters to the data on the Exploration Page, the Results section will automatically be updated. Results are divided into different tabs: `Cases`, `Genes`, `Mutations`, and `OncoGrid`. -[![Upload Gene Set Validation](images/Exploration-Upload-Gene-Set-Validation.png)](images/Exploration-Upload-Gene-Set-Validation.png "Click to see the full image.") +To illustrate these tabs, Case, Gene, and Mutation filters have been chosen (Genes in the Cancer Gene Census, that have a missense variant for the TCGA-BRCA project) and a description of what each tab displays follows. -Clicking on `Submit` will filter the results in the Exploration Page by those genes. +### Cases -### Mutation Filters +The `Cases` tab gives an overview of all the cases/patients who correspond to the filters chosen (Cohort). -The final tab of filters is for specific mutations. +[![Exploration Case Example](images/Exploration-Case-Example_v3.png)](images/Exploration-Case-Example_v2.png "Click to see the full image.") -[![Exploration Mutation Filters](images/Exploration-Mutations-Filter.png)](images/Exploration-Mutations-Filter.png "Click to see the full image.") +The top of this section contains a few pie graphs with categorical information regarding the Primary Site, Project, Disease Type, Gender, and Vital Status. -Users can filter by: +Below these pie charts is a tabular view of cases, which can be exported, sorted and saved using the buttons on the right and includes the following information: -* __Mutation__ - Unique ID for that mutation. Users can use the following: - * UUID - c7c0aeaa-29ed-5a30-a9b6-395ba4133c63 - * DNA Change - chr12:g.121804752delC - * COSMIC ID - COSM202522 - * List of any mutation UUIDs or DNA Change id's ('Mutation Set') -* __Consequence Type__ - Consequence type of this variation; [sequence ontology](http://www.sequenceontology.org/) terms -* __Impact__ - A subjective classification of the severity of the variant consequence. This information comes from the [Ensembl VEP](http://www.ensembl.org/info/genome/variation/predicted_data.html). -* __Type__ - A general classification of the mutation -* __Variant Caller__ - The variant caller used to identify the mutation -* __COSMIC ID__ - The identifier of the gene or mutation maintained in COSMIC, the Catalogue Of Somatic Mutations In Cancer -* __dbSNP rs ID__ - The reference SNP identifier maintained in dbSNP +* __Case ID (Submitter ID):__ The Case ID / submitter ID of that case/patient (i.e. TCGA Barcode). +* __Project:__ The study name for the project for which the case belongs. +* __Primary Site:__ The primary site of the cancer/project. +* __Gender:__ The gender of the case. +* __Files:__ The total number of files available for that case. +* __Available Files per Data Category:__ Seven columns displaying the number of files available in each of the seven data categories. These link to the files for the specific case. +* __# Mutations:__ The number of SSMs (simple somatic mutations) detected in that case. +* __# Genes:__ The number of genes affected by mutations in that case. +* __Slides:__ The total number of slides available for that case. For more information about [slide images](Repository.md#image-viewer-features). -#### Upload Mutation Set +>__Note__: By default, the UUID is not displayed on summary page tables. You can display the UUID by clicking on the icon with 3 parallel lines and checking the UUID option. -In the `Mutations` filters panel, instead of supplying mutation id's one-by-one, users can supply a list of mutations. Clicking on the `Upload Mutation Set` button will launch a dialog as shown below, where users can supply a list of mutations or upload a comma-separated text file of mutations. +### Case Summary Page -[![Upload Case Set](images/gdc-exploration-mutation-set.png)](images/gdc-exploration-case-set.png "Click to see the full image.") +The Case Summary Page displays case details including the project and disease information, data files that are available for that case, and the experimental strategies employed. A button in the top-right corner of the page allows the user to add all files associated with the case to the file [cart](Cart.md). -After supplying a list of mutations, a table below will appear which indicates whether the mutation was found. +[![Case Page](images/gdc-case-entity-page_v2.png)](images/gdc-case-entity-page_v2.png "Click to see the full image.") -[![Upload Case Set Validation](images/gdc-exploration-mutation-set-validation.png)](images/gdc-exploration-case-set-validation.png "Click to see the full image.") +#### Clinical and Biospecimen Information -Clicking on `Submit` will filter the results in the Exploration Page by those mutations. +The page also provides clinical and biospecimen information about that case. Links to export clinical and biospecimen information in JSON format are provided. -[![Upload Case Set Results](images/mutation-set-filter.png)](images/case-set-filter.png "Click to see the full image.") +[![Case Page, Clinical and Biospecimen](images/image_clinical_and_biospecimen_information.png)](images/image_clinical_and_biospecimen_information.png "Click to see the full image.") -## Results -As users add filters to the data on the Exploration Page, the Results section will automatically be updated. Results are divided into different tabs: `Cases`, `Genes`, `Mutations`, and `OncoGrid`. +Some clinical records can support multiple records of the same type (Diagnoses, Family Histories, Exposures, Follow-Ups, Molecular Tests). If only one record exists, the UUID of the record is provided at the top of the corresponding tab. -To illustrate these tabs, Case, Gene, and Mutation filters have been chosen ( Genes in the Cancer Gene Census, that have HIGH VEP Impact for the TCGA-BRCA project) and a description of what each tab displays follows. +[![Case Page, Single Clinical Record](images/gdc-case-clinical-single-record.png)](images/gdc-case-clinical-single-record.png "Click to see the full image.") +If there are multiple records, they are listed as horizontal tabs. -#### Cases +[![Case Page, Multiple Clinical Records](images/gdc-case-clinical-multiple-records.png)](images/gdc-case-clinical-multiple-records.png "Click to see the full image.") -The `Cases` tab gives an overview of all the cases/patients who correspond to the filters chosen (Cohort). +Some record types are further nested under another. For example, a Diagnosis record may have multiple associated Treatment records. Or a Follow-Up record may have multiple associated Molecular Test Records. The associated sub-records are listed in a table on the tab. -[![Exploration Case Example](images/Exploration-Case-Example_v3.png)](images/Exploration-Case-Example_v2.png "Click to see the full image.") +[![Case Page, Nested Clinical Records](images/gdc-case-clinical-nested-records.png)](images/gdc-case-clinical-nested-records.png "Click to see the full image.") -The top of this section contains a few pie graphs with categorical information regarding the Primary Site, Project, Disease Type, Gender, and Vital Status. +#### Biospecimen Search + +A search filter just below the biospecimen section can be used to find and filter biospecimen data. The wildcard search will highlight entities in the tree that match the characters typed. This will search both the case submitter ID, as well as the additional metadata for each entity. For example, searching 'Primary Tumor' will highlight samples that match that type. + +[![Biospecimen Search](images/gdc_case_biospecimen_search_v3.png)](images/gdc_case_biospecimen_search_v3.png "Click to see the full image.") -Below these pie charts is a tabular view of cases (which can be exported, sorted and saved using the buttons on the right), that includes the following information: +#### Most Frequent Somatic Mutations for a Case -* __Case ID (Submitter ID):__ The Case ID / submitter ID of that case/patient (i.e. TCGA Barcode) -* __Project:__ The study name for the project for which the case belongs -* __Primary Site:__ The primary site of the cancer/project -* __Gender:__ The gender of the case -* __Files:__ The total number of files available for that case -* __Available Files per Data Category:__ Five columns displaying the number of files available in each of the five data categories. These link to the files for the specific case. -* __# Mutations:__ The number of SSMs (simple somatic mutations) detected in that case -* __# Genes:__ The number of genes affected by mutations in that case -* __Slides:__ The total number of slides available for that case. +The Case Entity Page also lists the mutations found in that particular case. -*Note: By default, the Case UUID is not displayed. You can display the UUID of the case, but clicking on the icon with 3 parallel lines, and choose to display the Case UUID* +[![Case Page](images/gdc-case-entity-mfm.png)](images/gdc-case-entity-mfm.png "Click to see the full image.") -#### Genes +For more information, please go to the [Most Frequent Somatic Mutation](#most-frequent-somatic-mutations) section. + +### Genes The `Genes` tab will give an overview of all the genes that match the criteria of the filters (Cohort). -[![Exploration Gene Example](images/Exploration-Gene-Example.png)](images/Exploration-Gene-Example.png "Click to see the full image.") +[![Exploration Gene Example](images/Exploration-Gene-Example_v3.png)](images/Exploration-Gene-Example_v3.png "Click to see the full image.") -The top of this section contains a survival plot of all the cases within the specified Exploration page search, in addition to a bar graph of the most frequently mutated genes. Hovering over each bar in the plot will display information about the percentage of cases affected. Users may choose to download the underlying data in JSON or TSV format or an image of the graph in SVG or PNG format by clicking the `download` icon at the top of each graph. +The top of this tab contains a bar graph of the most frequently mutated genes. Hovering over each bar in the plot will display information about the percentage of cases affected. In addition, this section contains a survival curve. The survival curve is calculated using the Kaplan-Meier estimator based on all cases with survival data within the specified Exploration Page search. For more information on how these values are determined, please go to the [Survival Analysis](#survival-analysis) section. Users may choose to download the underlying data in JSON or TSV format or an image of the graph in SVG or PNG format by clicking the `download` icon at the top of each graph. Below these graphs is a tabular view of the genes affected, which includes the following information: -* __Symbol:__ The gene symbol, which links to the Gene Summary Page -* __Name:__ Full name of the gene -* __Cytoband:__ The location of the mutation on the chromosome in terms of Giemsa-stained samples. -* __Type:__ The type of gene -* __# Affected Cases in Cohort:__ The number of cases affected in the Cohort -* __# Affected Cases Across all Projects:__ The number of cases within all the projects in the GDC that contain a mutation on this gene. Clicking the red arrow will display the cases broken down by project -* __# Mutations:__ The number of SSMs (simple somatic mutations) detected in that gene -* __Annotations:__ Includes a COSMIC symbol if the gene belongs to [The Cancer Gene Census](http://cancer.sanger.ac.uk/census/) -* __Survival Analysis:__ An icon that, when clicked, will plot the survival rate between cases in the project with mutated and non-mutated forms of the gene - -#### Survival Analysis - -Survival analysis is used to analyze the occurrence of event data over time. In the GDC, survival analysis is performed on the mortality of the cases. Survival analysis requires: - -* Data on the time to a particular event (days to death or last follow up) - * Fields: __diagnoses.days_to_death__ and __diagnoses.days_to_last_follow_up__ -* Information on whether the event has occurred (alive/deceased) - * Fields: __diagnoses.vital_status__ -* Data split into different categories or groups (i.e. gender, etc.) - * Fields: __demographic.gender__ +* __Symbol:__ The gene symbol, which links to the Gene Summary Page. +* __Name:__ Full name of the gene. +* __# SSM Affected Cases in Cohort:__ The number of cases affected by SSMs (simple somatic mutations) in the Cohort. +* __# SSM Affected Cases Across the GDC:__ The number of cases within all the projects in the GDC that contain a mutation on this gene. Clicking the red arrow will display the cases broken down by project. +* __# CNV Gain:__ The number of CNV (copy number variation) events detected in that gene which resulted in an increase (gain) in the gene's copy number. +* __# CNV Loss:__ The number of CNV events detected in that gene which resulted in a decrease (loss) in the gene's copy number. +* __# Mutations:__ The number of SSMs (simple somatic mutations) detected in that gene. +* __Annotations:__ Includes a COSMIC symbol if the gene belongs to [The Cancer Gene Census](http://cancer.sanger.ac.uk/census/). +* __Survival:__ An icon that, when clicked, will plot the survival rate between cases in the project with mutated and non-mutated forms of the gene. -The survival analysis in the GDC uses a Kaplan-Meier estimator: +### Gene Summary Page -[![Kaplan-Meier Estimator](images/gdc-kaplan-meier-estimator.png)](images/gdc-kaplan-meier-estimator "Click to see the full image.") +Gene Summary Pages describe each gene with mutation data and provides results related to the analyses that are performed on these genes. -Where: +The summary section of the Gene Page contains the following information: - * S(ti) is the estimated survival probability for any particular one of the t time periods - * ni is the number of subjects at risk at the beginning of time period ti - * and di is the number of subjects who die during time period ti +[![Gene Summary](images/GDC-Gene-Summary_v2.png)](images/GDC-Gene-Summary_v2.png "Click to see the full image.") -The table below is an example data set to calculate survival for a set of seven cases: +* __Symbol:__ The gene symbol. +* __Name:__ Full name of the gene. +* __Synonyms:__ Synonyms of the gene name or symbol, if available. +* __Type:__ A broad classification of the gene. +* __Location:__ The chromosome on which the gene is located and its coordinates. +* __Strand:__ If the gene is located on the forward (+) or reverse (-) strand. +* __Description:__ A description of gene function and downstream consequences of gene alteration. +* __Annotation:__ A notation/link that states whether the gene is part of [The Cancer Gene Census](http://cancer.sanger.ac.uk/census/). -[![Sample Survival Analysis Table](images/gdc-sample-survival-table.png)](images/gdc-sample-survival-table.png "Click to see the full image.") +#### External References -The calculated cumulated survival probability can be plotted against the interval to obtain a survival plot like the one shown below. +A list with links that lead to external databases with additional information about each gene is displayed here. These external databases include: -[![Sample Survival Analysis Plot](images/gdc-survival-plot.png)](images/gdc-survival-plot.png "Click to see the full image.") +* [Entrez](https://www.ncbi.nlm.nih.gov/gquery/) +* [Uniprot](http://www.uniprot.org/) +* [Hugo Gene Nomenclature Committee](http://www.genenames.org/) +* [Online Mendelian Inheritance in Man](https://www.omim.org/) +* [Ensembl](http://may2015.archive.ensembl.org/index.html) +* [CIViC](https://civicdb.org/home) + +#### Cancer Distribution + +A table and two bar graphs show how many cases are affected by mutations and copy number variation within the gene as a ratio and percentage. Each row/bar represents the number of cases for each project. The final column in the table lists the number of unique mutations observed on the gene for each project. + +[![Cancer Distribution](images/GDC-Gene-CancerDist.png)](images/GDC-Gene-CancerDist.png "Click to see the full image.") + +#### Protein Viewer + +Mutations and their frequency across cases are mapped to a graphical visualization of protein-coding regions with a lollipop plot. Pfam domains are highlighted along the x-axis to assign functionality to specific protein-coding regions. The bottom track represents a view of the full gene length. Different transcripts can be selected by using the drop-down menu above the plot. + +[![Protein Plot](images/GDC-Gene-ProteinGraph.png)](images/GDC-Gene-ProteinGraph.png "Click to see the full image.") + +The panel to the right of the plot allows the plot to be filtered by mutation consequences or impact. The plot will dynamically change as filters are applied. Mutation consequence and impact is denoted in the plot by color. + +>__Note__: The impact filter on this panel will not display the annotations for alternate transcripts. + +The plot can be viewed at different zoom levels by clicking and dragging across the x-axis, clicking and dragging across the bottom track, or double clicking the pfam domain IDs. The `Reset` button can be used to bring the zoom level back to its original position. The plot can also be exported as a PNG image, SVG image or as JSON formatted text by choosing the `Download` button above the plot. + +#### Most Frequent Somatic Mutations + +The 20 most frequent mutations in the gene are displayed as a bar graph that indicates the number of cases that share each mutation. + +[![Gene MFM](images/GDC-Gene-MFM.png)](images/GDC-Gene-MFM.png "Click to see the full image.") -#### Mutations +A table is displayed below that lists information about each mutation including: -The `Mutations` tab will give an overview of all the mutations who match the criteria of the filters (Cohort). +* __DNA Change:__ The chromosome and starting coordinates of the mutation are displayed along with the nucleotide differences between the reference and tumor allele. +* __Type:__ A general classification of the mutation. +* __Consequences:__ The effects the mutation has on the gene coding for a protein (i.e. synonymous, missense, non-coding transcript). +* __# Affected Cases in Gene:__ The number of affected cases, expressed as number across all mutations within the selected Gene. +* __# Affected Cases Across GDC:__ The number of affected cases, expressed as number across all projects. Choosing the arrow next to the percentage will expand the selection with a breakdown of each affected project. +* __Impact:__ A [subjective classification](#mutation-filters) of the severity of the variant consequence. This is determined by three different tools: + * __[Ensembl VEP](http://useast.ensembl.org/info/genome/variation/prediction/index.html)__ + * __[PolyPhen](http://genetics.bwh.harvard.edu/pph/)__ + * __[SIFT](http://sift.jcvi.org/)__ -[![Exploration Mutation Example](images/Exploration-Mutation-Example.png)](images/Exploration-Mutation-Example.png "Click to see the full image.") -At the top of this tab is a survival plot of all the cases within the specified exploration page filters. +Clicking the `Open in Exploration` button will navigate the user to the Exploration Page, showing the same results in the table (mutations filtered by the gene). + +### Mutations + +The `Mutations` tab will give an overview of all the mutations that match the criteria of the filters (Cohort). + +Open-access mutation data is displayed by defualt. To access controlled access mutations, users must apply to the correct data access authority, be granted access, and login to the portal. If a user is logged in and has been granted access to controlled-access mutations, they will be integrated with open-access mutations throughout the portal visualizations and counts. + +[![Exploration Mutation Example](images/Exploration-Mutation-Example_v2.png)](images/Exploration-Mutation-Example_v2.png "Click to see the full image.") + +At the top of this tab contains a survival curve. The survival curve is calculated using the Kaplan-Meier estimator based on all cases with survival data within the specified Exploration Page search. For more information on how these values are determined, please go to the [Survival Analysis](#survival-analysis) section. Users may choose to download the underlying data in JSON or TSV format or an image of the graph in SVG or PNG format by clicking the `download` icon at the top of the graph. A table is displayed below that lists information about each mutation: -* __DNA Change:__ The chromosome and starting coordinates of the mutation are displayed along with the nucleotide differences between the reference and tumor allele -* __Type:__ A general classification of the mutation -* __Consequences:__ The effects the mutation has on the gene coding for a protein (i.e. synonymous, missense, non-coding transcript). A link to the Gene Summary Page for the gene affected by the mutation is included -* __# Affected Cases in Cohort:__ The number of affected cases in the Cohort as a fraction and as a percentage -* __# Affected Cases in Across all Projects:__ The number of affected cases, expressed as number across all projects. This information comes from the [Ensembl VEP](http://www.ensembl.org/info/genome/variation/predicted_data.html). Choosing the arrow next to the percentage will display a breakdown of each affected project -* __Impact (VEP):__ A subjective classification of the severity of the variant consequence. The categories are: - * __HIGH (H)__: The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function, or triggering nonsense mediated decay - * __MODERATE (M)__: A non-disruptive variant that might change protein effectiveness - * __LOW (L)__: Assumed to be mostly harmless or unlikely to change protein behavior - * __MODIFIER (MO)__: Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact -* __Survival Analysis:__ An icon that when clicked, will plot the survival rate between the gene's mutated and non-mutated cases +* __DNA Change:__ The chromosome and starting coordinates of the mutation are displayed along with the nucleotide differences between the reference and tumor allele. +* __Type:__ A general classification of the mutation. +* __Consequences:__ The effects the mutation has on the gene coding for a protein (i.e. synonymous, missense, non-coding transcript). A link to the [Gene Summary Page](Exploration.md#gene-summary-page) for the gene affected by the mutation is included. +* __# Affected Cases in Cohort:__ The number of affected cases in the Cohort as a fraction and as a percentage. +* __# Affected Cases in Across all Projects:__ The number of affected cases, expressed as number across all projects. Clicking the arrow next to the percentage will display a breakdown of each affected project. +* __Impact:__ A [subjective classification](#mutation-filters) of the severity of the variant consequence. This is determined by three different tools: + * __[Ensembl VEP](http://useast.ensembl.org/info/genome/variation/prediction/index.html)__ + * __[PolyPhen](http://genetics.bwh.harvard.edu/pph/)__ + * __[SIFT](http://sift.jcvi.org/)__ +* __Survival:__ An icon that when clicked, will plot the survival rate between the gene's mutated and non-mutated cases. + +### Mutation Summary Page + + The Mutation Summary Page contains information about one somatic mutation and how it affects the associated gene. Each mutation is identified by its chromosomal position and nucleotide-level change. + + [![Mutation Summary](images/GDC-Mutation-Summary_v2.png)](images/GDC-Mutation-Summary_v2.png "Click to see the full image.") + + - __UUID:__ A unique identifier (UUID) for this mutation. + - __DNA Change:__ Denotes the chromosome number, position, and nucleotide change of the mutation. + - __Type:__ A broad categorization of the mutation. + - __Reference Genome Assembly:__ The reference genome in which the chromosomal position refers to. + - __Allele in the Reference Assembly:__ The nucleotide(s) that compose the site in the reference assembly. + - __Functional Impact:__ A subjective classification of the severity of the variant consequence. + +#### External References + + A separate panel contains links to databases that contain information about the specific mutation. These include [dbSNP](https://www.ncbi.nlm.nih.gov/projects/SNP/), [COSMIC](http://cancer.sanger.ac.uk/cosmic), and [CIViC](https://civicdb.org/home). + +#### Consequences + +The consequences of the mutation are displayed in a table. The set of consequence terms, defined by the [Sequence Ontology](http://www.sequenceontology.org). + + [![Mutation Consequences](images/GDC-Mutation-Consequences.png)](images/GDC-Mutation-Consequences.png "Click to see the full image.") + +The fields that describe each consequence are listed below: + + * __Gene:__ The symbol for the affected gene. + * __AA Change:__ Details on the amino acid change, including compounds and position, if applicable. + * __Consequence:__ The biological consequence of each mutation. + * __Coding DNA Change:__ The specific nucleotide change and position of the mutation within the gene. + * __Impact:__ A [subjective classification](#mutation-filters) of the severity of the variant consequence. This is determined by three different tools: + * __[Ensembl VEP](http://useast.ensembl.org/info/genome/variation/prediction/index.html)__ + * __[PolyPhen](http://genetics.bwh.harvard.edu/pph/)__ + * __[SIFT](http://sift.jcvi.org/)__ + * __Strand:__ If the gene is located on the forward (+) or reverse (-) strand. + * __Transcript(s):__ The transcript(s) affected by the mutation. Each contains a link to the [Ensembl](https://www.ensembl.org) entry for the transcript. + +#### Cancer Distribution + +A table and bar graph shows how many cases are affected by the particular mutation. Each row/bar represents the number of cases for each project. + + [![Mutation Distribution](images/GDC-Mutation-CancerDist.png)](images/GDC-Mutation-CancerDist.png "Click to see the full image.") + +The table contains the following fields: + + * __Project ID__: The ID for a specific project. + * __Disease Type__: The disease associated with the project. + * __Site__: The anatomical site affected by the disease. + * __# SSM Affected Cases__: The number of affected cases and total number of cases displayed as a fraction and percentage. + +#### Protein Viewer + +The protein viewer displays a plot representing the position of mutations along the polypeptide chain. The y-axis represents the number of cases that exhibit each mutation, whereas the x-axis represents the polypeptide chain sequence. [Pfam domains](http://pfam.xfam.org/) that were identified along the polypeptide chain are identified with colored rectangles labeled with pfam IDs. See the [Gene Summary Page](#gene-summary-page) for additional details about the [protein viewer](#protein-viewer). -*Note: By default, the Mutation UUID is not displayed. You can display the UUID of the case, but clicking on the icon with 3 parallel lines, and choose to display the Mutation UUID* + [![Mutation Protein Graph](images/GDC-Mutation-ProteinGraph.png)](images/GDC-Mutation-ProteinGraph.png "Click to see the full image.") -#### OncoGrid +## OncoGrid -The Exploration page includes an OncoGrid plot of the cases with the most mutations, for the top 50 mutated genes affected by high impact mutations. Genes displayed on the left of the grid (Y-axis) correspond to individual cases on the bottom of the grid (X-axis). +The Exploration Page includes an OncoGrid plot of the cases with the most mutations, for the top 50 mutated genes affected by high impact mutations. Genes displayed on the left of the grid (Y-axis) correspond to individual cases on the bottom of the grid (X-axis). Additionally, the plot also indicates in each cell any CNV events detected for these top mutated cases and genes. -[![Exploration Oncogrid Example](images/Exploration-Oncogrid-Example.png)](images/Exploration-Oncogrid-Example.png "Click to see the full image.") +[![Exploration Oncogrid Example](images/Exploration-Oncogrid-Example_v2.png)](images/Exploration-Oncogrid-Example_v2.png "Click to see the full image.") -The grid is color-coded with a legend at the top left which describes what type of mutation consequence is observed for each gene/case combination. Clinical information and the available data for each case are available at the bottom of the grid. +The grid is color-coded with a legend at the top which describes what type of mutation consequence and CNV event is observed for each gene/case combination. Clinical information and the available data for each case are available at the bottom of the grid. The right side of the grid displays additional information about the genes: * __Gene Sets:__ Describes whether a gene is part of [The Cancer Gene Census](http://cancer.sanger.ac.uk/census/). (The Cancer Gene Census is an ongoing effort to catalogue those genes for which mutations have been causally implicated in cancer) -* __GDC:__ Identifies all cases in the GDC affected with a mutation in this gene +* __# Cases Affected:__ Identifies all cases in the GDC affected with a mutation in this gene -#### OncoGrid Options +### OncoGrid Options -To facilitate readability and comparisons, drag-and-drop can be used to reorder the gene rows. Double clicking a row in the "# Cases Affected" bar at the right side of the graphic launches the respective Gene Summary Page page. Hovering over a cell will display information about the mutation such as its ID, affected case, and biological consequence. Clicking on the cell will bring the user to the respective Mutation Summary page. +To facilitate readability and comparisons, drag-and-drop can be used to reorder the gene rows. Double clicking a row in the "# Cases Affected" bar at the right side of the graphic launches the respective Gene Summary Page. Hovering over a cell will display information about the mutation such as its ID, affected case, and biological consequence. Clicking on the cell will bring the user to the respective Mutation Summary Page. A tool bar at the top right of the graphic allows the user to export the data as a JSON object, PNG image, or SVG image. Seven buttons are available in this toolbar: -* __Download:__ Users can choose to export the contents either to a static image file (PNG or SVG format) or the underlying data in JSON format -* __Reload Grid:__ Sets all OncoGrid rows, columns, and zoom levels back to their initial positions -* __Cluster Data:__ Clusters the rows and columns to place mutated genes with the same cases and cases with the same mutated genes together -* __Toggle Heatmap:__ The view can be toggled between cells representing mutation consequences or number of mutations in each gene -* __Toggle Gridlines:__ Turn the gridlines on and off -* __Toggle Crosshairs:__ Turns crosshairs on, so that users can zoom into specific sections of the OncoGrid -* __Fullscreen:__ Turns Fullscreen mode on/off +* __Customize Colors:__ Users can customize the colors that represent mutation consequence types and CNV gains/losses. +* __Download:__ Users can choose to export the contents either to a static image file (PNG or SVG format) or the underlying data in JSON format. +* __Reload Grid:__ Sets all OncoGrid rows, columns, and zoom levels back to their initial positions. +* __Cluster Data:__ Clusters the rows and columns to place mutated genes with the same cases and cases with the same mutated genes together. +* __Toggle Heatmap:__ The view can be toggled between cells representing mutation consequences or number of mutations in each gene. +* __Toggle Gridlines:__ Turn the gridlines on and off. +* __Toggle Crosshairs:__ Turns crosshairs on, so that users can zoom into specific sections of the OncoGrid. +* __Fullscreen:__ Turns Fullscreen mode on/off. -### File Navigation +### OncoGrid Color Picker + +To customize the colors for mutation consequence types and CNV gains/losses, a user can click the color picker icon in the OncoGrid toolbar. + +* __Customize Colors:__ Opens a control where the user can pick their own colors or apply a suggested theme and save their changes. +* __Reset to Default:__ Resets all colors to the defaults initially used by OncoGrid. + +[![Exploration Oncogrid Color Picker](images/Exploration-Oncogrid-Color-Picker.png)](images/Exploration-Oncogrid-Color-Picker.png "Click to see the full image.") + +## File Navigation After utilizing the Exploration Page to narrow down a specific cohort, users can find the specific files that relate to this group by clicking on the `View Files in Repository` button as shown in the image below. -[![Exploration File Navigation](images/Exploration-View-Files_v3.png)](images/Exploration-View-Files_v2.png "Click to see the full image.") +[![Exploration File Navigation](images/Exploration-View-Files_v4.png)](images/Exploration-View-Files_v4.png "Click to see the full image.") Clicking this button will navigate the users to the Repository Page, filtered by the cases within the cohort. -[![Input Set Explanation](images/gdc-input-set_v2.png)](images/gdc-input-set.png "Click to see the full image.") +[![Input Set Explanation](images/gdc-input-set_v2.png)](images/gdc-input-set_v2.png "Click to see the full image.") + +The filters chosen on the Exploration Page are displayed as an `input set` on the Repository Page. Additional filters may be added on top of this `input set`, but the original set cannot be modified and instead a new `input set` must be created from original data. + +--- + +## Survival Analysis + +The survival analysis, which is seen in both the `Gene` and `Mutation` tabs, is used to analyze the occurrence of event data over time. In the GDC, survival analysis is performed on the mortality of the cases. Thus, the values are retrieved from [GDC Data Dictionary](../../../Data_Dictionary) properties and a survival analysis requires the following fields: + +* Data on the time to a particular event (days to death or last follow up). + * Fields: __demographic.days_to_death__ or __demographic.days_to_last_follow_up__ +* Information on whether the event has occurred (alive/deceased). + * Fields: __demographic.vital_status__ +* Data split into different categories or groups (i.e. gender, etc.). + * Fields: __demographic.gender__ + +The survival analysis in the GDC uses a Kaplan-Meier estimator: -The filters chosen on the Exploration page are displayed as an `input set` on the Repository page. Additional filters may be added on top of this `input set`, but the original set cannot be modified and instead must be created from scratch again. +[![Kaplan-Meier Estimator](images/gdc-kaplan-meier-estimator2.png)](images/gdc-kaplan-meier-estimator2.png "Click to see the full image.") + +Where: + + * S(t) is the estimated survival probability for any particular one of the t time periods. + * ni is the number of subjects at risk at the beginning of time period ti. + * and di is the number of subjects who die during time period ti. + +The table below is an example data set to calculate survival for a set of seven cases: + +[![Sample Survival Analysis Table](images/gdc-sample-survival-table.png)](images/gdc-sample-survival-table.png "Click to see the full image.") + +The calculated cumulated survival probability can be plotted against the interval to obtain a survival plot like the one shown below. + +[![Sample Survival Analysis Plot](images/gdc-survival-plot.png)](images/gdc-survival-plot.png "Click to see the full image.") diff --git a/docs/Data_Portal/Users_Guide/GeneEntity.md b/docs/Data_Portal/Users_Guide/GeneEntity.md index f909ac0b3..4ea9bbb6d 100644 --- a/docs/Data_Portal/Users_Guide/GeneEntity.md +++ b/docs/Data_Portal/Users_Guide/GeneEntity.md @@ -6,7 +6,7 @@ The Gene Summary Page describes each gene with mutation data featured at the GDC The summary section of the gene page contains the following information: -[![Gene Summary](images/GDC-Gene-Summary.png)](images/GDC-Gene-Summary.png "Click to see the full image.") +[![Gene Summary](images/GDC-Gene-Summary_v2.png)](images/GDC-Gene-Summary_v2.png "Click to see the full image.") * __Symbol:__ The gene symbol * __Name:__ Full name of the gene @@ -19,7 +19,7 @@ The summary section of the gene page contains the following information: ## External References -A list with links that lead to external databases with additional information about each gene is displayed here. These external databases include: [Entrez](https://www.ncbi.nlm.nih.gov/gquery/), [Uniprot](http://www.uniprot.org/), [Hugo Gene Nomenclature Committee](http://www.genenames.org/), [Online Mendelian Inheritance in Man](https://www.omim.org/), and [Ensembl](http://may2015.archive.ensembl.org/index.html). +A list with links that lead to external databases with additional information about each gene is displayed here. These external databases include: [Entrez](https://www.ncbi.nlm.nih.gov/gquery/), [Uniprot](http://www.uniprot.org/), [Hugo Gene Nomenclature Committee](http://www.genenames.org/), [Online Mendelian Inheritance in Man](https://www.omim.org/), [Ensembl](http://may2015.archive.ensembl.org/index.html), and [CIViC](https://civicdb.org/home). ## Cancer Distribution diff --git a/docs/Data_Portal/Users_Guide/Genes_and_Mutations.md b/docs/Data_Portal/Users_Guide/Genes_and_Mutations.md index d4cd7d5ff..b76d24cf2 100644 --- a/docs/Data_Portal/Users_Guide/Genes_and_Mutations.md +++ b/docs/Data_Portal/Users_Guide/Genes_and_Mutations.md @@ -10,7 +10,7 @@ Gene Summary Pages describe each gene with mutation data and provides results re The summary section of the gene page contains the following information: -[![Gene Summary](images/GDC-Gene-Summary.png)](images/GDC-Gene-Summary.png "Click to see the full image.") +[![Gene Summary](images/GDC-Gene-Summary_v2.png)](images/GDC-Gene-Summary_v2.png "Click to see the full image.") * __Symbol:__ The gene symbol * __Name:__ Full name of the gene @@ -23,13 +23,13 @@ The summary section of the gene page contains the following information: ### External References -A list with links that lead to external databases with additional information about each gene is displayed here. These external databases include: [Entrez](https://www.ncbi.nlm.nih.gov/gquery/), [Uniprot](http://www.uniprot.org/), [Hugo Gene Nomenclature Committee](http://www.genenames.org/), [Online Mendelian Inheritance in Man](https://www.omim.org/), and [Ensembl](http://may2015.archive.ensembl.org/index.html). +A list with links that lead to external databases with additional information about each gene is displayed here. These external databases include: [Entrez](https://www.ncbi.nlm.nih.gov/gquery/), [Uniprot](http://www.uniprot.org/), [Hugo Gene Nomenclature Committee](http://www.genenames.org/), [Online Mendelian Inheritance in Man](https://www.omim.org/), [Ensembl](http://may2015.archive.ensembl.org/index.html), and [CIViC](https://civicdb.org/home). ### Cancer Distribution -A table and bar graph show how many cases are affected by mutations within the gene as a ratio and percentage. Each row/bar represents the number of cases for each project. The final column in the table lists the number of unique mutations observed on the gene for each project. +A table and two bar graphs (one for mutations, one for CNV events) show how many cases are affected by mutations and CNV events within the gene as a ratio and percentage. Each row/bar represents the number of cases for each project. The final column in the table lists the number of unique mutations observed on the gene for each project. -[![Cancer Distribution](images/GDC-Gene-CancerDist.png)](images/GDC-Gene-CancerDist.png "Click to see the full image.") +[![Cancer Distribution](images/GDC-Gene-CancerDist_v2.png)](images/GDC-Gene-CancerDist_v2.png "Click to see the full image.") ### Protein Viewer @@ -68,7 +68,7 @@ Clicking the `Open in Exploration` button will navigate the user to the Explorat ### Summary - [![Mutation Summary](images/GDC-Mutation-Summary.png)](images/GDC-Mutation-Summary.png "Click to see the full image.") + [![Mutation Summary](images/GDC-Mutation-Summary_v2.png)](images/GDC-Mutation-Summary_v2.png "Click to see the full image.") - __ID:__ A unique identifier (UUID) for this mutation - __DNA Change:__ Denotes the chromosome number, position, and nucleotide change of the mutation @@ -79,7 +79,7 @@ Clicking the `Open in Exploration` button will navigate the user to the Explorat #### External References - A separate panel contains links to databases that contain information about the specific mutation. These include [dbSNP](https://www.ncbi.nlm.nih.gov/projects/SNP/) and [COSMIC](http://cancer.sanger.ac.uk/cosmic). + A separate panel contains links to databases that contain information about the specific mutation. These include [dbSNP](https://www.ncbi.nlm.nih.gov/projects/SNP/), [COSMIC](http://cancer.sanger.ac.uk/cosmic), and [CIViC](https://civicdb.org/home). ### Consequences diff --git a/docs/Data_Portal/Users_Guide/Getting_Started.md b/docs/Data_Portal/Users_Guide/Getting_Started.md index f427d97a4..26d9fbbff 100644 --- a/docs/Data_Portal/Users_Guide/Getting_Started.md +++ b/docs/Data_Portal/Users_Guide/Getting_Started.md @@ -1,16 +1,15 @@ # Getting Started - ## The GDC Data Portal: An Overview The Genomic Data Commons (GDC) Data Portal provides users with web-based access to data from cancer genomics studies. Key GDC Data Portal features include: -* Open, granular access to information about all datasets available in the GDC -* Advanced search and visualization-assisted filtering of data files -* Data visualization tools to support the analysis and exploration of data (including on a gene and mutation level from Open-Access MAF files) -* Cart for collecting data files of interest -* Authentication using eRA Commons credentials for access to controlled data files -* Secure data download directly from the cart or using the [GDC Data Transfer Tool](https://gdc.cancer.gov/access-data/gdc-data-transfer-tool) +* Open, granular access to information about all datasets available in the GDC. +* Advanced search and visualization-assisted filtering of data files. +* Data visualization tools to support the analysis and exploration of data (including on a gene and mutation level from Open-Access MAF files). +* Cart for collecting data files of interest. +* Authentication using eRA Commons credentials and auathorization using dbGaP for access to controlled data files. +* Secure data download directly from the cart or using the [GDC Data Transfer Tool](https://gdc.cancer.gov/access-data/gdc-data-transfer-tool). For more information about available datasets, see the [GDC Website](https://gdc.cancer.gov/about-data). @@ -18,7 +17,7 @@ For more information about available datasets, see the [GDC Website](https://gdc ## Accessing the GDC Data Portal -The GDC Data Portal is accessible using a web browser such as Chrome, Internet Explorer, and Firefox at the following URL: +The GDC Data Portal is accessible using a web browser such as Chrome, Firefox, and Microsoft Edge at the following URL: [https://portal.gdc.cancer.gov](https://portal.gdc.cancer.gov) @@ -27,10 +26,7 @@ The front page displays a summary of all available datasets: [![GDC Home Page](images/GDC-Home-Page.png)](images/GDC-Home-Page.png "Click to see the full image.") - -## Navigation - -### Views +## Views The GDC Data Portal provides five navigation options (*Views*) for browsing available harmonized datasets: @@ -40,68 +36,67 @@ The GDC Data Portal provides five navigation options (*Views*) for browsing avai * __Exploration__: The Exploration link takes users to the [Exploration Page](Exploration.md), which allows users to explore data by utilizing various case, genes and mutation filters. -* __Analysis__: The Analysis link directs users to the [Analysis Page](Custom_Set_Analysis.md). This page has features available for users to compare different cohorts. These cohorts can either be generated with existing filters (e.g. males with lung cancer) or through custom selection. +* __Analysis__: The Analysis link directs users to the [Analysis Page](Custom_Set_Analysis.md). This page has features available for users to compare different cohorts or analyze the clinical variables of a specific cohort. These cohorts can either be generated with existing filters (e.g. males with lung cancer) or through custom selection. * __Repository__: The Repository link directs users to the [Repository Page](Repository.md). Here users can see the data files available for download at the GDC and apply file/case filters to narrow down their search. -* __Image Viewer__: The [Image viewer](Image_viewer.md) allows users to visualize tissue slide images. - * __Human Outline__: The home page displays a human anatomical outline that can be used to refine their search. Choosing an associated organ will direct the user to a listing of all projects associated with that primary site. For example, clicking on the human brain will show only cases and projects associated with brain cancer (TCGA-GBM and TCGA-LGG). The number of cases associated with each primary site is also displayed here and separated by project. Each view provides a distinct representation of the same underlying set of GDC data and metadata. The GDC also provides access to certain unharmonized data files generated by GDC-hosted projects. These files and their associated metadata are not represented in the views above; instead they can be found in the [GDC Legacy Archive](Legacy_Archive.md). The Projects, Exploration, Analysis and Repository pages can be accessed from the GDC Data Portal front page and from the toolbar (see below). The annotations view is accessible from Repository view. A link to the GDC Legacy Archive is available on the GDC Data Portal front page and in the GDC Apps menu (see below). -### Toolbar +## Toolbar The toolbar available at the top of all pages in the GDC Data Portal provides convenient navigation links and access to authentication and quick search. -The left portion of this toolbar provides access to the Home Page, __Projects Page__, __Exploration Page__, __Analysis Page__, and a link to __Repository Page__: +The left portion of this toolbar provides access to the __Home Page__, __Projects Page__, __Exploration Page__, __Analysis Page__, and a link to __Repository Page__: [![GDC Data Portal Toolbar (Left)](images/gdc-data-portal-top-menu-bar-left.png)](images/gdc-data-portal-top-menu-bar-left.png "Click to see the full image.") -The right portion of this toolbar provides access to [quick search](#quick-search), [manage sets](#manage-sets), [authentication functions](Authentication.md), the [cart](Cart.md), and the GDC Apps menu: +The right portion of this toolbar provides access to [quick search](#quick-search), [manage sets](#manage-sets), [authentication functions](Repository.md#authentication), the [cart](Cart.md), and the GDC Apps menu: [![GDC Data Portal Toolbar (Left)](images/gdc-data-portal-top-menu-bar-right.png)](images/gdc-data-portal-top-menu-bar-right.png "Click to see the full image.") The GDC Apps menu provides links to all resources provided by the GDC, including the [GDC Legacy Archive](Legacy_Archive.md). -[![GDC Apps](images/gdc-data-portal-gdc-apps.png)](images/gdc-data-portal-gdc-apps.png "Click to see the full image.") +[![GDC Apps](images/gdc-data-portal-gdc-apps_v2.png)](images/gdc-data-portal-gdc-apps_v2.png "Click to see the full image.") -### Tables +## Tables Tabular listings are the primary method of representing available data in the GDC Data Portal. Tables are available in all views and in the file cart. Users can customize each table by specifying columns, size, and sorting. -#### Table Sort -The *sort table* button is available in the top right corner of each table. To sort by a column, place a checkmark next to it and select the preferred sort direction. If multiple columns columns are selected for sorting, data is sorted column-by-column in the order that columns appear in the sort menu: the topmost selected column becomes the primary sorting parameter; the selected column below it is used for secondary sort, etc. +### Table Sort + +The sort button is available in the top right corner of each table. To sort by a column, place a checkmark next to it and select the preferred sort direction. If multiple columns are selected for sorting, data is sorted column-by-column in the order that the columns appear in the sort menu: the topmost selected column becomes the primary sorting parameter; the selected column below it is used for secondary sort, etc. [![Sorting a table](images/gdc-data-portal-table-sort.png)](images/gdc-data-portal-table-sort.png "Click to see the full image.") -#### Table Arrangement +### Table Arrangement -The *arrange columns* button allows users to adjust the order of columns in the table and select which columns are displayed. +The arrange button allows users to adjust the order of columns in the table and select which columns are displayed. ![Selecting table columns](images/gdc-data-portal-table-column-selection.png) -#### Table Size +### Table Size Table size can be adjusted using the menu in the bottom left corner of the table. The menu sets the maximum number of rows to display. If the number of entries to be displayed exceeds the maximum number of rows, then the table will be paginated, and navigation buttons will be provided in the bottom right corner of the table to navigate between pages. ![Specifying table size](images/gdc-data-portal-table-size-and-pagination.png) -#### Table Export +### Table Export In the Repository, Projects, and Annotations views, tables can be exported in either a JSON or TSV format. The `JSON` button will export the entire table's contents into a JSON file. The `TSV` button will export the current view of the table into a TSV file. [![Table Columns Filtering](images/gdc-data-portal-table-export.png)](images/gdc-data-portal-table-export.png "Click to see the full image.") -### Filtering and Searching +## Filtering and Searching The GDC Data Portal offers three different means of searching and filtering the available data: facet filters, quick search, and advanced search. -#### Facet Filters +### Facet Filters Facets on the left of each view (Projects, Exploration, and Repository) represent properties of the data that can be used for filtering. Some of the available facets are project name, disease type, patient gender and age at diagnosis, and various data formats and categories. Each facet displays the name of the data property, the available values, and numbers of matching entities for each value (files, cases, mutations, genes, annotations, or projects, depending on the context). @@ -113,9 +108,9 @@ Multiple selections within a facet are treated as an "OR" query: e.g. "Aligned R The information displayed in each facet reflects this: in the example above, marking the "Aligned Reads" checkbox does not change the numbers or the available values in the _Data Type_ facet where the checkbox is found, but it does change the values available in the _Experimental Strategy_ facet. The _Experimental Strategy_ facet now displays only values from files of _Data Type_ "Aligned Reads". -Custom facet filters can be added in [Repository View](Repository.md) to expand the GDC Data Portal's filtering capabilities. +Custom facet filters can be added in the [Repository View](Repository.md) to expand the GDC Data Portal's filtering capabilities. -#### Quick Search +### Quick Search The quick search feature allows users to find cases, files, mutations, or genes using a search query (i.e. UUID, filename, gene name, DNA Change, project name, id, disease type or primary site). Quick search is available by clicking on the magnifier in the right section of the toolbar (which appears on every page) or by using the search bar on the Home Page. @@ -131,16 +126,16 @@ __Toolbar Quick Search:__ [![Quick Search, Searching for an Entity](images/gdc-quick-search2.png)](images/gdc-quick-search2.png "Click to see the full image.") -#### Advanced Search +### Advanced Search Advanced Search is available in Repository View. It allows users to construct complex queries with a custom query language and auto-complete suggestions. See [Advanced Search](Advanced_Search.md) for details. -#### Manage Sets +## Manage Sets The `Manage Sets` button at the top of the GDC Portal stores sets of cases, genes, or mutations of interest. On this page, users can review the sets that have been saved as well as upload new sets and delete existing sets. [![Manage Sets](images/gdc-manage-sets.png)](images/gdc-manage-sets.png "Click to see the full image.") -##### Upload Sets +### Upload Sets Clicking the `Upload Set` button shows options for creating Case, Gene, or Mutation sets. @@ -154,7 +149,7 @@ Clicking the `Submit` button will add the set of items to the list of sets on th [![New Sets Gene](images/gdc-new-set.png)](images/gdc-manage-sets.png "Click to see the full image.") -##### Export Sets +### Export Sets Users can export selected sets on this page by first clicking the checkboxes next to each set, then clicking the `Export selected` button at the top of the table. @@ -162,15 +157,15 @@ Users can export selected sets on this page by first clicking the checkboxes nex A text file containing the UUID of each case, gene or mutation is downloaded after clicking this button. -##### Review Sets +### Review Sets There are a few buttons in the list of sets that allows a user to get further information about each one. * __# Items__: Clicking the link under the # Items column navigates the user to the Exploration page using the set as a filter. -* __Download/View__: To the right of the # Items column are buttons that will download the list as a tsv or open the cases in the Repository page. +* __Download/View__: To the right of the # Items column are buttons that will download the list as a TSV or open the cases in the Repository Page. -##### Creating Sets from GDC Portal Filters -Many pages on the GDC Portal have an option called `Save Sets` that allows users to save a group of cases, mutations, or genes for further analysis. After using the filtering options on the `Exploration` page as an example, users can click the `Save Case/Gene/Mutation Set` button to save this set. +### Creating Sets from GDC Portal Filters +Many pages on the GDC Portal have an option called `Save Sets` that allows users to save a group of cases, mutations, or genes for further analysis. After using the filtering options on the `Exploration` Page as an example, users can click the `Save Case/Gene/Mutation Set` button to save this set. [![Save Sets](images/gdc-exploration-save-sets.png)](images/gdc-quick-search2.png "Click to see the full image.") diff --git a/docs/Data_Portal/Users_Guide/Legacy_Archive.md b/docs/Data_Portal/Users_Guide/Legacy_Archive.md index ba84c08ae..616a1eac8 100644 --- a/docs/Data_Portal/Users_Guide/Legacy_Archive.md +++ b/docs/Data_Portal/Users_Guide/Legacy_Archive.md @@ -20,7 +20,7 @@ The GDC Legacy Archive contains a limited set of features of the GDC Data Portal ### File Page -The file page of the GDC Legacy Archive is similar to the [file page of the GDC Data Portal](Repository.md#file-summary-page). It does not include the Workflow, Reference Genome, and Read Groups sections as these are only applicable to harmonized data available in the GDC Data Portal. The Legacy Archive includes additional archive information as described below. +The file page of the GDC Legacy Archive is similar to the [File Summary Page of the GDC Data Portal](Repository.md#file-summary-page). It does not include the Workflow, Reference Genome, and Read Groups sections as these are only applicable to harmonized data available in the GDC Data Portal. The Legacy Archive includes additional archive information as described below. [![Files Entity Page](images/gdc-data-portal-files-entity-page-Archive-MagTab.png)](images/gdc-data-portal-files-entity-page-Archive-MagTab.png "Click to see the full image.") @@ -28,9 +28,9 @@ The file page of the GDC Legacy Archive is similar to the [file page of the GDC If a file was originally produced as part of an archive containing other files, the archive information (Archive ID and number of files in the archive) is displayed in the file properties and, if selected, the user will see a list of files containing all other files in that archive. -#### Metadata files +#### Metadata Files -If a file has any associated MAGE-TAB or SRA XML metadata files, these files will be listed at the bottom of the page. These files will can be downloaded directly from here. Alternatively, metadata files can be downloaded from the file cart. +If a file has any associated MAGE-TAB or SRA XML metadata files, these files will be listed at the bottom of the page. These files will can be downloaded directly from this page. Alternatively, metadata files can be downloaded from the file cart. ### File Cart diff --git a/docs/Data_Portal/Users_Guide/MutationEntity.md b/docs/Data_Portal/Users_Guide/MutationEntity.md index 18f75e7bf..84f4f167e 100644 --- a/docs/Data_Portal/Users_Guide/MutationEntity.md +++ b/docs/Data_Portal/Users_Guide/MutationEntity.md @@ -4,7 +4,7 @@ The Mutation Summary Page contains information about one somatic mutation and ho ## Summary -[![Mutation Summary](images/GDC-Mutation-Summary.png)](images/GDC-Mutation-Summary.png "Click to see the full image.") +[![Mutation Summary](images/GDC-Mutation-Summary_v2.png)](images/GDC-Mutation-Summary_v2.png "Click to see the full image.") - __ID:__ A unique identifier (UUID) for this mutation - __DNA Change:__ Denotes the chromosome number, position, and nucleotide change of the mutation @@ -18,7 +18,7 @@ The Mutation Summary Page contains information about one somatic mutation and ho ### External References -A separate panel contains links to databases that contain information about the specific mutation. These include [dbSNP](https://www.ncbi.nlm.nih.gov/projects/SNP/) and [COSMIC](http://cancer.sanger.ac.uk/cosmic). +A separate panel contains links to databases that contain information about the specific mutation. These include [dbSNP](https://www.ncbi.nlm.nih.gov/projects/SNP/), [COSMIC](http://cancer.sanger.ac.uk/cosmic), and [CIViC](https://civicdb.org/home). ## Consequences diff --git a/docs/Data_Portal/Users_Guide/Projects.md b/docs/Data_Portal/Users_Guide/Projects.md index 9e8a9d28d..283ff57ad 100644 --- a/docs/Data_Portal/Users_Guide/Projects.md +++ b/docs/Data_Portal/Users_Guide/Projects.md @@ -1,46 +1,46 @@ # Projects -## Summary -At a high level, data in the Genomic Data Commons is organized by project. Typically, a project is a specific effort to look at particular type(s) of cancer undertaken as part of a larger cancer research program. The GDC Data Portal allows users to access aggregate project-level information via the Projects Page and Project Summary pages. +At a high level, data in the Genomic Data Commons is organized by project. Typically, a project is a specific effort to look at particular type(s) of cancer undertaken as part of a larger cancer research program. The GDC Data Portal allows users to access aggregate project-level information via the Projects Page and Project Summary Pages. ## Projects Page -The Projects Page provides an overview of all harmonized data available in the Genomic Data Commons, organized by project. It also provides filtering, navigation, and advanced visualization features that allow users to identify and browse projects of interest. Users can access Projects Page from the GDC Data Portal Home page, from the Data Portal toolbar, or directly at [https://portal.gdc.cancer.gov/projects](https://portal.gdc.cancer.gov/projects). +The Projects Page provides an overview of all harmonized data available in the Genomic Data Commons, organized by project. It also provides filtering, navigation, and advanced visualization features that allow users to identify and browse projects of interest. Users can access the [Projects Page](https://portal.gdc.cancer.gov/projects) from the GDC Data Portal Home page or from the Data Portal toolbar. On the left, a panel of facets allow users to apply filters to find projects of interest. When facet filters are applied, the table and visualizations on the right are updated to display only the matching projects. When no filters are applied, all projects are displayed. -The right side of this page displays a few visualizations of the data (Top Mutated Genes in Selected Projects and Case Distribution per Project). Below these graphs is a table that contains a list of projects and select details about each project, such as the number of cases and data files. The Graph tab provides a visual representation of this information. +The right side of the Projects Page displays a few visualizations of the data (Top Mutated Genes in Selected Projects and Case Distribution per Project). Below these graphs is a table that contains a list of projects and select details about each project, such as the number of cases and data files. The Graph tab provides a visual representation of this information. -[![Projects Page, Main Window (Table View)](images/gdc-data-portal-project-page.png)](images/gdc-data-portal-project-page.png "Click to see the full image.") +[![Projects Page, Main Window (Table View)](images/gdc-data-portal-project-page_v3.png)](images/gdc-data-portal-project-page_v3.png "Click to see the full image.") ### Visualizations -[![Projects Visualizations)](images/gdc-projects-visualizations.png)](images/gdc-projects-visualizations.png "Click to see the full image.") +[![Projects Visualizations)](images/gdc_project_visualizations3.png)](images/gdc_project_visualizations3.png "Click to see the full image.") #### Top Mutated Cancer Genes in Selected Projects -This dynamically generated bar graph shows the 20 genes with the most mutations across all projects. The genes are filtered by those that are part of the Cancer Gene Census and that have the following types of mutations: `missense_variant, frameshift_variant, start_lost, stop_lost, initiator_codon_variant, and stop_gained`. The bars represent the frequency of each mutation and is broken down into different colored segments by project and disease type. The graphic is updated as filters are applied for projects, programs, disease types, and data categories available in the project. Note, that due the these filters the number of cases displayed here will be less that the total number of cases per project. +This dynamically generated bar graph shows the 20 genes with the most mutations across all projects. The genes are filtered by those that are part of the Cancer Gene Census and that have the following types of mutations: `missense_variant`, `frameshift_variant`, `start_lost`, `stop_lost`, `initiator_codon_variant`, and `stop_gained`. The bars represent the frequency of mutations per gene and is broken down into different colored segments by project. The graphic is updated as filters are applied for projects, programs, disease types, and data categories available in the project. -Hovering the cursor over each bar will display information about the number of cases affected by the disease type and clicking on each bar will launch the Gene Summary Page page for the gene associated with the mutation. +> __Note:__ Due to these filters, the number of cases displayed here will be less that the total number of cases per project. + +Hovering the cursor over each bar will display information about the number of cases affected by the disease type and clicking on each bar will launch the [Gene Summary Page](Exploration.md#gene-summary-page) for the gene associated with the mutation. Users can toggle the Y-Axis of this bar graph between a percentage or raw number of cases affected. #### Case Distribution per Project -A pie chart displays the relative number of cases for each primary site (inner circle), which is further divided by project (outer circle). Hovering the cursor over each portion of the graph will display the primary site or project with the number of associated cases. Filtering projects at the left panel will update the pie chart. - +A pie chart displays the relative number of cases for each project. Hovering the cursor over each portion of the graph will display the project with the number of associated cases. Filtering projects at the left panel will update the pie chart. ### Projects Table The `Table` tab lists projects by Project ID and provides additional information about each project. If no facet filters have been applied, the table will display all available projects; otherwise it will display only those projects that match the selected criteria. -[![Projects Table)](images/gdc-projects-table-view.png)](images/gdc-data-portal-project-page.png "Click to see the full image.") +[![Projects Table)](images/gdc-projects-table-view_v2.png)](images/gdc-projects-table-view_v2.png "Click to see the full image.") -The table provides links to Project Summary pages in the Project ID column. Columns with file and case counts include links to open the corresponding files or cases in [Repository Page](Repository.md). +The table provides links to [Project Summary Pages](Projects.md#project-summary-page) in the Project ID column. Columns with file and case counts include links to open the corresponding files or cases in [Repository Page](Repository.md). ### Projects Graph -The `Graph` tab contains an interactive view of information in the Table tab. The numerical values in Case Count, File Count, and File Size columns are represented by bars of varying length according to size. These columns are sorted independently in descending order. Mousing over an element of the graph connects it to associated elements in other columns, including Project ID and Primary Site +The `Graph` tab contains an interactive view of information in the Table tab. The numerical values in Case Count, File Count, and File Size columns are represented by bars of varying length according to size. These columns are sorted independently in descending order. Mousing over an element of the graph connects it to associated elements in other columns, including Project ID and major Primary Sites. [![Graph Mouseover](images/gdc-table-graph-mouse-over.png)](images/gdc-table-graph-mouse-over.png "Click to see the full image.") @@ -52,18 +52,18 @@ Like the projects table, the graph will reflect any applied facet filters. Facets represent properties of the data that can be used for filtering. The facets panel on the left allows users to filter the projects presented in the Table and Graph tabs as well as visualizations. -[![Panel with Applied Filters](images/gdc-data-portal-project-page-facets.png)](images/gdc-data-portal-project-page-facets.png "Click to see the full image.") +[![Panel with Applied Filters](images/gdc-data-portal-project-page-facets4.png)](images/gdc-data-portal-project-page-facets4.png "Click to see the full image.") Users can filter by the following facets: -* __Project__: Individual project ID -* __Primary Site__: Anatomical site of the cancer under investigation or review -* __Program__: Research program that the project is part of -* __Disease Type__: Type of cancer studied -* __Data Category__: Type of data available in the project -* __Experimental Strategy__: Experimental strategies used for molecular characterization of the cancer +* __Project__: Individual project ID. +* __Primary Site__: Anatomical site of the cancer under investigation or review. +* __Program__: Research program that the project is part of. +* __Disease Type__: Type of cancer studied. +* __Data Category__: Type of data available in the project. +* __Experimental Strategy__: Experimental strategies used for molecular characterization of the cancer. -Filters can be applied by selecting values of interest in the available facets, for example "WXS" and "RNA-Seq" in the "Experimental Strategy" facet and "Brain" in the "Primary Site" facet. When facet filters are applied, the Table and Graph tabs are updated to display matching projects, and the banner above the tabs summarizes the applied filters. The banner allows the user to click on filter elements to remove the associated filters, and includes a link to view the matching cases and files. +Filters can be applied by selecting values of interest in the available facets, for example "WXS" and "RNA-Seq" in the "Experimental Strategy" facet and "Brain" in the "Primary Site" facet. When facet filters are applied, the Table and Graph tabs are updated to display matching projects, and the banner above the tabs summarizes the applied filters. The banner allows the user to click on filter elements to remove the associated filters and includes a link to view the matching cases and files. [![Panel with Applied Filters](images/panel-with-applied-filters.png)](images/panel-with-applied-filters.png "Click to see the full image.") @@ -71,13 +71,13 @@ For information on how to use facet filters, see [Getting Started](Getting_Start ## Project Summary Page -Each project has a summary page that provides an overview of all available cases, files, and annotations available. Clicking on the numbers in the summary table will display the corresponding data. +Each project has a Summary Page that provides an overview of all available cases, files, and annotations available. Clicking on the numbers in the summary table will display the corresponding data. -[![Project Summary Page](images/gdc-project-entity-page_v3.png)](images/gdc-project-entity-page_v2.png "Click to see the full image.") +[![Project Summary Page](images/gdc-project-entity-page_v4.png)](images/gdc-project-entity-page_v4.png "Click to see the full image.") -Three download buttons in the top right corner of the screen allow the user to download the entire project dataset, along with the associated project metadata: +Four buttons in the top right corner of the screen allow the user to explore or download the entire project dataset, along with the associated project metadata: -* __Explore Project Data__: Opens Exploration page with summary project information. -* __Download Biospecimen__: Downloads biospecimen metadata associated with all cases in the project in either TSV or JSON format. -* __Download Clinical__: Downloads clinical metadata about all cases in the project in either TSV or JSON format. -* __Download Manifest__: Downloads a manifest for all data files available in the project. The manifest can be used with the GDC Data Transfer Tool to download the files. +* __Explore Project Data__: Opens Exploration Page with summary project information. +* __Biospecimen__: Downloads biospecimen metadata associated with all cases in the project in either TSV or JSON format. +* __Clinical__: Downloads clinical metadata about all cases in the project in either TSV or JSON format. +* __Manifest__: Downloads a manifest for all data files available in the project. The manifest can be used with the GDC Data Transfer Tool to download the files. diff --git a/docs/Data_Portal/Users_Guide/Repository.md b/docs/Data_Portal/Users_Guide/Repository.md index efc4d3e30..7bba60e70 100644 --- a/docs/Data_Portal/Users_Guide/Repository.md +++ b/docs/Data_Portal/Users_Guide/Repository.md @@ -1,20 +1,18 @@ # Repository -## Summary - -The Repository Page is the primary method of accessing data in the GDC Data Portal. It provides an overview of all cases and files available in the GDC and offers users a variety of filters for identifying and browsing cases and files of interest. Users can access the Repository Page from the GDC Data Portal front page, from the Data Portal toolbar, or directly at [https://portal.gdc.cancer.gov/repository](https://portal.gdc.cancer.gov/repository). +The Repository Page is the primary method of accessing data in the GDC Data Portal. It provides an overview of all cases and files available in the GDC and offers users a variety of filters for identifying and browsing cases and files of interest. Users can access the [Repository Page](https://portal.gdc.cancer.gov/repository) from the GDC Data Portal Home Page or from the Data Portal toolbar. ## Filters / Facets On the left, a panel of data facets allows users to filter cases and files using a variety of criteria. If facet filters are applied, the tabs on the right will display information about matching cases and files. If no filters are applied, the tabs on the right will display information about all available data. On the right, two tabs contain information about available data: -* *`Files` tab* provides a list of files, select information about each file, and links to individual file detail pages. -* *`Cases` tab* provides a list of cases, select information about each case, and links to individual case summary pages +* `Files` tab provides a list of files, select information about each file, and links to [individual file detail pages](#file-summary-page). +* `Cases` tab provides a list of cases, select information about each case, and links to [individual case summary pages](Exploration.md#case-summary-page). The banner above the tabs on the right displays any active facet filters and provides access to advanced search. -The top of the Repository Page contains a few summary pie charts for Primary Sites, Projects, Disease Type, Gender, and Vital Status. These reflect all available data or, if facet filters are applied, only the data that matches the filters. Clicking on a specific slice in a pie chart, or on a number in a table, applies corresponding facet filters. +The top of the Repository Page, in the "Files" tab, contains a few summary pie charts for Primary Sites, Projects, Data Category, Data Type, and Data Format. These reflect all available data or, if facet filters are applied, only the data that matches the filters. Clicking on a specific slice in a pie chart, or on a number in a table, applies corresponding facet filters. The scope of these pie chart will change depending on whether you have the "Files" tab or the "Cases" tab selected. [![Data View](images/gdc-data-portal-repository-view_v2.png)](images/gdc-data-portal-repository-view_v2.png "Click to see the full image.") @@ -22,9 +20,7 @@ The top of the Repository Page contains a few summary pie charts for Primary Sit Facets represent properties of the data that can be used for filtering. The facets panel on the left allows users to filter the cases and files presented in the tabs on the right. -The facets panel is divided into two tabs, with the Files tab containing facets pertaining to data files and experimental strategies, while the Cases tab containing facets pertaining to the cases and biospecimen information. Users can apply filters in both tabs simultaneously. The applied filters will be displayed in the banner above the tabs on the right, with the option to open the filter in [Advanced Search](Advanced_Search.md) to further refine the query. - -The [Getting Started](Getting_Started.md#facet-filters) section provides instructions on using facet filters. In the following example, a filter from the Cases tab ("primary site") and filters from the Files tab ("data category", "experimental strategy") are both applied: +The facets panel is divided into two tabs, with the `Files` tab containing facets pertaining to data files and experimental strategies, while the `Cases` tab containing facets pertaining to the cases and biospecimen information. Users can apply filters in both tabs simultaneously. The applied filters will be displayed in the banner above the tabs on the right, with the option to open the filter in [Advanced Search](Advanced_Search.md) to further refine the query. [![Facet Filters Applied in Data View](images/data-view-with-facet-filters-applied_v2.png)](images/data-view-with-facet-filters-applied_v2.png "Click to see the full image.") @@ -44,9 +40,9 @@ The default set of facets is listed below. *Cases* facets tab: * __Case__: Specify individual cases using submitter ID (barcode) or UUID. -* __Case Submitter ID Prefix__: Search for cases using a part (prefix) of the submitter ID (barcode). +* __Case ID__: Search for cases using a part (prefix) of the submitter ID (barcode). * __Primary Site__: Anatomical site of the cancer under investigation or review. -* __Cancer Program__: A cancer research program, typically consisting of multiple focused projects. +* __Program__: A cancer research program, typically consisting of multiple focused projects. * __Project__: A cancer research project, typically part of a larger cancer research program. * __Disease Type__: Type of cancer studied. * __Gender__: Gender of the patient. @@ -58,119 +54,126 @@ The default set of facets is listed below. ### Adding Custom Facets -The Repository Page provides access to additional data facets beyond those listed above. Facets corresponding to additional properties listed in the [GDC Data Dictionary](../../Data_Dictionary/index.md) can be added using the "add a filter" links available at the top of the Cases and Files facet tabs: +The Repository Page provides access to additional data facets beyond the automatically listed group filters. Facets corresponding to additional properties listed in the [GDC Data Dictionary](../../Data_Dictionary/index.md) can be added using the "Add a Filter" link available at the top of the `Cases` and `Files` facet tabs: [![Add a Facet](images/gdc-data-portal-data-add-facet.png)](images/gdc-data-portal-data-add-facet.png "Click to see the full image.") -The links open a search window that allows the user to find an additional facet by name or description. Not all facets have values available for filtering; checking the "Only show fields with values" checkbox will limit the search results to only those that do. Selecting a facet from the list of search results below the search box will add it to the facets panel. +The link opens a search window that allows the user to find an additional facet by name or description. Not all facets have values available for filtering; checking the "Only show fields with values" checkbox will limit the search results to only those that do. When selecting a facet from the list of search results below the search box will add it to the facets panel. [![Search for a Facet](images/gdc-data-portal-data-facet-search.png)](images/gdc-data-portal-data-facet-search.png "Click to see the full image.") -Newly added facets will show up at the top of the facets panel and can be removed individually by clicking on the red cross to the right of the facet name. The default set of facets can be restored by clicking "Reset". +Newly added facets will show up at the top of the facets panel and can be removed individually by clicking on the "__x__" to the right of the facet name. The default set of facets can be restored by clicking "Reset". [![Customize Facet](images/gdc-data-portal-data-facet-tumor_stage.png)](images/gdc-data-portal-data-facet-tumor_stage.png "Click to see the full image.") -## Results -### Files List +## Annotations View -The Files tab on the right provides a list of available files and select information about each file. If facet filters are applied, the list includes only matching files. Otherwise, the list includes all data files available in the GDC Data Portal. +The Annotations View provides an overview of the available annotations and allows users to browse and filter the annotations based on a number of annotation properties (facets), such as the type of entity the annotation is attached to or the annotation category. This page can be found by clicking on the [Browse Annotations](https://portal.gdc.cancer.gov/annotations) link, located at the top right of the repository page. -[![Files Tab](images/gdc-data-portal-data-files.png)](images/gdc-data-portal-data-files.png "Click to see the full image.") +[![Annotations View](images/Browse_Annotations.png)](images/Browse_Annotations.png "Click to see the full image.") -The *File Name* column includes links to [file detail pages](#file-detail-page) where the user can learn more about each file. +The view presents a list of annotations in tabular format on the right, and a facet panel on the left that allows users to filter the annotations displayed in the table. If facet filters are applied, the tabs on the right will display only the matching annotations. If no filters are applied, the tabs on the right will display information about all available annotations. -Users can add individual file(s) to the file cart using the cart button next to each file. Alternatively, all files that match the current facet filters can be added to the cart using the menu in the top left corner of the table: +[![Annotations View](images/gdc-data-portal-annotations.png)](images/gdc-data-portal-annotations.png "Click to see the full image.") -[![Files Tab](images/gdc-data-portal-data-files-add-cart.png)](images/gdc-data-portal-data-files-add-cart.png "Click to see the full image.") +Clicking on an annotation ID in the annotations list will take the user to the Annotation Summary Page. The Annotation Summary Page provides more details about a specific annotation. -### Cases List +[![Annotation Entity Page](images/annotations-entity-page.png)](images/annotations-entity-page.png "Click to see the full image.") -The Cases tab on the right provides a list of available cases and select information about each case. If facet filters are applied, the list includes only matching cases. Otherwise, the list includes all cases available in the GDC Data Portal. +## Results -[![Cases Tab](images/gdc-data-portal-data-cases_v3.png)](images/gdc-data-portal-data-cases_v3.png "Click to see the full image.") +### Navigation -The list includes links to [case summary pages](#case-summary-page) in the *Case UUID* column, the Submitter ID (i.e. TCGA Barcode), and counts of the available file types for each case. Clicking on a count will apply facet filters to display the corresponding files. +After utilizing the Repository Page to narrow down a specific set of cases, users can choose to continue to explore the mutations and genes affected by these cases by clicking the `View Cases in Exploration` button as shown in the image below. -The list also includes a shopping cart button, allowing the user to add all files associated with a case to the file cart for downloading at a later time: +[![Exploration File Navigation](images/gdc-view-in-exploration_v3.png)](images/gdc-view-in-exploration_v3.png "Click to see the full image.") -[![Cases Tab, Add to Cart](images/gdc-data-portal-data-case-add-cart.png)](images/gdc-data-portal-data-case-add-cart.png "Click to see the full image.") +Clicking this button will navigate the users to the [Exploration Page](Exploration.md), filtered by the cases within the cohort. +### Files List -## Navigation +The `Files` tab on the right provides a list of available files and select information about each file. If facet filters are applied, the list includes only matching files. Otherwise, the list includes all data files available in the GDC Data Portal. -After utilizing the Repository Page to narrow down a specific set of cases, users can continue to explore the mutations and genes affected by these cases by clicking the `View Files in Repository` button as shown in the image below. +[![Files Tab](images/gdc-data-portal-data-files.png)](images/gdc-data-portal-data-files.png "Click to see the full image.") -[![Exploration File Navigation](images/gdc-view-in-exploration_v3.png)](images/gdc-view-in-exploration_v3.png "Click to see the full image.") +The "*File Name*" column includes links to [File Summary Pages](#file-summary-page) where the user can learn more about each file. + +Users can add individual file(s) to the [cart](Cart.md) using the cart button next to each file. Alternatively, all files that match the current facet filters can be added to the cart using the menu in the top left corner of the table: + +[![Files Tab](images/gdc-data-portal-data-files-add-cart.png)](images/gdc-data-portal-data-files-add-cart.png "Click to see the full image.") -Clicking this button will navigate the users to the Exploration Page, filtered by the cases within the cohort. +## File Summary Page + +The File Summary page provides information about a data file, including file properties like size, MD5 checksum, and data format; information on the type of data included; links to the associated cases and biospecimen; and information about how the data file was generated or processed. + +The page also includes buttons to download the file, add it to the file [cart](Cart.md), or (for BAM files) utilize the BAM slicing function. + +[![Files Detail Page](images/gdc-data-portal-files-entity-page.png)](images/gdc-data-portal-files-entity-page.png "Click to see the full image.") -## Case Summary Page +In the lower section of the screen, the following tables provide more details about the file and its characteristics: -The Case Summary page displays case details including the project and disease information, data files that are available for that case, and the experimental strategies employed. A button in the top-right corner of the page allows the user to add all files associated with the case to the file cart. +* __Associated Cases / Biospecimen__: List of cases or biospecimen the file is directly attached to. +* __Analysis and Reference Genome__: Information on the workflow and reference genome used for file generation. +* __Read Groups__: Information on the read groups associated with the file. +* __Metadata Files__: Experiment metadata, run metadata and analysis metadata associated with the file. +* __Downstream Analysis Files__: List of downstream analysis files generated by the file. +* __File Versions__: List of all versions of the file. -[![Case Page](images/gdc-case-entity-page.png)](images/gdc-case-entity-page.png "Click to see the full image.") -### Clinical and Biospecimen Information +[![Files Entity Page](images/gdc-data-portal-files-entity-page-part2_v2.png)](images/gdc-data-portal-files-entity-page-part2_v2.png "Click to see the full image.") -The page also provides clinical and biospecimen information about that case. Links to export clinical and biospecimen information in JSON format are provided. +>**Note**: *The Legacy Archive* will not display the "Workflow, Reference Genome and Read Groups" sections (these sections are applicable to the GDC harmonization pipeline only). However, it may provide information on archives and metadata files like MAGE-TABs and SRA XMLs. For more information, please refer to the section [Legacy Archive](Legacy_Archive.md). -[![Case Page, Clinical and Biospecimen](images/gdc-case-clinical-biospecimen_v3.png)](images/gdc-case-clinical-biospecimen_v3.png "Click to see the full image.") +### BAM Slicing -For clinical records that support multiple records of the same type (Diagnoses, Family Histories, or Exposures), a UUID of the record is provided on the left hand side of the corresponding tab, allowing the user to select the entry of interest. +BAM file Summary Pages have a "BAM Slicing" button. This function allows the user to specify a region of a BAM file for download. Clicking on it will open the BAM Slicing window: -### Biospecimen Search +[![BAM Slicing Window](images/gdc-data-portal-bam-slicing_v2.png)](images/gdc-data-portal-bam-slicing_v2.png "Click to see the full image.") -A search filter just below the biospecimen section can be used to find and filter biospecimen data. The wildcard search will highlight entities in the tree that match the characters typed. This will search both the case submitter ID, as well as the additional metadata for each entity. For example, searching 'Primary Tumor' will highlight samples that match that type. +During preparation of the slice, the icon on the BAM Slicing button will be spinning, and the file will be offered for download to the user as soon as it is ready. -[![Biospecimen Search](images/gdc-case-biospecimen-search_v2.png)](images/gdc-case-biospecimen-search_v2.png "Click to see the full image.") +### Cases List -### Most Frequent Somatic Mutations +The `Cases` tab on the right provides a list of available cases and select information about each case. If facet filters are applied, the list includes only matching cases. Otherwise, the list includes all cases available in the GDC Data Portal. -The case entity page also lists the mutations found in that particular case. +[![Cases Tab](images/gdc-data-portal-data-cases_v3.png)](images/gdc-data-portal-data-cases_v3.png "Click to see the full image.") -[![Case Page](images/gdc-case-entity-mfm.png)](images/gdc-case-entity-mfm.png "Click to see the full image.") +From the left side, the list starts with a shopping cart icon, allowing the user to add all files associated with a case to the [file cart](Cart.md) for downloading at a later time. The following columns in the list includes links to [Case Summary Pages](Exploration.md#case-summary-page) in the *Case UUID* column, the Submitter ID (i.e. TCGA Barcode), and counts of the available file types for each case. Clicking on a count will apply facet filters to display the corresponding files. On the last column, there are image slide icons and a number that indicate whether there are slide images available and how many. -The table lists the following information for each mutation +## Image Viewer -* __DNA Change:__ The chromosome and starting coordinates of the mutation are displayed along with the nucleotide differences between the reference and tumor allele -* __Type:__ A general classification of the mutation -* __Consequences:__ The effects the mutation has on the gene coding for a protein (i.e. synonymous, missense, non-coding transcript) -* __# Affected Cases in Project:__ The number of affected cases, expressed as number across all mutations within the Project -* __# Affected Cases Across GDC:__ The number of affected cases, expressed as number across all projects. Choosing the arrow next to the percentage will expand the selection with a breakdown of each affected project -* __Impact (VEP):__ A subjective classification of the severity of the variant consequence. This information comes from the [Ensembl VEP](http://www.ensembl.org/info/genome/variation/predicted_data.html). The categories are: - - __HIGH (H)__: The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay - - __MODERATE (M)__: A non-disruptive variant that might change protein effectiveness - - __LOW (L)__: Assumed to be mostly harmless or unlikely to change protein behavior - - __MODIFIER (MO)__: Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact +The Image Viewer allows users to visualize tissue and diagnostic slide images. -Clicking on the `Open in Exploration` button at the top right of this section will navigate the user to the Exploration page, filtered on this case. +[![Image Viewer](images/Image_viewer_browser.png)](images/Image_viewer_browser.png "Click to see the full image.") -## File Summary Page +### How to Access the Image Viewer -The File Summary page provides information a data file, including file properties like size, md5 checksum, and data format; information on the type of data included; links to the associated case and biospecimen; and information about how the data file was generated or processed. +* __Repository Page__: From the main search on the Repository Page by clicking on the "View images" button. It will display the tissue slide images of all the cases resulting from the query. -The page also includes buttons to download the file, add it to the file cart, or (for BAM files) utilize the BAM slicing function. +[![Image Viewer](images/Image_Viewer_from_Repository.png)](images/Image_Viewer_from_Repository.png "Click to see the full image.") -[![Files Detail Page](images/gdc-data-portal-files-entity-page.png)](images/gdc-data-portal-files-entity-page.png "Click to see the full image.") +* __Case Table in Repository Page__: Click on the image viewer icon in the Case table. It will display in the image viewer all the tissue slide images attached to the Case. -In the lower section of the screen, the following tables provide more details about the file and its characteristics: +[![Cases Tab](images/gdc-data-portal-data-cases_v3.png)](images/gdc-data-portal-data-cases_v3.png "Click to see the full image.") -* __Associated Cases / Biospecimen__: List of Cases or biospecimen the file is directly attached to. -* __Analysis and Reference Genome__: Information on the workflow and reference genome used for file generation. -* __Read Groups__: Information on the read groups associated with the file. -* __Metadata Files__: Experiment metadata, run metadata and analysis metadata associated with the file. -* __Downstream Analysis Files__: List of downstream analysis files generated by the file. -* __File Versions__: List of all versions of the file. +* __Case Summary Page:__ Selecting a Case ID in the Repository Cases table will direct the user to the [Case Summary Page](Exploration.md#case-summary-page). For cases with images, the Image Viewer icon will appear in the Case Summary section or in the Biospecimen - Slides details section. Clicking on the Image Viewer icon will display the Image Viewer for the slide images attached to the case. + [![Image Viewer](images/Image_viewer_case_summary.png)](images/Image_viewer_case_summary.png "Click to see the full image.") + [![Image Viewer](images/Image_viewer_case_slide_section.png)](images/Image_viewer_case_slide_section.png "Click to see the full image.") -[![Files Entity Page](images/gdc-data-portal-files-entity-page-part2_v2.png)](images/gdc-data-portal-files-entity-page-part2_v2.png "Click to see the full image.") +* __The Image File Page__: You can visualize the slide image directly in the File Summary Page by selecting an image file in the Repository's files table. -**Note**: *The Legacy Archive* will not display "Workflow, Reference Genome and Read Groups" sections (these sections are applicable to the GDC harmonization pipeline only). However it may provide information on Archives and metadata files like MAGE-TABs and SRA XMLs. For more information, please refer to the section [Legacy Archive](Legacy_Archive.md). +[![Image Viewer](images/Repository_select_image.png)](images/Repository_select_image.png "Click to see the full image.") -### BAM Slicing +[![Image Viewer](images/Image_viewer_File_entity.png)](images/Image_viewer_File_entity.png "Click to see the full image.") -BAM file detail pages have a "BAM Slicing" button. This function allows the user to specify a region of a BAM file for download. Clicking on it will open the BAM slicing window: +### Image Viewer Features +In the image viewer, a user can: -[![BAM Slicing Window](images/gdc-data-portal-bam-slicing.png)](images/gdc-data-portal-bam-slicing.png "Click to see the full image.") +* Zoom in and zoom out by clicking on + and - icons. +* Reset to default display by clicking on the Home icon. +* Display the image in full screen mode by clicking on the Expand icon. +* View the slide detail by clicking on "Details" button. +* Selecting the area of interest with the thumbnail at the top-right corner. -During preparation of the slice, the icon on the BAM Slicing button will be spinning, and the file will be offered for download to the user as soon as ready. +[![Image Viewer](images/Image_viewer_features.png)](images/Image_viewer_features.png "Click to see the full image.") diff --git a/docs/Data_Portal/Users_Guide/images/Browse_Annotations.png b/docs/Data_Portal/Users_Guide/images/Browse_Annotations.png new file mode 100644 index 000000000..46c62f331 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/Browse_Annotations.png differ diff --git a/docs/Data_Portal/Users_Guide/images/Exploration-Case-Example_v3.png b/docs/Data_Portal/Users_Guide/images/Exploration-Case-Example_v3.png index e28cf845c..d94b465b3 100644 Binary files a/docs/Data_Portal/Users_Guide/images/Exploration-Case-Example_v3.png and b/docs/Data_Portal/Users_Guide/images/Exploration-Case-Example_v3.png differ diff --git a/docs/Data_Portal/Users_Guide/images/Exploration-Cases-Filter_v2.png b/docs/Data_Portal/Users_Guide/images/Exploration-Cases-Filter_v2.png new file mode 100644 index 000000000..02c44a4d4 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/Exploration-Cases-Filter_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/Exploration-Clinical-Filter.png b/docs/Data_Portal/Users_Guide/images/Exploration-Clinical-Filter.png new file mode 100644 index 000000000..4748027bc Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/Exploration-Clinical-Filter.png differ diff --git a/docs/Data_Portal/Users_Guide/images/Exploration-Gene-Example2.png b/docs/Data_Portal/Users_Guide/images/Exploration-Gene-Example2.png new file mode 100644 index 000000000..1b8894f75 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/Exploration-Gene-Example2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/Exploration-Gene-Example_v2.png b/docs/Data_Portal/Users_Guide/images/Exploration-Gene-Example_v2.png new file mode 100644 index 000000000..b0171dead Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/Exploration-Gene-Example_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/Exploration-Gene-Example_v3.png b/docs/Data_Portal/Users_Guide/images/Exploration-Gene-Example_v3.png new file mode 100644 index 000000000..8fe2be99e Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/Exploration-Gene-Example_v3.png differ diff --git a/docs/Data_Portal/Users_Guide/images/Exploration-Gene-Filter_v2.png b/docs/Data_Portal/Users_Guide/images/Exploration-Gene-Filter_v2.png new file mode 100644 index 000000000..9e15faf21 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/Exploration-Gene-Filter_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/Exploration-Mutation-Example.png b/docs/Data_Portal/Users_Guide/images/Exploration-Mutation-Example.png index 51d3da719..ed53656b4 100644 Binary files a/docs/Data_Portal/Users_Guide/images/Exploration-Mutation-Example.png and b/docs/Data_Portal/Users_Guide/images/Exploration-Mutation-Example.png differ diff --git a/docs/Data_Portal/Users_Guide/images/Exploration-Mutation-Example_v2.png b/docs/Data_Portal/Users_Guide/images/Exploration-Mutation-Example_v2.png new file mode 100644 index 000000000..39209f24f Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/Exploration-Mutation-Example_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/Exploration-Mutations-Filter_v2.png b/docs/Data_Portal/Users_Guide/images/Exploration-Mutations-Filter_v2.png new file mode 100644 index 000000000..94f395178 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/Exploration-Mutations-Filter_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/Exploration-Oncogrid-Color-Picker.png b/docs/Data_Portal/Users_Guide/images/Exploration-Oncogrid-Color-Picker.png new file mode 100644 index 000000000..052c85c57 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/Exploration-Oncogrid-Color-Picker.png differ diff --git a/docs/Data_Portal/Users_Guide/images/Exploration-Oncogrid-Example_v2.png b/docs/Data_Portal/Users_Guide/images/Exploration-Oncogrid-Example_v2.png new file mode 100644 index 000000000..0689672e4 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/Exploration-Oncogrid-Example_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/Exploration-View-Files_v3.png b/docs/Data_Portal/Users_Guide/images/Exploration-View-Files_v3.png index c9dfb63c3..ac841e590 100644 Binary files a/docs/Data_Portal/Users_Guide/images/Exploration-View-Files_v3.png and b/docs/Data_Portal/Users_Guide/images/Exploration-View-Files_v3.png differ diff --git a/docs/Data_Portal/Users_Guide/images/Exploration-View-Files_v4.png b/docs/Data_Portal/Users_Guide/images/Exploration-View-Files_v4.png new file mode 100644 index 000000000..66777ce4a Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/Exploration-View-Files_v4.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Analysis-Tab_v2.png b/docs/Data_Portal/Users_Guide/images/GDC-Analysis-Tab_v2.png new file mode 100644 index 000000000..7d9ac74ad Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Analysis-Tab_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Box-And-QQ-Plots.png b/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Box-And-QQ-Plots.png new file mode 100644 index 000000000..9af664fe0 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Box-And-QQ-Plots.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Categorical-Bins.png b/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Categorical-Bins.png new file mode 100644 index 000000000..b90eadd2d Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Categorical-Bins.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Continuous-Bins-Error-Example1.png b/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Continuous-Bins-Error-Example1.png new file mode 100644 index 000000000..c1719f54e Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Continuous-Bins-Error-Example1.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Continuous-Bins-Error-Example2.png b/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Continuous-Bins-Error-Example2.png new file mode 100644 index 000000000..68c2057a1 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Continuous-Bins-Error-Example2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Continuous-Bins.png b/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Continuous-Bins.png new file mode 100644 index 000000000..be2bb0c7d Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Continuous-Bins.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Histogram.png b/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Histogram.png new file mode 100644 index 000000000..465b4bc44 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Histogram.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Survival-Plot.png b/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Survival-Plot.png new file mode 100644 index 000000000..a39ce5d0f Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Analysis-Survival-Plot.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Cohort_v2.png b/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Cohort_v2.png new file mode 100644 index 000000000..2cf540ba0 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Clinical-Cohort_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Cohort-Comparison-Top_v2.png b/docs/Data_Portal/Users_Guide/images/GDC-Cohort-Comparison-Top_v2.png new file mode 100644 index 000000000..6b5d0191f Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Cohort-Comparison-Top_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Enable-Clinical-Cards.png b/docs/Data_Portal/Users_Guide/images/GDC-Enable-Clinical-Cards.png new file mode 100644 index 000000000..6baf849c3 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Enable-Clinical-Cards.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Exploration-Page_v6.png b/docs/Data_Portal/Users_Guide/images/GDC-Exploration-Page_v6.png new file mode 100644 index 000000000..1b3c78a4f Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Exploration-Page_v6.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-ExplorationSet-Cohort_v2.png b/docs/Data_Portal/Users_Guide/images/GDC-ExplorationSet-Cohort_v2.png index a86e40e53..1b7dc32c5 100644 Binary files a/docs/Data_Portal/Users_Guide/images/GDC-ExplorationSet-Cohort_v2.png and b/docs/Data_Portal/Users_Guide/images/GDC-ExplorationSet-Cohort_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Gene-CancerDist.png b/docs/Data_Portal/Users_Guide/images/GDC-Gene-CancerDist.png index 92a42d89a..a27de3581 100644 Binary files a/docs/Data_Portal/Users_Guide/images/GDC-Gene-CancerDist.png and b/docs/Data_Portal/Users_Guide/images/GDC-Gene-CancerDist.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Gene-CancerDist_v2.png b/docs/Data_Portal/Users_Guide/images/GDC-Gene-CancerDist_v2.png new file mode 100644 index 000000000..0bf68368e Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Gene-CancerDist_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Gene-MFM.png b/docs/Data_Portal/Users_Guide/images/GDC-Gene-MFM.png index 51b224381..849785711 100644 Binary files a/docs/Data_Portal/Users_Guide/images/GDC-Gene-MFM.png and b/docs/Data_Portal/Users_Guide/images/GDC-Gene-MFM.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Gene-ProteinGraph.png b/docs/Data_Portal/Users_Guide/images/GDC-Gene-ProteinGraph.png index f58906822..c8801011c 100644 Binary files a/docs/Data_Portal/Users_Guide/images/GDC-Gene-ProteinGraph.png and b/docs/Data_Portal/Users_Guide/images/GDC-Gene-ProteinGraph.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Gene-Summary.png b/docs/Data_Portal/Users_Guide/images/GDC-Gene-Summary.png index 2b330ea8a..9da5f9994 100644 Binary files a/docs/Data_Portal/Users_Guide/images/GDC-Gene-Summary.png and b/docs/Data_Portal/Users_Guide/images/GDC-Gene-Summary.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Gene-Summary_v2.png b/docs/Data_Portal/Users_Guide/images/GDC-Gene-Summary_v2.png new file mode 100644 index 000000000..f65094952 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Gene-Summary_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Load-Clinical-Analysis.png b/docs/Data_Portal/Users_Guide/images/GDC-Load-Clinical-Analysis.png new file mode 100644 index 000000000..78bd8f975 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Load-Clinical-Analysis.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Mutation-CancerDist.png b/docs/Data_Portal/Users_Guide/images/GDC-Mutation-CancerDist.png index b737f2291..8d4b97a9a 100644 Binary files a/docs/Data_Portal/Users_Guide/images/GDC-Mutation-CancerDist.png and b/docs/Data_Portal/Users_Guide/images/GDC-Mutation-CancerDist.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Mutation-Consequences.png b/docs/Data_Portal/Users_Guide/images/GDC-Mutation-Consequences.png index 3fab97e96..6fad980d3 100644 Binary files a/docs/Data_Portal/Users_Guide/images/GDC-Mutation-Consequences.png and b/docs/Data_Portal/Users_Guide/images/GDC-Mutation-Consequences.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Mutation-ProteinGraph.png b/docs/Data_Portal/Users_Guide/images/GDC-Mutation-ProteinGraph.png index a49fd4f89..be6df6554 100644 Binary files a/docs/Data_Portal/Users_Guide/images/GDC-Mutation-ProteinGraph.png and b/docs/Data_Portal/Users_Guide/images/GDC-Mutation-ProteinGraph.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Mutation-Summary.png b/docs/Data_Portal/Users_Guide/images/GDC-Mutation-Summary.png index 617a54f6e..f55f5ca89 100644 Binary files a/docs/Data_Portal/Users_Guide/images/GDC-Mutation-Summary.png and b/docs/Data_Portal/Users_Guide/images/GDC-Mutation-Summary.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Mutation-Summary_v2.png b/docs/Data_Portal/Users_Guide/images/GDC-Mutation-Summary_v2.png new file mode 100644 index 000000000..a2c4d9a57 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Mutation-Summary_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Search-Clinical-Fields.png b/docs/Data_Portal/Users_Guide/images/GDC-Search-Clinical-Fields.png new file mode 100644 index 000000000..59134d24b Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Search-Clinical-Fields.png differ diff --git a/docs/Data_Portal/Users_Guide/images/GDC-Select-Clinical-Cohort.png b/docs/Data_Portal/Users_Guide/images/GDC-Select-Clinical-Cohort.png new file mode 100644 index 000000000..2cb69f3e8 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/GDC-Select-Clinical-Cohort.png differ diff --git a/docs/Data_Portal/Users_Guide/images/Repository_select_image.png b/docs/Data_Portal/Users_Guide/images/Repository_select_image.png new file mode 100644 index 000000000..d390db17b Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/Repository_select_image.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-analysis-resultstab_v2.png b/docs/Data_Portal/Users_Guide/images/gdc-analysis-resultstab_v2.png new file mode 100644 index 000000000..51ea92c12 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-analysis-resultstab_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-case-clinical-biospecimen_v4.png b/docs/Data_Portal/Users_Guide/images/gdc-case-clinical-biospecimen_v4.png new file mode 100644 index 000000000..0e9418edb Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-case-clinical-biospecimen_v4.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-case-clinical-multiple-records.png b/docs/Data_Portal/Users_Guide/images/gdc-case-clinical-multiple-records.png new file mode 100644 index 000000000..4b248cde3 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-case-clinical-multiple-records.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-case-clinical-nested-records.png b/docs/Data_Portal/Users_Guide/images/gdc-case-clinical-nested-records.png new file mode 100644 index 000000000..30c48adb2 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-case-clinical-nested-records.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-case-clinical-single-record.png b/docs/Data_Portal/Users_Guide/images/gdc-case-clinical-single-record.png new file mode 100644 index 000000000..557bc71d1 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-case-clinical-single-record.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-case-entity-mfm.png b/docs/Data_Portal/Users_Guide/images/gdc-case-entity-mfm.png index 2a6569a71..3b5363eea 100644 Binary files a/docs/Data_Portal/Users_Guide/images/gdc-case-entity-mfm.png and b/docs/Data_Portal/Users_Guide/images/gdc-case-entity-mfm.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-case-entity-page_v2.png b/docs/Data_Portal/Users_Guide/images/gdc-case-entity-page_v2.png new file mode 100644 index 000000000..106c4fea5 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-case-entity-page_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-data-portal-bam-slicing_v2.png b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-bam-slicing_v2.png new file mode 100644 index 000000000..77b6a8fe9 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-bam-slicing_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-data-portal-download-cart_v2.png b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-download-cart_v2.png new file mode 100644 index 000000000..81f2f0f1b Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-download-cart_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-data-portal-gdc-apps_v2.png b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-gdc-apps_v2.png new file mode 100644 index 000000000..92a53eb6c Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-gdc-apps_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page-facets2.png b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page-facets2.png new file mode 100644 index 000000000..a2f961f0b Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page-facets2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page-facets3.png b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page-facets3.png new file mode 100644 index 000000000..45a4a8350 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page-facets3.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page-facets4.png b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page-facets4.png new file mode 100644 index 000000000..1e018b7b7 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page-facets4.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page.png b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page.png index 89b3b8ad4..d0d2bbab9 100644 Binary files a/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page.png and b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page_v2.png b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page_v2.png new file mode 100644 index 000000000..d59277f31 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page_v3.png b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page_v3.png new file mode 100644 index 000000000..155480923 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-data-portal-project-page_v3.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-input-set_v2.png b/docs/Data_Portal/Users_Guide/images/gdc-input-set_v2.png index 8eed614b4..b9e950ede 100644 Binary files a/docs/Data_Portal/Users_Guide/images/gdc-input-set_v2.png and b/docs/Data_Portal/Users_Guide/images/gdc-input-set_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-kaplan-meier-estimator.svg b/docs/Data_Portal/Users_Guide/images/gdc-kaplan-meier-estimator.svg new file mode 100644 index 000000000..8bf37b3a7 --- /dev/null +++ b/docs/Data_Portal/Users_Guide/images/gdc-kaplan-meier-estimator.svg @@ -0,0 +1,53 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/Data_Portal/Users_Guide/images/gdc-kaplan-meier-estimator2.png b/docs/Data_Portal/Users_Guide/images/gdc-kaplan-meier-estimator2.png new file mode 100644 index 000000000..1f756fe09 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-kaplan-meier-estimator2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-project-entity-page_v4.png b/docs/Data_Portal/Users_Guide/images/gdc-project-entity-page_v4.png new file mode 100644 index 000000000..98cef52ab Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-project-entity-page_v4.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-projects-table-view_v2.png b/docs/Data_Portal/Users_Guide/images/gdc-projects-table-view_v2.png new file mode 100644 index 000000000..b63648566 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc-projects-table-view_v2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc-table-graph-mouse-over.png b/docs/Data_Portal/Users_Guide/images/gdc-table-graph-mouse-over.png index 06d02ffed..dd40e17ef 100644 Binary files a/docs/Data_Portal/Users_Guide/images/gdc-table-graph-mouse-over.png and b/docs/Data_Portal/Users_Guide/images/gdc-table-graph-mouse-over.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc_case_biospecimen_search_v3.png b/docs/Data_Portal/Users_Guide/images/gdc_case_biospecimen_search_v3.png new file mode 100644 index 000000000..82b5414bb Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc_case_biospecimen_search_v3.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc_project_visualizations2.png b/docs/Data_Portal/Users_Guide/images/gdc_project_visualizations2.png new file mode 100644 index 000000000..e018a9aaa Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc_project_visualizations2.png differ diff --git a/docs/Data_Portal/Users_Guide/images/gdc_project_visualizations3.png b/docs/Data_Portal/Users_Guide/images/gdc_project_visualizations3.png new file mode 100644 index 000000000..d9b93a68b Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/gdc_project_visualizations3.png differ diff --git a/docs/Data_Portal/Users_Guide/images/image_clinical_and_biospecimen_information.png b/docs/Data_Portal/Users_Guide/images/image_clinical_and_biospecimen_information.png new file mode 100644 index 000000000..2b8c51322 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/image_clinical_and_biospecimen_information.png differ diff --git a/docs/Data_Portal/Users_Guide/images/mutation-set-filter.png b/docs/Data_Portal/Users_Guide/images/mutation-set-filter.png index 2fc7c861c..6e8d1c95a 100644 Binary files a/docs/Data_Portal/Users_Guide/images/mutation-set-filter.png and b/docs/Data_Portal/Users_Guide/images/mutation-set-filter.png differ diff --git a/docs/Data_Portal/Users_Guide/images/mutation-set-filter_v2.png b/docs/Data_Portal/Users_Guide/images/mutation-set-filter_v2.png new file mode 100644 index 000000000..ef2ebad14 Binary files /dev/null and b/docs/Data_Portal/Users_Guide/images/mutation-set-filter_v2.png differ diff --git a/docs/Data_Submission_Portal/Release_Notes/Data_Submission_Portal_Release_Notes.md b/docs/Data_Submission_Portal/Release_Notes/Data_Submission_Portal_Release_Notes.md index 1dbdb6073..2a3bc386e 100644 --- a/docs/Data_Submission_Portal/Release_Notes/Data_Submission_Portal_Release_Notes.md +++ b/docs/Data_Submission_Portal/Release_Notes/Data_Submission_Portal_Release_Notes.md @@ -2,6 +2,13 @@ | Version | Date | |---|---| +| [v2.5.1](Data_Submission_Portal_Release_Notes.md#release-251) | August 14, 2020 | +| [v2.5.0](Data_Submission_Portal_Release_Notes.md#release-250) | July 2, 2020 | +| [v2.4.1](Data_Submission_Portal_Release_Notes.md#release-241) | March 9, 2020 | +| [v2.4.0](Data_Submission_Portal_Release_Notes.md#release-240) | November 6, 2019 | +| [v2.3.0](Data_Submission_Portal_Release_Notes.md#release-230) | June 5, 2019 | +| [v2.2.0](Data_Submission_Portal_Release_Notes.md#release-220) | February 20, 2019 | +| [v2.1.0](Data_Submission_Portal_Release_Notes.md#release-210) | November 7, 2018 | | [v2.0.0](Data_Submission_Portal_Release_Notes.md#release-200) | August 23, 2018 | | [v1.9.0](Data_Submission_Portal_Release_Notes.md#release-190) | May 21, 2018 | | [v1.8.0](Data_Submission_Portal_Release_Notes.md#release-180) | February 15, 2018 | @@ -15,6 +22,143 @@ | [v0.3.21](Data_Submission_Portal_Release_Notes.md#release-0321) | January 27, 2016 | | [v0.2.18.3](Data_Submission_Portal_Release_Notes.md#release-02183) | November 30, 2015 | +--- +## Release 2.5.1 + +* __GDC Product__: GDC Data Submission Portal +* __Release Date__: August 14, 2020 + +### New Features and Changes + +* Enhancements were made to the submission API to increase performance and usability. + +### Bugs Fixed Since Last Release + +* None + +### Known Issues and Workarounds + +* When creating entities in the Submission Portal, occasionally an extra transaction will appear with status error. This does not seem to impact that actual transaction, which is recorded as occurring successfully. + + +## Release 2.5.0 + +* __GDC Product__: GDC Data Submission Portal +* __Release Date__: July 2, 2020 + +### New Features and Changes + +* None. + +### Bugs Fixed Since Last Release + +* Fixed bug where the Details pane in the QC Report was displaying crowded, non-uniform buttons. + +### Known Issues and Workarounds + +* When creating entities in the Submission Portal, occasionally an extra transaction will appear with status error. This does not seem to impact that actual transaction, which is recorded as occurring successfully. + + +## Release 2.4.1 + +* __GDC Product__: GDC Data Submission Portal +* __Release Date__: March 9, 2020 + +### New Features and Changes + +* Removed duplicate queries from various pages in the Submission Portal to optimize data retrieval and rendering. + +### Bugs Fixed Since Last Release + +* Fixed bug where the right-hand detail pane in the Transactions and QC Report tabs was being cut off and not scrollable in the viewport for Windows environments (all browsers). +* Fixed bug in the PDF file downloaded from the QC Report tab's Project Summary, where text was being cut off when browsing in Firefox or Microsoft Edge. +* Fixed bug where the TSV and JSON download buttons completely disappear and cannot be scrolled to in the Project Data Download modal, if it is shrunk beyond a certain threshold. +* Fixed bug in the Manifest download button that was trying to capture certain incorrect or unnecessary file states. +* Fixed incorrect DTT hyperlink in the GDC Apps menu. +* Fixed bug where the banner warning users that ERA Commons login was currently not working, would only appear after the user logged in, thus defeating the purpose of the warning in the first place. + +### Known Issues and Workarounds + +* When creating entities in the Submission Portal, occasionally an extra transaction will appear with status error. This does not seem to impact that actual transaction, which is recorded as occurring successfully. + + +## Release 2.4.0 + +* __GDC Product__: GDC Data Submission Portal +* __Release Date__: November 6, 2019 + +### New Features and Changes + +* Added new QC Report tab that allows users to view and download QC errors detected on the current set of unsubmitted data. Users must examine these errors and fix them appropriately before re-uploading the data and requesting harmonization. New donut added to the Dashboard tab to display a quick breakdown of CRITICAL vs WARNING errors across the project. + +### Bugs Fixed Since Last Release + +* Fixed the Project Data download button in the Project Overview, so that the JSON option is selectable in the modal. +* Fixed the trash can icon for the Delete button in the Submitter Detail pane, so that it is fully visible and no longer cutoff +* Fixed a Section 508 Accessibility violation in the Submitter Detail pane. +* Increased the global transaction polling interval to 15 seconds across the portal to improve performance. + +### Known Issues and Workarounds + +* When creating entities in the Submission Portal, occasionally an extra transaction will appear with status error. This does not seem to impact that actual transaction, which is recorded as occurring successfully. + + +## Release 2.3.0 + +* __GDC Product__: GDC Data Submission Portal +* __Release Date__: June 5, 2019 + +### New Features and Changes + +* Added Aligned Reads, Gene Expressions, miRNA Expression to the Harmonized Data Files list in the Browse tab. + +### Bugs Fixed Since Last Release + +* Fixed logic on the Submittable Data Files donut to more accurately display number of submittable files that have been validated. Specifically, both the file state and corresponding note state must be validated. Also updated the corresponding tooltip text. + +### Known Issues and Workarounds + +* When creating entities in the Submission Portal, occasionally an extra transaction will appear with status error. This does not seem to impact that actual transaction, which is recorded as occurring successfully. + + +## Release 2.2.0 + +* __GDC Product__: GDC Data Submission Portal +* __Release Date__: February 20, 2019 + +### New Features and Changes + +* Renamed the "Request Submission" button to "Request Harmonization" to make the purpose of this action more clear. + +### Bugs Fixed Since Last Release + +* Fixed the right scroll bar in the records list on the Browse page so that it works in Firefox. +* Fixed a dead link to the Submission Portal User Guide on the Dashboard. + +### Known Issues and Workarounds + +* When creating entities in the Submission Portal, occasionally an extra transaction will appear with status error. This does not seem to impact that actual transaction, which is recorded as occurring successfully. + + +## Release 2.1.0 + +* __GDC Product__: GDC Data Submission Portal +* __Release Date__: November 7, 2018 + +### New Features and Changes + +* Updated the project columns to include a Release column in addition to the Batch Submit column. + +### Bugs Fixed Since Last Release + +* Fixed quick search so that projects with a dash in the name will no longer break the search. +* PO reports will now return the latest data for each project that has completed running. + +### Known Issues and Workarounds + +* When creating entities in the Submission Portal, occasionally an extra transaction will appear with status error. This does not seem to impact that actual transaction, which is recorded as occurring successfully. + + ## Release 2.0.0 * __GDC Product__: GDC Data Submission Portal @@ -128,8 +272,8 @@ Release details are maintained in the [GDC Data Submission Portal Change Log](ht ### New Features and Changes -* Added ability to delete an entity. Read more about this [here](http://docs.gdc.cancer.gov/Data_Submission_Portal/Users_Guide/Data_Upload_UG/#deleting-submitted-entities) -* Added Project Reports in the projects list page. Read more about this [here](http://docs.gdc.cancer.gov/Data_Submission_Portal/Users_Guide/Homepage/#reports). +* Added ability to delete an entity. Read more about this [here](https://docs.gdc.cancer.gov/Data_Submission_Portal/Users_Guide/Data_Submission_Walkthrough/#deleting-submitted-entities) +* Added Project Reports in the projects list page. Read more about this [here](https://docs.gdc.cancer.gov/Data_Submission_Portal/Users_Guide/Data_Submission_Process/#reports). * To avoid confusion, renamed "Status" to "State" in the Browse section * Added tooltip over Hierarchy title when reviewing an entity * Restrict the upload window to only supported data formats (JSON and TSV) diff --git a/docs/Data_Submission_Portal/Users_Guide/Best_Practices.md b/docs/Data_Submission_Portal/Users_Guide/Best_Practices.md index 7111f6bdc..5fa3ac7fc 100644 --- a/docs/Data_Submission_Portal/Users_Guide/Best_Practices.md +++ b/docs/Data_Submission_Portal/Users_Guide/Best_Practices.md @@ -1,6 +1,6 @@ # Submission Best Practices -Because of the data types and relationships included in the GDC, data submission can become a complex procedure. The purpose of this section is to present guidelines that will aid in the incorporation and harmonization of submitters' data. Please contact the GDC Help Desk at __support@nci-gdc.datacommons.io__ if you have any questions or concerns regarding a submission project. +Because of the data types and relationships included in the GDC, data submission can become a complex procedure. The purpose of this section is to present guidelines that will aid in the incorporation and harmonization of submitters' data. Please contact the GDC Help Desk at ____ if you have any questions or concerns regarding a submission project. ## Date Obfuscation @@ -8,66 +8,60 @@ The GDC is committed to providing accurate and useful information as well as pro ### General Guidelines -Actual calendar dates are not reported in GDC clinical fields but the lengths of time between events are preserved. Time points are reported based on the number of days since the patient's initial diagnosis. Events that occurred after the initial diagnosis are reported as positive and events that occurred before are reported as negative. Dates are not automatically obfuscated by the GDC validation system and submitters are required to make these changes in their clinical data. +Actual calendar dates are not reported in GDC clinical fields but the lengths of time between events are preserved. Time points are reported based on the number of days since the patient's initial diagnosis. Events that occurred after the initial diagnosis are reported as positive and events that occurred before are reported as negative. Dates are not automatically obfuscated by the GDC validation system and submitters are required to make these changes in their clinical data. This affects these fields: `days_to_birth`, `days_to_death`, `days_to_last_follow_up`, `days_to_last_known_disease_status`, `days_to_recurrence`, `days_to_treatment` -| Affected Fields | -| --- | -| `days_to_birth` | -| `days_to_death` | -| `days_to_last_follow_up` | -| `days_to_last_known_disease_status` | -| `days_to_recurrence` | -| `days_to_treatment` | +>__Note:__ The day-based fields take leap years into account. -### Patients Older than 90 Years +### Patients Older than 90 Years and Clinical Events Because of the low population number within the demographic of patients over 90 years old, it becomes more likely that patients can potentially be identified by a combination of their advanced age and publicly available clinical data. Because of this, patients over 90 years old are reported as exactly 90 years or 32,872 days old. -__Note:__ The day-based fields take leap years into account. - -### Clinical Events After a Patient Turns 90 Years Old - -Clinical events that occur over 32,872 days after an event also have the potential to reveal the age and identity of an individual over the age of 90. Following this, all timelines are capped at 32,872 days. When timelines are capped, the priority should be to shorten the post-diagnosis values to preserve the accuracy of the age of the patient (except for patients who were diagnosed at over 90 years old). Values such as `days_to_death` and `days_to_recurrence` should be compressed before `days_to_birth` is compressed. +Following this, clinical events that occur over 32,872 days are also capped at 32,872 days. When timelines are capped, the priority should be to shorten the post-diagnosis values to preserve the accuracy of the age of the patient (except for patients who were diagnosed at over 90 years old). Values such as `days_to_death` and `days_to_recurrence` should be compressed before `days_to_birth` is compressed. ### Examples Timelines __Example 1:__ An 88 year old patient is diagnosed with cancer and dies 13 years later. The `days_to_birth` value is less than 32,872 days, so it can be accurately reported. However, between the initial diagnosis and death, the patient turned 90 years old. Since 32,872 is the maximum, `days_to_death` would be calculated as 32872 - 32142 = 730. -__Dates__ +>__Dates__ -* _Date of Birth:_ 01-01-1900 -* _Date of Initial Diagnosis:_ 01-01-1988 -* _Date of Death:_ 01-01-2001 +>* _Date of Birth:_ 01-01-1900 +>* _Date of Initial Diagnosis:_ 01-01-1988 +>* _Date of Death:_ 01-01-2001 -__Actual-Values__ +>__Actual-Values__ -* _days_to_birth:_ -32142 -* _days_to_death:_ 4748 +>* _days_to_birth:_ -32142 +>* _days_to_death:_ 4748 -__Obfuscated-Values__ +>__Obfuscated-Values__ -* _days_to_birth:_ -32142 -* _days_to_death:_ 730 +>* _days_to_birth:_ -32142 +>* _days_to_death:_ 730 __Example 2:__ A 98 year old patient is diagnosed with cancer and dies three years later. Because `days_to_X` values are counted from initial diagnosis, days will be at their maximum value of 32,872 upon initial diagnosis. This will compress the later dates and reduce `days_to_birth` to -32,872 and `days_to_death` to zero. -__Dates__ +>__Dates__ + +>* _Date of Birth:_ 01-01-1900 +>* _Date of Initial Diagnosis:_ 01-01-1998 +>* _Date of Death:_ 01-01-2001 + +>__Actual-Values__ -* _Date of Birth:_ 01-01-1900 -* _Date of Initial Diagnosis:_ 01-01-1998 -* _Date of Death:_ 01-01-2001 +>* _days_to_birth:_ -35794 +>* _days_to_death:_ 1095 -__Actual-Values__ +>__Obfuscated-Values__ -* _days_to_birth:_ -35794 -* _days_to_death:_ 1095 +>* _days_to_birth:_ -32872 +>* _days_to_death:_ 0 -__Obfuscated-Values__ +## Array Submission -* _days_to_birth:_ -32872 -* _days_to_death:_ 0 +Certain fields in the GDC, such as diagnosis.sites_of_involvement, are of type "array". This allows multiple values to be submitted on one property. These values need to be uploaded in a `|`-delimited format. See the example below. +__Example:__ `"sites_of_involvement" : "Kidney, Upper Pole|Kidney, Middle"` ## Submitting Complex Data Model Relationships @@ -97,28 +91,32 @@ submitted_aligned_reads Alignment.bam Raw Sequencing Data BAM Aligned Reads W } ``` -## Read groups +### Read groups -### Submitting Read Group Names +#### Submitting Read Group Names The `read_group` entity requires a `read_group_name` field for submission. If the `read_group` entity is associated with a BAM file, the submitter should use the `@RG` ID present in the BAM header as the `read_group_name`. This is important for the harmonization process and will reduce the possibility of errors. -### Minimal Read Group Information +#### Multiple FASTQs from One Read Group -In addition to the required properties on `read_group` we also recommend submitting `flow_cell_barcode`, `lane_number` and `multiplex_barcode`. This information can be used by our bioinformatics team and data downloaders to construct a `Platform Unit` (`PU`), which is a universally unique identifier that can be used to model various sequencing technical artifacts. More information can be found in the SAM specification (https://github.com/samtools/hts-specs/blob/master/SAMv1.pdf). +To align reads according to their direction and pair, the GDC requires that unaligned forward and reverse reads are submitted as "submitted_unaligned_reads." When more than one FASTQ exists for a read group direction, the GDC requires that the FASTQ files are concatenated for each direction. In other words, each paired-end read group should be associated with exactly two FASTQ files (submitted_unaligned_reads). -For projects with library strategies of targeted sequencing or WXS we also require information on the target capture protocol included on `target_capture_kit` +#### Minimal and Recommended Read Group Information -If this information is not provided it may cause a delay in the processing of submitted data. +In addition to the required properties on `read_group` we also recommend submitting `flow_cell_barcode`, `lane_number` and `multiplex_barcode`. This information can be used by our bioinformatics team and data downloaders to construct a `Platform Unit` (`PU`), which is a universally unique identifier that can be used to model various sequencing technical artifacts. More information can be found in the [SAM specification PDF](https://github.com/samtools/hts-specs/blob/master/SAMv1.pdf). + +For projects with library strategies of targeted sequencing or WXS we also require information on the target capture protocol included on `target_capture_kit`. -### Recommended Read Group Information +If this information is not provided it may cause a delay in the processing of submitted data. -Additional read group information will benefit data users. Such information can be used by bioinformatics pipelines and will aid understanding and mitigation of batch effects. If available you should also provide as many of the remaining read group properties as possible. +Additional read group information will benefit data users. Such information can be used by bioinformatics pipelines and will aid understanding and mitigation of batch effects. If available, you should also provide as many of the remaining read group properties as possible. ## Submission File Quality Control The GDC harmonization pipelines include multiple quality control steps to ensure the integrity of harmonized files and the efficient use of computational resources. For fastest turnaround of data processing we recommend that submitters perform basic QC of their data files prior to upload to identify any underlying data quality issues. This may include such tests as verifying expected genome coverage levels and sequencing quality. +Except for [miRNA data](https://docs.gdc.cancer.gov/Data_Submission_Portal/Users_Guide/Best_Practices/#mirna-submission) submission, sequencing data for all other experimental strategy types (i.e. whole exome sequencing, whole genome sequencing, targeted sequencing and mRNA sequencing) do not need to be trimmed by submitters prior to submission, as tools used in GDC alignment workflows are capable of handling adaptors and low quality bases correctly. + ## Target Capture Kit Q and A 1. What is a Target Capture Kit? @@ -129,19 +127,19 @@ Target region information is important for DNA-Seq variant calling and filtering 3. How do submitters provide this information? There are 3 steps - * Step 1. The submitter should contact GDC User Service about any new Target Capture Kits that do not already exist in the GDC Dictionary. The GDC Bioinformatics and User Services teams will work together with the submitter to create a meaningful name for the kit, and import this name and Target Region Bed File into the GDC. + * Step 1. The submitter should contact GDC User Service about any new Target Capture Kits that do not already exist in the GDC Dictionary. The GDC Bioinformatics and User Services teams will work together with the submitter to create a meaningful name for the kit and import this name and Target Region Bed File into the GDC. * Step 2. The submitter can then select one and only one GDC Target Capture Kit for each read group during molecular data submission. - * Step 3. The submitter should also selection the appropriate `library_strategy` and `library_selection` on the read_group entity. + * Step 3. The submitter should also select the appropriate `library_strategy` and `library_selection` on the read_group entity. 4. What is a Target Region Bed File? -A Target Region Bed File is tab-delimited file describing the kit target region in bed format (https://genome.ucsc.edu/FAQ/FAQformat.html#format1). The first 3 columns of such files are chrom, chromStart, and chromEnd. +A Target Region Bed File is tab-delimited file describing the kit target region in [bed format](https://genome.ucsc.edu/FAQ/FAQformat.html#format1). The first 3 columns of such files are chrom, chromStart, and chromEnd. Note that by definition, bed files are 0-based or "left-open, right-closed", which means bed interval "chr1 10 20" only contains 10 bases on chr1, from the 11th to the 20th. In addition, submitters should also let GDC know the genome build (hg18, hg19 or GRCh38) of their bed files. 5. Is a Target Capture Kit uniquely defined by its Target Region Bed File? -Not necessary. Sometimes, users or manufactures may want to augment an existing kit with additional probes, in order to capture more regions or simply improve the quality of existing regions. In the later case, the bed file stays the same, but it is now a different Target Capture Kit and should be registered separately as described in Step 3 above. +Not necessarily. Sometimes, users or manufactures may want to augment an existing kit with additional probes, in order to capture more regions or simply improve the quality of existing regions. In the latter case, the bed file stays the same, but it is now a different Target Capture Kit and should be registered separately as described in Step 3 above. -## Specifying Tumor Normal Pairs for analysis +## Specifying Tumor Normal Pairs for Analysis It is critical for many cancer bioinformatics pipelines to specify which normal sample to use to factor out germline variation. In particular, this is a necessary specification for all tumor normal paired variant calling pipelines. The following details describe how the GDC determines which normal sample to use for variant calling. @@ -150,18 +148,24 @@ It is critical for many cancer bioinformatics pipelines to specify which normal * If there are multiple normals of the same experimental_strategy for a case: * Users can specify which normal to use by specifying on the aliquot. To do so one of the following should be set to `TRUE` for the specified experimental strategy: `selected_normal_low_pass_wgs`, `selected_normal_targeted_sequencing`, `selected_normal_wgs`, or `selected_normal_wxs`. * Or if no normal is specified the GDC will select the best normal for that patient based on the following criteria. This same logic will also be used if multiple normal are selected. - * If a case has blood cancer we will use sample type in the following priority order: Blood Derived Normal > Bone Marrow Normal > Mononuclear Cells from Bone Marrow Normal > Fibroblasts from Bone Marrow Normal > Lymphoid Normal > Buccal Cell Normal > Solid Tissue Normal > EBV Immortalized Normal - * If case does not have blood cancer we will use sample type in the following priority order: - Solid Tissue Normal > Buccal Cell Normal > Lymphoid Normal > Fibroblasts from Bone Marrow Normal > Mononuclear Cells from Bone Marrow Normal > Bone Marrow Normal > Blood Derived Normal > EBV Immortalized Normal - * If there are still ties we will choose the aliquot submitted first -* If there are no normals - * The GDC will not run tumor only variant calling pipeline by default. The submitter must specify one of the following properties as TRUE: `no_matched_normal_low_pass_wgs`, `no_matched_normal_targeted_sequencing`, `no_matched_normal_wgs`, `no_matched_normal_wxs`. + * If a case has blood cancer we will use sample type in the following priority order: + + Blood Derived Normal > Bone Marrow Normal > Mononuclear Cells from Bone Marrow Normal > Fibroblasts from Bone Marrow Normal > Lymphoid Normal > Buccal Cell Normal > Solid Tissue Normal > EBV Immortalized Normal -Note that we will only run variant calling for a particular tumor aliquot per experimental strategy once. You must make sure that the appropriate normal control is uploaded to the GDC when Requesting Submission. Uploading a different normal sample later will not result in reanalysis by the GDC. + * If a case does not have blood cancer we will use sample type in the following priority order: + Solid Tissue Normal > Buccal Cell Normal > Lymphoid Normal > Fibroblasts from Bone Marrow Normal > Mononuclear Cells from Bone Marrow Normal > Bone Marrow Normal > Blood Derived Normal > EBV Immortalized Normal + * If there are still ties, we will choose the aliquot submitted first. +* If there are no normals. + * The GDC will not run tumor only variant calling pipeline by default. The submitter must specify one of the following properties as TRUE: `no_matched_normal_low_pass_wgs`, `no_matched_normal_targeted_sequencing`, `no_matched_normal_wgs`, `no_matched_normal_wxs`. +Note that we will only run variant calling for a particular tumor aliquot per experimental strategy once. You must make sure that the appropriate normal control is uploaded to the GDC when Requesting Submission. Uploading a different normal sample later will not result in reanalysis by the GDC. ## Clinical Data Requirements For the GDC to release a project there is a minimum number of clinical properties that are required. Minimal cross-project GDC requirements include age, gender, and diagnosis information. Other requirements may be added when the submitter is approved for submission to the GDC. + +## miRNA Submission + +The GDC requires that miRNA reads be adapter-trimmed before being uploaded to the GDC because miRNA datasets can have different trimming schemas. Uploading untrimmed miRNA reads will result in unusably low miRNA quantifications. diff --git a/docs/Data_Submission_Portal/Users_Guide/Checklist.md b/docs/Data_Submission_Portal/Users_Guide/Checklist.md new file mode 100644 index 000000000..d984f32c7 --- /dev/null +++ b/docs/Data_Submission_Portal/Users_Guide/Checklist.md @@ -0,0 +1,39 @@ +# Before Submitting Data to the GDC Portal + +## Overview +The National Cancer Institute (NCI) Genomic Data Commons (GDC) Data Submission Portal User's Guide is the companion documentation for the [GDC Data Submission Portal](https://gdc.cancer.gov/submit-data/gdc-data-submission-portal) and provides detailed information and instructions for its use. + +## Steps to Submit Data to the GDC +The following tasks are required to submit data to the [GDC Data Submission Portal](https://portal.gdc.cancer.gov/). + +1. Complete the GDC Data [Submission Request Form](https://gdc.cancer.gov/data-submission-request-form). After submission, the reqest will be reviewed by the GDC Data Submission Review Committee. During this time, create an [eRA Commons account](https://era.nih.gov/registration_accounts.cfm) if you do not already have one. + +2. If the study is approved, contact a [Genomic Program Administrator (GPA)](https://osp.od.nih.gov/genomic-program-administrators/) to register the approved study in [dbGaP](https://www.ncbi.nlm.nih.gov/sra/docs/submitdbgap). This includes registering the project as a GDC Trusted Partner study, registering cases, and adding authorized data submitters. For more information, see the [Data Submission Process](https://gdc.cancer.gov/submit-data/data-submission-processes-and-tools). + +3. Contact GDC User Services to create a submission project. The User Services team will require a project ID, which is a two-part identifier, where the first portion is the __Program__ followed by a hyphen (__-__) and the second portion is the __Project__. This must be alphanumeric and all caps only. An example would be `TCGA-BRCA`. You must also create a project name, which can be longer and has fewer requirements on length or character usage. An example would be `Breast Invasive Carcinoma`. + +## Key Features +The GDC Data Submission Portal is a platform that allows researchers to submit and release data to the GDC. The key features of the GDC Data Submission Portal are: + +* __Upload and Validate Data__: Project data can be uploaded to the GDC project workspace. The GDC will validate the data against the [GDC Data Dictionary](../../Data_Dictionary/viewer.md). +* __Browse Data__: Data that has been uploaded to the project workspace can be browsed to ensure that the project is ready for processing. +* __Download Data__: Data that has been uploaded into the project workspace can be downloaded for review or update by using the [API](https://docs.gdc.cancer.gov/API/Users_Guide/Downloading_Files/) or the [Data Transfer Tool](https://gdc.cancer.gov/access-data/gdc-data-transfer-tool). +* __Review and Submit Data__: Prior to submission, data can be reviewed to check for accuracy and completeness. Once the review is complete, the data can be submitted to the GDC for processing through [Data Harmonization](https://gdc.cancer.gov/submit-data/gdc-data-harmonization). +* __Release Data__: After harmonization, data can be released to the research community for access through the [GDC Data Portal](https://portal.gdc.cancer.gov/) and other [GDC Data Access Tools](https://gdc.cancer.gov/access-data/data-access-processes-and-tools). +* __Status and Alerts__: Visual cues are implemented in the GDC Data Submission Portal Dashboard to easily identify incomplete submissions via panel displays summarizing submitted data and associated data elements. +* __Transactions__: A list of all actions performed in a project is provided with detailed information for each action. + +## Sections to the Data Submission Portal Guide + +* [__Data Submission Overview__](Data_Submission_Overview.md): Graphical explanations used to display the life cycle of projects and their data. +* [__Data Submission Process__](Data_Submission_Process.md): An overview of the data submission process using the GDC Data Submission Portal. +* [__Data Submission Walkthrough__](Data_Submission_Walkthrough.md): Step-by-step instructions on GDC data submission and their relationship to the GDC Data Model. +* [__Pre-Release Data Portal__](Pre_Release_QC.md): Instructions on how to use the Pre-Release Data Portal for projects that have been harmonized but not released. + +## HIPAA Compliance + +The GDC will not accept any data for patients age 90 and over including any follow-up events in which the event occurs after a patient turns 90 to ensure that HIPAA compliance is maintained. To comply with these requirements data submitters may omit any data (entire cases or specific nodes) that would violate this rule or obfuscate associated dates. Please see the [Date Obfuscation](/Data_Submission_Portal/Users_Guide/Best_Practices/#date-obfuscation) section for more information. + +## Release Notes + +The [Release Notes](../../Data_Submission_Portal/Release_Notes/Data_Submission_Portal_Release_Notes.md) section of this User's Guide contains details about new features, bug fixes, and known issues. diff --git a/docs/Data_Submission_Portal/Users_Guide/Dashboard.md b/docs/Data_Submission_Portal/Users_Guide/Dashboard.md index 9221e34af..558e06c6f 100644 --- a/docs/Data_Submission_Portal/Users_Guide/Dashboard.md +++ b/docs/Data_Submission_Portal/Users_Guide/Dashboard.md @@ -4,7 +4,6 @@ The GDC Data Submission Portal dashboard provides details about a specific project. -[![GDC Submission Dashboard Page](images/GDC_Submission_Dashboard_2.png)](images/GDC_Submission_Dashboard_2.png "Click to see the full image.") [![GDC Submission Dashboard Page-2](images/GDC_Submission_Dashboard_3.png)](images/GDC_Submission_Dashboard_3.png "Click to see the full image.") The dashboard contains various visual elements to guide the user through all stages of submission, from viewing the [Data Dictionary](https://docs.gdc.cancer.gov/Data_Dictionary/) in support of data upload to submitting a project for harmonization. @@ -16,14 +15,17 @@ The Project Overview sections of the dashboard displays the project state (open The search field at the top of the dashboard allows for submitted entities to be searched by partial or whole `submitter_id`. When a search term is entered into the field, a list of entities matching the term is updated in real time. Selecting one of these entities links to its details in the [Browse Tab](Browse_Data.md) -The remaining part of the top section of the dashboard is broken down into four status charts: +The remaining part of the top section of the dashboard is broken down into five status charts: +* __QC Errors__: The number of QC errors identified within the data that has been uploaded. Refer to [QC Reports](/Data_Submission_Portal/Users_Guide/Data_Submission_Process/#qc-reports) for more information. * __Cases with Clinical__: The number of `cases` for which Clinical data has been uploaded. * __Cases with Biospecimen__: The number of `cases` for which Biospecimen data has been uploaded. * __Cases with Submittable Data Files__: The number of `cases` for which experimental data has been uploaded. * __Submittable Data Files__: The number of files uploaded through the GDC Data Transfer Tool. For more information on this status chart, please refer to [File Status Lifecycle](Submission_Workflow/#file-status-lifecycle). The _'DOWNLOAD MANIFEST'_ button below this status chart allows the user to download a manifest for registered files in this project that have not yet been uploaded. +\* Note that the QC Errors will not be immediately updated after submission of a new file. + Status charts are constantly updated to reflect the current state of the selected project. ## Action Panels diff --git a/docs/Data_Submission_Portal/Users_Guide/Data_Submission_Overview.md b/docs/Data_Submission_Portal/Users_Guide/Data_Submission_Overview.md new file mode 100644 index 000000000..ed70cf5f9 --- /dev/null +++ b/docs/Data_Submission_Portal/Users_Guide/Data_Submission_Overview.md @@ -0,0 +1,111 @@ +# Data Submission Overview + +## Overview +This section will walk users through two parts of the submission process. The first portion will be the steps taken by the users to go through the submission process from start to finish. The second portion will describe the lifecycle of a project and a file throughout the data submission process. + +## GDC Data Submission Workflow + +The diagram below illustrates the process from uploading through releasing data in the GDC Data Submission Portal. To review the steps needed before beginning submission see [Before Submitting Data to the GDC Portal](https://docs.gdc.cancer.gov/Data_Submission_Portal/Users_Guide/Checklist/) + +[![GDC Data Submission Portal Workflow Upload](images/GDC_Data_Submission_Workflow-updated_20190301.jpg)](images/GDC_Data_Submission_Workflow-updated_20190301.jpg "Click to see the full image.") + +### Review GDC Dictionary and GDC Data Model - Submitter Activity + +It is suggested that all submitters review the [GDC Dictionary](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/) and [GDC Data Model](https://gdc.cancer.gov/developers/gdc-data-model/gdc-data-model-components). It is beneficial for submitters to know which nodes will need metadata submission, how these nodes relate to each other, and what information is required for each node in the model. + +### Download Templates - Submitter Activity + +After determining the required nodes for the submission, go to each node page in the [GDC Dictionary](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/). There will be a "Download Template" drop down list. Select the file format, either TSV or JSON, and download the template for the node. If [numerous entries](Data_Submission_Walkthrough.md#submitting-numerous-cases) are being submitted all at one time, it is suggested that the user uses a TSV template. At this point, it is suggested to go through the template and remove fields that will not be populated by the metadata submission, but make sure to complete all fields that are required for the node. For more information about the Data Dictionary, please visit [here](../../../Data_Dictionary/). + +[`See GDC Data Dictionary here.`](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/) + +### Upload Case Information Including dbGaP Submitted Subject IDs - Submitter Activity + +After registering the study in [dbGaP](https://gdc.cancer.gov/submit-data/obtaining-access-submit-data), the first node to be created in the data model is the [`case` node](Data_Submission_Walkthrough.md#case-submission). The `case` node is important as it will contain a unique `submitter_id` that is registered in dbGaP under a particular project. This will connect the two databases, dbGaP and GDC, and allows for access to be granted to a controlled data set based on the study and its cases. + +To [submit the `case`](Data_Submission_Walkthrough.md#uploading-the-case-submission-file) nodes, a user must be able to [login](Data_Submission_Process.md#authentication) and access the [GDC Submission Portal](https://portal.gdc.cancer.gov/submission/) for their respective project. Metadata for all nodes are uploaded via the [API](https://docs.gdc.cancer.gov/API/Users_Guide/Submission/#creating-and-updating-entities) or through the [Submission Portal](Data_Submission_Walkthrough.md#upload-using-the-gdc-data-submission-portal). + +[`See case example here.`](Data_Submission_Walkthrough.md#case-submission) + +[`See metadata upload example here.`](Data_Submission_Walkthrough.md#upload-using-the-gdc-data-submission-portal) + +### Upload Clinical and Biospecimen Data - Submitter Activity + +With the creation of `case` nodes, other nodes in the [data model](https://gdc.cancer.gov/developers/gdc-data-model/gdc-data-model-components) can be [uploaded](Data_Submission_Walkthrough.md#upload-using-the-gdc-data-submission-portal). This includes the [Clinical](Data_Submission_Walkthrough.md#clinical-data-submission) and [Biospecimen](Data_Submission_Walkthrough.md#biospecimen-submission) nodes, with examples for each that can be found in the [Data Upload Walkthrough](Data_Submission_Walkthrough.md). + +[`See clinical example here.`](Data_Submission_Walkthrough.md#clinical-data-submission) + +[`See biospecimen example here.`](Data_Submission_Walkthrough.md#biospecimen-submission) + +[`See metadata upload example here.`](Data_Submission_Walkthrough.md#upload-using-the-gdc-data-submission-portal) + +### Register Data Files - Submitter Activity + +Registering data files is necessary before they can be uploaded. This allows the GDC to later validate the uploads against the user-supplied md5sum and file size. The [submission](Data_Submission_Walkthrough.md#experiment-data-submission) of these files can range from clinical and biospecimen supplements to `submitted_aligned_reads` and `submitted_unaligned_reads`. + +[`See experiment data example here.`](Data_Submission_Walkthrough.md#experiment-data-submission) + +### Upload Data Using Data Transfer Tool - Submitter Activity + +Before uploading the submittable data files to the GDC, a user will need to determine if the correct nodes have been created and the information within them are correct. This is accomplished using the [Browse](Data_Submission_Process.md#browse) page in the [Data Submission Portal](https://portal.gdc.cancer.gov/submission). Here you can find the metadata and file_state, which must have progressed to `registered` for an associated file to be uploaded. You can find more about the file life cycle [here](#file-lifecycle). + +Once the submitter has verified that the submittable data files have been registered, the user can obtain the submission manifest file that is found on the [Project Overview](Data_Submission_Process.md#project-overview) page. From this point the submission process is described in the ["Uploading the Submittable Data File to the GDC"](Data_Submission_Walkthrough.md#uploading-the-submittable-data-file-to-the-gdc) section. + +For strategies on data upload, further documentation for the GDC Data Submission process is detailed on the [Data Submission Processes and Tools](https://gdc.cancer.gov/submit-data/data-submission-processes-and-tools) section of the GDC Website. + +[`See submittable data file upload example here.`](Data_Submission_Walkthrough.md#uploading-the-submittable-data-file-to-the-gdc) + +### Verify Accuracy and Completeness of Project Data (Project QC) - Submitter Activity + +The submitter is responsible for reviewing the data uploaded to the project workspace and ensuring there are no critical QC errors, see [Data Submission Walkthrough](Data_Submission_Walkthrough.md), and ensuring that it is ready for processing by the GDC [Harmonization Process](https://gdc.cancer.gov/submit-data/gdc-data-harmonization). A user should be able to go through the [Pre-Harmonization Checklist](Data_Submission_Process.md#pre-harmonization-checklist), and verify that their submission meets these criteria. + +[`See pre-harmonization checklist here.`](Data_Submission_Process.md#pre-harmonization-checklist) + +### Request Data Harmonization - Submitter Activity + +When the project is complete and ready for processing, the submitter will [request harmonization](Data_Submission_Process.md#submit-your-workspace-data-to-the-gdc). If the project is not ready for processing, the project can be re-opened and the submitter will be able to upload more data to the project workspace. + +[`See harmonization request example here.`](Data_Submission_Process.md#submit-your-workspace-data-to-the-gdc) + +> __NOTE:__ The GDC requests that users submit their data to the GDC within six months from the first upload of data to the project workspace. + +### GDC Review/QC Submitted Data - GDC Activity + +The Bioinformatics Team at the GDC runs the Quality Control pipeline on the submitted data. This pipeline mirrors the [Pre-Harmonization Checklist](Data_Submission_Process.md#pre-harmonization-checklist) and will determine if the submission is complete and is ready for the Harmonization pipeline. If the submission does contain problems, the GDC will contact the user to "Re-Open" the project and fix the errors in their submission. + +Once the review is complete, all validated nodes will be changed to state "submitted". At this point users can submit more files to a project, but they will be considered as a different batch for harmonization. + +### GDC Harmonize Data - GDC Activity + +After the submission passes the GDC Quality Control pipeline, it will be queued for the [GDC Harmonization pipeline](https://gdc.cancer.gov/about-data/gdc-data-harmonization). + +### Submitter Review/QC of Harmonized Data - Submitter Activity + +After the data is processed in the Harmonization pipeline, the GDC asks submitters to [verify the quality](https://portal.gdc.cancer.gov/submission/login?next=%2Fsubmission%2F) of their harmonized data. It is the user's responsibility to notify the GDC of any errors in their harmonized data sets. The GDC will then work with the user to correct the issue and rerun the Harmonization pipeline if needed. + +### Release Data Within Six Months - Submitter Activity + +Project release occurs after the data has been harmonized, and allows users to access this data with the [GDC Data Portal](https://portal.gdc.cancer.gov/) and other [GDC Data Access Tools](https://gdc.cancer.gov/access-data/data-access-processes-and-tools). The GDC will release data according to [GDC Data Sharing Policies](https://gdc.cancer.gov/submit-data/data-submission-policies). Data must be released within six months after GDC data processing has been completed, or the submitter may request earlier release. + +[`See release example here.`](Data_Submission_Process.md#release) + +>__Note__: Released cases and/or files can be redacted from the GDC. For more information, visit the [GDC Policies page (under GDC Data Sharing Policies)](https://gdc.cancer.gov/submit-data/data-submission-policies). + +### GDC Releases Data - GDC Activity + +GDC data releases are not continuous, but instead are released in discrete data updates. Once harmonized data is approved and release request is approved, data will be available in an upcoming GDC Data Release. + +## Project and File Lifecycles + +### Project Lifecycle +The diagram of the project lifecycle below demonstrates the transition of a project through the various states. Initially the project is open for data upload and validation. Any changes to the data must be made while the project status is open. When the data is uploaded and ready for review, the submitter changes the project state to review. During the review state, the project is locked and additional data cannot be uploaded. If data changes are needed during the review period, the project has to be re-opened. + +The process of Harmonization does not occur immediately after submitted files are uploaded. After the submission is complete and all the necessary data and files have been uploaded, the user submits the data to the GDC for processing through the [GDC Data Harmonization Pipelines](https://gdc.cancer.gov/submit-data/gdc-data-harmonization) and the project state changes to submitted. When the data has been processed, the project state changes back to open for new data to be submitted to the project and the submitter can review the processed data. After review of the processed data, the submitter can then release the harmonized data to the [GDC Data Portal](https://portal.gdc.cancer.gov/) and other [GDC Data Access Tools](https://gdc.cancer.gov/access-data/data-access-processes-and-tools) according to [GDC Data Sharing Policies](https://gdc.cancer.gov/submit-data/data-submission-policies). + +[![GDC Data Submission Portal Workflow](images/Submission.png)](images/Submission.png "Click to see the full image.") + +### File Lifecycle + +This section describes states pertaining to submittable data files throughout the data submission process. A submittable data file could contain data such as genomic sequences (such as a BAM or FASTQ) or pathology slide images. The file lifecycle starts when a submitter uploads metadata for a file to the [GDC Data Submission Portal](https://portal.gdc.cancer.gov/submission/). This metadata file registers a description of the file as an entity on the GDC, the status for this is known as "state" and is represented by __purple__ cirlces. The submitter can then use the [GDC Data Transfer Tool](https://gdc.cancer.gov/access-data/gdc-data-transfer-tool) to upload the actual file, which is represeneted by __red__ circles. Throughout the lifecycle, the file and its associated entity transition through various states from when they are initially registered through file submission and processing. The diagram below details these state transitions. + +[![GDC Data Submission Portal File Status](images/gdc-submission-portal-file-state-vs-state.png)](images/gdc-submission-portal-file-state-vs-state.png "Click to see the full image.") diff --git a/docs/Data_Submission_Portal/Users_Guide/Data_Submission_Process.md b/docs/Data_Submission_Portal/Users_Guide/Data_Submission_Process.md new file mode 100644 index 000000000..2dec9e9be --- /dev/null +++ b/docs/Data_Submission_Portal/Users_Guide/Data_Submission_Process.md @@ -0,0 +1,364 @@ +# Data Submission Portal + +## Overview + +This section will walk users through the submission process using the [GDC Data Submission Portal](https://portal.gdc.cancer.gov/submission/) to upload files to the GDC. + +## Authentication + +### Requirements + +Accessing the GDC Data Submission Portal requires eRA Commons credentials with appropriate dbGaP authorization. To learn more about obtaining the required credentials and authorization, see [Obtaining Access to Submit Data]( https://gdc.cancer.gov/submit-data/obtaining-access-submit-data). + +### Authentication via eRA Commons + +Users can log into the GDC Data Submission Portal with eRA Commons credentials by clicking the "Login" button. If authentication is successful, the user will be redirected to the GDC Data Submission Portal front page and the user's eRA Commons username will be displayed in the upper right corner of the screen. + +#### GDC Authentication Tokens + +The GDC Data Portal provides authentication tokens for use with the GDC Data Transfer Tool or the GDC API. To download a token: + +1. Log into the GDC using your eRA Commons credentials. +2. Click the username in the top right corner of the screen. +3. Select the "Download Token" option. + +![Token Download Button](images/gdc-data-portal-token-download.png) + +A new token is generated each time the `Download Token` button is clicked. + +For more information about authentication tokens, see [Data Security](../../Data/Data_Security/Data_Security.md#authentication-tokens). + +>**NOTE:** The authentication token should be kept in a secure location, as it allows access to all data accessible by the associated user account. + +#### Logging Out + +To log out of the GDC, click the username in the top right corner of the screen, and select the Logout option. Users will automatically be logged out after 15 minutes of inactivity. + +![Logout link](images/gdc-data-portal-token-download.png) + +## Homepage + +After authentication, users are redirected to a homepage. The homepage acts as the entry point for GDC data submission and provides submitters with access to a list of authorized projects, reports, and transactions. Content on the homepage varies based on the user profile (e.g. submitter, program office). + +[![GDC Submitter Home Page](images/GDC-HomePage-Submit_v2.png)](images/GDC-HomePage-Submit_v2.png "Click to see the full image.") + +### Reports + +Project summary reports can be downloaded at the Submission Portal homepage at three different levels: `CASE OVERVIEW`, `ALIQUOT OVERVIEW`, and `DATA VALIDATION`. Each report is generated in tab-delimited format in which each row represents an active project. + +* __`CASE OVERVIEW`:__ This report describes the number of cases with associated biospecimen data, clinical data, or submittable data files (broken down by data type) for each project. +* __`ALIQUOT OVERVIEW`:__ This report describes the number of aliquots in a project with associated data files. Aliquot numbers are broken down by sample tissue type. +* __`DATA VALIDATION`:__ This report categorizes all submittable data files associated with a project by their file status. + +### Projects + +The projects section in the homepage lists the projects that the user has access to along with basic information about each project. For users with access to a large number of projects, this table can be filtered using the 'FILTER PROJECTS' field. Selecting a project ID will direct the user to the project's [Dashboard](#dashboard). The button used to release data for each project is also located on this screen, see [Release](#release) for details. + +## Dashboard + +The GDC Data Submission Portal dashboard provides details about a specific project. + +[![GDC Submission Dashboard Page](images/Submission_portal_homepage.png)](images/Submission_portal_homepage.png "Click to see the full image.") + +The dashboard contains various visual elements to guide the user through all stages of submission, from viewing the [Data Dictionary](https://docs.gdc.cancer.gov/Data_Dictionary/), support of data upload, to submitting a project for harmonization. + +To better understand the information displayed on the dashboard and the available actions, please refer to the [Data Submission Walkthrough](Data_Submission_Walkthrough.md). + +### Project Overview +The Project Overview sections of the dashboard displays the most current project state (open / review / submitted / processing) and the GDC Release, which is the date in which the project was released to the GDC. + +The search field at the top of the dashboard allows for submitted entities to be searched by partial or whole `submitter_id`. When a search term is entered into the field, a list of entities matching the term is updated in real time. Selecting one of these entities links to its details in the [Browse Tab](#browse). + +The remaining part of the top section of the dashboard is broken down into four status charts: + +* __QC Errors__: The number of errors found in the uploaded data. For more details please refer to the [QC Report Section](/Data_Submission_Portal/Users_Guide/Data_Submission_Process/#qc-reports). +* __Cases with Clinical__: The number of `cases` for which Clinical data has been uploaded. +* __Cases with Biospecimen__: The number of `cases` for which Biospecimen data has been uploaded. +* __Cases with Submittable Data Files__: The number of `cases` for which experimental data has been uploaded. +* __Submittable Data Files__: The number of registered submittable data files that have been successfully uploaded through the GDC Data Transfer Tool. Totals do not include files that have been submitted for harmonization. For more information on this status chart, please refer to [File Lifecycle](Data_Submission_Overview.md#file-lifecycle). + * __`DOWNLOAD MANIFEST`:__ This button below the status chart allows the user to download a manifest for registered files in this project that have not yet been uploaded. + +### Action Panels + +There are two action panels available below the Project Overview. + +* [UPLOAD DATA TO YOUR WORKSPACE](Data_Submission_Walkthrough.md): Allows a submitter to upload project data to the GDC project workspace. The GDC will validate the uploaded data against the [GDC Data Dictionary](https://docs.gdc.cancer.gov/Data_Dictionary/). This panel also contains a table that displays details about the five latest transactions. Clicking the IDs in the first column will bring up a window with details about the transaction, which are documented in the [transactions](#transactions) page. This panel will also allow the user to commit file uploads to the project. +* [REVIEW AND SUBMIT YOUR WORKSPACE DATA TO THE GDC](#submit-your-workspace-data-to-the-gdc): Allows a submitter to review project data which will lock the project to ensure that additional data cannot be uploaded while in review. Once the review is complete, the data can be submitted to the GDC for processing through the [GDC Harmonization Process](https://gdc.cancer.gov/submit-data/gdc-data-harmonization). + +These actions and associated features are further detailed in their respective sections of the documentation. + +## Transactions + +The transactions page lists all of the project's transactions. The transactions page can be accessed by choosing the Transactions tab at the top of the dashboard or by choosing "View All Data Upload Transactions" in the first panel of the dashboard. + +[![GDC Submission Transactions](images/GDC_Submission_Transactions_2.png)](images/GDC_Submission_Transactions_2.png "Click to see the full image.") + +The types of transactions are the following: + +* __Upload:__ The user uploads data to the project workspace. Note that submittable data files uploaded using the GDC Data Transfer tool do not appear as transactions. Uploaded submittable data can be viewed in the Browse tab. +* __Delete:__ The user deletes data from the project workspace. +* __Review:__ The user reviews the project before submitting data to the GDC. +* __Open:__ The user re-opens the project if it was under review. This allows the upload of new data to the project workspace. +* __Submit:__ The user submits uploaded data to the GDC. This triggers the data harmonization process. +* __Release:__ The user releases harmonized data to be available through the GDC Data Portal and other GDC data access tools. + +### Transactions List View + +The transactions list view displays the following information: + +|Column|Description| +| --- | --- | +| __ID__ | Identifier of the transaction | +| __Type__ | Type of the transaction (see the list of transaction types in the previous section)| +| __Step__ | The step of the submission process that each file is currently in. This can be Validate or Commit. "Validate" represents files that have not yet been committed but have been uploaded using the submission portal or the API. | +| __DateTime__ | Date and Time that the transaction was initiated | +| __User__ | The username of the submitter that performed the transaction | +| __State__ | Indicates the status of the transaction: `SUCCEEDED`, `PENDING`, or `FAILED` | +| __Commit/Discard__ | Two buttons appear when data has been uploaded using the API or the submission portal. This allows for validated data to be incorporated into the project or discarded. This column will then display the transaction number for commited uploads and "Discarded" for the uploads that are discarded.| + +### Transaction Filters + +Choosing from the drop-down menu at the top of the table allows the transactions to be filtered by those that are in progress, to be committed, succeeded, failed, or discarded. The drop-down menu also allows for the transactions to be filtered by type and step. + +### Transactions Details + +Clicking on a transaction will open the details panel. Data in this panel is organized into multiple sections including actions, details, types, and documents as described below. + +[![GDC Submission Transactions](images/GDC_Submission_Transactions_Details_3.png)](images/GDC_Submission_Transactions_Details_3.png "Click to see the full image.") + +Navigation between the sections can be performed by either scrolling down or by clicking on the section icon displayed on the left side of the details panel. + +#### Actions + +The Actions section allows a user to perform an action for transactions that provide actions. For example, if a user uploads read groups and file metadata, a corresponding manifest file will be available for download from the transaction. This manifest is used to upload the actual files through the [GDC Data Transfer Tool](https://gdc.cancer.gov/access-data/gdc-data-transfer-tool). + +[![GDC Submission Transactions Details Action](images/GDC_Submission_Transactions_Details_Action_2.png)](images/GDC_Submission_Transactions_Details_Action_2.png "Click to see the full image.") + +#### Details + +The Details section provides details about the transaction itself, such as its project, type, and number of affected cases. + +[![GDC Submission Transactions Details](images/GDC_Submission_Transactions_Details_Details_2.png)](images/GDC_Submission_Transactions_Details_Details_2.png "Click to see the full image.") + +#### Types + +The Types section lists the type of files submitted and the number of affected cases and entities. + +[![GDC Submission Transactions Types](images/GDC_Submission_Transactions_Details_Types_2.png)](images/GDC_Submission_Transactions_Details_Types_2.png "Click to see the full image.") + +#### Documents + +The Documents section lists the files submitted during the transaction. +The user can download the original files from the transaction, a report detailing the transaction, or the errors that originated from the transaction that has failed. + +[![GDC Submission Transactions Documents](images/GDC_Submission_Transactions_Details_Documents_2.png)](images/GDC_Submission_Transactions_Details_Documents_2.png "Click to see the full image.") + +## Browse + +The `Browse` menu provides access to all of a project's content. Most content is driven by the GDC Data Dictionary and the interface is dynamically generated to accommodate the content. + +Please refer to the [GDC Data Dictionary Viewer](../../Data_Dictionary/viewer.md) for specific details about dictionary-generated fields, columns, and filters. + +[![GDC Submission Cases Default View](images/GDC_Submission_Cases_Default_2.png)](images/GDC_Submission_Cases_Default_2.png "Click to see the full image.") + +### Main Interface Elements + +#### Filters + +A wide set of filters are available for the user to select the type of entity to be displayed. These filters are dynamically created based on the [GDC Data Dictionary](../../Data_Dictionary/index.md). + +Current filters are: + +|Filter|Description| +| --- | --- | +| __Cases__ | Display all `Cases` associated with the project. | +| __Clinical__ | Display all Clinical data uploaded to the project workspace. This is divided into subgroups including `Demographics`, `Diagnoses`, `Exposures`, `Family Histories`, `Follow_up`, `Molecular_tests`, and `Treatments`. | +| __Biospecimen__ | Display all Biospecimen data uploaded to the project workspace. This is divided into subgroups including `Samples`, `Portions`, `Slides`, `Analytes`, `Aliquots`, and `Read Groups`. | +| __Submittable Data Files__ | Displays all data files that have been registered with the project. This includes files that have been uploaded and those that have been registered but not uploaded yet. This category is divided into groups by file type. | +| __Annotations__ | Lists all annotations associated with the project. An annotation provides an explanatory comment associated with data in the project. | +| __Harmonized Data Files__ | Lists all data files that have been harmonized by the GDC. This category is divided into groups by generated data. | + +#### List View + +The list view is a paginated list of all entities corresponding to the selected filter. + +On the top-right section of the screen, the user can download data about all entities associated with the selected filter. + +* For the case filter, it will download all Clinical data or all Metadata. +* For all other filters, it will download the corresponding metadata (e.g., for the `demographic` filter, it will download all `demographic` data). + +[![GDC Submission Case Summary Download](images/GDC_Submission_Cases_Summary_Download_2.png)](images/GDC_Submission_Cases_Summary_Download_2.png "Click to see the full image.") + +#### Details Panel + +Clicking on an entity will open the details panel. Data in this panel is broken down into multiple sections depending on the entity type. The main sections are: + +* __Actions__: Actions that can be performed relating the entity. This includes downloading the metadata (JSON or TSV) or submittable data file pertaining to the entity and deleting the entity. See the [Deleting Entities](Data_Submission_Walkthrough.md#deleting-submitted-entities) guide for more information. +* __Summary__: A list of IDs and system properties associated with the entity. +* __Details__: Properties of the entity (not associated with cases). +* __Hierarchy__ or __Related Entities__: A list of associated entities. +* __Annotations__: A list of annotations associated with the entity. +* __Transactions__: A list of previous transactions that affect the entity. + +[![GDC Submission Case Details](images/GDC_Submission_Cases_Details_2.png)](images/GDC_Submission_Cases_Details_2.png "Click to see the full image.") + +The sections listed above can be navigated either by scrolling down or by clicking on the section icon on the left side of the details panel. + +#### Related Entities + +The Related Entities table lists all entities, grouped by type, related to the selected `case`. This section is only available at the `case` level. + +[![GDC Submission Cases Related Entities](images/GDC_Submission_Cases_Summary_Related_Entities_2.png)](images/GDC_Submission_Cases_Summary_Related_Entities_2.png "Click to see the full image.") + + +This table contains the following columns: + +* __Category__: category of the entity (Clinical, Biospecimen, submittable data file). +* __Type__: type of entity (based on Data Dictionary). +* __Count:__ number of occurrences of an entity associated with the `case`. Clicking on the count will open a window listing those entities within the Browse page. + +#### Hierarchy + +The hierarchy section is available for entities at any level (e.g., Clinical, Biospecimen, etc.), except for `case`. The user can use the hierarchy section to navigate through entities. + +The hierarchy shows: + +* The `case` associated with the entity. +* The __direct__ parents of the entity. +* The __direct__ children of the entity. + +[![GDC Submission Cases Details Hierarchy](images/GDC_Submission_Cases_Summary_Hierarchy_2.png)](images/GDC_Submission_Cases_Summary_Hierarchy_2.png "Click to see the full image.") + +After uploading data to the workspace on the GDC Data Submission Portal, data will need to be [reviewed by the submitter](#pre-harmonization-checklist) and then [submitted to the GDC](#submit-to-the-gdc) for processing. + +## QC Reports + +The QC Reports section allows users to see errors identified by the GDC for the current data that has not yet been submitted for harmonization. This includes all nodes in state `validated`. Data with error type `Critical` indicates errors that must be fixed before a submitter can Request Harmonization. Errors with error type `Warning` should be reviewed by the submitter as they may indicate discrepancies or problematic data. + +You can see in the QC Reports Tab highlights of what data are present and the types of errors found in the project. + +[![QC Report](images/QC_Report_tab.png)](images/QC_Report_tab.png "Click to see the full image.") + +To find specific details for any node that contains errors you can click on the facet panel on the left to see those errors and to download a list of errors for that respective node. All potential errors are listed in the [Pre-harmonization Checklist](/Data_Submission_Portal/Users_Guide/Data_Submission_Process/#pre-harmonization-checklist). + +[![QC Errors for Submitted Unaligned Reads](images/SUR_QC_errors.png)](SUR_QC_errors.png "Click to see the full image.") + +## Submit Your Workspace Data to the GDC + +The GDC Data Submission process is detailed on the [Data Submission Processes and Tools](https://gdc.cancer.gov/submit-data/data-submission-processes-and-tools) section of the GDC Website. + +### Review + +The submitter is responsible for reviewing the data uploaded to the project workspace (see [Data Submission Walkthrough](Data_Submission_Walkthrough.md)), and ensuring that it is ready for processing by the GDC [Harmonization Process](https://gdc.cancer.gov/submit-data/gdc-data-harmonization). + +The user will be able to view the section below on the dashboard. The `REVIEW` button is available only if the project is in "OPEN" state. + +[![GDC Submission Review Tab](images/GDC_Submission_Submit_Release_Review_tab_2_v2.png)](images/GDC_Submission_Submit_Release_Review_tab_2_v2.png "Click to see the full image.") + +Setting the project to the "REVIEW" state will lock the project and prevent users from uploading additional data. During this period, the submitter can browse the data in the Data Submission Portal or download it. Once the review is complete, the user can request to submit data to the GDC. + +Once the user clicks on `REVIEW`, the project state will change to "REVIEW": + +[![GDC Submission Review State](images/GDC_Submission_Submit_Release_Project_State_Review_3.png)](images/GDC_Submission_Submit_Release_Project_State_Review_3.png "Click to see the full image.") + +### Pre-Harmonization Checklist + +The Harmonization step is __NOT__ an automatic process that occurs when data is uploaded to the GDC. The GDC performs batch processing of submitted data for Harmonization only after verifying that the submission is complete. + +QC checks are automatically run on all supplied metadata and data files. The results are displayed within the [QC Reports](/Data_Submission_Portal/Users_Guide/Data_Submission_Process/#qc-reports). These errors fall into two categories: Critical or Warning. If an error is deemed Critical it must be resolved before a submitter can request harmonization. If an error is categorized as Warning then the submitter should review this to verify the data have been submitted correctly. A list of the errors and their meanings are found in the table below: + +#### __Critical Errors__ + +| Error Message | Description | How to Fix / Error Meaning | +|---|---|---| +|INVALID_CHARACTER | This entity submitter_id includes invalid characters | Upload new entity without invalid characters. The acceptable characters are alphanumeric characters [a-z, A-Z, 0-9] and `_`, `.`, `-`. Any other characters will interfere with the Harmonization workflow. | +| MORE_THAN_ONE_SAMPLE_TYPE | The aliquot is associated with more than one sample type | Ensure there is no `aliquot` attached to multiple `sample` nodes of more than one sample_type. | +| TWO_NODE_TYPES | The aliquot is associated with two or more node types| Ensure aliquot is only connected to a single type of node. | +| PE_FASTQ_FILE_COUNT | The number of FASTQ files for PE readgroup is not 2| Ensure that if a read group is paired end, that it has two FASTQ files. For the `read_group` node, make sure that the `is_paired_end` is set to `true` for paired end sequencing and `false` for single end sequencing.| +| SE_FASTQ_FILE_COUNT | The number of FASTQ files for SE readgroup is not 1| Ensure that if a read group is single end, that it has one FASTQ file. For the `read_group` node, make sure that the `is_paired_end` is set to `true` for paired end sequencing and `false` for single end sequencing.| +| CAPTURE_KIT_INADEQUATE | WXS/Targeted Sequencing ReadGroup lacks valid target capture kit| Modify read group entity to have a valid target capture kit from data dictionary. The `target_capture_kit` property is completed when the selected `library_strategy` is `WXS`. Errors will occur if `Not Applicable` or `Unknown` is selected. | +| TARGET_SEQ_LIBRARY_SELECTION | ReadGroup has library strategy Targeted Sequencing but does not have PCR or Hybrid Selection as its library selection| If library strategy is Target Sequencing, modify library selection to be either PCR or Hybrid Selection | +| WXS_LIBRARY_SELECTION | ReadGroup has library strategy WXS but does not have Hybrid Selection as its library selection| Modify library selection to be Hybrid Selection for WXS read groups | +| WGS_LIBRARY_SELECTION | ReadGroup has library strategy WGS but does not have Random as its library selection| For WGS read groups, ensure library strategy is set to Random | +| NO_READ_PAIR_NUMBER | The FASTQ is paired but has no read_pair_number| Include a read_pair_number for paired end FASTQ files | +| DUPLICATE_MD5S | Two or more files have the same md5sum| This means there are duplicate files in the submission. You must delete one of these files | + +#### __Warning Errors__ +| Error Message | Description | How to Fix / Error Meaning | +|---|---|---| +| FILE_BAD_STATE | The file node is in a bad state | There are some files in a bad file_state. All files that are registered must been uploaded and validated. If file_state is `Error` You will have to delete the file using the data transfer tool, and re-upload it, or upload a file if the state is `Registered`| +| INCONSISTENT_READGROUPS | ReadGroups sharing a library_strategy under a given aliquot have properties that do not match| Verify the properties of shared read groups under the same aliquot are consistent.| +| NO_CLINICAL_SUPPLEMENT | The case has no associated clinical supplement| Upload an optional clinical supplement file. This is a file that contains clinical data about one or more cases in a user specified format | +| NO_BIOSPECIMEN_SUPPLEMENT | The case has no associated biospecimen supplement| Upload an optional biospecimen supplement file. This is a file that contains biospecimen data about one or more cases in a user specified format | +| NO_DEMOGRAPHIC | The case has no associated demographic information| Provide demographic information on the case. This will be required before data can be released. | +| NO_DIAGNOSIS | The case has no associated diagnosis information | Provide diagnosis information on the case. This will be required before data can be released. | +| MORE_THAN_ONE_SAMPLE | The aliquot is associated with more than one sample| Review whether multiple samples were actually combined to make a single aliquot. This is uncommon, but potentially correct. | +| MULTIPLE_ALIGNED_BAMS | The read_group has multiple submitted aligned BAMs| Review whether one read group actually appears in multiple BAM files. This is uncommon. | +| NO_MULTIPLEX_BARCODE | The read_group has no multiplex barcode| Provide multiplex barcode for the read_group. | +| NO_FLOWCELL_BARCODE | The read_group has no flowcell barcode| Provide flowcell barcode for the read_group | +| NO_LANE_NUMBER | The read_group has no lane number| Provide lane number for the read_group | +| MULTIPLE_SARS_ON_ALIQUOT | Multiple submitted aligned reads of the same experimental strategy are associated with one aliquot.| Each `aliquot` node is only associated with one `submitted_aligned_reads` file of the same `experimental_strategy`. | +| FASTQ_UNKNOWN_EXTENSION | The FASTQ filename has an unknown extension| FASTQ file extension should be `.fq` or `.fq.gz`. Impermissible extensions are `tar.gz` and `tar`. | +| MULTIPLE_FASTQ_READGROUPS | Submitted FASTQ file has links to multiple read groups| Ensure `submitted_unaligned_reads` of data_format `FASTQ` is not linked to multiple `read_group` nodes. | +| INVALID_FASTQ_EXTENSION | Submitted FASTQ file name has an invalid extension| FASTQ file extension should be `.fq` or `.fq.gz`. Impermissible extensions are `tar.gz` and `tar`.| +| FASTQ_TOO_LARGE | FASTQ exceeds 10GB in size| The `submitted_unaligned_reads` file is larger than 10 GB. | +| NO_ASSOCIATED_FILES | ReadGroup has no associated genomic files| Ensure that all read groups have genomic files attached - or delete them if they are no longer needed | + +Once user review is complete and all Critical errors are resolved, clicking the `REQUEST HARMONIZATION` button will indicate to the GDC Team and pipeline automation system that data processing can begin. + +### Submit to the GDC for Harmonization + +When the project is ready for processing, the submitter will request to submit data to the GDC for Harmonization. If the project is not ready for processing, the project can be re-opened. Then the submitter will be able to upload more data to the project workspace. + +The `REQUEST HARMONIZATION` button is available only if the project is in "REVIEW" state. At this point, the user can decide whether to re-open the project to upload more data or to request harmonization of the data to the GDC. When the project is in "REVIEW" the following panel appears on the dashboard: + +[![GDC Submission Submit Tab](images/GDC_Submission_Submit_Release_Submit_tab_2_v4.png)](images/GDC_Submission_Submit_Release_Submit_tab_2_v4.png "Click to see the full image.") + +Once the user submits data to the GDC, they cannot modify the submitted nodes and files while harmonization is underway. Additional project data can be added during this period and will be considered a separate batch. To process an additional batch the user must again review the data and select `REQUEST HARMONIZATION`. + +[![GDC Submission Submission Tab](images/GDC_SUBMIT_TO_GDC_v3.png)](images/GDC_SUBMIT_TO_GDC_v3.png "Click to see the full image.") + +When the user clicks on the action `REQUEST HARMONIZATION` on the dashboard, the following popup is displayed: + +[![GDC Submission Submit Popup](images/GDC_Submission_Submit_Release_Submit_Popup_v2.png)](images/GDC_Submission_Submit_Release_Submit_Popup_v2.png "Click to see the full image.") + + +After the user clicks on `SUBMIT VALIDATED DATA TO THE GDC`, the project state becomes "Harmonization Requested": + +[![GDC Submission Project State](images/GDC_Submission_Submit_Release_Project_State_v3.png)](images/GDC_Submission_Submit_Release_Project_State_v3.png "Click to see the full image.") + +The GDC requests that users submit their data to the GDC for harmonization within six months from the first upload of data to the project workspace. + +### Reviewing Harmonized Data +After harmonization and prior to release, the GDC provides data submitters with access to their harmonized data. This allows the submitter to perform a check of the data, and let the GDC know if anything is incorrect before the data are released to the GDC Data Portal. How and in what detail the submitter wants to perform such a review is up to them, but here are a few suggestions for what a submitter may want to check. + +Are all expected data present? More specifically, you could review the following questions: + * Are the number of cases correct? + * Are the number of cases associated with a given experimental strategy correct? + * Are there any cases or experimental strategies I want to hold back that are still within the 6 month embargo period? + * Does the clinical data appear as I expect? + * Do the alignment statistics look acceptable? The GDC produces alignment metrics which are available via the API. This will allow users to see whether coverage, alignment, and other statistics are in line with expectations. [The complete list can be found here.](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Aligned_reads_summary_metrics/) + +If users have access to other derived data files, like called variants or expression levels, there is another level of QC that is possible. + +If you have access to this data you could also investigate the following: + * Are expected variants present for a given tumor-normal pair? Note, due to differences between the GDC and user workflows (e.g. reference genome, variant calling pipelines, variant filtering, etc.) the exact list of variants may differ significantly between MAFs generated by users and those generated by the GDC. + * Does gene expression correlate with previously generated expression data from the same aliquot? Note, the GDC performs non-stranded expression quantification for HTSeq workflows. To review strand-specific results please review STAR output. + +Once these user reviews have been completed, the user will need to contact the GDC and inform them that the project is ready for release. + +## Release +Project release occurs after the data has been harmonized, and allows users to access this data with the [GDC Data Portal](https://portal.gdc.cancer.gov/) and other [GDC Data Access Tools](https://gdc.cancer.gov/access-data/data-access-processes-and-tools). The GDC will release data according to [GDC Data Sharing Policies](https://gdc.cancer.gov/submit-data/data-submission-policies). Data must be released within six months after GDC data processing has been completed, or the submitter may request earlier release using the "Request Release" function. A project can only be released once. + +[![GDC Submission Release Tab](images/GDC_Submission_Landing_Submitter_4.png)](images/GDC_Submission_Landing_Submitter_4.png "Click to see the full image.") + +When the user clicks on the action `REQUEST RELEASE`, the following Release popup is displayed: + +[![GDC Submission Release Popup](images/GDC_Submission_Submit_Release_Release_Popup.png)](images/GDC_Submission_Submit_Release_Release_Popup.png "Click to see the full image.") + +After the user clicks on `RELEASE SUBMITTED AND PROCESSED DATA`, the project release state becomes "Release Requested": + +[![GDC Submission Project State](images/GDC_Submission_Submit_Release_Project_State_3.png)](images/GDC_Submission_Submit_Release_Project_State_3.png "Click to see the full image.") + + +>__Note__: Released cases and/or files can be redacted from the GDC. For more information, visit the [GDC Policies page (under GDC Data Sharing Policies)](https://gdc.cancer.gov/about-gdc/gdc-policies). diff --git a/docs/Data_Submission_Portal/Users_Guide/Data_Submission_Walkthrough.md b/docs/Data_Submission_Portal/Users_Guide/Data_Submission_Walkthrough.md new file mode 100644 index 000000000..cc0160318 --- /dev/null +++ b/docs/Data_Submission_Portal/Users_Guide/Data_Submission_Walkthrough.md @@ -0,0 +1,698 @@ +# Data Upload Walkthrough + +This guide details step-by-step procedures for different aspects of the GDC Data Submission process and how they relate to the GDC Data Model and structure. The first sections of this guide break down the submission process and associate each step with the Data Model. Additional sections are detailed below for strategies on expediting data submission, using features of the GDC Data Submission Portal, and best practices used by the GDC. + +## GDC Data Model Basics + +Pictured below is the submittable subset of the GDC Data Model: a roadmap for GDC data submission. Each oval node in the graphic represents an entity: a logical unit of data related to a specific clinical, biospecimen, or file facet in the GDC. An entity includes a set of fields, the associated values, and information about its related node associations. All submitted entities require a connection to another entity type, based on the GDC Data Model, and a `submitter_id` as an identifier. This walkthrough will go through the submission of different entities. The completed (submitted) portion of the entity process will be highlighted in __blue__. + +[![GDC Data Model 1](images/GDC-Data-Model-None.png)](images/GDC-Data-Model-None.png "Click to see the full image.") + +# Case Submission + +The `case` is the center of the GDC Data Model and usually describes a specific patient. Each `case` is connected to a `project`. Different types of clinical data, such as `diagnoses` and `exposures`, are connected to the `case` to describe the case's attributes and medical information. + +[![GDC Data Model 2](images/GDC-Data-Model-Case.png)](images/GDC-Data-Model-Case.png "Click to see the full image.") + +The main entity of the GDC Data Model is the `case`, each of which must be registered beforehand with [dbGaP](https://www.ncbi.nlm.nih.gov/sra/docs/submitdbgap) under a unique `submitter_id`. The first step to submitting a `case` is to consult the [Data Dictionary](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#data-dictionary-viewer), which details the fields that are associated with a `case`, the fields that are required to submit a `case`, and the values that can populate each field. Dictionary entries are available for all entities in the GDC Data Model. + +[![Dictionary Case](images/Dictionary_Case.png)](images/Dictionary_Case.png "Click to see the full image.") + +Submitting a [__Case__](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=case) entity requires: + +* __`submitter_id`:__ A unique key to identify the `case` +* __`projects.code`:__ A link to the `project` + +The submitter ID is different from the universally unique identifier (UUID), which is based on the [UUID Version 4 Naming Convention](https://en.wikipedia.org/wiki/Universally_unique_identifier#Version_4_.28random.29). The UUID can be accessed under the `_id` field for each entity. For example, the `case` UUID can be accessed under the `case_id` field. The UUID is either assigned to each entity automatically or can be submitted by the user. Submitter-generated UUIDs cannot be uploaded in `submittable_data_file` entity types. See the [Data Model Users Guide](https://docs.gdc.cancer.gov/Data/Data_Model/GDC_Data_Model/#gdc-identifiers) for more details about GDC identifiers. + +The `projects.code` field connects the `case` entity to the `project` entity. The rest of the entity connections use the `submitter_id` field instead. + +The `case` entity can be added in JSON or TSV format. A template for any entity in either of these formats can be found in the Data Dictionary at the top of each page. Templates populated with `case` metadata in both formats are displayed below. + +```JSON +{ + "type": "case", + "submitter_id": "PROJECT-INTERNAL-000055", + "projects": { + "code": "INTERNAL" + } +} +``` +```TSV +type submitter_id projects.code +case PROJECT-INTERNAL-000055 INTERNAL +``` + +>__Note:__ JSON and TSV formats handle links between entities (`case` and `project`) differently. JSON includes the `code` field nested within `projects` while TSV appends `code` to `projects` with a period. + + +## Uploading the Case Submission File + +The file detailed above can be uploaded using the GDC Data Submission Portal and the GDC API as described below: + +### Upload Using the GDC Data Submission Portal + +An example of a `case` upload is detailed below. The [GDC Data Submission Portal](https://gdc.cancer.gov/submit-data/gdc-data-submission-portal) is equipped with a wizard window to facilitate the upload and validation of entities. + +#### 1. Upload Files + +Choosing _'UPLOAD'_ from the project dashboard will open the Upload Data Wizard. + +[![GDC Submission Wizard Upload Files](images/GDC_Submission_Wizard_Upload_2.png)](images/GDC_Submission_Wizard_Upload_2.png "Click to see the full image.") + +Files containing one or more entities can be added either by clicking on `CHOOSE FILE(S)` or using drag and drop. Files can be removed from the Upload Data Wizard by clicking on the garbage can icon that is displayed next to the file after the file is selected for upload. + +#### 2. Validate Entities + +The __Validate Entities__ stage acts as a safeguard against submitting incorrectly formatted data to the GDC Data Submission Portal. During the validation stage, the GDC API will validate the content of uploaded entities against the Data Dictionary to detect potential errors. Invalid entities will not be processed and must be corrected by the user and re-uploaded before being accepted. A validation error report provided by the system can be used to isolate and correct errors. + +When the first file is added, the wizard will move to the Validate section and the user can continue to add files. When all files have been added, choosing `VALIDATE` will run a test to check if the entities are valid for submission. + +[![GDC Submission Wizard Validate Files](images/GDC_Submission_Portal_Validate.png)](images/GDC_Submission_Portal_Validate.png "Click to see the full image.") + +#### 3. Commit or Discard Files +If the upload contains valid entities, a new transaction will appear in the latest transactions panel with the option to `COMMIT` or `DISCARD` the data. Entities contained in these files can be committed (applied) to the project or discarded using these two buttons. + +If the upload contains invalid files, a transaction will appear with a FAILED status. Invalid files will need to be either corrected and re-uploaded or removed from the submission. If more than one file is uploaded and at least one is not valid, the validation step will fail for all files. + +[![Commit_Discard](images/GDC_Submission_CommitDiscard.png)](images/GDC_Submission_CommitDiscard.png "Click to see the full image.") + + +### Upload Using the GDC API + +The API has a much broader range of functionality than the Data Wizard. Entities can be created, updated, and deleted through the API. See the [API Submission User Guide](https://docs.gdc.cancer.gov/API/Users_Guide/Submission/#creating-and-updating-entities) for a more detailed explanation and for the rest of the functionalities of the API. Generally, uploading an entity through the API can be performed using a command similar to the following: + +```Shell +curl --header "X-Auth-Token: $token" --request POST --data @CASE.json https://api.gdc.cancer.gov/v0/submission/GDC/INTERNAL/_dry_run?async=true +``` +CASE.json is detailed below. +```json +{ + "type": "case", + "submitter_id": "PROJECT-INTERNAL-000055", + "projects": { + "code": "INTERNAL" + } +} +``` + +In this example, the `_dry_run` marker is used to determine if the entities can be validated, but without committing any information. If a command passed through the `_dry_run` works, the command will work when it is changed to `commit`. For more information please go to [Dry Run Transactions](https://docs.gdc.cancer.gov/API/Users_Guide/Submission/#dry-run-transactions). + +>__Note:__ Submission of TSV files is also supported by the GDC API. + +Next, the file can either be committed (applied to the project) through the Data Submission Portal as before, or another API query can be performed that will commit the file to the project. The transaction number in the URL (467) is printed to the console during the first step of API submission and can also be retrieved from the [Transactions](Data_Submission_Process.md#transactions) tab in the Data Submission Portal. + +```Shell +curl --header "X-Auth-Token: $token" --request POST https://api.gdc.cancer.gov/v0/submission/GDC/INTERNAL/transactions/467/commit?async=true +``` + +# Clinical Data Submission + +Typically, a submission project will include additional information about a `case` such as `demographic`, `diagnosis`, or `exposure` data. + +## Clinical Data Requirements + +For the GDC to release a project there is a minimum number of clinical properties that are required. Minimal GDC requirements for each project includes age, gender, and diagnosis information. Other [requirements](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-entity-list&anchor=clinical) may be added when the submitter is approved for submission to the GDC. + +[![GDC Data Model Clinical](images/GDC-Data-Model-Clinical.png)](images/GDC-Data-Model-Clinical.png "Click to see the full image.") + +## Submitting a Demographic Entity to a Case + +The `demographic` entity contains information that characterizes the `case` entity. + +Submitting a [__Demographic__](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=demographic) entity requires: + +* __`submitter_id`:__ A unique key to identify the `demographic` entity. +* __`cases.submitter_id`:__ The unique key that was used for the `case` that links the `demographic` entity to the `case`. +* __`ethnicity`:__ An individual's self-described social and cultural grouping, specifically whether an individual describes themselves as Hispanic or Latino. The provided values are based on the categories defined by the U.S. Office of Management and Business and used by the U.S. Census Bureau. +* __`gender`:__ Text designations that identify gender. Gender is described as the assemblage of properties that distinguish people on the basis of their societal roles. +* __`race`:__ An arbitrary classification of a taxonomic group that is a division of a species. It usually arises as a consequence of geographical isolation within a species and is characterized by shared heredity, physical attributes and behavior, and in the case of humans, by common history, nationality, or geographic distribution. The provided values are based on the categories defined by the U.S. Office of Management and Business and used by the U.S. Census Bureau. + +```JSON +{ + "type": "demographic", + "submitter_id": "PROJECT-INTERNAL-000055-DEMOGRAPHIC-1", + "cases": { + "submitter_id": "PROJECT-INTERNAL-000055" + }, + "ethnicity": "not hispanic or latino", + "gender": "male", + "race": "asian", +} +``` +```TSV +type cases.submitter_id ethnicity gender race +demographic PROJECT-INTERNAL-000055 not hispanic or latino male asian +``` + +## Submitting a Diagnosis Entity to a Case + +Submitting a [__Diagnosis__](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=diagnosis) entity requires: + +* __`submitter_id`:__ A unique key to identify the `diagnosis` entity. +* __`cases.submitter_id`:__ The unique key that was used for the `case` that links the `diagnosis` entity to the `case`. +* __`age_at_diagnosis`:__ Age at the time of diagnosis expressed in number of days since birth. +* __`days_to_last_follow_up`:__ Time interval from the date of last follow up to the date of initial pathologic diagnosis, represented as a calculated number of days. +* __`days_to_last_known_disease_status`:__ Time interval from the date of last follow up to the date of initial pathologic diagnosis, represented as a calculated number of days. +* __`days_to_recurrence`:__ Time interval from the date of new tumor event including progression, recurrence and new primary malignancies to the date of initial pathologic diagnosis, represented as a calculated number of days. +* __`last_known_disease_status`:__ The state or condition of an individual's neoplasm at a particular point in time. +* __`morphology`:__ The third edition of the International Classification of Diseases for Oncology, published in 2000 used principally in tumor and cancer registries for coding the site (topography) and the histology (morphology) of neoplasms. The study of the structure of the cells and their arrangement to constitute tissues and, finally, the association among these to form organs. In pathology, the microscopic process of identifying normal and abnormal morphologic characteristics in tissues, by employing various cytochemical and immunocytochemical stains. A system of numbered categories for representation of data. +* __`primary_diagnosis`:__ Text term for the structural pattern of cancer cells used to define a microscopic diagnosis. +* __`progression_or_recurrence`:__ Yes/No/Unknown indicator to identify whether a patient has had a new tumor event after initial treatment. +* __`site_of_resection_or_biopsy`:__ The third edition of the International Classification of Diseases for Oncology, published in 2000, used principally in tumor and cancer registries for coding the site (topography) and the histology (morphology) of neoplasms. The description of an anatomical region or of a body part. Named locations of, or within, the body. A system of numbered categories for representation of data. +* __`tissue_or_organ_of_origin`:__ Text term that describes the anatomic site of the tumor or disease. +* __`tumor_grade`:__ Numeric value to express the degree of abnormality of cancer cells, a measure of differentiation and aggressiveness. +* __`tumor_stage`:__ The extent of a cancer in the body. Staging is usually based on the size of the tumor, whether lymph nodes contain cancer, and whether the cancer has spread from the original site to other parts of the body. The accepted values for tumor_stage depend on the tumor site, type, and accepted staging system. These items should accompany the tumor_stage value as associated metadata. +* __`vital_status`:__ The survival state of the person registered on the protocol. + +```JSON +{ + "type": "diagnosis", + "submitter_id": "PROJECT-INTERNAL-000055-DIAGNOSIS-1", + "cases": { + "submitter_id": "GDC-INTERNAL-000099" + }, + "age_at_diagnosis": 10256, + "days_to_last_follow_up": 34, + "days_to_last_known_disease_status": 34, + "days_to_recurrence": 45, + "last_known_disease_status": "Tumor free", + "morphology": "8260/3", + "primary_diagnosis": "ACTH-producing tumor", + "progression_or_recurrence": "no", + "site_of_resection_or_biopsy": "Lung, NOS", + "tissue_or_organ_of_origin": "Lung, NOS", + "tumor_grade": "not reported", + "tumor_stage": "stage i", + "vital_status": "alive" +} +``` +```TSV +type submitter_id cases.submitter_id age_at_diagnosis days_to_last_follow_up days_to_last_known_disease_status days_to_recurrence last_known_disease_status morphology primary_diagnosis progression_or_recurrence site_of_resection_or_biopsy tissue_or_organ_of_origin tumor_grade tumor_stage vital_status +diagnosis PROJECT-INTERNAL-000055-DIAGNOSIS-1 GDC-INTERNAL-000099 10256 34 34 45 Tumor free 8260/3 ACTH-producing tumor no Lung, NOS Lung, NOS not reported stage i alive +``` + +### Submitting an Exposure Entity to a Case + +Submitting an [__Exposure__](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=exposure) entity does not require any information besides a link to the `case` and a `submitter_id`. The following fields are optionally included: + +* __`alcohol_history`:__ A response to a question that asks whether the participant has consumed at least 12 drinks of any kind of alcoholic beverage in their lifetime. +* __`alcohol_intensity`:__ Category to describe the patient's current level of alcohol use as self-reported by the patient. +* __`alcohol_days_per_week`:__ Numeric value used to describe the average number of days each week that a person consumes an alchoolic beverage. +* __`years_smoked`:__ Numeric value (or unknown) to represent the number of years a person has been smoking. +* __`tobacco_smoking_onset_year`:__ The year in which the participant began smoking. +* __`tobacco_smoking_quit_year`:__ The year in which the participant quit smoking. + +```JSON +{ + "type": "exposure", + "submitter_id": "PROJECT-INTERNAL-000055-EXPOSURE-1", + "cases": { + "submitter_id": "PROJECT-INTERNAL-000055" + }, + "alcohol_history": "yes", + "alcohol_intensity": "Drinker", + "alcohol_days_per_week": 2, + "years_smoked": 5, + "tobacco_smoking_onset_year": 2007, + "tobacco_smoking_quit_year": 2012 +} +``` +```TSV +type submitter_id cases.submitter_id alcohol_history alcohol_intensity alcohol_days_per_week years_smoked tobacco_smoking_onset_year tobacco_smoking_quit_year +exposure PROJECT-INTERNAL-000055-EXPOSURE-1 PROJECT-INTERNAL-000055 yes Drinker 2 5 2007 2012 +``` + +>__Note:__ Submitting a clinical entity uses the same conventions as submitting a `case` entity (detailed above). + + +# Biospecimen Submission + +One of the main features of the GDC is the genomic data harmonization workflow. Genomic data is connected the case through biospecimen entities. The `sample` entity describes a biological piece of matter that originated from a `case`. Subsets of the `sample` such as `portions` and `analytes` can optionally be described. The `aliquot` originates from a `sample` or `analyte` and describes the nucleic acid extract that was sequenced. The `read_group` entity describes the resulting set of reads from one sequencing lane. + +## Sample Submission + +[![GDC Data Model 3](images/GDC-Data-Model-Sample.png)](images/GDC-Data-Model-Sample.png "Click to see the full image.") + +A `sample` submission has the same general structure as a `case` submission as it will require a unique key and a link to the `case`. However, `sample` entities require one additional value: `sample_type`. This peripheral data is required because it is necessary for the data to be interpreted. For example, an investigator using this data would need to know whether the `sample` came from tumor or normal tissue. + +[![Dictionary Sample](images/Dictionary_Sample.png)](images/Dictionary_Sample.png "Click to see the full image.") + +Submitting a [__Sample__](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=sample) entity requires: + +* __`submitter_id`:__ A unique key to identify the `sample`. +* __`cases.submitter_id`:__ The unique key that was used for the `case` that links the `sample` to the `case`. +* __`sample_type`:__ Type of the `sample`. Named for its cellular source, molecular composition, and/or therapeutic treatment. +* __`tissue_type`:__ Text term that represents a description of the kind of tissue collected with respect to disease status or proximity to tumor tissue. + +>__Note:__ The `case` must be "committed" to the project before a `sample` can be linked to it. This also applies to all other links between entities. + +```JSON +{ + "type": "sample", + "cases": { + "submitter_id": "PROJECT-INTERNAL-000055" + }, + "sample_type": "Blood Derived Normal", + "submitter_id": "Blood-00001SAMPLE_55" + "tissue_type": "Normal" +} +``` +```TSV +type cases.submitter_id submitter_id sample_type tissue_type +sample PROJECT-INTERNAL-000055 Blood-00001SAMPLE_55 Blood Derived Normal Normal +``` + +## Portion, Analyte and Aliquot Submission + +[![GDC Data Model 4](images/GDC-Data-Model-Aliquot.png)](images/GDC-Data-Model-Aliquot.png "Click to see the full image.") + +Submitting a [__Portion__](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=portion) entity requires: + +* __`submitter_id`:__ A unique key to identify the `portion`. +* __`samples.submitter_id`:__ The unique key that was used for the `sample` that links the `portion` to the `sample`. + +```JSON +{ + "type": "portion", + "submitter_id": "Blood-portion-000055", + "samples": { + "submitter_id": "Blood-00001SAMPLE_55" + } +} + +``` +```TSV +type submitter_id samples.submitter_id +portion Blood-portion-000055 Blood-00001SAMPLE_55 +``` + +Submitting an [__Analyte__](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=analyte) entity requires: + +* __`submitter_id`:__ A unique key to identify the `analyte`. +* __`portions.submitter_id`:__ The unique key that was used for the `portion` that links the `analyte` to the `portion`. +* __`analyte_type`:__ Text term that represents the kind of molecular specimen analyte. + +```JSON +{ + "type": "analyte", + "portions": { + "submitter_id": "Blood-portion-000055" + }, + "analyte_type": "DNA", + "submitter_id": "Blood-analyte-000055" +} + +``` +```TSV +type portions.submitter_id analyte_type submitter_id +analyte Blood-portion-000055 DNA Blood-analyte-000055 +``` + +Submitting an [__Aliquot__](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=aliquot) entity requires: + +* __`submitter_id`:__ A unique key to identify the `aliquot`. +* __`analytes.submitter_id`:__ The unique key that was used for the `analyte` that links the `aliquot` to the `analyte`. + +```JSON +{ + "type": "aliquot", + "submitter_id": "Blood-00021-aliquot55", + "analytes": { + "submitter_id": "Blood-analyte-000055" + } +} + +``` +```TSV +type submitter_id analytes.submitter_id +aliquot Blood-00021-aliquot55 Blood-analyte-000055 +``` + +>__Note:__ `aliquot` entities can be directly linked to `sample` entities via the `samples.submitter_id`. The `portion` and `analyte` entities are not required for submission. + +## Read Group Submission + +[![GDC Data Model 5](images/GDC-Data-Model-RG.png)](images/GDC-Data-Model-RG.png "Click to see the full image.") + +Information about sequencing reads is necessary for downstream analysis, thus the `read_group` entity requires more fields than the other Biospecimen entities (`sample`, `portion`, `analyte`, `aliquot`). + +Submitting a [__Read Group__](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=read_group) entity requires: + +* __`submitter_id`:__ A unique key to identify the `read_group`. +* __`aliquots.submitter_id`:__ The unique key that was used for the `aliquot` that links the `read_group` to the `aliquot`. +* __`experiment_name`:__ Submitter-defined name for the experiment. +* __`is_paired_end`:__ Are the reads paired end? (Boolean value: `true` or `false`). +* __`library_name`:__ Name of the library. +* __`library_strategy`:__ Library strategy. +* __`platform`:__ Name of the platform used to obtain data. +* __`read_group_name`:__ The name of the `read_group`. +* __`read_length`:__ The length of the reads (integer). +* __`sequencing_center`:__ Name of the center that provided the sequence files. +* __`library_selection`:__ Library Selection Method. +* __`target_capture_kit`:__ Description that can uniquely identify a target capture kit. Suggested value is a combination of vendor, kit name, and kit version. + +```JSON +{ + "type": "read_group", + "submitter_id": "Blood-00001-aliquot_lane1_barcodeACGTAC_55", + "experiment_name": "Resequencing", + "is_paired_end": true, + "library_name": "Solexa-34688", + "library_strategy": "WXS", + "platform": "Illumina", + "read_group_name": "205DD.3-2", + "read_length": 75, + "sequencing_center": "BI", + "library_selection": "Hybrid Selection", + "target_capture_kit": "Custom MSK IMPACT Panel - 468 Genes", + "aliquots": + { + "submitter_id": "Blood-00021-aliquot55" + } +} + +``` +```TSV +type submitter_id experiment_name is_paired_end library_name library_selection library_strategy platform read_group_name read_length sequencing_center target_capture_kit aliquots.submitter_id +read_group Blood-00001-aliquot_lane1_barcodeACGTAC_55 Resequencing true Solexa-34688 Hybrid Selection WXS Illumina 205DD.3-2 75 BI Custom MSK IMPACT Panel - 468 Genes Blood-00021-aliquot55 +``` + +>__Note:__ Submitting a biospecimen entity uses the same conventions as submitting a `case` entity (detailed above). + +# Experiment Data Submission + +Several types of experiment data can be uploaded to the GDC. The `submitted_aligned_reads` and `submitted_unaligned_reads` files are associated with the `read_group` entity, while the array-based files such as the `submitted_tangent_copy_number` are associated with the `aliquot` entity. Each of these file types are described in their respective entity submission and are uploaded separately using the [GDC API](https://docs.gdc.cancer.gov/API/Users_Guide/Getting_Started/) or the [GDC Data Transfer Tool](https://gdc.cancer.gov/access-data/gdc-data-transfer-tool). + +[![GDC Data Model 6](images/GDC-Data-Model-Reads.png)](images/GDC-Data-Model-Reads.png "Click to see the full image.") + +Before the experiment data file can be submitted, the GDC requires that the user provides information about the file as a `submittable_data_file` entity. This includes file-specific data needed to validate the file and assess which analyses should be performed. Sequencing data files can be submitted as `submitted_aligned_reads` or `submitted_unaligned_reads`. + +Submitting a [__Submitted Aligned-Reads__](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=submitted_aligned_reads) ([__Submitted Unaligned-Reads__](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=submitted_unaligned_reads)) entity requires: + +* __`submitter_id`:__ A unique key to identify the `submitted_aligned_reads`. +* __`read_groups.submitter_id`:__ The unique key that was used for the `read_group` that links the `submitted_aligned_reads` to the `read_group`. +* __`data_category`:__ Broad categorization of the contents of the data file. +* __`data_format`:__ Format of the data files. +* __`data_type`:__ Specific content type of the data file. (must be "Aligned Reads"). +* __`experimental_strategy`:__ The sequencing strategy used to generate the data file. +* __`file_name`:__ The name (or part of a name) of a file (of any type). +* __`file_size`:__ The size of the data file (object) in bytes. +* __`md5sum`:__ The 128-bit hash value expressed as a 32 digit hexadecimal number used as a file's digital fingerprint. + + +```JSON +{ + "type": "submitted_aligned_reads", + "submitter_id": "Blood-00001-aliquot_lane1_barcodeACGTAC_55.bam", + "data_category": "Raw Sequencing Data", + "data_format": "BAM", + "data_type": "Aligned Reads", + "experimental_strategy": "WGS", + "file_name": "test.bam", + "file_size": 38, + "md5sum": "aa6e82d11ccd8452f813a15a6d84faf1", + "read_groups": [ + { + "submitter_id": "Primary_Tumor_RG_86-1" + } + ] +} +``` +```TSV +type submitter_id data_category data_format data_type experimental_strategy file_name file_size md5sum read_groups.submitter_id#1 +submitted_aligned_reads Blood-00001-aliquot_lane1_barcodeACGTAC_55.bam Raw Sequencing Data BAM Aligned Reads WGS test.bam 38 aa6e82d11ccd8452f813a15a6d84faf1 Primary_Tumor_RG_86-1 +``` + +>__Note:__ For details on submitting experiment data associated with more than one `read_group` entity, see the [Tips for Complex Submissions](#submitting-complex-data-model-relationships) section. + +## Uploading the Submittable Data File to the GDC + +The submittable data file can be uploaded when it is registered with the GDC. A submittable data file is registered when its corresponding entity (e.g. `submitted_unaligned_reads`) is uploaded and committed. It is important to note that the Harmonization process does not occur on these submitted files until the user clicks the [`Request Submission`](Data_Submission_Process.md#release) button. Uploading the file can be performed with either the [GDC Data Transfer Tool](https://gdc.cancer.gov/access-data/gdc-data-transfer-tool) or the [GDC API](https://docs.gdc.cancer.gov/API/Users_Guide/Getting_Started/). Other types of data files such as clinical supplements, biospecimen supplements, and pathology reports are uploaded to the GDC in the same way. Supported data file formats are listed at the GDC [Submitted Data Types and File Formats](https://gdc.cancer.gov/about-data/data-types-and-file-formats/submitted-data-types-and-file-formats) website. + +__GDC Data Transfer Tool:__ A file can be uploaded using its UUID (which can be retrieved from the GDC Submission Portal or API) once it is registered. + +[![UUID Location](images/GDC_Submission_UUID_location.png)](images/GDC_Submission_UUID_location.png "Click to see the full image.") + +The following command can be used to upload the file: + +```Shell +gdc-client upload --project-id PROJECT-INTERNAL --identifier a053fad1-adc9-4f2d-8632-923579128985 -t $token -f $path_to_file +``` + +Additionally a manifest can be downloaded from the Submission Portal and passed to the Data Transfer Tool. This will allow for the upload of more than one `submittable_data_file`: + +```Shell +gdc-client upload -m manifest.yml -t $token +``` +__API Upload:__ A `submittable_data_file` can be uploaded through the API by using the `/submission/$PROGRAM/$PROJECT/files` endpoint. The following command would be typically used to upload a file: + +```Shell +curl --request PUT --header "X-Auth-Token: $token" https://api.gdc.cancer.gov/v0/submission/PROJECT/INTERNAL/files/6d45f2a0-8161-42e3-97e6-e058ac18f3f3 -d $path_to_file + +``` + +For more details on how to upload a `submittable_data_file` to a project see the [API Users Guide](https://docs.gdc.cancer.gov/API/Users_Guide/Submission/) and the [Data Transfer Tool Users Guide](https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload/). + +## Annotation Submission + +The GDC Data Portal supports the use of annotations for any submitted entity or file. An annotation entity may include comments about why particular patients or samples are not present or why they may exhibit critical differences from others. Annotations include information that cannot be submitted to the GDC through other existing nodes or properties. + +If a submitter would like to create an annotation, please contact the GDC Support Team (support@nci-gdc.datacommons.io). + +## Deleting Submitted Entities + +The GDC Data Submission Portal allows users to delete submitted entities from the project when the project is in an "OPEN" state. Files cannot be deleted while in the "SUBMITTED" state. This section applies to entities that have been committed to the project. Entities that have not been committed can be removed from the project by choosing the `DISCARD` button. Entities can also be deleted using the API. See the [API Submission Documentation](https://docs.gdc.cancer.gov/API/Users_Guide/Submission/#deleting-entities) for specific instructions. + +>__NOTE:__ Entities associated with files uploaded to the GDC object store cannot be deleted until the associated file has been deleted. Users must utilize the [GDC Data Transfer Tool](https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload/#deleting-previously-uploaded-data) to delete these files first. + +### Simple Deletion + +If an entity was uploaded and has no related entities, it can be deleted from the [Browse](Data_Submission_Process.md#browse) tab. Once the entity to be deleted is selected, choose the `DELETE` button in the right panel under "ACTIONS". + + +[![GDC Delete Unassociated Case](images/GDC-Delete-Case-Unassociated.png)](images/GDC-Delete-Case-Unassociated.png "Click to see the full image.") + + +A message will then appear asking if you are sure about deleting the entity. Choosing the `YES, DELETE` button will remove the entity from the project, whereas choosing the `NO, CANCEL` button will return the user to the previous screen. + + +[![GDC Yes or No](images/GDC-Delete-Sure.png)](images/GDC-Delete-Sure.png "Click to see the full image.") + + +### Deletion with Dependents + +If an entity has related entities, such as a `case` with multiple `samples` and `aliquots`, deletion takes one extra step. + + +[![GDC Delete Associated Case](images/GDC-Delete-Case-Associated.png)](images/GDC-Delete-Case-Associated.png "Click to see the full image.") + + +Follow the [Simple Deletion](Data_Submission_Walkthrough.md#simple-deletion) method until the end. This action will appear in the [Transactions](Data_Submission_Process.md#transactions) tab as "Delete" with a "FAILED" state. + + +[![GDC Delete Failed](images/GDC-Failed-Transaction.png)](images/GDC-Failed-Transaction.png "Click to see the full image.") + + +Choose the failed transaction and the right panel will show the list of entities related to the entity that was going to be deleted. + + +[![GDC Error Related](images/GDC-Error-Related.png)](images/GDC-Error-Related.png "Click to see the full image.") + + +Selecting the `DELETE ALL` button at the bottom of the list will delete all of the related entities, their descendants, and the original entity. + + +### Submitted Data File Deletion + +The [`submittable_data_files`](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-entity-list&anchor=submittable_data_file) that were uploaded erroneously are deleted separately from their associated entity using the GDC Data Transfer Tool. See the section on [Deleting Data Files](https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload/#deleting-previously-uploaded-data) in the Data Transfer Tool users guide for specific instructions. + +## Updating Uploaded Entities + +Before harmonization occurs, entities can be modified to update, add, or delete information. These methods are outlined below. + +### Updating or Adding Fields + +Updated or additional fields can be applied to entities by re-uploading them through the GDC Data Submission portal or API. See below for an example of a case upload with a `primary_site` field being added and a `disease_type` field being updated. + +```Before +{ +"type":"case", +"submitter_id":"GDC-INTERNAL-000043", +"projects":{ + "code":"INTERNAL" +}, +"disease_type": "Myomatous Neoplasms" +} +``` +```After +{ +"type":"case", +"submitter_id":"GDC-INTERNAL-000043", +"projects":{ + "code":"INTERNAL" +}, +"disease_type": "Myxomatous Neoplasms", +"primary_site": "Pancreas" +} +``` +__Guidelines:__ + +* The newly uploaded entity must contain the `submitter_id` of the existing entity so that the system updates the correct one. +* All newly updated entities will be validated by the GDC Dictionary. All required fields must be present in the newly updated entity. +* Fields that are not required do not need to be re-uploaded and will remain unchanged in the entity unless they are updated. + +### Deleting Optional Fields + +It may be necessary to delete fields from uploaded entities. This can be performed through the API and can only be applied to optional fields. It also requires the UUID of the entity, which can be retrieved from the submission portal or using a GraphQL query. + +In the example below, the `primary_site` and `disease_type` fields are removed from a `case` entity: + +```Shell +curl --header "X-Auth-Token: $token_string" --request DELETE --header "Content-Type: application/json" "https://api.gdc.cancer.gov/v0/submission/EXAMPLE/PROJECT/entities/7aab7578-34ff-5651-89bb-57aefdc4c4f8?fields=primary_site,disease_type" +``` + +```Before +{ +"type":"case", +"submitter_id":"GDC-INTERNAL-000043", +"projects":{ + "code":"INTERNAL" +}, +"disease_type": "Germ Cell Neoplasms", +"primary_site": "Pancreas" +} +``` +```After +{ +"type":"case", +"submitter_id":"GDC-INTERNAL-000043", +"projects":{ + "code":"INTERNAL" +} +} +``` + +### Versioning +Changes to entities will create versions. For more information on this, please go to [Uploading New Versions of Data Files](https://docs.gdc.cancer.gov/API/Users_Guide/Submission/#uploading-new-versions-of-data-files). + +## Strategies for Submitting in Bulk + +Each submission in the previous sections was broken down by component to demonstrate the GDC Data Model structure. However, the submission of multiple entities at once is supported and encouraged. Here two strategies for submitting data in an efficient manner are discussed. + +### Registering a BAM File: One Step + +Registering a BAM file (or any other type) can be performed in one step by including all of the entities, from `case` to `submitted_aligned_reads`, in one file. See the example below: + +```JSON +[{ + "type": "case", + "submitter_id": "PROJECT-INTERNAL-000055", + "projects": { + "code": "INTERNAL" + } +}, +{ + "type": "sample", + "cases": { + "submitter_id": "PROJECT-INTERNAL-000055" + }, + "sample_type": "Blood Derived Normal", + "submitter_id": "Blood-00001_55" +}, +{ + "type": "portion", + "submitter_id": "Blood-portion-000055", + "samples": { + "submitter_id": "Blood-00001_55" + } +}, +{ + "type": "analyte", + "portions": { + "submitter_id": "Blood-portion-000055" + }, + "analyte_type": "DNA", + "submitter_id": "Blood-analyte-000055" +}, +{ + "type": "aliquot", + "submitter_id": "Blood-00021-aliquot55", + "analytes": { + "submitter_id": "Blood-analyte-000055" + } +}, +{ + "type": "read_group", + "submitter_id": "Blood-00001-aliquot_lane1_barcodeACGTAC_55", + "experiment_name": "Resequencing", + "is_paired_end": true, + "library_name": "Solexa-34688", + "library_selection":"Hybrid Selection", + "library_strategy": "WXS", + "platform": "Illumina", + "read_group_name": "205DD.3-2", + "read_length": 75, + "sequencing_center": "BI", + "aliquots": + { + "submitter_id": "Blood-00021-aliquot55" + } +}, +{ + "type": "submitted_aligned_reads", + "submitter_id": "Blood-00001-aliquot_lane1_barcodeACGTAC_55.bam", + "data_category": "Raw Sequencing Data", + "data_format": "BAM", + "data_type": "Aligned Reads", + "experimental_strategy": "WGS", + "file_name": "test.bam", + "file_size": 38, + "md5sum": "aa6e82d11ccd8452f813a15a6d84faf1", + "read_groups": [ + { + "submitter_id": "Blood-00001-aliquot_lane1_barcodeACGTAC_55" + } + ] +}] +``` + +All of the entities are placed into a JSON list object: + +`[{"type": "case","submitter_id": "PROJECT-INTERNAL-000055","projects": {"code": "INTERNAL"}}}, entity-2, entity-3]` + +The entities need not be in any particular order as they are validated together. + +>__Note:__ Tab-delimited format is not recommended for 'one-step' submissions due to an inability of the format to accommodate multiple 'types' in one row. + +### Submitting Numerous Cases + +The GDC understands that submitters will have projects that comprise more entities than would be reasonable to individually parse into JSON formatted files. Additionally, many investigators store large amounts of data in a tab-delimited format (TSV). For instances like this, we recommend parsing all entities of the same type into separate TSVs and submitting them on a type-basis. + +For example, a user may want to submit 100 Cases associated with 100 `samples`, 100 `portions`, 100 `analytes`, 100 `aliquots`, and 100 `read_groups`. Constructing and submitting 100 JSON files would be tedious and difficult to organize. The solution is submitting one `case` TSV containing the 100 `cases`, one `sample` TSV containing the 100 `samples`, so on and so forth. Doing this would only require six TSVs and these files can be formatted in programs such as Microsoft Excel or Google Spreadsheets. + +See the following example TSV files: + +* [Cases.tsv](Cases.tsv) +* [Samples.tsv](Samples.tsv) +* [Portions.tsv](Portions.tsv) +* [Analytes.tsv](Analytes.tsv) +* [Aliquots.tsv](Aliquots.tsv) +* [Read-Groups.tsv](Readgroups.tsv) + +### Download Previously Uploaded Metadata Files + +The [transaction](Data_Submission_Process.md#transactions) page lists all previous transactions in the project. The user can download metadata files uploaded to the GDC workspace in the details section of the screen by selecting one transaction and scrolling to the "DOCUMENTS" section. + + +[![Transaction Original Files](images/GDC_Submission_Transactions_Original_Files_2.png)](images/GDC_Submission_Transactions_Original_Files_2.png "Click to see the full image.") + +### Download Previously Uploaded Data Files + +The only supported method to download data files previously uploaded to the GDC Submission Portal that have not been release yet is to use the API or the [Data Transfer Tool](https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Getting_Started/). To retrieve data previous upload to the submission portal you will need to retrieve the data file's UUID. The UUIDs for submitted data files are located in the submission portal under the file's Summary section as well as the manifest file located on the file's Summary page. + +[![Submission Portal Summary View](images/gdc-submission__image2_submission_UUID.png)](images/gdc-submission__image2_submission_UUID.png "Click to see the full image.") + +Once the UUID(s) have been retrieved, the download process is the same as it is for downloading data files at the [GDC Portal using UUIDs](https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload/#downloading-data-using-gdc-file-uuids). + + >__Note:__ When submittable data files are uploaded through the Data Transfer Tool they are not displayed as transactions. diff --git a/docs/Data_Submission_Portal/Users_Guide/Getting_Started.md b/docs/Data_Submission_Portal/Users_Guide/Getting_Started.md deleted file mode 100644 index 5de77ef20..000000000 --- a/docs/Data_Submission_Portal/Users_Guide/Getting_Started.md +++ /dev/null @@ -1,66 +0,0 @@ -# Getting Started - -## Overview - -The National Cancer Institute (NCI) Genomic Data Commons (GDC) Data Submission Portal User's Guide is the companion documentation for the [GDC Data Submission Portal](https://gdc.cancer.gov/submit-data/gdc-data-submission-portal) and provides detailed information and instructions for its use. - -The GDC Data Submission Portal is a platform that allows researchers to submit and release data to the GDC. The key features of the GDC Data Submission Portal are: - -* __Upload and Validate Data__: Project data can be uploaded to the GDC project workspace. The GDC will validate the data against the [GDC Data Dictionary](https://gdc-docs.nci.nih.gov/Data_Dictionary/). -* __Review and Submit Data__: Prior to submission, data can be reviewed to check for accuracy. Once the review is complete, the data can be submitted to the GDC for processing through [Data Harmonization](https://gdc.cancer.gov/submit-data/gdc-data-harmonization). -* __Release Data__: After harmonization, data can be released to the research community for access through [GDC Data Access Tools](https://gdc.cancer.gov/access-data/data-access-processes-and-tools). -* __Download Data__: Data that has been uploaded into the project workspace can be downloaded for review or update. Data can then be re-uploaded before it is released for access through [GDC Data Access Tools](https://gdc.cancer.gov/access-data/data-access-processes-and-tools). -* __Browse Data__: Data that has been uploaded to the project workspace can be browsed to ensure that the project is ready for processing. -* __Status and Alerts__: Visual cues are implemented to easily identify incomplete submissions. - - -## Key Features - -### Upload and Validate Data -To submit data to the GDC, the user will prepare the data and upload it to the project workspace. - -The main categories of data that can be uploaded include: - -* __Clinical Data__: Elements such as `gender`, `age`, `diagnosis`, etc. as defined in the GDC Data Dictionary. -* __Biospecimen Data__: Information about entities such as `samples`, `aliquots`, etc. as defined in the GDC Data Dictionary. -* __Submittable Data Files__: Sequencing data such as BAM and FASTQ files, slide images, and other experimental data collected by the study. - -The [GDC Data Dictionary Viewer](../../Data_Dictionary/viewer.md) outlines the minimum field requirements for each of the three categories listed above. - -### Review and Submit Data - -Once data is uploaded to the project workspace, it can be reviewed to ensure that the data is ready for processing through the [GDC Harmonization Process](https://gdc.cancer.gov/submit-data/gdc-data-harmonization). The review will lock the project to ensure that additional data cannot be uploaded while in review. During this period the data can be browsed or downloaded in the Data Submission Portal. - -If the project is ready for processing, data can be submitted to the GDC. If the project is not ready for processing, the project can be re-opened. This will allow for additional data to be uploaded to the project workspace. - -### Release Data - -The GDC will release data according to [GDC data sharing policies](https://gdc.cancer.gov/submit-data/data-submission-policies). Data may be released after six months from the date of upload, or the submitter may request earlier release using the "Request Release" function. - -Upon release, harmonized data will be available to GDC users through the [GDC Data Portal](https://portal.gdc.cancer.gov/) and other [GDC Data Access Tools](https://gdc.cancer.gov/access-data/data-access-processes-and-tools). - - -### Redaction - -Data uploaded to the GDC can be updated before it is submitted for processing and harmonization. After harmonized data is released, it can only be redacted by GDC administrators under certain conditions. To request redaction of released data, please contact [GDC User Services](https://gdc.cancer.gov/support#gdc-help-desk). - -### Browse and Download Data - -Authorized submitters can browse and retrieve data submitted to their project using the Data Submission Portal. Retrieval of data submitted to the submission portal can be accomplished by using the API or the Data Transfer Tool. UUIDs of submitted files can be retrieved from the submission portal or with a [GraphQL](https://docs.gdc.cancer.gov/API/Users_Guide/Submission/#querying-submitted-data-using-graphql) query. Please see the [API](https://docs.gdc.cancer.gov/API/Users_Guide/Downloading_Files/) documentation for more information about downloads. - - -### Status and Alerts - -The GDC Data Submission Portal Dashboard and navigation panel displays a summary of submitted data and associated data elements, such as the number of cases with Clinical data or Biospecimen data. - -### Transactions - -Submitters can access a list of all actions performed in a project by clicking on the Transactions tab on the dashboard. This will display a list of all past transactions for the selected project. Users can access details about each transaction. The most recent transactions are also displayed on the dashboard. - -### Submission Project Examples - -Step-by-step instructions on GDC data submission and their relationship to the GDC Data Model are detailed in the [Upload Data](Data_Upload_UG.md) guide. - -## Release Notes - -The [Release Notes](../../Data_Submission_Portal/Release_Notes/Data_Submission_Portal_Release_Notes.md) section of this User's Guide contains details about new features, bug fixes, and known issues. diff --git a/docs/Data_Submission_Portal/Users_Guide/Pre_Release_QC.md b/docs/Data_Submission_Portal/Users_Guide/Pre_Release_QC.md index 291cc668a..0996e4c69 100644 --- a/docs/Data_Submission_Portal/Users_Guide/Pre_Release_QC.md +++ b/docs/Data_Submission_Portal/Users_Guide/Pre_Release_QC.md @@ -1,90 +1,44 @@ # Pre-Release Data Portal - -## Getting Started - - -### The GDC Pre-Release Data Portal: An Overview - -The Genomic Data Commons (GDC) Portal provides users with web-based access to pre-released data from cancer genomics studies that have been harmonized by the GDC, but not yet released in the main GDC Data Portal. Key GDC Pre-Release Data Portal features include: - -* Access to data prior to release on the GDC Data Portal. -* Repository page for browsing data by project / file / case -* File / case faceted searches to filter data -* Cart for collecting data files of interest -* Authentication using eRA Commons credentials for access to controlled-access data files -* Secure data download directly from the cart or using the [GDC Data Transfer Tool](https://gdc.cancer.gov/access-data/gdc-data-transfer-tool) -* Use of API for query and download - - - +The [GDC Pre-Release Data Portal](https://portal.awg.gdc.cancer.gov/) provides users with web-based access to pre-released data from cancer genomics studies that have been harmonized by the GDC, but not yet released in the main GDC Data Portal. ## Navigation +[Pre-Release Data Portal](https://portal.awg.gdc.cancer.gov/) will appear similar to the GDC Active Portal, but the Pre-Release Data Portal features are a subset of what can be found in the GDC Data Portal. -Pre-Release Data Portal features are a subset of what can be found in the GDC Data Portal. For more information on any of these general features please review the [Data Portal User Guide](/Data_Portal/Users_Guide/Getting_Started/#navigation). - -[![GDC Views](images/AWG_Portal.png)](images/WG_Portal.png "Click to see the full image.") - - +[![GDC Views](images/AWG_Portal.png)](images/AWG_Portal.png "Click to see the full image.") +For more information on any of these general features please review the [GDC Data Portal User Guide](/Data_Portal/Users_Guide/Getting_Started/#navigation). ## Authentication -### Overview - -The GDC Pre-Release Data Portal provides access to datasets prior to release to a group of users specified by the data submitter. This area is only available to data submitters (or their designees) for reviewing pre-release data. Users must be granted access as specified in the admin portal section and also have downloader access within dbGaP for the specified project. - -### GDC Authentication Tokens - -The GDC Pre-Release Data Portal provides authentication tokens for use with the GDC Data Transfer Tool or the GDC API. To download a token: - -1. Log into the GDC using your eRA Commons credentials -2. Click the username in the top right corner of the screen -3. Select the "Download token" option - -![Token Download Button](images/gdc-data-portal-token-download.png) - -A new token is generated each time the `Download Token` button is clicked. - -For more information about authentication tokens, see [Data Security](../../Data/Data_Security/Data_Security.md#authentication-tokens). - -**NOTE:** The authentication token should be kept in a secure location, as it allows access to all data accessible by the associated user account. - ### Relationship between GDC Data Portal and Pre-Release Data Portal Tokens -The tokens used to download files from the GDC Data Portal and Pre-Release Data Portal are related but distinct. Specifically, the token generated in the Pre-Release data portal contains a longer version of the regular GDC Authentication Token downloaded from the GDC Data Portal. Because of this, the GDC Data Portal token will not function for downloading data from the Pre-release Data Portal environment using the Data Transfer Tool or API. However, the Pre-Release Data Portal token will function for downloading data from the GDC Data Portal using the API or Data Transfer Tool. Finally, if a new token is generated in the Pre-release Data Portal this will invalidate the token downloaded from the GDC Data Portal and vice versa. - -### Logging Out - -To log out of the GDC, click the username in the top right corner of the screen, and select the Logout option. - -![Logout link](images/gdc-data-portal-token-download.png) +The GDC Pre-Release Data Portal provides access to datasets prior to release to a group of users specified by the data submitter. This area is only available to data submitters (or their designees) for reviewing pre-release data. Users must be granted access as specified in the GDC Pre-Release Data Admin Portal section and have downloader access within dbGaP for the specified project. To learn more about obtaining the required credentials and authorization, see [Obtaining Access to Submit Data]( https://gdc.cancer.gov/submit-data/obtaining-access-submit-data). +The tokens used to download files from the GDC Data Portal and Pre-Release Data Portal are related but distinct. Specifically, the token generated in the Pre-Release Data Portal contains a longer version of the regular GDC Authentication Token downloaded from the GDC Data Portal. Because of this, the GDC Data Portal token will not function for downloading data from the Pre-release Data Portal environment using the Data Transfer Tool or API. However, the Pre-Release Data Portal token will function for downloading data from the GDC Data Portal using the API or Data Transfer Tool. Finally, if a new token is generated in the Pre-release Data Portal this will invalidate the token downloaded from the GDC Data Portal and vice versa. ## Data Transfer Tool -As with the GDC Data Portal, downloads of large or numerous files is best performed using the GDC Data Transfer Tool. Information on the GDC Data Transfer Tool is available in the [GDC Data Transfer Tool User's Guide](/node/8196/). An important distinction for use with the Pre-Release Data Portal is that it must always be used with a token and with the option `-s https://api.awg.gdc.cancer.gov`. +As with the GDC Data Portal, downloads of large or numerous files is best performed using the GDC Data Transfer Tool. Information on the GDC Data Transfer Tool is available in the [GDC Data Transfer Tool User's Guide](https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Getting_Started/). An important distinction for use with the Pre-Release Data Portal is that it must always be used with a token and with the option `-s https://api.awg.gdc.cancer.gov`. ## GDC Pre-Release Data Admin Portal -### Overview - -The GDC Pre-Release Data Admin Portal allows Pre-Release Data Portal admins to create and maintain Pre-Release Data Groups and associated projects, as well as grant appropriate access to users within these groups. To gain access to the Pre-Release Data Admin Portal please contact the GDC Helpdesk (support@nci-gdc.datacommons.io). +The GDC Pre-Release Data Admin Portal allows admins to create and maintain Pre-Release Data Groups and associated projects, as well as grant appropriate access to users within these groups. To gain access to the Pre-Release Data Admin Portal please contact the GDC Helpdesk (support@nci-gdc.datacommons.io). [![GDC Pre-Release Data Portal Main Page](images/AWG_Admin.png)](images/AWG_Admin.png "Click to see the full image.") The Pre-Release Data Admin Portal is broken into two views on the left-most panel: -* __Users__: Allows admin to create, view, edit Pre-Release Data Portal user profiles -* __Groups__: Allows admin to manage groups projects / users +* __Users__: Allows admin to create, view, edit Pre-Release Data Portal user profiles. +* __Groups__: Allows admin to manage groups projects / users. #### Definitions | Entity | Definition | |---|---| | __User__ | An individual with an eRA Commons account. | -| __Project__ | A collection of files and observations that are contained in the GDC database and have been registered in dbGAP as a project. Only certain projects are designated as Pre-Release Data projects.| +| __Project__ | A collection of files and observations that are contained in the GDC database and have been registered in dbGaP as a project. Only certain projects are designated as Pre-Release Data projects.| | __Group__ | A collection of users and projects. When a user is assigned to a group, they will have access to the projects in that group when they login to the Pre-Release Data portal as long as they have downloader access to the project in dbGaP.| ### Users @@ -95,15 +49,15 @@ The __Users__ section of the GDC Pre-Release Data Admin portal allows admins to #### Creating Users -To create a new user in the Pre-Release Data Admin Portal, click on the `Create` button on the far right panel. +To create a new user in the Pre-Release Data Admin Portal, click on the `Create` button on the far-right panel. [![GDC Pre-Release Data Portal Main Page](images/AWG_Admin_Create_User.png)](images/AWG_Admin_Create_User.png "Click to see the full image.") Then the following information must be supplied, before clicking the `Save` button: -* __eRA Commons ID__: The eRA Commons ID of the user to be added -* __Role__: Choose between `Admin` or `User` roles -* __Group (Optional)__: Choose existing groups to add the user to +* __eRA Commons ID__: The eRA Commons ID of the user to be added. +* __Role__: Choose between `Admin` or `User` roles. +* __Group (Optional)__: Choose existing groups to add the user to. After clicking `Save`, the user should appear in the list of users in the center panel. Also clicking on the user in the list will display information about that user and gives the options to `Edit` the user profile, or `Delete` the user. @@ -117,24 +71,24 @@ The __Groups__ section of the GDC Pre-Release Data Admin portal allows admins to #### Creating Groups -To create a new group in the Pre-Release Data Admin Portal, click on the `Create` button on the far right panel. +To create a new group in the Pre-Release Data Admin Portal, click on the `Create` button on the far-right panel. [![GDC Pre-Release Data Portal Main Page](images/AWG_Admin_Groups_Add.png)](images/AWG_Admin_Groups_Add.png "Click to see the full image.") Then the following information must be supplied, before clicking the `Save` button: -* __Name__: The name of the group -* __Description__: The description of the group -* __Users (Optional)__: Choose existing users to add to the group -* __Projects(Optional)__: Choose existing projects to add to the group +* __Name__: The name of the group. +* __Description__: The description of the group. +* __Users (Optional)__: Choose existing users to add to the group. +* __Projects(Optional)__: Choose existing projects to add to the group. After clicking `Save`, the group should appear in the list of groups in the center panel. Also clicking on the group in the list will display information about that group and gives the options to `Edit` or `Delete` the group. [![GDC Pre-Release Data Portal Main Page](images/AWG_Admin_New_Group.png)](images/AWG_Admin_New_Group.png "Click to see the full image.") -## API +## AWG API -API functionality is similar to what is available for the main GDC Data Portal. You can read more about the GDC API in general in the [API User Guide](/API/Users_Guide/Getting_Started/). Important differences for the AWG API include the following: +API functionality is similar to what is available for the main GDC Data Portal. You can read more about the GDC API in general in the [API User Guide](/API/Users_Guide/Getting_Started/). Important differences for the Analysis Working Group (AWG) API include the following: * The base URL is different. Instead use https://api.awg.gdc.cancer.gov/ * An authorization token must always be passed with every query rather than just for downloading controlled access data. diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC-HomePage-Submit_v2.png b/docs/Data_Submission_Portal/Users_Guide/images/GDC-HomePage-Submit_v2.png index 56849cc7e..79810d43c 100644 Binary files a/docs/Data_Submission_Portal/Users_Guide/images/GDC-HomePage-Submit_v2.png and b/docs/Data_Submission_Portal/Users_Guide/images/GDC-HomePage-Submit_v2.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Data_Submission_Workflow-updated_20190301.jpg b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Data_Submission_Workflow-updated_20190301.jpg new file mode 100644 index 000000000..51fce4c7c Binary files /dev/null and b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Data_Submission_Workflow-updated_20190301.jpg differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC_SUBMIT_TO_GDC_v3.png b/docs/Data_Submission_Portal/Users_Guide/images/GDC_SUBMIT_TO_GDC_v3.png new file mode 100644 index 000000000..3cd73a01f Binary files /dev/null and b/docs/Data_Submission_Portal/Users_Guide/images/GDC_SUBMIT_TO_GDC_v3.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Cases_Default_2.png b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Cases_Default_2.png index bc224c45f..5c7bdc784 100644 Binary files a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Cases_Default_2.png and b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Cases_Default_2.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Cases_Summary_Download_2.png b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Cases_Summary_Download_2.png index e172a2f96..ac1bca023 100644 Binary files a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Cases_Summary_Download_2.png and b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Cases_Summary_Download_2.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Dashboard_2.png b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Dashboard_2.png index 29f8cebd9..dc6357a73 100644 Binary files a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Dashboard_2.png and b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Dashboard_2.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Dashboard_3.png b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Dashboard_3.png index 1fe3ef70b..59c8fa65c 100644 Binary files a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Dashboard_3.png and b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Dashboard_3.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Dashboard_4.png b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Dashboard_4.png new file mode 100644 index 000000000..69ffdbe76 Binary files /dev/null and b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Dashboard_4.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Submit_Release_Project_State.png b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Submit_Release_Project_State.png index 18683c176..8375522e7 100644 Binary files a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Submit_Release_Project_State.png and b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Submit_Release_Project_State.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Submit_Release_Project_State_Review_3.png b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Submit_Release_Project_State_Review_3.png new file mode 100644 index 000000000..ec84b299a Binary files /dev/null and b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Submit_Release_Project_State_Review_3.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Submit_Release_Project_State_v3.png b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Submit_Release_Project_State_v3.png new file mode 100644 index 000000000..78e6420e8 Binary files /dev/null and b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Submit_Release_Project_State_v3.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Submit_Release_Submit_tab_2_v4.png b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Submit_Release_Submit_tab_2_v4.png new file mode 100644 index 000000000..e4423432c Binary files /dev/null and b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Submit_Release_Submit_tab_2_v4.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Transactions_2.png b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Transactions_2.png index 5f30a9bb6..7b2c85630 100644 Binary files a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Transactions_2.png and b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Transactions_2.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Transactions_Details_2.png b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Transactions_Details_2.png index dc75edf28..2e007b0ba 100644 Binary files a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Transactions_Details_2.png and b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Transactions_Details_2.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Transactions_Details_3.png b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Transactions_Details_3.png new file mode 100644 index 000000000..3ad69d70a Binary files /dev/null and b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Transactions_Details_3.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Transactions_Original_Files_2.png b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Transactions_Original_Files_2.png index 98f405753..8bfefbf78 100644 Binary files a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Transactions_Original_Files_2.png and b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_Transactions_Original_Files_2.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_UUID_location.png b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_UUID_location.png new file mode 100644 index 000000000..2155101a7 Binary files /dev/null and b/docs/Data_Submission_Portal/Users_Guide/images/GDC_Submission_UUID_location.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/QC_Report_tab.png b/docs/Data_Submission_Portal/Users_Guide/images/QC_Report_tab.png new file mode 100644 index 000000000..524ebd284 Binary files /dev/null and b/docs/Data_Submission_Portal/Users_Guide/images/QC_Report_tab.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/SUR_QC_errors.png b/docs/Data_Submission_Portal/Users_Guide/images/SUR_QC_errors.png new file mode 100644 index 000000000..b93fb5d0b Binary files /dev/null and b/docs/Data_Submission_Portal/Users_Guide/images/SUR_QC_errors.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/Submission.png b/docs/Data_Submission_Portal/Users_Guide/images/Submission.png new file mode 100644 index 000000000..eee369813 Binary files /dev/null and b/docs/Data_Submission_Portal/Users_Guide/images/Submission.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/Submission_portal_homepage.png b/docs/Data_Submission_Portal/Users_Guide/images/Submission_portal_homepage.png new file mode 100644 index 000000000..bc7857c13 Binary files /dev/null and b/docs/Data_Submission_Portal/Users_Guide/images/Submission_portal_homepage.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/Untitled.png b/docs/Data_Submission_Portal/Users_Guide/images/Untitled.png deleted file mode 100644 index 7dbe428eb..000000000 Binary files a/docs/Data_Submission_Portal/Users_Guide/images/Untitled.png and /dev/null differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/gdc-submission-portal-data-upload-workflow.png b/docs/Data_Submission_Portal/Users_Guide/images/gdc-submission-portal-data-upload-workflow.png index 4dd08fd1f..81f788499 100644 Binary files a/docs/Data_Submission_Portal/Users_Guide/images/gdc-submission-portal-data-upload-workflow.png and b/docs/Data_Submission_Portal/Users_Guide/images/gdc-submission-portal-data-upload-workflow.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/gdc-submission-portal-data-upload-workflow_2.jpg b/docs/Data_Submission_Portal/Users_Guide/images/gdc-submission-portal-data-upload-workflow_2.jpg new file mode 100644 index 000000000..fd641d17d Binary files /dev/null and b/docs/Data_Submission_Portal/Users_Guide/images/gdc-submission-portal-data-upload-workflow_2.jpg differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/gdc-submission-portal-data-upload-workflow_2.png b/docs/Data_Submission_Portal/Users_Guide/images/gdc-submission-portal-data-upload-workflow_2.png new file mode 100644 index 000000000..cec6b4b9f Binary files /dev/null and b/docs/Data_Submission_Portal/Users_Guide/images/gdc-submission-portal-data-upload-workflow_2.png differ diff --git a/docs/Data_Submission_Portal/Users_Guide/images/gdc-submission-portal-file-state-vs-state.png b/docs/Data_Submission_Portal/Users_Guide/images/gdc-submission-portal-file-state-vs-state.png index 75c10971a..a7a08c50f 100644 Binary files a/docs/Data_Submission_Portal/Users_Guide/images/gdc-submission-portal-file-state-vs-state.png and b/docs/Data_Submission_Portal/Users_Guide/images/gdc-submission-portal-file-state-vs-state.png differ diff --git a/docs/Data_Transfer_Tool/Release_Notes/DTT_Release_Notes.md b/docs/Data_Transfer_Tool/Release_Notes/DTT_Release_Notes.md index c5efa4685..0fb0ea7fd 100644 --- a/docs/Data_Transfer_Tool/Release_Notes/DTT_Release_Notes.md +++ b/docs/Data_Transfer_Tool/Release_Notes/DTT_Release_Notes.md @@ -2,12 +2,96 @@ | Version | Date | |---|---| +| [v1.6.1](DTT_Release_Notes.md#v161) | May 17, 2021 | +| [v1.6.0](DTT_Release_Notes.md#v160) | July 8, 2020 | +| [v1.5.0](DTT_Release_Notes.md#v150) | January 30, 2020 | +| [v1.4.0](DTT_Release_Notes.md#v140) | December 18, 2018 | | [v1.3.0](DTT_Release_Notes.md#v130) | August 22, 2017 | | [v1.2.0](DTT_Release_Notes.md#v120) | Oct 31, 2016 | | [v1.1.0](DTT_Release_Notes.md#v110) | September 7, 2016 | | [v1.0.1](DTT_Release_Notes.md#v101) | June 2, 2016 | | [v1.0.0](DTT_Release_Notes.md#v100) | May 26, 2016 | +## V1.6.1 +* __GDC Product__: Data Transfer Tool +* __Release Date__: May 17, 2021 + +### New Features and Changes +* None + +### Bugs Fixed Since Last Release +* Fixed issue with resuming large file downloads. +* Fixed issue with error reporting. +* Improved multi-part file upload. + +### Known Issues and Workarounds +* Use of non-ASCII characters in token passed to Data Transfer Tool will produce incorrect error message "Internal server error: Auth service temporarily unavailable". +* On some terminals, dragging and dropping a file into the interactive client will add single quotes (' ') around the file path. This causes the interactive client to misinterpret the file path and generate an error when attempting to load a manifest file or token. + * *Workaround:* Manually type out the file name or remove the single quotes from around the file path. +* When any files mentioned in the upload manifest are not present in the upload directory the submission will hang at the missing file. + * *Workaround:* Edit the manifest to specify only the the files that are present in the upload directory for submission or copy the missing files into the upload directory. + + +## V1.6.0 +* __GDC Product__: Data Transfer Tool +* __Release Date__: July 8, 2020 + +### New Features and Changes +* None + +### Bugs Fixed Since Last Release +* Fixed issue with file upload requiring a manifest +* Restored multi-part upload feature +* Fixed error reporting issue + +### Known Issues and Workarounds +* Use of non-ASCII characters in token passed to Data Transfer Tool will produce incorrect error message "Internal server error: Auth service temporarily unavailable". +* On some terminals, dragging and dropping a file into the interactive client will add single quotes (' ') around the file path. This causes the interactive client to misinterpret the file path and generate an error when attempting to load a manifest file or token. + * *Workaround:* Manually type out the file name or remove the single quotes from around the file path. +* When any files mentioned in the upload manifest are not present in the upload directory the submission will hang at the missing file. + * *Workaround:* Edit the manifest to specify only the the files that are present in the upload directory for submission or copy the missing files into the upload directory. + +## V1.5.0 +* __GDC Product__: Data Transfer Tool +* __Release Date__: January 30, 2020 + +### New Features and Changes +* Data transfer tool code now uses Python 3. + +### Bugs Fixed Since Last Release +* Problems with downloading associated annotations is fixed. + +### Known Issues and Workarounds +* Use of non-ASCII characters in token passed to Data Transfer Tool will produce incorrect error message "Internal server error: Auth service temporarily unavailable". +* On some terminals, dragging and dropping a file into the interactive client will add single quotes (' ') around the file path. This causes the interactive client to misinterpret the file path and generate an error when attempting to load a manifest file or token. + * *Workaround:* Manually type out the file name or remove the single quotes from around the file path. +* When any files mentioned in the upload manifest are not present in the upload directory the submission will hang at the missing file. + * *Workaround:* Edit the manifest to specify only the the files that are present in the upload directory for submission or copy the missing files into the upload directory. + + +## V1.4.0 +* __GDC Product__: Data Transfer Tool +* __Release Date__: December 18, 2018 + +### New Features and Changes +* Enabled download latest file version feature +* Removal of Interactive mode +* Enabled display of all default settings +* Standardized upload and download help menus + +### Bugs Fixed Since Last Release +* Download flag --no-related-files bug preventing file downloads fixed +* File name handling with forward slashes bug fixed +* Download flag --no-segment-md5sums bug fixed. + +### Known Issues and Workarounds +* Use of non-ASCII characters in token passed to Data Transfer Tool will produce incorrect error message "Internal server error: Auth service temporarily unavailable". +* On some terminals, dragging and dropping a file into the interactive client will add single quotes (' ') around the file path. This causes the interactive client to misinterpret the file path and generate an error when attempting to load a manifest file or token. + * *Workaround:* Manually type out the file name or remove the single quotes from around the file path. +* When any files mentioned in the upload manifest are not present in the upload directory the submission will hang at the missing file. + * *Workaround:* Edit the manifest to specify only the the files that are present in the upload directory for submission or copy the missing files into the upload directory. + + ## v1.3.0 * __GDC Product__: Data Transfer Tool * __Release Date__: August 22, 2017 @@ -103,12 +187,6 @@ * On some terminals, dragging and dropping a file into the interactive client will add single quotes (' ') around the file path. This causes the interactive client to misinterpret the file path and generate an error when attempting to load a manifest file or token. * *Workaround:* Manually type out the file name or remove the single quotes from around the file path. - - - - - - ## v1.0.0 * __GDC Product__: Data Transfer Tool diff --git a/docs/Data_Transfer_Tool/Release_Notes/DTT_UI_Release_Notes.md b/docs/Data_Transfer_Tool/Release_Notes/DTT_UI_Release_Notes.md index d164cf775..0fd85c2b0 100644 --- a/docs/Data_Transfer_Tool/Release_Notes/DTT_UI_Release_Notes.md +++ b/docs/Data_Transfer_Tool/Release_Notes/DTT_UI_Release_Notes.md @@ -3,15 +3,32 @@ | Version | Date | |---|---| +| [v0.6.0](DTT_UI_Release_Notes.md#v054) | August 12, 2020 | | [v0.5.4](DTT_UI_Release_Notes.md#v054) | April 5, 2018 | | [v0.5.3](DTT_UI_Release_Notes.md#v053) | December 14, 2017 | +## v0.6.0 +* __GDC Product__: Data Transfer Tool UI +* __Release Date__: August 12, 2020 + +### New Features and Changes +* The server option can now be indicated within the interface. Previously the DTT-UI server defaulted to `api.gdc.cancer.gov` +* The DTT-UI now uses Data Transfer Tool v1.6, which uses Python3. + +### Bugs Fixed Since Last Release +* None + +### Known Issues and Workarounds +* Download speeds for large numbers of small files may be better handled with the Command Line version of the Data Transfer Tool +* Data Submission to the GDC is not supported in the Data Transfer Tool UI. Instead users must use the Command Line Data Transfer Tool + + + ## v0.5.4 * __GDC Product__: Data Transfer Tool UI * __Release Date__: April 5, 2018 - ### New Features and Changes * None diff --git a/docs/Data_Transfer_Tool/Users_Guide/Accessing_Built-in_Help.md b/docs/Data_Transfer_Tool/Users_Guide/Accessing_Built-in_Help.md index 7eaf8d739..42df8861f 100644 --- a/docs/Data_Transfer_Tool/Users_Guide/Accessing_Built-in_Help.md +++ b/docs/Data_Transfer_Tool/Users_Guide/Accessing_Built-in_Help.md @@ -10,7 +10,7 @@ The GDC Data Transfer Tool comes with built-in help menus. These menus are displ gdc-client --help ``` ``` Output -usage: gdc-client [-h] [--version] {download,upload,interactive} ... +usage: gdc-client [-h] [--version] {download,upload,settings} ... The Genomic Data Commons Command Line Client @@ -19,11 +19,11 @@ optional arguments: --version show program's version number and exit commands: - {download,upload,interactive} + {download,upload,settings} for more information, specify -h after a command download download data from the GDC upload upload data to the GDC - interactive run in interactive mode + settings display default settings ``` The available menus are provided below. @@ -36,7 +36,7 @@ The GDC Data Transfer Tool displays the following output when executed without a gdc-client ``` ```Output -usage: gdc-client [-h] [--version] {download,upload,interactive} ... +usage: gdc-client [-h] [--version] {download,upload,settings} ... gdc-client: error: too few arguments ``` @@ -49,53 +49,64 @@ The GDC Data Transfer Tool displays the following help menu for its download fun gdc-client download --help ``` ```Output -usage: gdc-client download [-h] [--debug] [--log-file LOG_FILE] - [-t TOKEN_FILE] [-d DIR] [-s server] - [--no-segment-md5sums] [--no-file-md5sum] - [-n N_PROCESSES] - [--http-chunk-size HTTP_CHUNK_SIZE] - [--save-interval SAVE_INTERVAL] - [--no-verify] [--no-related-files] - [--no-annotations] [--no-auto-retry] - [--retry-amount RETRY_AMOUNT] - [--wait-time WAIT_TIME] [-u] [-m MANIFEST] - [file_id [file_id ...]] +usage: gdc-client download [-h] [--debug] + [--log-file LOG_FILE] + [--color_off] [-t TOKEN_FILE] + [-d DIR] [-s server] + [--no-segment-md5sums] + [--no-file-md5sum] + [-n N_PROCESSES] + [--http-chunk-size HTTP_CHUNK_SIZE] + [--save-interval SAVE_INTERVAL] + [--no-verify] + [--no-related-files] + [--no-annotations] + [--no-auto-retry] + [--retry-amount RETRY_AMOUNT] + [--wait-time WAIT_TIME] + [--latest] [--config FILE] [-u] + [-m MANIFEST] + [file_id [file_id ...]] positional arguments: - file_id The GDC UUID of the file(s) to download +file_id The GDC UUID of the file(s) to download optional arguments: - -h, --help show this help message and exit - --debug Enable debug logging. If a failure occurs, the program - will stop. - --log-file LOG_FILE Save logs to file. Amount logged affected by --debug - -t TOKEN_FILE, --token-file TOKEN_FILE - GDC API auth token file - -d DIR, --dir DIR Directory to download files to. Defaults to current dir - -s server, --server server +-h, --help show this help message and exit +--debug Enable debug logging. If a failure occurs, the program + will stop. +--log-file LOG_FILE Save logs to file. Amount logged affected by --debug +--color_off Disable colored output +-t TOKEN_FILE, --token-file TOKEN_FILE + GDC API auth token file +-d DIR, --dir DIR Directory to download files to. Defaults to current + dir +-s server, --server server The TCP server address server[:port] - --no-segment-md5sums Do not calculate inbound segment md5sumsand/or do not +--no-segment-md5sums Do not calculate inbound segment md5sums and/or do not verify md5sums on restart - --no-file-md5sum Do not verify file md5sum after download - -n N_PROCESSES, --n-processes N_PROCESSES - Number of client connections. - --http-chunk-size HTTP_CHUNK_SIZE - Size in bytes of standard HTTP block size. - --save-interval SAVE_INTERVAL - The number of chunks after which to flush state file. - A lower save interval will result in more frequent - printout but lower performance. - --no-verify Perform insecure SSL connection and transfer - --no-related-files Do not download related files. - --no-annotations Do not download annotations. - --no-auto-retry Ask before retrying to download a file - --retry-amount RETRY_AMOUNT - Number of times to retry a download - --wait-time WAIT_TIME - Amount of seconds to wait before retrying - -u, --udt Use the UDT protocol. - -m MANIFEST, --manifest MANIFEST - GDC download manifest file +--no-file-md5sum Do not verify file md5sum after download +-n N_PROCESSES, --n-processes N_PROCESSES + Number of client connections. +--http-chunk-size HTTP_CHUNK_SIZE, -c HTTP_CHUNK_SIZE + Size in bytes of standard HTTP block size. +--save-interval SAVE_INTERVAL + The number of chunks after which to flush state file. + A lower save interval will result in more frequent + printout but lower performance. +--no-verify Perform insecure SSL connection and transfer +--no-related-files Do not download related files. +--no-annotations Do not download annotations. +--no-auto-retry Ask before retrying to download a file +--retry-amount RETRY_AMOUNT + Number of times to retry a download +--wait-time WAIT_TIME + Amount of seconds to wait before retrying +--latest Download latest version of a file if it exists +--config FILE Path to INI-type config file +-u, --udt Use the UDT protocol. +-m MANIFEST, --manifest MANIFEST + GDC download manifest file ``` ### Upload help menu @@ -107,38 +118,51 @@ The GDC Data Transfer Tool displays the following help menu for its upload funct gdc-client upload --help ``` ```Output -usage: gdc-client upload [-h] [--debug] [-v] [--log-file LOG_FILE] - [-T TOKEN | -t TOKEN] [-H HOST] [-P PORT] - [--project-id PROJECT_ID] [--identifier IDENTIFIER] - [--path path] [--upload-id UPLOAD_ID] [--insecure] - [--server SERVER] [--part-size PART_SIZE] - [-n N_PROCESSES] [--disable-multipart] [--abort] - [--resume] [--delete] [--manifest MANIFEST] +usage: gdc-client upload [-h] [--debug] + [--log-file LOG_FILE] + [--color_off] [-t TOKEN_FILE] + [--project-id PROJECT_ID] + [--path path] + [--upload-id UPLOAD_ID] + [--insecure] [--server SERVER] + [--part-size PART_SIZE] + [--upload-part-size UPLOAD_PART_SIZE] + [-n N_PROCESSES] + [--disable-multipart] [--abort] + [--resume] [--delete] + [--manifest MANIFEST] + [--config FILE] + [file_id [file_id ...]] +positional arguments: + file_id The GDC UUID of the file(s) to upload optional arguments: - -h, --help show this help message and exit - --debug Enable debug logging. If a failure occurs, the program - will stop. - --log-file LOG_FILE Save logs to file. Amount logged affected by --debug - -t TOKEN_FILE, --token-file TOKEN_FILE - GDC API auth token file - --project-id PROJECT_ID, -p PROJECT_ID - The project ID that owns the file - --path path, -f path directory path to find file - --upload-id UPLOAD_ID, -u UPLOAD_ID - Multipart upload id - --insecure, -k Allow connections to server without certs - --server SERVER, -s SERVER - GDC API server address - --part-size PART_SIZE, -ps PART_SIZE - Part size for multipart upload - -n N_PROCESSES, --n-processes N_PROCESSES - Number of client connections - --disable-multipart Disable multipart upload - --abort Abort previous multipart upload - --resume, -r Resume previous multipart upload - --delete Delete an uploaded file - --manifest MANIFEST, -m MANIFEST - Manifest which describes files to be uploaded ---identifier, -i DEPRECATED + -h, --help show this help message and exit + --debug Enable debug logging. If a failure occurs, the program + will stop. + --log-file LOG_FILE Save logs to file. Amount logged affected by --debug + --color_off Disable colored output + -t TOKEN_FILE, --token-file TOKEN_FILE + GDC API auth token file + --project-id PROJECT_ID, -p PROJECT_ID + The project ID that owns the file + --path path, -f path directory path to find file + --upload-id UPLOAD_ID, -u UPLOAD_ID + Multipart upload id + --insecure, -k Allow connections to server without certs + --server SERVER, -s SERVER + GDC API server address + --part-size PART_SIZE + DEPRECATED in favor of [--upload-part-size] + --upload-part-size UPLOAD_PART_SIZE, -c UPLOAD_PART_SIZE + Part size for multipart upload + -n N_PROCESSES, --n-processes N_PROCESSES + Number of client connections + --disable-multipart Disable multipart upload + --abort Abort previous multipart upload + --resume, -r Resume previous multipart upload + --delete Delete an uploaded file + --manifest MANIFEST, -m MANIFEST + Manifest which describes files to be uploaded + --config FILE Path to INI-type config file ``` diff --git a/docs/Data_Transfer_Tool/Users_Guide/Appendix_A_-_Config_File.md b/docs/Data_Transfer_Tool/Users_Guide/Appendix_A_-_Config_File.md new file mode 100644 index 000000000..ecf6851a9 --- /dev/null +++ b/docs/Data_Transfer_Tool/Users_Guide/Appendix_A_-_Config_File.md @@ -0,0 +1,40 @@ +###Data Transfer Tool Configuration File +The DTT has the ability to save and reuse configuration parameters in the format of a flat text file via a command line argument. A simple text file needs to be created first with an extension of either txt or dtt. The supported section headers are upload and download which can be used independently of each other or used in the same configuration file. Each section header corresponds to the main functions of the application which are to either download data from the GDC portals or to upload data to the submission system of the GDC. The configurable parameters are those listed in the help menus under either [download](https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Accessing_Built-in_Help/#download-help-menu) or [upload](https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Accessing_Built-in_Help/#upload-help-menu). + + +Example usage: + + gdc-client download d45ec02b-13c3-4afa-822d-443ccd3795ca --config my-dtt-config.dtt + +Example of configuration file: + + [upload] + path = /some/upload/path + upload_part_size = 1073741824 + + + [download] + dir = /some/download/path + http_chunk_size = 2048 + retry_amount = 6 + + +###Display Config Parameters +This command line flag can be used with either the download or upload application feature to display what settings are active within a custom data transfer tool configuration file. + + gdc-client settings download --config my-dtt-config.dtt + [download] + no_auto_retry = False + no_file_md5sum = False + save_interval = 1073741824 + http_chunk_size = 2048 + server = http://exmple-site.com + n_processes = 8 + no_annotations = False + no_related_files = False + retry_amount = 6 + no_segment_md5sum = False + manifest = [] + wait_time = 5.0 + no_verify = True + dir = /some/download/path diff --git a/docs/Data_Transfer_Tool/Users_Guide/Appendix_A_-_Key_Terms.md b/docs/Data_Transfer_Tool/Users_Guide/Appendix_A_-_Key_Terms.md deleted file mode 100644 index 61a7e1e16..000000000 --- a/docs/Data_Transfer_Tool/Users_Guide/Appendix_A_-_Key_Terms.md +++ /dev/null @@ -1,13 +0,0 @@ -The following table provides definitions and explanations for terms and acronyms relevant to the content presented within this document. - -| Term | Definition | -|-------|--------------------------------------------------| -| eRA | Electronic Research Administration | -| GDC | Genomic Data Commons | -| HTTP | Hypertext Transfer Protocol | -| HTTPS | HTTP Secure | -| ID | Identifier | -| NCI | National Cancer Institute | -| TCGA | The Cancer Genome Atlas | -| TCP | Transmission Control Protocol | -| UUID | Universally Unique Identifier | diff --git a/docs/Data_Transfer_Tool/Users_Guide/Appendix_B_TroubleShooting.md b/docs/Data_Transfer_Tool/Users_Guide/Appendix_B_TroubleShooting.md new file mode 100644 index 000000000..fecff367c --- /dev/null +++ b/docs/Data_Transfer_Tool/Users_Guide/Appendix_B_TroubleShooting.md @@ -0,0 +1,61 @@ +# Troubleshooting Guide + +If you encounter issues when using the Data Transfer Tool for downloading files please reference the section below for helpful hints and recommendations. + +## Speed Performance During Download +The Data Transfer Tool has two performance tuning options that are presented during download operations. The two options are: + +* --n - The "--n" option assists with assigning the number of threads to the download process. The default is 4 and can not be lowered below three threads. + +* --http-chunk-size - The "--http-chunk-size" setting can improve performance but we do not provide any hard settings due to the eclectic nature of client networks and their connections to the internet but instead encourage clients to experiment with changing the default setting of 1048576 bytes to larger size ranges. + +## Very Large Manifests +Some clients have needed to create very large manifest files to satisfy the scope of their work. Using very large manifest files can from time to time lead to the end user experiencing network time outs or dropped connections due to network topologies, internet connections, or load on client side systems. Network time out or dropped network connect can manifest as a hung or unresponsive download session. To help mitigate these issues we recommended that clients breakup their manifest files into smaller chunks. + +## General Tips +* To avoid running into older software bugs/conflicts we recommend you always use the latest version of the client whenever possible. +* When experiencing download problems while using an access tokens try downloading a new token first before reporting it to the GDC Help Desk. + +## Logging +For troubleshooting purposes the GDC User Services Team may request that you run the command line application with the following flags { --debug --log-file }. These flags will run the application in debug mode and create a logfile file with the debug logs in it. +Example Usage: +```Debug-Logfile +gdc-client download -m lung.manifest.txt -t token.file --debug --log-file logfile.txt +``` + +## OS Compatibility with the Data Transfer Tool +The Data transfer Tool is offered in three OS compatible versions; Mac OS, Windows, and Ubuntu Linux. We have successfully tested the Ubuntu binary on CentOS 7.x and 8.x, RHEL 7/8 and Scientific Linux 7/8 with the client but have had problems with CentOS 6.x and RHEL 6 and SL6. To work around this problem we have asked users to build their own client from our [github](https://github.com/NCI-GDC/gdc-client) repository with the assistance of an instruction document that we provide on request via the GDC Helpdesk. + +## Network Troubleshooting + +Network problems can appear as dropped network connections or even a stalled application. The GDC Helpdesk might request more network information to assist in diagnosing the problem. The two tests they will request the end user to run are ping and traceroute (tracert on the windows platform) against our api servers. Please capture the output from these tests into a text file and attach it to the reply email. + +Examples: +```Ping +>ping api.gdc.cancer.gov + PING api.gdc.cancer.gov (192.170.230.246): 56 data bytes + 64 bytes from 192.170.230.246: icmp_seq=0 ttl=249 time=4.235 ms + 64 bytes from 192.170.230.246: icmp_seq=1 ttl=249 time=4.783 ms +``` +```Traceroute +>traceroute api.gdc.cancer.gov + traceroute to api.gdc.cancer.gov (192.170.230.246), 64 hops max, 52 byte packets + 1 h01-391-250-v1011.gw.uchicago.net (10.151.0.2) 4.595 ms 3.602 ms 3.322 ms + 2 h01-391-250-to-b65-ll129-300.p2p.uchicago.net (10.5.1.32) 13.285 ms 9.241 ms 5.156 ms + 3 b65-ll129-300-to-borderfw.p2p.uchicago.net (192.170.192.32) 3.218 ms 3.364 ms 3.396 ms + 4 borderfw-to-b65-ll129-500.p2p.uchicago.net (192.170.192.36) 3.605 ms 3.741 ms 3.833 ms + 5 b65-scidmz-01-to-b65-ll129-500.uchicago.net (128.135.247.182) 4.223 ms 5.428 ms 3.999 ms + 6 192.170.224.97 (192.170.224.97) 4.003 ms 3.970 ms 6.260 ms + 7 lnk-g30-scidist-01.scidmz.uchicago.net (192.170.224.66) 4.530 ms 4.649 ms 6.021 ms + 8 192.170.230.246 (192.170.230.246) 4.158 ms 4.273 ms 5.134 ms +``` + +## Common Error Codes +This is a list of the most common error codes the Data Transfer Tool generates and their meaning +
    +
  • Unable to connect to API - you might be running an out of date client so consider upgrading.
  • +
  • Error: Max Retries Exceeded - network connect timeouts
  • +
  • CryptographyDeprecationWarning - a warning that you should consider upgrading to a higher + version of python - please upgrade to 2.7.x or higher.
  • +
  • ERROR: An unexpected error has occurred during normal operation of the client - This could be a variety of problems and we ask you to contact our helpdesk.
  • +
  • ECONNRESET - network connection dropped.
  • diff --git a/docs/Data_Transfer_Tool/Users_Guide/Data_Download_DTT_UI.md b/docs/Data_Transfer_Tool/Users_Guide/Data_Download_DTT_UI.md index 0a8b2cf93..05d807265 100644 --- a/docs/Data_Transfer_Tool/Users_Guide/Data_Download_DTT_UI.md +++ b/docs/Data_Transfer_Tool/Users_Guide/Data_Download_DTT_UI.md @@ -68,6 +68,7 @@ While the default download options will work for the majority of use cases, ther | Settings | Details |----------|---------| +| Server URL | Default: https://api.gdc.cancer.gov | | Number of Client Connections: Default (3) | Number of concurrent client threads | | Destination Folder: Default (User's Home Directory) | User selectable download file location | | Calculate Inbound Segment and check Md5sum on Restart: Default (On) | Verify previous partial downloaded files via segment check sum | diff --git a/docs/Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload.md b/docs/Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload.md index a57d21721..d3d7c7bf4 100644 --- a/docs/Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload.md +++ b/docs/Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload.md @@ -1,4 +1,6 @@ -#Data Downloads and Uploads from the command line. +#Data Transfer Tool Command Line Documentation + + ## Downloads @@ -26,6 +28,19 @@ The GDC Data Transfer Tool supports resumption of interrupted downloads. To resu gdc-client download f80ec672-d00f-42d5-b5ae-c7e06bc39da1 +### Download Latest Version of a File +The GDC Data Transfer Tool supports file versioning. Our backend data storage supports multiple file versions so older and current versions can be accessible to our users. For information about accessing file versioning information with our API and finding older UUID information from current UUIDs please check out the [the API User Guide](https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#example-of-retrieving-file-version-information) section in our API documentation. When working with older manifests or older lists of UUIDs the latest version of a file can always be download with the --latest flag. + +```Shell +gdc-client download 426de656-7e34-4a49-b87e-6e2563fa3cdd --latest -t gdc-user-token.2018.txt +``` +```Output +Downloading LATEST versions of files +Latest version for 426de656-7e34-4a49-b87e-6e2563fa3cdd ==> 6633bfbd-87f1-4d3a-a475-7ad1e8c2017a +100% [#############################################################################################################################] Time: 0:01:16 14.10 MB/s +Successfully downloaded: 1 +``` + ### Downloading Controlled-Access Data A user authentication token is required for downloading Controlled-Access Data from GDC. Tokens can be obtained from the GDC Data Portal (see instructions in [Obtaining an Authentication Token](Preparing_for_Data_Download_and_Upload.md#obtaining-an-authentication-token)). Once downloaded, the token *file* can be passed to the GDC Data Transfer Tool using the **-t** or **--token-file** option: @@ -75,11 +90,6 @@ Previously uploaded data can be replaced with new data by deleting it first usin gdc-client upload -m manifest.yml -t token --delete -## Recurrent Transfers of Very Large Datasets over High-speed Networks - -Institutions that regularly transfer very large volumes of data between GDC facilities (located in Chicago, IL, USA) and a geographically remote location over gigabit+ networks may benefit from using the UDT mode of the GDC Data Transfer Tool. **UDT mode** is an advanced feature that uses [UDT](http://udt.sourceforge.net/), or User Datagram Protocol (UDP)-based Data Transfer, instead of the ubiquitous [Transmission Control Protocol (TCP) protocol](https://tools.ietf.org/html/rfc793). Please contact the GDC Helpdesk if you are interested in learning more about this feature. - - ## Troubleshooting ### Invalid Token @@ -137,3 +147,211 @@ To resolve this issue, delete the file using the **--delete** switch before re-u Attempting to run gdc-client.exe by double-clicking it in the Windows Explorer will produce a window that blinks once and disappears. This is normal, the executable must be run using the command prompt. Click 'Start', followed by 'Run' and type 'cmd' into the text bar. Then navigate to the path containing the executable using the 'cd' command. + +## Help Menus + +The GDC Data Transfer Tool comes with built-in help menus. These menus are displayed when the GDC Data Transfer Tool is run with flags -h or --help for any of the main arguments to the tool. Running the GDC Data Transfer Tool without argument or flag will present a list of available command options. + + + +```Shell +gdc-client --help +``` +``` Output +usage: gdc-client [-h] [--version] {download,upload,settings} ... + +The Genomic Data Commons Command Line Client + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit + +commands: + {download,upload,settings} + for more information, specify -h after a command + download download data from the GDC + upload upload data to the GDC + settings display default settings +``` + + The available menus are provided below. + +### Root menu + +The GDC Data Transfer Tool displays the following output when executed without any arguments. + +```Shell +gdc-client +``` +```Output +usage: gdc-client [-h] [--version] {download,upload,settings} ... +gdc-client: error: too few arguments +``` + + +### Download help menu + +The GDC Data Transfer Tool displays the following help menu for its download functionality. + +```Shell +gdc-client download --help +``` +```Output +usage: gdc-client download [-h] [--debug] + [--log-file LOG_FILE] + [--color_off] [-t TOKEN_FILE] + [-d DIR] [-s server] + [--no-segment-md5sums] + [--no-file-md5sum] + [-n N_PROCESSES] + [--http-chunk-size HTTP_CHUNK_SIZE] + [--save-interval SAVE_INTERVAL] + [--no-verify] + [--no-related-files] + [--no-annotations] + [--no-auto-retry] + [--retry-amount RETRY_AMOUNT] + [--wait-time WAIT_TIME] + [--latest] [--config FILE] [-u] + [-m MANIFEST] + [file_id [file_id ...]] + +positional arguments: +file_id The GDC UUID of the file(s) to download + +optional arguments: +-h, --help show this help message and exit +--debug Enable debug logging. If a failure occurs, the program + will stop. +--log-file LOG_FILE Save logs to file. Amount logged affected by --debug +--color_off Disable colored output +-t TOKEN_FILE, --token-file TOKEN_FILE + GDC API auth token file +-d DIR, --dir DIR Directory to download files to. Defaults to current + dir +-s server, --server server + The TCP server address server[:port] +--no-segment-md5sums Do not calculate inbound segment md5sums and/or do not + verify md5sums on restart +--no-file-md5sum Do not verify file md5sum after download +-n N_PROCESSES, --n-processes N_PROCESSES + Number of client connections. +--http-chunk-size HTTP_CHUNK_SIZE, -c HTTP_CHUNK_SIZE + Size in bytes of standard HTTP block size. +--save-interval SAVE_INTERVAL + The number of chunks after which to flush state file. + A lower save interval will result in more frequent + printout but lower performance. +--no-verify Perform insecure SSL connection and transfer +--no-related-files Do not download related files. +--no-annotations Do not download annotations. +--no-auto-retry Ask before retrying to download a file +--retry-amount RETRY_AMOUNT + Number of times to retry a download +--wait-time WAIT_TIME + Amount of seconds to wait before retrying +--latest Download latest version of a file if it exists +--config FILE Path to INI-type config file +-u, --udt Use the UDT protocol. +-m MANIFEST, --manifest MANIFEST + GDC download manifest file +``` + +### Upload help menu + +The GDC Data Transfer Tool displays the following help menu for its upload functionality. + + +```Shell +gdc-client upload --help +``` +```Output +usage: gdc-client upload [-h] [--debug] + [--log-file LOG_FILE] + [--color_off] [-t TOKEN_FILE] + [--project-id PROJECT_ID] + [--path path] + [--upload-id UPLOAD_ID] + [--insecure] [--server SERVER] + [--part-size PART_SIZE] + [--upload-part-size UPLOAD_PART_SIZE] + [-n N_PROCESSES] + [--disable-multipart] [--abort] + [--resume] [--delete] + [--manifest MANIFEST] + [--config FILE] + [file_id [file_id ...]] +positional arguments: + file_id The GDC UUID of the file(s) to upload + +optional arguments: + -h, --help show this help message and exit + --debug Enable debug logging. If a failure occurs, the program + will stop. + --log-file LOG_FILE Save logs to file. Amount logged affected by --debug + --color_off Disable colored output + -t TOKEN_FILE, --token-file TOKEN_FILE + GDC API auth token file + --project-id PROJECT_ID, -p PROJECT_ID + The project ID that owns the file + --path path, -f path directory path to find file + --upload-id UPLOAD_ID, -u UPLOAD_ID + Multipart upload id + --insecure, -k Allow connections to server without certs + --server SERVER, -s SERVER + GDC API server address + --part-size PART_SIZE + DEPRECATED in favor of [--upload-part-size] + --upload-part-size UPLOAD_PART_SIZE, -c UPLOAD_PART_SIZE + Part size for multipart upload + -n N_PROCESSES, --n-processes N_PROCESSES + Number of client connections + --disable-multipart Disable multipart upload + --abort Abort previous multipart upload + --resume, -r Resume previous multipart upload + --delete Delete an uploaded file + --manifest MANIFEST, -m MANIFEST + Manifest which describes files to be uploaded + --config FILE Path to INI-type config file +``` + +##Data Transfer Tool Configuration File +The DTT has the ability to save and reuse configuration parameters in the format of a flat text file via a command line argument. A simple text file needs to be created first with an extension of either txt or dtt. The supported section headers are upload and download which can be used independently of each other or used in the same configuration file. Each section header corresponds to the main functions of the application which are to either download data from the GDC portals or to upload data to the submission system of the GDC. The configurable parameters are those listed in the help menus under either [download](https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload/#download-help-menu) or [upload](https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload/#upload-help-menu) displayed under the output tabs. + + +Example usage: + + gdc-client download d45ec02b-13c3-4afa-822d-443ccd3795ca --config my-dtt-config.dtt + +Example of configuration file: + + [upload] + path = /some/upload/path + upload_part_size = 1073741824 + + + [download] + dir = /some/download/path + http_chunk_size = 2048 + retry_amount = 6 + + +###Display Config Parameters +This command line flag can be used with either the download or upload application feature to display what settings are active within a custom data transfer tool configuration file. + + gdc-client settings download --config my-dtt-config.dtt + [download] + no_auto_retry = False + no_file_md5sum = False + save_interval = 1073741824 + http_chunk_size = 2048 + server = http://exmple-site.com + n_processes = 8 + no_annotations = False + no_related_files = False + retry_amount = 6 + no_segment_md5sum = False + manifest = [] + wait_time = 5.0 + no_verify = True + dir = /some/download/path diff --git a/docs/Data_Transfer_Tool/Users_Guide/Getting_Started.md b/docs/Data_Transfer_Tool/Users_Guide/Getting_Started.md index b44882796..e63cab6a7 100644 --- a/docs/Data_Transfer_Tool/Users_Guide/Getting_Started.md +++ b/docs/Data_Transfer_Tool/Users_Guide/Getting_Started.md @@ -10,16 +10,16 @@ The GDC Data Transfer Tool, a command-line driven application, provides an optim ### System Recommendations -The system recommendations for using the GDC Data Transfer Tool are as follows: +The system recommendations for using the GDC Data Transfer Tool are as follows: -* OS: Linux (Ubuntu 14.x or later), OS X (10.9 Mavericks or later), or Windows (7 or later) -* CPU: At least eight 64-bit cores, Intel or AMD +* OS: Linux (Ubuntu 16.x or later), OS X (10.9 Mavericks or later), or Windows (8 or later) +* CPU: At least two 64-bit cores, Intel or AMD * RAM: At least 8 GiB * Storage: Enterprise-class storage system capable of at least 1 Gb/s (gigabit per second) write throughput and sufficient free space for BAM files. ### Binary Distributions -Binary distributions are available on the [GDC Transfer Tool page](https://gdc.cancer.gov/access-data/gdc-data-transfer-tool). To install the GDC Data Transfer Tool, download the respective binary distribution and unzip the distribution's archive to a location on the target system. It is recommended that the binary be copied to a located that is in the user's path so that is it accessible from any location within the terminal or command prompt. +Binary distributions are available on the [GDC Transfer Tool page](https://gdc.cancer.gov/access-data/gdc-data-transfer-tool). To install the GDC Data Transfer Tool, download the respective binary distribution and unzip the distribution's archive to a location on the target system. It is recommended that the binary be copied to a located that is in the user's path so that is it accessible from any location within the terminal or command prompt. ### Release Notes diff --git a/docs/Data_Transfer_Tool/Users_Guide/Preparing_for_Data_Download_and_Upload.md b/docs/Data_Transfer_Tool/Users_Guide/Preparing_for_Data_Download_and_Upload.md index 8ff3e9b77..b64bd91fa 100644 --- a/docs/Data_Transfer_Tool/Users_Guide/Preparing_for_Data_Download_and_Upload.md +++ b/docs/Data_Transfer_Tool/Users_Guide/Preparing_for_Data_Download_and_Upload.md @@ -47,7 +47,7 @@ Multiple data file uploads are supported by the GDC Data Transfer Tool via a man **NOTE:** To download a project's manifest file click on the _Download Manifest_ button located on the home page of the project, just below the four status charts. A manifest will be generated for the entire project or if previous files have already been upload only the files that remain to be uploaded. -A manifest for individual files can also be downloaded from the transaction tab and browse tab pages of the submission portal's project. More information on the process can be found under the Submission Portal's documentation section entitled [Step 4: GDC Data Transfer Tool](https://docs.gdc.cancer.gov/Data_Submission_Portal/Users_Guide/Upload_Data/#step-4-gdc-data-transfer-tool). +A manifest for individual files can also be downloaded from the transaction tab and browse tab pages of the submission portal's project. More information on the process can be found under the Submission Portal's documentation section entitled [Uploading the Submittable Data File to the GDC](https://docs.gdc.cancer.gov/Data_Submission_Portal/Users_Guide/Data_Submission_Walkthrough/#uploading-the-submittable-data-file-to-the-gdc). ### Obtaining UUIDs for Data Uploads A UUID can be used for data submission with the Data Transfer Tool. The UUID for submittable data uploads can be obtained from the Submission Portal or from the API GraphQL endpoint. In the Submission Portal the UUID for a data file can be found in the Manifest YAML file located in the _id:_ row located under the file size entry. diff --git a/docs/Data_Transfer_Tool/Users_Guide/images/DTT_Settings_Page.png b/docs/Data_Transfer_Tool/Users_Guide/images/DTT_Settings_Page.png index 7fd73242a..b0f83fcd8 100644 Binary files a/docs/Data_Transfer_Tool/Users_Guide/images/DTT_Settings_Page.png and b/docs/Data_Transfer_Tool/Users_Guide/images/DTT_Settings_Page.png differ diff --git a/docs/Encyclopedia/ReadyForApproval/DAVE.md b/docs/Encyclopedia/ReadyForApproval/DAVE.md new file mode 100644 index 000000000..33bc88c26 --- /dev/null +++ b/docs/Encyclopedia/ReadyForApproval/DAVE.md @@ -0,0 +1,16 @@ +# DAVE # + +## Description ## +The NCI Genomic Data Commons's DAVE (Data Analysis, Visualization, and Exploration) tools +are an open access interactive visualization application created to interact with the data stored in the GDC. Analysis can be performed in real time and online without downloading any of the data. +## Overview ## + + +### Tools ### +## References ## +1.[GDC DAVE]https://gdc.cancer.gov/dave-factsheet + +## External Links ## +* N/A + +Categories: Workflow Type diff --git a/docs/Encyclopedia/ReadyForApproval/FASTQv3.md b/docs/Encyclopedia/ReadyForApproval/FASTQv3.md new file mode 100644 index 000000000..3b64ea4cd --- /dev/null +++ b/docs/Encyclopedia/ReadyForApproval/FASTQv3.md @@ -0,0 +1,19 @@ +# FASTQ # +## Description ## +Is a file format standard used to store text-based sequence and quality scores information generated from high-throughput sequencing data acquisition systems. + +## Overview ## + +### Structure ### +The format of the a FASTQ file consists of: +1) A line starting with "@" and containing the sequence identifier along with an optional description. +2) Lines consisting of raw sequence information. +3) A line starting with the "+" repeating the sequence ID or left blank. +4) Lines containing the quality scores information. + +## References ## +1. [FASTQ Format]https://en.wikipedia.org/wiki/FASTQ_format + +## External Links ## +* +Categories: Data Format diff --git a/docs/Encyclopedia/ReadyForApproval/Mutation_Annotation_Format_TCGAv2.md b/docs/Encyclopedia/ReadyForApproval/Mutation_Annotation_Format_TCGAv2.md new file mode 100644 index 000000000..7e20fc7f5 --- /dev/null +++ b/docs/Encyclopedia/ReadyForApproval/Mutation_Annotation_Format_TCGAv2.md @@ -0,0 +1,373 @@ +Mutation Annotation Format (MAF) - Legacy TCGA Specification +============================================== + + +*This definition was taken from the previously public wiki hosted by TCGA and reflects the MAF format +that was available during the active period of the TCGA project.* + + + + +**Document Information** + +The spec has been reverted to the June 26th version (version 20). Additional +changes are the removal of the "under construction" banner, changing all text to +black, and fixing a typo in the link to the MAF 2.2 specification. + +**Specification for Mutation Annotation Format** +Version 2.4.1 +June 20, 2014 + +**Contents** + +- 1 Current version changes + +- 2 About MAF specifications + + - 2.1 Definition of open access MAF + data + + - 2.2 Somatic MAF vs. Protected + MAF + +- 3 MAF file fields + + - 3.1 Table 1 - File column + headers + +- 4 MAF file checks + +- 5 MAF naming convention + +- 6 Previous specification + versions + +Current version changes +======================= + +This current revision is **version 2.4.1** of the Mutation Annotation Format +(MAF) specification. + +The following items in the specification were added or modified in version 2.4.1 +from version 2.4: + +- Header for MAF file is "\#version 2.4.1" + +- "Somatic" and "None" are the only acceptable values for "Mutation_Status" + for a somatic.MAF (named .somatic.maf). When Mutation_Status is None, + Validation_Status must be Invalid. + +- Centers need to make sure that Mutations_Status "None" doesn't include + germline mutation. + +- For a somatic MAF, following rules should be satisfied: + SOMATIC = (A AND (B OR C OR D)) OR (E AND F) + A: *Mutation_Status* == "Somatic" + B: *Validation_Status* == "Valid" + C. *Verification_Status* == "Verified" + D. *Variant_Classification* is not {Intron, 5'UTR, 3'UTR, 5'Flank, 3'Flank, + IGR}, which implies that *Variant_Classification* can only be + \\{Frame_Shift_Del, Frame_Shift_Ins, In_Frame_Del, In_Frame_Ins, + Missense_Mutation, Nonsense_Mutation, Silent, Splice_Site, + Translation_Start_Site, Nonstop_Mutation, RNA, Targeted_Region}. + E: *Mutations_status == "None"* + F: *Validation_status == "Invalid"* + +- Extra validation rules: If Validation_Status == Valid or Invalid, then + Validation_Method != none (case insensitive). + +About MAF specifications +======================== + +Mutation annotation files should be transferred to the DCC. Those files should +be formatted using the mutation annotation format (MAF) that is described below. +File naming convention is also +[below](#MutationAnnotationFormat(MAF)Specificat). + +Following categories of somatic mutations are reported in MAF files: + +- Missense and nonsense + +- Splice site, defined as SNP within 2 bp of the splice junction + +- Silent mutations + +- Indels that overlap the coding region or splice site of a gene or the + targeted region of a genetic element of interest. + +- Frameshift mutations + +- Mutations in regulatory regions + +### Definition of open access MAF data + +A large proportion of MAFs are submitted as discovery data and sites labeled as +somatic in these files overlap with known germline variants. In order to +minimize germline contamination in putative (unvalidated) somatic calls, certain +filtering criteria have been imposed. Based on current policy, open access MAF +data should: + +- **include** all validated somatic mutation calls + +- **include** all unvalidated somatic mutation calls that overlap with a + coding region or splice site + +- **exclude** all other types of mutation calls (i.e., non-somatic calls + (validated or not), unvalidated somatic calls that are not in coding region + or splice sites, and dbSNP sites that are not annotated as somatic in dbSNP, + COSMIC or OMIM) + + + +### Somatic MAF vs. Protected MAF + +Centers will submit to the DCC MAF archives that contain Somatic MAF +(named**.somatic.maf**) for open access data and an all-inclusive Protected MAF +(named**.protected.maf**) that does not filter any data out and represents the +original super-set of mutation calls. The files will be formatted using the +Mutation Annotation Format (MAF). + +The following table lists some of the critical attributes of somatic and +protected MAF files and provides a comparison. + +| Attribute | Somatic MAF | Protected MAF | +| ----------- | ----------- | ------------- | +| **File naming** | Somatic MAFs should be named as**\*.somatic.maf**and cannot contain 'germ' or 'protected' in file name. | Protected MAFs should be named as**\*.protected.maf**and should not contain 'somatic' in the file name. | +| **Mutation category** | Somatic MAFs can only contain entries where*Mutation_Status*is "Somatic". If any other value is assigned to the field, the archive will fail. Experimentally validated or unvalidated (see next row) somatic mutations can be included in the file. | There is no such restriction for protected MAF. The file should contain all mutation calls including those from which .somatic.maf is derived. | +| **Filtering criteria** | In order to minimize germline contamination, somatic MAFs can contain unvalidated somatic mutations only from coding regions and splice sites, which implies: | There are no such constraints for mutations in protected MAF. | +| | If *Validation_Status* **is**"Unknown",*V a riant_Classification* **cannot** be 3'UTR, 3'Flank, 5'UTR, 5'Flank, IGR, or Intron.*Variant_Classification*can only be \\{Frame_Shift_Del, Frame_Shift_Ins, In_Frame_Del, In_Frame_Ins, Missense_Mutation, Nonsense_Mutation, Silent, Splice_Site, Translation_Start_Site, Nonstop_Mutation, RNA, Targeted_Region, De_novo_Start_InFrame, De_novo_Start_OutOfFrame\\}. | | +| | There is no such constraint for experimentally validated (*Validation_Status*is "Valid") somatic mutations. | | +| | | | +| | dbSNP sites that are not annotated as somatic in dbSNP, COSMIC or OMIM must be removed from somatic MAFs. | | +| **Access level** | These files are deployed as open access data. | These files are deployed as protected data. | + +MAF file fields +=============== + +The format of a MAF file is tab-delimited columns. Those columns are described +in Table 1 and are required in every MAF file. The order of the columns will be +validated by the DCC. Column headers and values **are** case sensitive where +specified. Columns may allow null values (i.e.\_ blank cells) and/or have +enumerated values. **The validator looks for a header stating the version of the +specification to validate against (e.g. \#version 2.4). If not, validation +fails.** Any columns that come after the columns described in Table 1 are +optional. Optional columns are not validated by the DCC and can be in any order. + + + +Table 1 - File column headers +----------------------------- + + + +| **Index** | **MAF Column Header** | **Description of Values** | **Example** | **Case Sensitive** | **Null** | **Enumerated** | +| --------- | --------------------- | ------------------------- | ----------- | ------------------ | -------- | -------------- | +| 1 | Hugo_Symbol | HUGO symbol for the gene (HUGO symbols are *always* in all caps). If no gene exists within 3kb enter "Unknown". |EGFR | Yes | No | Set or Unknown | | | | | | | | | | +| | | Source: | | | | | | | | | | | | | | +| 2 | Entrez_Gene_Id | Entrez gene ID (an integer). If no gene exists within 3kb enter "0". | 1956 | No | No | Set | | | | | | | | | | +| | | Source: | | | | | | | | | | | | | | +| 3 | Center | Genome sequencing center reporting the variant. If multiple institutions report the same mutation separate list using semicolons. Non-GSC centers will be also supported if center name is an accepted center name. | hgsc.bcm.edu;genome.wustl.edu | Yes | No | Set | | | | | | | | | | +| 4 | NCBI_Build | Any TGCA accepted genome identifier. Can be string, integer or a float. | hg18, hg19, GRCh37, GRCh37-lite, 36, 36.1, 37, | No | No | Set and Enumerated. | | | | | | | | | | +| 5 | Chromosome | Chromosome number without "chr" prefix that contains the gene. | X, Y, M, 1, 2, etc. | Yes | No | Set | | | | | | | | | | +| 6 | Start_Position | Lowest numeric position of the reported variant on the genomic reference sequence. Mutation start coordinate (1-based coordinate system). | 999 | No | No | Set | | | | | | | | | | +| 7 | End_Position | Highest numeric genomic position of the reported variant on the genomic reference sequence. Mutation end coordinate (inclusive, 1-based coordinate system). | 1000 | No | No | Set | | | | | | | | | | +| 8 | Strand | Genomic strand of the reported allele. Variants should always be reported on the positive genomic strand. (Currently, only the positive strand is an accepted value). | \+ | No | No | \+ | | | | | | | | | | +| 9 | Variant_Classification | Translational effect of variant allele. | Missense_Mutation | Yes | No | Frame_Shift_Del, Frame_Shift_Ins, In_Frame_Del, In_Frame_Ins, Missense_Mutation, Nonsense_Mutation, Silent, Splice_Site, Translation_Start_Site, Nonstop_Mutation, 3'UTR, 3'Flank, 5'UTR, 5'Flank, IGR *(See Notes Section #1)* , Intron, RNA, Targeted_Region | | | | | | | | | | +| 10 | Variant_Type | Type of mutation. TNP (tri-nucleotide polymorphism) is analogous to DNP but for 3 consecutive nucleotides. ONP (oligo-nucleotide polymorphism) is analogous to TNP but for consecutive runs of 4 or more. | INS | Yes | No | SNP, DNP, TNP, ONP, INS, DEL, or Consolidated *(See Notes Section #2)* ) | | | | | | | | | | +| 11 | Reference_Allele | The plus strand reference allele at this position. Include the sequence deleted for a deletion, or "-" for an insertion. | A | Yes | No | A,C,G,T and/or - | | | | | | | | | | +| 12 | Tumor_Seq_Allele1 | Primary data genotype. Tumor sequencing (discovery) allele 1. " -" for a deletion represent a variant. "-" for an insertion represents wild-type allele. Novel inserted sequence for insertion should not include flanking reference bases. | C | Yes | No | A,C,G,T and/or - | | | | | | | | | | +| 13 | Tumor_Seq_Allele2 | Primary data genotype. Tumor sequencing (discovery) allele 2. " -" for a deletion represents a variant. "-" for an insertion represents wild-type allele. Novel inserted sequence for insertion should not include flanking reference bases. | G | Yes | No | A,C,G,T and/or - | | | | | | | | | | +| 14 | dbSNP_RS | Latest dbSNP rs ID (dbSNP_ID) or "novel" if there is no dbSNP record. source: | rs12345 | Yes | Yes | Set or "novel" | | | | | | | | | | +| 15 | dbSNP_Val_Status | dbSNP validation status. Semicolon- separated list of validation statuses. | by2Hit2Allele;byCluster | No | Yes | by1000genomes;by2Hit2Allele; byCluster; byFrequency; byHapMap; byOtherPop; bySubmitter; alternate_allele *(See Notes Section #3)* **Note that "none" will no longer be an acceptable value.** | | | | | | | | | | +| 16 | Tumor_Sample_Barcode | BCR aliquot barcode for the tumor sample including the two additional fields indicating plate and well position. i.e. TCGA-SiteID-PatientID-SampleID-PortionID-PlateID-CenterID. The full TCGA Aliquot ID. | TCGA-02-0021-01A-01D-0002-04 | Yes | No | Set | | | | | | | | | | +| 17 | Matched_Norm_Sample_Barcode | BCR aliquot barcode for the matched normal sample including the two additional fields indicating plate and well position. i.e. TCGA-SiteID-PatientID-SampleID-PortionID-PlateID-CenterID. The full TCGA Aliquot ID; e.g. TCGA-02-0021-10A-01D-0002-04 (compare portion ID '10A' normal sample, to '01A' tumor sample). | TCGA-02-0021-10A-01D-0002-04 | Yes | No | Set | | | | | | | | | | +| 18 | Match_Norm_Seq_Allele1 | Primary data. Matched normal sequencing allele 1. "-" for deletions; novel inserted sequence for INS not including flanking reference bases. | T | Yes | Yes | A,C,G,T and/or - | | | | | | | | | | +| 19 | Match_Norm_Seq_Allele2 | Primary data. Matched normal sequencing allele 2. "-" for deletions; novel inserted sequence for INS not including flanking reference bases. | ACGT | Yes | Yes | A,C,G,T and/or - | | | | | | | | | | +| 20 | Tumor_Validation_Allele1 | Secondary data from orthogonal technology. Tumor genotyping (validation) for allele 1. "-" for deletions; novel inserted sequence for INS not including flanking reference bases. | \- | Yes | Yes | A,C,G,T and/or - | | | | | | | | | | +| 21 | Tumor_Validation_Allele2 | Secondary data from orthogonal technology. Tumor genotyping (validation) for allele 2. "-" for deletions; novel inserted sequence for INS not including flanking reference bases. | A | Yes | Yes | A,C,G,T and/or - | | | | | | | | | | +| 22 | Match_Norm_Validation_Allele1 | Secondary data from orthogonal technology. Matched normal genotyping (validation) for allele 1. "-" for deletions; novel inserted sequence for INS not including flanking reference bases. | C | Yes | Yes | A,C,G,T and/or - | | | | | | | | | | +| 23 | Match_Norm_Validation_Allele2 | Secondary data from orthogonal technology. Matched normal genotyping (validation) for allele 2. "-" for deletions; novel inserted sequence for INS not including flanking reference bases. | G | Yes | Yes | A,C,G,T and/or - | | | | | | | | | | +| 24 | Verification_Status *(See Notes Section #4)* | Second pass results from independent attempt using same methods as primary data source. Generally reserved for 3730 Sanger Sequencing. | Verified | Yes | Yes | Verified, Unknown | | | | | | | | | | +| 25 | Validation_Status *(See Notes Section #5)* | Second pass results from orthogonal technology. | Valid | Yes | No | Untested, Inconclusive, Valid, Invaild | | | | | | | | | | +| 26 | Mutation_Status | Updated to reflect validation or verification status and to be in agreement with the [VCF VLS](https://wiki.nci.nih.gov/x/2gcYAw) field. The values allowed in this field are constrained by the value in the Validation_Status field. | Somatic | Yes | No | **Validation_Status values:** Untested, Inconslusive, Valid, Invalid - **Allowed Mutations_Status Values for Untested and Inconclusive:** *(See Notes Seciton #6)* None, Germline, Somatic, LOH, Post-transcriptional modification **Unknown Allowed Mutation_status Values for Valid:** *(See Notes Seciton #6)* Germline, Somatic, LOH, Post-transcriptional modification, Unknown - **Allowed Mutations_Status Values for Invalid:** *(See Notes Seciton #6)* none | | | | | | | | | | + | | | | | | | | | | | | | | | +| 27 | Sequencing_Phase | TCGA sequencing phase. Phase should change under any circumstance that the targets under consideration change. | Phase_I | No | Yes | No | | | | | | | | | | +| 28 | Sequence_Source | Molecular assay type used to produce the analytes used for sequencing. Allowed values are a subset of the [SRA 1.5](http://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/sra/doc/SRA_1-5/) library_strategy field values. This subset matches those used at CGHub. | WGS;WXS | Yes | No | **Common TCGA values:** WGS, WGA, WXS, RNA-Seq, miRNA-Seq, Bisulfite-Seq, VALIDATION, Other **Other allowed values (per SRA 1.5)** ncRNA-Seq, WCS, CLONE, POOLCLONE, AMPLICON, CLONEEND, FINISHING, ChIP-Seq, MNase-Seq, DNase-Hypersensitivity, EST, FL-cDNA, CTS, MRE-Seq, MeDIP-Seq, MBD-Seq, Tn-Seq, FAIRE-seq, SELEX, RIP-Seq, ChIA-PET + | | | | | | | | | | +| 29 | Validation_Method | The assay platforms used for the validation call. Examples: Sanger_PCR_WGA, Sanger_PCR_gDNA, 454_PCR_WGA, 454_PCR_gDNA; separate multiple entries using semicolons. | Sanger_PCR_WGA;Sanger_PCR_gDNA | No | **NO**. I**f Validation_Status = Untested then "none"** If Validation_Status = Valid or Invalid, then not "none" (case insensitive) | No | | | | | | | | | | +| 30 | Score | Not in use. | NA | No | Yes | No | | | | | | | | | | +| 31 | BAM_File | Not in use. | NA | No | Yes | No | | | | | | | | | | +| 32 | Sequencer | Instrument used to produce primary data. Separate multiple entries using semicolons. | Illumina GAIIx;SOLID | Yes | No | Illumina GAIIx, Illumina HiSeq, SOLID, 454, ABI 3730xl, Ion Torrent PGM, Ion Torrent Proton, PacBio RS, Illumina MiSeq, Illumina HiSeq 2500, 454 GS FLX Titanium, AB SOLiD 4 System | | | | | | | | | | +| 33 | Tumor_Sample_UUID | BCR aliquot UUID for tumor sample | 550e8400-e29b-41d4-a716-446655440000 | Yes | No | | | | | | | | | | | +| 34 | Matched_Norm_Sample_UUID | BCR aliquot UUID for matched normal | 567e8487-e29b-32d4-a716-446655443246 | Yes | No | + +**Notes**
    +*1 Intergenic Region.*
    +*2 Consolidationd is used to indicate a site that was initially reported as a variant but subsequently removed from further analysis because it was consolidated into a new variant. For example, a SNP variant incorporated into a TNP variant.*
    +*3 Used when the discovered varieant differs from that of dbSNP.*
    +*4 These MAF headers describe the technology that was used to confirm a mutation, whether the same technology ("verification") or a different technology ("validation") is used to prove that a variant is germline or a somatic mutation.*
    +*5 These MAF headers describe the technology that was used toconfirm a mutation, whether the same technology (verification) or a different technology (validation) is used to prove that a variant is germline or a somatic mutation.*
    +*6 Explanation of some Validation Status-Mutation Status combinations.*
    + +| Validation Status | Mutation Status | Explanation | +| ------------------ | --------------- | ----------- | +| Valid | Unknown | a valid variant with unknown somatic status due to lack of data from matched normal tissue. | +| Invalid | None | validation attempted, tumor and normal are homozygous reference (formerly described as Wildtype) | +| Inconclusive | Unknown | validation failed, neither the genotype nor its somatic status is certain due to lack of data from matched normal tissue | +| Inconclusive | None | validation failed, tumor genotype appears to be homozygous reference | + + Important Criteria + + **Index column indicates the order in which the columns are expected**. **All + headers are case sensitive.** The Case Sensitive column specifies which values + are case sensitive. The Null column indicates which MAF columns are allowed to + have null values. The Enumerated column indicates which MAF columns have + specified values: an Enumerated value of "No" indicates that there are no + specified values for that column; other values indicate the specific values + listed allowed; a value of "Set" indicates that the MAF column values come from + a specified set of known values (*e.g.*HUGO gene symbols). + + +MAF file checks +=============== + +The DCC Archive Validator checks the integrity of a MAF file. Validation will +fail if any of the below are not true for a MAF file: + +1. Column header text (including case) and order must match specification + (Table 1) exactly + +2. Values under column headers listed in the specification (Table 1) as not + null must have values + +3. Values that are specified in Table 1 as Case Sensitive must be. + +4. If column headers are listed in the specification as having *enumerated* + values (*i.e.* a "Yes" in the "Enumerated" column), then the values under + those column must come from the enumerated values listed under "Enumerated". + +5. If column headers are listed in the specification as having *set* values + (*i.e.* a "Set" in the "Enumerated" column), then the values under those + column must come from the enumerated values of that domain (*e.g.* HUGO gene + symbols). + +6. All Allele-based columns must contain- (deletion), or a string composed of + the following capitalized letters: A, T, G, C. + +7. IfValidation_Status== "Untested" + thenTumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2can + be null (depending onValidation_Status). + + 1. IfValidation_Status== "Inconclusive" + thenTumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2can + be null (depending onValidation_Status)**.** + +8. If Validation_Status == Valid, then Validated_Tumor_Allele1 and + Validated_Tumor_Allele2must be populated (one of A, C, G, T, and -) + + 1. If Validation_Status == "Valid" then Tumor_Validation_Allele1, + Tumor_Validation_Allele2, Match_Norm_Validation_Allele1, + Match_Norm_Validation_Allele2 cannot be null + + 2. IfValidation_Status== "Invalid" + thenTumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2cannot + be null AND Tumor_Validation_Allelle1 == + Match_Norm_Validation_Allele1AND Tumor_Validation_Allelle2 == + Match_Norm_Validation_Allele2 (Added as a replacement for 8a as a + result of breakdown) + +9. Check allele values against Mutation_Status: + Check allele values against Validation_status: + + 1. If Mutation_Status == "Germline" and Validation_Status == "Valid", then + Tumor_Validation_Allele1 == Match_Norm_Validation_Allele1 and + Tumor_Validation_Allele2 == Match_Norm_Validation_Allele2. + + 2. If Mutation_Status == "Somatic" and Validation_Status == "Valid", then + Match_Norm_Validation_Allele1 == Match_Norm_Validation_Allele2 == + Reference_Allele and (Tumor_Validation_Allele1 or + Tumor_Validation_Allele2) != Reference_Allele + + 3. If Mutation_Status == "LOH" and Validation_Status=="Valid", then + Tumor_Validation_Allele1 == Tumor_Validation_Allele2 and + Match_Norm_Validation_Allele1 != Match_Norm_Validation_Allele2 and + Tumor_Validation_Allele1 == (Match_Norm_Validation_Allele1 or + Match_Norm_Validation_Allele2). + +10. Check that Start_position \<= End_position + +11. Check for the Start_position and End_position against Variant_Type: + + 1. If Variant_Type == "INS", then (End_position - Start_position + 1 == + length (Reference_Allele) or End_position - Start_position == 1) and + length(Reference_Allele) \<= length(Tumor_Seq_Allele1 and + Tumor_Seq_Allele2) + + 2. If Variant_Type == "DEL", then End_position - Start_position + 1 == + length (Reference_Allele), then length(Reference_Allele) \>= + length(Tumor_Seq_Allele1 and Tumor_Seq_Allele2) + + 3. If Variant_Type == "SNP", then length(Reference_Allele and + Tumor_Seq_Allele1 and Tumor_Seq_Allele2) == 1 and (Reference_Allele and + Tumor_Seq_Allele1 and Tumor_Seq_Allele2) != "-" + + 4. If Variant_Type == "DNP", then length(Reference_Allele and + Tumor_Seq_Allele1 and Tumor_Seq_Allele2) == 2 and (Reference_Allele and + Tumor_Seq_Allele1 and Tumor_Seq_Allele2) !contain "-" + + 5. If Variant_Type == "TNP", then length(Reference_Allele and + Tumor_Seq_Allele1 and Tumor_Seq_Allele2) == 3 and (Reference_Allele and + Tumor_Seq_Allele1 and Tumor_Seq_Allele2) !contain "-" + + 6. If Variant_Type == "ONP", then length(Reference_Allele) == + length(Tumor_Seq_Allele1) == length(Tumor_Seq_Allele2) \> 3 and + (Reference_Allele and Tumor_Seq_Allele1 and Tumor_Seq_Allele2) !contain + "-" + +12. Validation for UUID-based files: + + 1. Column \#33 must be Tumor_Sample_UUID containing UUID of the BCR aliquot + for tumor sample + + 2. Column \#34 must be Matched_Norm_Sample_UUID containing UUID of the BCR + aliquot for matched normal sample + + 3. Metadata represented by Tumor_Sample_Barcode and + Matched_Norm_Sample_Barcode should correspond to the UUIDs assigned to + Tumor_Sample_UUID and Matched_Norm_Sample_UUID respectively + +13. If Validation_Status == "Valid" or "Invalid", then Validation_Method != + "none" (case insensitive) . + +MAF naming convention +===================== + +In archives uploaded to the DCC, the MAF file name should relate to the +containing archive name in the following way: + +If the archive has the name + + \_\.\.Level_2.\.\.0.tar.gz + +then a somatic MAF file with the archive should be named according to + + \_\.\.Level_2.\[.\].somatic.maf + +and a protected MAF with the archive should be named according to + + \_\.\.Level_2.\[.\].protected.maf + +The \ may consist of alphanumeric characters, dash, and +underscore; no spaces or periods; or it may be left out altogether. The purpose +of the optional tag is to impart some brief annotation. + +*Example* + +For the archive + + genome.wustl.edu_OV.IlluminaGA_DNASeq.Level_2.7.6.0.tar.gz + +the following are examples of valid maf names + + genome.wustl.edu_OV.IlluminaGA_DNASeq.Level_2.7.somatic.maf + genome.wustl.edu_OV.IlluminaGA_DNASeq.Level_2.7.protected.maf diff --git a/docs/Encyclopedia/ReadyForApproval/Portion.md b/docs/Encyclopedia/ReadyForApproval/Portion.md new file mode 100644 index 000000000..e2193c608 --- /dev/null +++ b/docs/Encyclopedia/ReadyForApproval/Portion.md @@ -0,0 +1,15 @@ +# Portion # +## Description ## +An portion is a physical piece of any sample. +## Overview ## +A portion is typically one of several sequential 100-120 mg sections of a vial. The [GDC Data Model](https://gdc.cancer.gov/developers/gdc-data-model/gdc-data-model-components) +relates portions to samples and/or analytes but is not a required biospecimen entity. + +## References ## +1. [GDC Data Dictionary - Portion](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=portion) +1. [TCGA Enyclopedia - Portion](hhttps://wiki.nci.nih.gov/display/TCGA/Portion) + +## External Links ## +* N/A + +Categories: General, Biospecimen diff --git a/docs/Encyclopedia/Under_Development/DAVE.md b/docs/Encyclopedia/Under_Development/DAVE.md new file mode 100644 index 000000000..33bc88c26 --- /dev/null +++ b/docs/Encyclopedia/Under_Development/DAVE.md @@ -0,0 +1,16 @@ +# DAVE # + +## Description ## +The NCI Genomic Data Commons's DAVE (Data Analysis, Visualization, and Exploration) tools +are an open access interactive visualization application created to interact with the data stored in the GDC. Analysis can be performed in real time and online without downloading any of the data. +## Overview ## + + +### Tools ### +## References ## +1.[GDC DAVE]https://gdc.cancer.gov/dave-factsheet + +## External Links ## +* N/A + +Categories: Workflow Type diff --git a/docs/Encyclopedia/Under_Development/FASTQ.md b/docs/Encyclopedia/Under_Development/FASTQ.md deleted file mode 100644 index b6d0f3ee8..000000000 --- a/docs/Encyclopedia/Under_Development/FASTQ.md +++ /dev/null @@ -1,13 +0,0 @@ -# FASTQ # -## Description ## -## Overview ## -### Structure ### -#### Header (Optional) #### -#### Body (Optional) #### -## References ## -1. - -## External Links ## -* TBD - -Categories: Data Format diff --git a/docs/Encyclopedia/Under_Development/Node.md b/docs/Encyclopedia/Under_Development/Node.md new file mode 100644 index 000000000..2e3c23c54 --- /dev/null +++ b/docs/Encyclopedia/Under_Development/Node.md @@ -0,0 +1,3 @@ +# Node # +## Description ## +Please see [Entity]https://docs.gdc.cancer.gov/Encyclopedia/pages/Entity/ diff --git a/docs/Encyclopedia/Under_Development/Portion.md b/docs/Encyclopedia/Under_Development/Portion.md index 090f7509a..83f0ba7b7 100644 --- a/docs/Encyclopedia/Under_Development/Portion.md +++ b/docs/Encyclopedia/Under_Development/Portion.md @@ -2,7 +2,7 @@ ## Description ## An portion is a physical sub-part of any sample.. ## Overview ## -A portion is typically one of several sequential 100-120 mg sections of a vial. The [GDC Data Model](https://gdc.cancer.gov/developers/gdc-data-model/gdc-data-model-components) +A portion is typically one of several sequential 100-120 mg sections of a vial. The [GDC Data Model](https://gdc.cancer.gov/developers/gdc-data-model/gdc-data-model-components) relates portions to samples and/or analytes but is not a required biospecimen entity. ## References ## diff --git a/docs/Encyclopedia/ReadyForApproval/Aligned_Reads.md b/docs/Encyclopedia/pages/Aligned_Reads.md similarity index 100% rename from docs/Encyclopedia/ReadyForApproval/Aligned_Reads.md rename to docs/Encyclopedia/pages/Aligned_Reads.md diff --git a/docs/Encyclopedia/pages/Analyte.md b/docs/Encyclopedia/pages/Analyte.md new file mode 100644 index 000000000..832240b01 --- /dev/null +++ b/docs/Encyclopedia/pages/Analyte.md @@ -0,0 +1,15 @@ +# Analyte # +## Description ## +An analyte is any substance or sample being analyzed. + +## Overview ## +An analyte is the specimen extracted for analysis from a portion or sample using a specific extraction protocol. +The [GDC Data Model](https://gdc.cancer.gov/developers/gdc-data-model/gdc-data-model-components) relates analytes to aliquots or portions or samples but is not a required biospecimen entity. + +## References ## +1. [GDC Data Dictionary - Analyte](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=analyte) + +## External Links ## +# [Analyte Wikipedia](https://en.wikipedia.org/wiki/Analyte) + +Categories: General, Biospecimen diff --git a/docs/Encyclopedia/pages/Annotations.md b/docs/Encyclopedia/pages/Annotations.md new file mode 100644 index 000000000..13b956954 --- /dev/null +++ b/docs/Encyclopedia/pages/Annotations.md @@ -0,0 +1,27 @@ +Annotations +=========================== + +Annotations contain important information about files, cases, or metadata nodes that may be of use to data downloaders when analyzing GDC data. They should be reviewed prior to running an analysis. An annotation may include key comments about why particular patients, samples, or files are absent from the GDC or why they may exhibit critical differences from others. Annotations include information that cannot be submitted to the GDC through other existing nodes or properties. + +Annotations are automatically downloaded in TSV format with impacted files when using the Data Transfer Tool. They may also be searched via the [API](/API/Users_Guide/Search_and_Retrieval/#annotations-endpoint) or on the annotations page of the [GDC Data Portal](https://portal.gdc.cancer.gov/annotations). Instructions on accessing annotations in the GDC Data Portal are found in the [GDC Data Portal User Guide](/Data_Portal/Users_Guide/Repository/#annotations-view). + +For information on Annotation structure and content please review the [GDC Data Dictionary](/Data_Dictionary/viewer/#?view=table-definition-view&id=annotation) + +For information about TCGA conventions for annotations please see the [TCGA Introduction to Annotations](Annotations_TCGA/). + +If a submitter would like to create an annotation, please contact the GDC Support Team (support@nci-gdc.datacommons.io). + + + +## References ## +1. [API User Guide](/API/Users_Guide/Search_and_Retrieval/#annotations-endpoint) +2. [GDC Data Portal](https://portal.gdc.cancer.gov/annotations) +3. [GDC Data Portal User Guide](/Data_Portal/Users_Guide/Repository/#annotations-view) +4. [GDC Data Dictionary](/Data_Dictionary/viewer/#?view=table-definition-view&id=annotation) +5. [TCGA Annotations](Annotations_TCGA/) + + +## External Links ## + + +Categories: Data Type diff --git a/docs/Encyclopedia/pages/Introduction+to+Annotations.md b/docs/Encyclopedia/pages/Annotations_TCGA.md similarity index 96% rename from docs/Encyclopedia/pages/Introduction+to+Annotations.md rename to docs/Encyclopedia/pages/Annotations_TCGA.md index 0c8379427..81c5b8a00 100644 --- a/docs/Encyclopedia/pages/Introduction+to+Annotations.md +++ b/docs/Encyclopedia/pages/Annotations_TCGA.md @@ -1,7 +1,8 @@ Introduction to Annotations =========================== -This document is retained here for reference purposes and should not be considered the current standard. -Document was adapted from https://wiki.nci.nih.gov/pages/viewpage.action?spaceKey=TCGA&title=Introduction+to+Annotations +This document is retained for reference purposes for TCGA and should not be considered the current GDC standard. For information on the existing GDC use of annotations please see the [Annotations Encyclopedia entry](/Encyclopedia/pages/Annotations/). + +This document was adapted from https://wiki.nci.nih.gov/pages/viewpage.action?spaceKey=TCGA&title=Introduction+to+Annotations This section includes the following topics. diff --git a/docs/Encyclopedia/pages/Case.md b/docs/Encyclopedia/pages/Case.md index a60cd6594..e84190a00 100644 --- a/docs/Encyclopedia/pages/Case.md +++ b/docs/Encyclopedia/pages/Case.md @@ -14,9 +14,7 @@ Cases are also the basic unit of registration for the GDC Submission Portal. All ## References ## 1. [GDC Data Dictionary - Case](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=case) 2. [GDC Data Model](https://gdc.cancer.gov/developers/gdc-data-model/gdc-data-model-components) -3. [GDC Portal Case Detail Page](https://docs.gdc.cancer.gov/Data_Portal/Users_Guide/Cases_and_Files/#case-detail-page) +3. [GDC Portal Case Detail Page](https://docs.gdc.cancer.gov/Data_Portal/Users_Guide/Repository/#cases-list) -## External Links ## -* N/A Categories: General diff --git a/docs/Encyclopedia/pages/Controlled_Access.md b/docs/Encyclopedia/pages/Controlled_Access.md index 4dc6749ca..906de767d 100644 --- a/docs/Encyclopedia/pages/Controlled_Access.md +++ b/docs/Encyclopedia/pages/Controlled_Access.md @@ -10,7 +10,6 @@ Genomic data access is governed by the NIH's Genomic Data Sharing Policy. ## References ## 1. [GDC Granting Access to Controlled Data](https://gdc.cancer.gov/access-data/obtaining-access-controlled-data) -2. [Registering and Working with eRA Commons and dbGaP](https://gdc.cancer.gov/access-data/obtaining-access-controlled-data/registering-and-working-era-commons-and-dbgap) ## External Links ## * [NIH Genomic Data Sharing Policy](https://gds.nih.gov/03policy2.html) diff --git a/docs/Encyclopedia/pages/Data_Submitter.md b/docs/Encyclopedia/pages/Data_Submitter.md index 6c4740851..0552b3bf1 100644 --- a/docs/Encyclopedia/pages/Data_Submitter.md +++ b/docs/Encyclopedia/pages/Data_Submitter.md @@ -23,7 +23,7 @@ After all project data is uploaded and reviewed, the data submitter can release 3. [GDC Data Model](https://gdc.cancer.gov/developers/gdc-data-model/gdc-data-model-components) 4. [GDC Data Dictionary](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/) 5. [API Submission](https://docs.gdc.cancer.gov/API/Users_Guide/Submission/) -6. [Data Submission Portal Documentation](https://docs.gdc.cancer.gov/Data_Submission_Portal/Users_Guide/Getting_Started/) +6. [Data Submission Portal Documentation](https://docs.gdc.cancer.gov/Data_Submission_Portal/Users_Guide/Data_Submission_Overview/) 7. [GDC Data Transfer Tool](https://gdc.cancer.gov/access-data/gdc-data-transfer-tool) ## External Links ## diff --git a/docs/Encyclopedia/pages/GDC_Data_Portal.md b/docs/Encyclopedia/pages/GDC_Data_Portal.md index 9396537e0..3dcf789f1 100644 --- a/docs/Encyclopedia/pages/GDC_Data_Portal.md +++ b/docs/Encyclopedia/pages/GDC_Data_Portal.md @@ -13,7 +13,7 @@ Key GDC Data Portal features include: * Secure data download directly from the cart or using the GDC Data Transfer Tool ## References ## -1. [GDC Data Portal](https://gdc-portal.nci.nih.gov) +1. [GDC Data Portal](https://portal.gdc.cancer.gov) 2. [GDC Data Portal User's Guide](https://docs.gdc.cancer.gov/Data_Portal/Users_Guide/Getting_Started/) 3. [GDC Legacy Archive](https://portal.gdc.cancer.gov/legacy-archive/search/f) diff --git a/docs/Encyclopedia/pages/GDC_Data_Submission_Portal.md b/docs/Encyclopedia/pages/GDC_Data_Submission_Portal.md index c2d7b3d6d..a36e6c6c7 100644 --- a/docs/Encyclopedia/pages/GDC_Data_Submission_Portal.md +++ b/docs/Encyclopedia/pages/GDC_Data_Submission_Portal.md @@ -13,8 +13,8 @@ Key GDC Data Submission Portal features include2: * __Status and Alerts:__ Visual cues are implemented to easily identify incomplete submissions. ## References ## -1. [GDC Data Submission Portal](https://gdc-portal.nci.nih.gov/submission/login?next=%2Fsubmission%2F) -2. [GDC Data Submission Portal User's Guide](https://docs.gdc.cancer.gov/Data_Submission_Portal/Users_Guide/Getting_Started/) +1. [GDC Data Submission Portal](https://portal.gdc.cancer.gov/submission/) +2. [GDC Data Submission Portal User's Guide](https://docs.gdc.cancer.gov/Data_Submission_Portal/Users_Guide/Data_Submission_Overview/) ## External Links ## * N/A diff --git a/docs/Encyclopedia/pages/Mutation_Annotation_Format.md b/docs/Encyclopedia/pages/Mutation_Annotation_Format.md index e9ba4106d..d5b44253b 100644 --- a/docs/Encyclopedia/pages/Mutation_Annotation_Format.md +++ b/docs/Encyclopedia/pages/Mutation_Annotation_Format.md @@ -13,7 +13,7 @@ The structure of the MAF is available in the [GDC MAF Specification](https://doc ## References ## 1. [GDC DNA-Seq Analysis](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline/) 2. [GDC-Dictionary: Somatic Aggregation Pipeline](https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=somatic_aggregation_workflow) -3. [GDC MAF TCGA (Legacy)Format ](https://docs.gdc.cancer.gov/Encyclopedia/pages/Mutation_Annotation_Format_TCGA/) +3. [GDC MAF TCGA (Legacy) Format ](https://docs.gdc.cancer.gov/Encyclopedia/pages/Mutation_Annotation_Format_TCGAv2/) diff --git a/docs/Encyclopedia/pages/Mutation_Annotation_Format_TCGAv2.md b/docs/Encyclopedia/pages/Mutation_Annotation_Format_TCGAv2.md new file mode 100644 index 000000000..7e20fc7f5 --- /dev/null +++ b/docs/Encyclopedia/pages/Mutation_Annotation_Format_TCGAv2.md @@ -0,0 +1,373 @@ +Mutation Annotation Format (MAF) - Legacy TCGA Specification +============================================== + + +*This definition was taken from the previously public wiki hosted by TCGA and reflects the MAF format +that was available during the active period of the TCGA project.* + + + + +**Document Information** + +The spec has been reverted to the June 26th version (version 20). Additional +changes are the removal of the "under construction" banner, changing all text to +black, and fixing a typo in the link to the MAF 2.2 specification. + +**Specification for Mutation Annotation Format** +Version 2.4.1 +June 20, 2014 + +**Contents** + +- 1 Current version changes + +- 2 About MAF specifications + + - 2.1 Definition of open access MAF + data + + - 2.2 Somatic MAF vs. Protected + MAF + +- 3 MAF file fields + + - 3.1 Table 1 - File column + headers + +- 4 MAF file checks + +- 5 MAF naming convention + +- 6 Previous specification + versions + +Current version changes +======================= + +This current revision is **version 2.4.1** of the Mutation Annotation Format +(MAF) specification. + +The following items in the specification were added or modified in version 2.4.1 +from version 2.4: + +- Header for MAF file is "\#version 2.4.1" + +- "Somatic" and "None" are the only acceptable values for "Mutation_Status" + for a somatic.MAF (named .somatic.maf). When Mutation_Status is None, + Validation_Status must be Invalid. + +- Centers need to make sure that Mutations_Status "None" doesn't include + germline mutation. + +- For a somatic MAF, following rules should be satisfied: + SOMATIC = (A AND (B OR C OR D)) OR (E AND F) + A: *Mutation_Status* == "Somatic" + B: *Validation_Status* == "Valid" + C. *Verification_Status* == "Verified" + D. *Variant_Classification* is not {Intron, 5'UTR, 3'UTR, 5'Flank, 3'Flank, + IGR}, which implies that *Variant_Classification* can only be + \\{Frame_Shift_Del, Frame_Shift_Ins, In_Frame_Del, In_Frame_Ins, + Missense_Mutation, Nonsense_Mutation, Silent, Splice_Site, + Translation_Start_Site, Nonstop_Mutation, RNA, Targeted_Region}. + E: *Mutations_status == "None"* + F: *Validation_status == "Invalid"* + +- Extra validation rules: If Validation_Status == Valid or Invalid, then + Validation_Method != none (case insensitive). + +About MAF specifications +======================== + +Mutation annotation files should be transferred to the DCC. Those files should +be formatted using the mutation annotation format (MAF) that is described below. +File naming convention is also +[below](#MutationAnnotationFormat(MAF)Specificat). + +Following categories of somatic mutations are reported in MAF files: + +- Missense and nonsense + +- Splice site, defined as SNP within 2 bp of the splice junction + +- Silent mutations + +- Indels that overlap the coding region or splice site of a gene or the + targeted region of a genetic element of interest. + +- Frameshift mutations + +- Mutations in regulatory regions + +### Definition of open access MAF data + +A large proportion of MAFs are submitted as discovery data and sites labeled as +somatic in these files overlap with known germline variants. In order to +minimize germline contamination in putative (unvalidated) somatic calls, certain +filtering criteria have been imposed. Based on current policy, open access MAF +data should: + +- **include** all validated somatic mutation calls + +- **include** all unvalidated somatic mutation calls that overlap with a + coding region or splice site + +- **exclude** all other types of mutation calls (i.e., non-somatic calls + (validated or not), unvalidated somatic calls that are not in coding region + or splice sites, and dbSNP sites that are not annotated as somatic in dbSNP, + COSMIC or OMIM) + + + +### Somatic MAF vs. Protected MAF + +Centers will submit to the DCC MAF archives that contain Somatic MAF +(named**.somatic.maf**) for open access data and an all-inclusive Protected MAF +(named**.protected.maf**) that does not filter any data out and represents the +original super-set of mutation calls. The files will be formatted using the +Mutation Annotation Format (MAF). + +The following table lists some of the critical attributes of somatic and +protected MAF files and provides a comparison. + +| Attribute | Somatic MAF | Protected MAF | +| ----------- | ----------- | ------------- | +| **File naming** | Somatic MAFs should be named as**\*.somatic.maf**and cannot contain 'germ' or 'protected' in file name. | Protected MAFs should be named as**\*.protected.maf**and should not contain 'somatic' in the file name. | +| **Mutation category** | Somatic MAFs can only contain entries where*Mutation_Status*is "Somatic". If any other value is assigned to the field, the archive will fail. Experimentally validated or unvalidated (see next row) somatic mutations can be included in the file. | There is no such restriction for protected MAF. The file should contain all mutation calls including those from which .somatic.maf is derived. | +| **Filtering criteria** | In order to minimize germline contamination, somatic MAFs can contain unvalidated somatic mutations only from coding regions and splice sites, which implies: | There are no such constraints for mutations in protected MAF. | +| | If *Validation_Status* **is**"Unknown",*V a riant_Classification* **cannot** be 3'UTR, 3'Flank, 5'UTR, 5'Flank, IGR, or Intron.*Variant_Classification*can only be \\{Frame_Shift_Del, Frame_Shift_Ins, In_Frame_Del, In_Frame_Ins, Missense_Mutation, Nonsense_Mutation, Silent, Splice_Site, Translation_Start_Site, Nonstop_Mutation, RNA, Targeted_Region, De_novo_Start_InFrame, De_novo_Start_OutOfFrame\\}. | | +| | There is no such constraint for experimentally validated (*Validation_Status*is "Valid") somatic mutations. | | +| | | | +| | dbSNP sites that are not annotated as somatic in dbSNP, COSMIC or OMIM must be removed from somatic MAFs. | | +| **Access level** | These files are deployed as open access data. | These files are deployed as protected data. | + +MAF file fields +=============== + +The format of a MAF file is tab-delimited columns. Those columns are described +in Table 1 and are required in every MAF file. The order of the columns will be +validated by the DCC. Column headers and values **are** case sensitive where +specified. Columns may allow null values (i.e.\_ blank cells) and/or have +enumerated values. **The validator looks for a header stating the version of the +specification to validate against (e.g. \#version 2.4). If not, validation +fails.** Any columns that come after the columns described in Table 1 are +optional. Optional columns are not validated by the DCC and can be in any order. + + + +Table 1 - File column headers +----------------------------- + + + +| **Index** | **MAF Column Header** | **Description of Values** | **Example** | **Case Sensitive** | **Null** | **Enumerated** | +| --------- | --------------------- | ------------------------- | ----------- | ------------------ | -------- | -------------- | +| 1 | Hugo_Symbol | HUGO symbol for the gene (HUGO symbols are *always* in all caps). If no gene exists within 3kb enter "Unknown". |EGFR | Yes | No | Set or Unknown | | | | | | | | | | +| | | Source: | | | | | | | | | | | | | | +| 2 | Entrez_Gene_Id | Entrez gene ID (an integer). If no gene exists within 3kb enter "0". | 1956 | No | No | Set | | | | | | | | | | +| | | Source: | | | | | | | | | | | | | | +| 3 | Center | Genome sequencing center reporting the variant. If multiple institutions report the same mutation separate list using semicolons. Non-GSC centers will be also supported if center name is an accepted center name. | hgsc.bcm.edu;genome.wustl.edu | Yes | No | Set | | | | | | | | | | +| 4 | NCBI_Build | Any TGCA accepted genome identifier. Can be string, integer or a float. | hg18, hg19, GRCh37, GRCh37-lite, 36, 36.1, 37, | No | No | Set and Enumerated. | | | | | | | | | | +| 5 | Chromosome | Chromosome number without "chr" prefix that contains the gene. | X, Y, M, 1, 2, etc. | Yes | No | Set | | | | | | | | | | +| 6 | Start_Position | Lowest numeric position of the reported variant on the genomic reference sequence. Mutation start coordinate (1-based coordinate system). | 999 | No | No | Set | | | | | | | | | | +| 7 | End_Position | Highest numeric genomic position of the reported variant on the genomic reference sequence. Mutation end coordinate (inclusive, 1-based coordinate system). | 1000 | No | No | Set | | | | | | | | | | +| 8 | Strand | Genomic strand of the reported allele. Variants should always be reported on the positive genomic strand. (Currently, only the positive strand is an accepted value). | \+ | No | No | \+ | | | | | | | | | | +| 9 | Variant_Classification | Translational effect of variant allele. | Missense_Mutation | Yes | No | Frame_Shift_Del, Frame_Shift_Ins, In_Frame_Del, In_Frame_Ins, Missense_Mutation, Nonsense_Mutation, Silent, Splice_Site, Translation_Start_Site, Nonstop_Mutation, 3'UTR, 3'Flank, 5'UTR, 5'Flank, IGR *(See Notes Section #1)* , Intron, RNA, Targeted_Region | | | | | | | | | | +| 10 | Variant_Type | Type of mutation. TNP (tri-nucleotide polymorphism) is analogous to DNP but for 3 consecutive nucleotides. ONP (oligo-nucleotide polymorphism) is analogous to TNP but for consecutive runs of 4 or more. | INS | Yes | No | SNP, DNP, TNP, ONP, INS, DEL, or Consolidated *(See Notes Section #2)* ) | | | | | | | | | | +| 11 | Reference_Allele | The plus strand reference allele at this position. Include the sequence deleted for a deletion, or "-" for an insertion. | A | Yes | No | A,C,G,T and/or - | | | | | | | | | | +| 12 | Tumor_Seq_Allele1 | Primary data genotype. Tumor sequencing (discovery) allele 1. " -" for a deletion represent a variant. "-" for an insertion represents wild-type allele. Novel inserted sequence for insertion should not include flanking reference bases. | C | Yes | No | A,C,G,T and/or - | | | | | | | | | | +| 13 | Tumor_Seq_Allele2 | Primary data genotype. Tumor sequencing (discovery) allele 2. " -" for a deletion represents a variant. "-" for an insertion represents wild-type allele. Novel inserted sequence for insertion should not include flanking reference bases. | G | Yes | No | A,C,G,T and/or - | | | | | | | | | | +| 14 | dbSNP_RS | Latest dbSNP rs ID (dbSNP_ID) or "novel" if there is no dbSNP record. source: | rs12345 | Yes | Yes | Set or "novel" | | | | | | | | | | +| 15 | dbSNP_Val_Status | dbSNP validation status. Semicolon- separated list of validation statuses. | by2Hit2Allele;byCluster | No | Yes | by1000genomes;by2Hit2Allele; byCluster; byFrequency; byHapMap; byOtherPop; bySubmitter; alternate_allele *(See Notes Section #3)* **Note that "none" will no longer be an acceptable value.** | | | | | | | | | | +| 16 | Tumor_Sample_Barcode | BCR aliquot barcode for the tumor sample including the two additional fields indicating plate and well position. i.e. TCGA-SiteID-PatientID-SampleID-PortionID-PlateID-CenterID. The full TCGA Aliquot ID. | TCGA-02-0021-01A-01D-0002-04 | Yes | No | Set | | | | | | | | | | +| 17 | Matched_Norm_Sample_Barcode | BCR aliquot barcode for the matched normal sample including the two additional fields indicating plate and well position. i.e. TCGA-SiteID-PatientID-SampleID-PortionID-PlateID-CenterID. The full TCGA Aliquot ID; e.g. TCGA-02-0021-10A-01D-0002-04 (compare portion ID '10A' normal sample, to '01A' tumor sample). | TCGA-02-0021-10A-01D-0002-04 | Yes | No | Set | | | | | | | | | | +| 18 | Match_Norm_Seq_Allele1 | Primary data. Matched normal sequencing allele 1. "-" for deletions; novel inserted sequence for INS not including flanking reference bases. | T | Yes | Yes | A,C,G,T and/or - | | | | | | | | | | +| 19 | Match_Norm_Seq_Allele2 | Primary data. Matched normal sequencing allele 2. "-" for deletions; novel inserted sequence for INS not including flanking reference bases. | ACGT | Yes | Yes | A,C,G,T and/or - | | | | | | | | | | +| 20 | Tumor_Validation_Allele1 | Secondary data from orthogonal technology. Tumor genotyping (validation) for allele 1. "-" for deletions; novel inserted sequence for INS not including flanking reference bases. | \- | Yes | Yes | A,C,G,T and/or - | | | | | | | | | | +| 21 | Tumor_Validation_Allele2 | Secondary data from orthogonal technology. Tumor genotyping (validation) for allele 2. "-" for deletions; novel inserted sequence for INS not including flanking reference bases. | A | Yes | Yes | A,C,G,T and/or - | | | | | | | | | | +| 22 | Match_Norm_Validation_Allele1 | Secondary data from orthogonal technology. Matched normal genotyping (validation) for allele 1. "-" for deletions; novel inserted sequence for INS not including flanking reference bases. | C | Yes | Yes | A,C,G,T and/or - | | | | | | | | | | +| 23 | Match_Norm_Validation_Allele2 | Secondary data from orthogonal technology. Matched normal genotyping (validation) for allele 2. "-" for deletions; novel inserted sequence for INS not including flanking reference bases. | G | Yes | Yes | A,C,G,T and/or - | | | | | | | | | | +| 24 | Verification_Status *(See Notes Section #4)* | Second pass results from independent attempt using same methods as primary data source. Generally reserved for 3730 Sanger Sequencing. | Verified | Yes | Yes | Verified, Unknown | | | | | | | | | | +| 25 | Validation_Status *(See Notes Section #5)* | Second pass results from orthogonal technology. | Valid | Yes | No | Untested, Inconclusive, Valid, Invaild | | | | | | | | | | +| 26 | Mutation_Status | Updated to reflect validation or verification status and to be in agreement with the [VCF VLS](https://wiki.nci.nih.gov/x/2gcYAw) field. The values allowed in this field are constrained by the value in the Validation_Status field. | Somatic | Yes | No | **Validation_Status values:** Untested, Inconslusive, Valid, Invalid - **Allowed Mutations_Status Values for Untested and Inconclusive:** *(See Notes Seciton #6)* None, Germline, Somatic, LOH, Post-transcriptional modification **Unknown Allowed Mutation_status Values for Valid:** *(See Notes Seciton #6)* Germline, Somatic, LOH, Post-transcriptional modification, Unknown - **Allowed Mutations_Status Values for Invalid:** *(See Notes Seciton #6)* none | | | | | | | | | | + | | | | | | | | | | | | | | | +| 27 | Sequencing_Phase | TCGA sequencing phase. Phase should change under any circumstance that the targets under consideration change. | Phase_I | No | Yes | No | | | | | | | | | | +| 28 | Sequence_Source | Molecular assay type used to produce the analytes used for sequencing. Allowed values are a subset of the [SRA 1.5](http://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/sra/doc/SRA_1-5/) library_strategy field values. This subset matches those used at CGHub. | WGS;WXS | Yes | No | **Common TCGA values:** WGS, WGA, WXS, RNA-Seq, miRNA-Seq, Bisulfite-Seq, VALIDATION, Other **Other allowed values (per SRA 1.5)** ncRNA-Seq, WCS, CLONE, POOLCLONE, AMPLICON, CLONEEND, FINISHING, ChIP-Seq, MNase-Seq, DNase-Hypersensitivity, EST, FL-cDNA, CTS, MRE-Seq, MeDIP-Seq, MBD-Seq, Tn-Seq, FAIRE-seq, SELEX, RIP-Seq, ChIA-PET + | | | | | | | | | | +| 29 | Validation_Method | The assay platforms used for the validation call. Examples: Sanger_PCR_WGA, Sanger_PCR_gDNA, 454_PCR_WGA, 454_PCR_gDNA; separate multiple entries using semicolons. | Sanger_PCR_WGA;Sanger_PCR_gDNA | No | **NO**. I**f Validation_Status = Untested then "none"** If Validation_Status = Valid or Invalid, then not "none" (case insensitive) | No | | | | | | | | | | +| 30 | Score | Not in use. | NA | No | Yes | No | | | | | | | | | | +| 31 | BAM_File | Not in use. | NA | No | Yes | No | | | | | | | | | | +| 32 | Sequencer | Instrument used to produce primary data. Separate multiple entries using semicolons. | Illumina GAIIx;SOLID | Yes | No | Illumina GAIIx, Illumina HiSeq, SOLID, 454, ABI 3730xl, Ion Torrent PGM, Ion Torrent Proton, PacBio RS, Illumina MiSeq, Illumina HiSeq 2500, 454 GS FLX Titanium, AB SOLiD 4 System | | | | | | | | | | +| 33 | Tumor_Sample_UUID | BCR aliquot UUID for tumor sample | 550e8400-e29b-41d4-a716-446655440000 | Yes | No | | | | | | | | | | | +| 34 | Matched_Norm_Sample_UUID | BCR aliquot UUID for matched normal | 567e8487-e29b-32d4-a716-446655443246 | Yes | No | + +**Notes**
    +*1 Intergenic Region.*
    +*2 Consolidationd is used to indicate a site that was initially reported as a variant but subsequently removed from further analysis because it was consolidated into a new variant. For example, a SNP variant incorporated into a TNP variant.*
    +*3 Used when the discovered varieant differs from that of dbSNP.*
    +*4 These MAF headers describe the technology that was used to confirm a mutation, whether the same technology ("verification") or a different technology ("validation") is used to prove that a variant is germline or a somatic mutation.*
    +*5 These MAF headers describe the technology that was used toconfirm a mutation, whether the same technology (verification) or a different technology (validation) is used to prove that a variant is germline or a somatic mutation.*
    +*6 Explanation of some Validation Status-Mutation Status combinations.*
    + +| Validation Status | Mutation Status | Explanation | +| ------------------ | --------------- | ----------- | +| Valid | Unknown | a valid variant with unknown somatic status due to lack of data from matched normal tissue. | +| Invalid | None | validation attempted, tumor and normal are homozygous reference (formerly described as Wildtype) | +| Inconclusive | Unknown | validation failed, neither the genotype nor its somatic status is certain due to lack of data from matched normal tissue | +| Inconclusive | None | validation failed, tumor genotype appears to be homozygous reference | + + Important Criteria + + **Index column indicates the order in which the columns are expected**. **All + headers are case sensitive.** The Case Sensitive column specifies which values + are case sensitive. The Null column indicates which MAF columns are allowed to + have null values. The Enumerated column indicates which MAF columns have + specified values: an Enumerated value of "No" indicates that there are no + specified values for that column; other values indicate the specific values + listed allowed; a value of "Set" indicates that the MAF column values come from + a specified set of known values (*e.g.*HUGO gene symbols). + + +MAF file checks +=============== + +The DCC Archive Validator checks the integrity of a MAF file. Validation will +fail if any of the below are not true for a MAF file: + +1. Column header text (including case) and order must match specification + (Table 1) exactly + +2. Values under column headers listed in the specification (Table 1) as not + null must have values + +3. Values that are specified in Table 1 as Case Sensitive must be. + +4. If column headers are listed in the specification as having *enumerated* + values (*i.e.* a "Yes" in the "Enumerated" column), then the values under + those column must come from the enumerated values listed under "Enumerated". + +5. If column headers are listed in the specification as having *set* values + (*i.e.* a "Set" in the "Enumerated" column), then the values under those + column must come from the enumerated values of that domain (*e.g.* HUGO gene + symbols). + +6. All Allele-based columns must contain- (deletion), or a string composed of + the following capitalized letters: A, T, G, C. + +7. IfValidation_Status== "Untested" + thenTumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2can + be null (depending onValidation_Status). + + 1. IfValidation_Status== "Inconclusive" + thenTumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2can + be null (depending onValidation_Status)**.** + +8. If Validation_Status == Valid, then Validated_Tumor_Allele1 and + Validated_Tumor_Allele2must be populated (one of A, C, G, T, and -) + + 1. If Validation_Status == "Valid" then Tumor_Validation_Allele1, + Tumor_Validation_Allele2, Match_Norm_Validation_Allele1, + Match_Norm_Validation_Allele2 cannot be null + + 2. IfValidation_Status== "Invalid" + thenTumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2cannot + be null AND Tumor_Validation_Allelle1 == + Match_Norm_Validation_Allele1AND Tumor_Validation_Allelle2 == + Match_Norm_Validation_Allele2 (Added as a replacement for 8a as a + result of breakdown) + +9. Check allele values against Mutation_Status: + Check allele values against Validation_status: + + 1. If Mutation_Status == "Germline" and Validation_Status == "Valid", then + Tumor_Validation_Allele1 == Match_Norm_Validation_Allele1 and + Tumor_Validation_Allele2 == Match_Norm_Validation_Allele2. + + 2. If Mutation_Status == "Somatic" and Validation_Status == "Valid", then + Match_Norm_Validation_Allele1 == Match_Norm_Validation_Allele2 == + Reference_Allele and (Tumor_Validation_Allele1 or + Tumor_Validation_Allele2) != Reference_Allele + + 3. If Mutation_Status == "LOH" and Validation_Status=="Valid", then + Tumor_Validation_Allele1 == Tumor_Validation_Allele2 and + Match_Norm_Validation_Allele1 != Match_Norm_Validation_Allele2 and + Tumor_Validation_Allele1 == (Match_Norm_Validation_Allele1 or + Match_Norm_Validation_Allele2). + +10. Check that Start_position \<= End_position + +11. Check for the Start_position and End_position against Variant_Type: + + 1. If Variant_Type == "INS", then (End_position - Start_position + 1 == + length (Reference_Allele) or End_position - Start_position == 1) and + length(Reference_Allele) \<= length(Tumor_Seq_Allele1 and + Tumor_Seq_Allele2) + + 2. If Variant_Type == "DEL", then End_position - Start_position + 1 == + length (Reference_Allele), then length(Reference_Allele) \>= + length(Tumor_Seq_Allele1 and Tumor_Seq_Allele2) + + 3. If Variant_Type == "SNP", then length(Reference_Allele and + Tumor_Seq_Allele1 and Tumor_Seq_Allele2) == 1 and (Reference_Allele and + Tumor_Seq_Allele1 and Tumor_Seq_Allele2) != "-" + + 4. If Variant_Type == "DNP", then length(Reference_Allele and + Tumor_Seq_Allele1 and Tumor_Seq_Allele2) == 2 and (Reference_Allele and + Tumor_Seq_Allele1 and Tumor_Seq_Allele2) !contain "-" + + 5. If Variant_Type == "TNP", then length(Reference_Allele and + Tumor_Seq_Allele1 and Tumor_Seq_Allele2) == 3 and (Reference_Allele and + Tumor_Seq_Allele1 and Tumor_Seq_Allele2) !contain "-" + + 6. If Variant_Type == "ONP", then length(Reference_Allele) == + length(Tumor_Seq_Allele1) == length(Tumor_Seq_Allele2) \> 3 and + (Reference_Allele and Tumor_Seq_Allele1 and Tumor_Seq_Allele2) !contain + "-" + +12. Validation for UUID-based files: + + 1. Column \#33 must be Tumor_Sample_UUID containing UUID of the BCR aliquot + for tumor sample + + 2. Column \#34 must be Matched_Norm_Sample_UUID containing UUID of the BCR + aliquot for matched normal sample + + 3. Metadata represented by Tumor_Sample_Barcode and + Matched_Norm_Sample_Barcode should correspond to the UUIDs assigned to + Tumor_Sample_UUID and Matched_Norm_Sample_UUID respectively + +13. If Validation_Status == "Valid" or "Invalid", then Validation_Method != + "none" (case insensitive) . + +MAF naming convention +===================== + +In archives uploaded to the DCC, the MAF file name should relate to the +containing archive name in the following way: + +If the archive has the name + + \_\.\.Level_2.\.\.0.tar.gz + +then a somatic MAF file with the archive should be named according to + + \_\.\.Level_2.\[.\].somatic.maf + +and a protected MAF with the archive should be named according to + + \_\.\.Level_2.\[.\].protected.maf + +The \ may consist of alphanumeric characters, dash, and +underscore; no spaces or periods; or it may be left out altogether. The purpose +of the optional tag is to impart some brief annotation. + +*Example* + +For the archive + + genome.wustl.edu_OV.IlluminaGA_DNASeq.Level_2.7.6.0.tar.gz + +the following are examples of valid maf names + + genome.wustl.edu_OV.IlluminaGA_DNASeq.Level_2.7.somatic.maf + genome.wustl.edu_OV.IlluminaGA_DNASeq.Level_2.7.protected.maf diff --git a/docs/Encyclopedia/pages/TCGA_VCF_1.1v2.md b/docs/Encyclopedia/pages/TCGA_VCF_1.1v2.md new file mode 100644 index 000000000..b8d17d479 --- /dev/null +++ b/docs/Encyclopedia/pages/TCGA_VCF_1.1v2.md @@ -0,0 +1,1351 @@ +TCGA Variant Call Format (VCF) 1.1 Specification +================================================ + +**Document Information** +This document is retained here for reference purposes and should not be considered the current standard. + + +**Specification for TCGA Variant Call Format (VCF)** +Version 1.1 + + +Please note that VCF files are treated as **protected** data and must be +submitted to the DCC only in **Level 2** archives. + +About TCGA VCF specification +============================ + +Variant Call Format (VCF) is a format for storing and reporting genomic sequence +variations. VCF files are modular where the annotations and genotype information +for a variant are separated from the call itself. As of May 2011, VCF version +4.1 (described +[here](http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41)) +is the most recent release. GSCs will generate sequence variation data using +high-throughput sequencing technologies and resulting variations will be +submitted to DCC as VCF files. TCGA has adopted VCF 4.1 with certain +modifications to support supplemental information specific to the project. +Subsequent sections describe the format TCGA VCF files should follow and +validation steps that would have to be implemented at the DCC. + +Summary of current version changes +================================== + +Following is a summary of additions/modifications for this version and the corresponding validation rule +number is included in parentheses. + +**UUID compliance**: All TCGA data is currently in the process of being +converted to be UUID-compliant. Until the conversion is complete and all centers +are prepared to start submitting UUID-compliant data, some of the VCF files may +adhere to UUID-based specification whereas some may still have barcodes. +Non-UUID files will follow the specification described here but for +UUID-compliance, VCF files should satisfy the following criteria. + +1. **SampleUUID** and **SampleTCGABarcode** are required tags in each + ##SAMPLE declaration. Please note that **SampleName** will not be a + required tag once submitting center has fully converted to UUIDs. + + 1. Metadata represented by SampleTCGABarcode at the DCC should correspond + to the UUID assigned to SampleUUID. + +2. **Individual** is not a required tag in ##SAMPLE declaration. + +3. If ##**INDIVIDUAL** is declared in the header, all SampleUUIDs in the + header must correspond to the same participant, and the corresponding TCGA + barcode for that participant should be assigned to ##INDIVIDUAL. + + + +1. SampleName is a required tag in ##SAMPLE declaration. The value assigned + to SampleName should be a valid [aliquot + barcode](https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/) / [UUID](https://docs.gdc.cancer.gov/Encyclopedia/pages/UUID/) + in the database. (#15b, #15h) + +2. Header declarations for INFO and FORMAT fields should match the values + defined in Tables 4 and 5 respectively. (#7a) + +3. Following FORMAT fields are required for all variant records in a VCF file: + (#10c) + + - Genotype (**GT**) + + - Read depth (**DP**) + + - Reads supporting ALT (**AD** or **DP4**) + + - Average base quality for reads supporting alleles (**BQ**) + + - Somatic status of the variant (**SS**). SS can be 0, 1, 2, 3, 4 or 5 + depending on whether relative to normal the variant is wildtype, + germline, somatic, LOH, post-transcriptional modification, or unknown + respectively. (#23) + +4. Values for INFO field **VLS** (validation status relative to non-adjacent + Normal) will be checked for validity. It can be 0, 1, 2, 3, 4, or 5 based on + whether the mutation is wildtype, germline, somatic, LOH, post + transcriptional modification, or unknown respectively. (#9c) + +5. Validation of tags in PEDIGREE declaration has changed as follows: (#16) + + - Name_0, Name_1, etc. do not have to be these literal strings but instead + represent arbitrary strings. + + - The keys and values used in the should be unique + across assignments in any given PEDIGREE declaration. + + - Value assigned in does not have to be defined as a + SAMPLE in a genotype column or in the header. + +TCGA-specific customizations +============================ + +The VCF 4.1 specification has been customized to support TCGA-specific variant +information. While majority of the steps pertaining to the basic structure of +the file remain the same, checks for supplemental information fields have been +introduced. For example, TCGA VCF specification allows for additional fields to +represent data associated with complex rearrangements, RNA-Seq variants, and +sample-specific metadata. + +All TCGA-specific additions and modifications in [validation +steps](#validation-rules) are prefixed with a + tag for convenient comparison with 1000Genomes VCF 4.1. The +following table summarizes TCGA-specific customizations that have been added to +the VCF 4.1 specification. The first column, "Customization type", indicates +whether a new validation step has been introduced or if an existing step has +been modified + +**Table 1: TCGA-specific validation steps** + +| **Customization type** | **Description** | **Validation step # in TCGA-VCF 1.1 spec** | **Corresponding validation step # in VCF 4.1 spec** | +|------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------|------------------------------------------------------| +| New | Validate that file contains ##tcgaversion HEADER line. Its presence indicates that the file is TCGA VCF and the value assigned to the field contains format version number | \--- | \--- | +| New | Additional mandatory header lines (Please refer to [Table 2](#TCGAVariantCallFormat(VCF)1.1Specificat)) | \#1 | \#1 | +| New | Validation of SAMPLE meta-information lines | \#15 | \--- | +| New | Validation of PEDIGREE meta-information lines | \#16 | \--- | +| Modification | Acceptable value set for CHROM has been modified | \#18a,b | \#16a | +| Modification | Acceptable value set for ALT has been modified | \#19 | \#17 | +| New | Validation for INFO sub-field "VT" has been added | \#22 | \--- | +| New | Validation for FORMAT sub-field "SS" has been added | \#23 | \--- | +| New | Validation for INFO/FORMAT sub-field "DP" has been added | \#24 | \--- | +| New | Validation for complex rearrangement records has been added | \#25 | \--- | +| New | Validation for RNA-Seq annotation fields has been added | \#26 | \--- | +| New | Mandatory FORMAT fields have been added | \#10c | \--- | +| New | Check for consistent definitions for INFO and FORMAT fields | \#7a | \--- | + +File format +=========== + +The following example (based on [VCF version +4.1)](http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41) +shows different components of a TCGA VCF file. Any VCF file contains two main +sections. The HEADER section contains meta-information for variant records that +are reported as individual rows in the BODY of the VCF file. Both sections are +described below. + +**Case-sensitivity**: Please note that all fields and their associated +validation rules are case-sensitive (as given in the specification) unless noted +otherwise. + +**Figure 1: Components of a sample TCGA VCF file** + +| ![images](images/vcfExample_VCF.png) | +|------------------------------------------| + + + + +HEADER +------ + +The HEADER contains meta-information lines that provide supplemental information +about variants contained in BODY of the file. HEADER lines could be formatted in +the following two ways: + + + ##key=value + + Example: + + ##fileformat=VCFv4.1 + + ##fileDate=20090805 + + +or + + ##FIELDTYPE= + + Example: + + ##INFO= + +Meta-information could be applicable either to all variant records in the file +(e.g., date of creation of file) or to individual variants (e.g., flag to +indicate whether a given variant exists in dbSNP). + +### Generic meta-information + +**Format**: *##key=value* OR *##FIELDTYPE=* + +The following table lists some of the reserved field names. Files can be +customized to contain additional meta-information fields as long as they are not +in conflict with reserved field names. The first field in Table 2 (fileformat) +is mandatory and lists the VCF version number of the file. + +**Table 2: Examples of generic meta-information fields** + +| **Field** | **Case-Sensitive** | **Description** | **Sample values** | Required (fields in red are TCGA-specific requirements) +| ------------- | --------------------- | --------------- | ------------------ | ----------------------------------------------------- | +| Fileformat | No | Lists the VCF version number the file is based on; must be the first line in the file | ##fileformat=VCFv4.1 | Yes | +| fileDate | No | Date file was created; should be in yyyymmdd format | ##fileDate=20090805 | Yes | +| Tcgaversion | No | Indicates that the file follows TCGA-VCF specification. Format version number is assigned to the field. | ##tcgaversion=1.1 | Yes | +| Reference | No | Reference build used for variant calling and against which variant coordinates are shown | ##reference=1000GenomesPilot-NCBI36 | Yes | +| | | | | | +| | | | OR | | +| | | | | | +| | | | ##reference= | | +| Assembly | No | External assembly file. The field can be assigned a file name if assembly file is included in the archive submitted to the DCC or it can be a URL pointing to the file location. | ##assembly=[ftp://ftp-trace.ncbi.nih.gov/](ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/sv/breakpoint_assemblies.fasta) | Yes | +| | | | [1000genomes/ftp/release/sv/breakpoint_assemblies.fasta](ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/sv/breakpoint_assemblies.fasta) | (if a contig from an assembly file is being referred to in the VCF file, especially for breakends) | +| center | No | Name of the center where VCF file is generated. A comma-separated list can be provided if files from multiple centers are merged. | ##center="Broad" | Yes | +| | | | | | +| | | | OR | | +| | | | | | +| | | | ##center="Broad,UCSC,BCM" | | +| phasing | No | Indicates whether genotype calls are partially phased (phasing=partial) or unphased (phasing=none) | ##phasing=none | Yes | +| geneAnno | No | URL of the gene annotation source e.g., Generic Annotation File (GAF) | ##geneAnno=[https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files](https://api.gdc.cancer.gov/legacy/data/95c3618c-bd9e-4df4-96e4-ef8d54710e51) | Yes (if annotation tags like GENE, SID and RGN are used) | +| vcfProcessLog | No | Lists algorithm, version and settings used to generate variant calls in a VCF file. If multiple VCF files are processed to produce a single merged file, the field records attributes for individual VCF files and the programs used to merge the files along with the associated version, parameters and contact information of the person who produced the merged file. | | | | | **Note**: If VCF file does not represent a set of merged files, *MergeSoftware*, *MergeParam*, *MergeVer* and *MergeContact* tags will not be applicable and can be omitted. + | | | | ##vcfProcessLog=, InputVCFSource=, | | +| | | **Note**: If VCF file does not represent a set of merged files, *MergeSoftware*, *MergeParam*, *MergeVer* and *MergeContact* tags will not be applicable and can be omitted. | InputVCFVer=<1.0>, | | +| | | | InputVCFParam= | | +| | | **Note**: If multiple parameters need to be declared in *InputVCFParam*, key=value pairs can be used to name these parameters. For example: | InputVCFgeneAnno=> | | +| | | InputVCFParam= | | | +| | | If there are multiple files for which parameters have to be declared, following format can be used: | OR | | +| | | InputVCFParam= | | | +| | | | ##vcfProcessLog=, | | +| | | | InputVCFSource=, | | +| | | | InputVCFVer=<1.0,2.1,2.0>, | | +| | | | InputVCFParam=, | | +| | | | InputVCFgeneAnno=, | | +| | | | MergeSoftware=, | | +| | | | MergeParam=, | | +| | | | MergeVer=<2.1,3.0>, | | +| | | | MergeContact=> | | +| INDIVIDUAL | No | Specifies the individual for which data is presented in the file | ##INDIVIDUAL=TCGA-24-0980 | No | + +### INFO/FORMAT/FILTER meta-information + +**Format**: *##FIELDTYPE=* + +INFO, FORMAT and FILTER (case-sensitive values) are optional fields that have to +be declared in the HEADER if they are being referred to in BODY of the file. +Different *keys* that can be used to define them are described in Table 3. All +three fields do not use the same set of keys. Please refer to individual field +definitions for further details. + +**Important**: TCGA VCF 1.1 requires all VCF files to follow consistent header +declarations for standard INFO and FORMAT sub-fields. Please refer to Tables 4 +and 5 for details. If a sub-field exists in these tables and is used in a TCGA +VCF file, then all pairs in the definition should match entries in +the corresponding table for the file to pass validation. + +**Table 3: Description of keys used in INFO/FORMAT/FILTER meta-information +declarations** + +| **Key** | **Case-sensitive** | **Description** | **Data Type (Possible values)** | **Additional Notes** | +| ----------- | ------------------ | --------------- | ------------------------------- | -------------------- | +| ID | Yes | name of the field; also used in BODY of the file to assign values for individual variant records | String, no whitespaces, no comma | \--- | +| Number | Yes | specifies the number of values that can be associated with the corresponding field | Set | Any integer \>= 0 indicating number of values; | +| | | | *(Integer \>= 0, "A", "G", ".")* | "A", if the field has one value per alternate allele; | +| | | | | "G", if the field has one value per genotype; | +| | | | | ".", if number of values varies, is unknown, or is unbounded | +| Type | Yes | indicates data type of the value associated with the field | Set | "Flag" type indicates that the field does not contain a value entry, and hence the *Number* should be 0 in this case. FORMAT fields cannot have a "Flag" *Type* assigned to them. | +| | | | *(Integer, Float, Flag, Character, String)* | | +| Description | Yes | provides a brief description of the field | String, surrounded by double-quotes, cannot itself contain a double-quote, cannot contain trailing whitespace at the end of string before closing quotes | \--- | + +#### INFO lines + +**Format**: *##INFO=* +**Required keys**: ID, Type, Number, Description + +INFO fields are optional and contain additional annotations for a variant. +Certain INFO fields have already been created and exist as reserved fields in +the current VCF standard. Custom INFO fields can be added based on study +requirements as long as they do not use the reserved field names. If an INFO +field is declared in the header, it needs to be described further using the +following format: + + ##INFO= + + Example: + + ##INFO= + + ##INFO= + +#### FORMAT lines + +**Format**: *##FORMAT=* +**Required keys**: ID, Type, Number, Description + +FORMAT declaration lines are used when annotations need to be added for +individual genotypes associated with each sample in the file. FORMAT sub-fields +are declared precisely as the INFO sub-fields with the exception that a FORMAT +sub-field cannot be assigned a "Flag" *Type.* + + ##FORMAT= + + Example: + + ##FORMAT= + + ##FORMAT= + +**Important**: TCGA VCF 1.1 requires the following FORMAT sub-fields to be +defined for all variant records. Therefore, these FORMAT lines are not optional +for TCGA VCF files and should be declared in the header. Please refer to Table 5 +for definitions for these sub-fields. + +- Genotype (**GT**) + +- Read depth (**DP**) + +- Reads supporting ALT (**AD** or **DP4**). Either AD or DP4 is required to be + defined although DP4 is preferred. + +- Average base quality for reads supporting alleles (**BQ**) + +- Somatic status of the variant (**SS**). SS can be 0, 1, 2, 3, 4, or 5 + depending on whether relative to normal the variant is wildtype, germline, + somatic, LOH, post-transcriptional modification, or unknown respectively. + +These should be considered as required fields so that they are included by +default unless there is an exceptional scenario where the information for a +field cannot be obtained. In such a case, "." can be used to indicate missing +value. + +#### FILTER lines + +**Format**: *##FILTER=* +**Required keys**: ID, Description + +FILTER fields are defined to list filtering criteria used for generating variant +calls. Custom filters can be applied as long as a definition is provided in the +HEADER. FILTERs that have been applied to the data should be described as +follows. Please note that FILTER declarations do not include *Type* or *Number* +keys. + + ##FILTER= + + Example: + + ##FILTER= + + ##FILTER= + +#### Consistent definitions for reserved INFO and FORMAT fields + +To ensure that all TCGA VCF files have consistent definitions for standard +fields and to avoid merging errors due to contradicting definitions, following +header declarations for common fields are proposed. The 'Source' column in the +tables below indicates whether the field is from 1000Genomes VCF or if it is +specific to TCGA-VCF. By adhering to these definitions, we can ensure that a +given field is interpreted the same way across all centers and that same +'Number', 'Type' and 'Description' values are used for these IDs. + +##### Table 4: INFO sub-field definitions + +| **Sub-field** | **Source** | **Formatted declaration** | +|---------------|------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| AA | VCF | ##INFO= | +| AC | VCF | ##INFO= | +| AF | VCF | ##INFO= | +| AN | VCF | ##INFO= | +| BQ | VCF | ##INFO= | +| CIGAR | VCF | ##INFO= | +| DB | VCF | ##INFO= | +| DP | VCF | ##INFO= | +| END | VCF | ##INFO= | +| H2 | VCF | ##INFO= | +| H3 | VCF | ##INFO= | +| MQ | VCF | ##INFO= | +| MQ0 | VCF | ##INFO= | +| NS | VCF | ##INFO= | +| SB | VCF | ##INFO= | +| SOMATIC | VCF | ##INFO= | +| VALIDATED | VCF | ##INFO= | +| 1000G | VCF | ##INFO= | +| IMPRECISE | VCF | ##INFO= | +| NOVEL | VCF | ##INFO= | +| SVTYPE | VCF | ##INFO= | +| SVLEN | VCF | ##INFO= | +| CIPOS | VCF | ##INFO= | +| CIEND | VCF | ##INFO= | +| HOMLEN | VCF | ##INFO= | +| HOMSEQ | VCF | ##INFO= | +| BKPTID | VCF | ##INFO= | +| MEINFO | VCF | ##INFO= | +| METRANS | VCF | ##INFO= | +| DGVID | VCF | ##INFO= | +| DBVARID | VCF | ##INFO= | +| DBRIPID | VCF | ##INFO= | +| MATEID | VCF | ##INFO= | +| PARID | VCF | ##INFO= | +| EVENT | VCF | ##INFO= | +| CILEN | VCF | ##INFO= | +| DPADJ | VCF | ##INFO= | +| CN | VCF | ##INFO= | +| CNADJ | VCF | ##INFO= | +| CICN | VCF | ##INFO= | +| CICNADJ | VCF | ##INFO= | +| VLS | TCGA-VCF | ##INFO= | +| SID | TCGA-VCF | ##INFO= | +| GENE | TCGA-VCF | ##INFO= | +| RGN | TCGA-VCF | ##INFO= | +| RE | TCGA-VCF | ##INFO= | +| VT | TCGA-VCF | ##INFO= | + +##### Table 5: FORMAT sub-field definitions + +| **Sub-field** | **Source** | **Formatted declaration** | +|---------------|------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| GT | VCF | ##FORMAT= | +| DP | VCF | ##FORMAT= | +| FT | VCF | ##FORMAT= | +| GL | VCF | ##FORMAT= | +| PL | VCF | ##FORMAT= | +| GP | VCF | ##FORMAT= | +| GQ | VCF | ##FORMAT= | +| HQ | VCF | ##FORMAT= | +| CN | VCF | ##FORMAT= | +| CNQ | VCF | ##FORMAT= | +| CNL | VCF | ##FORMAT= | +| MQ | VCF | ##FORMAT= | +| HAP | VCF | ##FORMAT= | +| AHAP | VCF | ##FORMAT= | +| SS | TCGA-VCF | ##FORMAT= | +| TE | TCGA-VCF | ##FORMAT= | +| AD | TCGA-VCF | ##FORMAT= | +| DP4 | TCGA-VCF | ##FORMAT= | +| BQ | TCGA-VCF | ##FORMAT= | +| VAQ | TCGA-VCF | ##FORMAT= | + + + + + +### TCGA-specific meta-information + +#### PEDIGREE lines + +**Format**: *##PEDIGREE=* +**Required keys**: Name_0,..,Name_N where N \>= 1; + +PEDIGREE lines are used to specify derivation relationships between different +genomes. *Name_0* is associated with the derived genome and *Name_1* through +*Name_N* represent the genomes from which it is derived. In the case of tumor +clonal populations, one population is clonally derived from another. In the +example below, PRIMARY-TUMOR-GENOME is derived from GERMLINE-GENOME. + + ##PEDIGREE=,Name_1=,...,Name_N=> + + where N is \>= 1; + + Example: + + ##PEDIGREE= + +#### SAMPLE lines + +**Format**: *##SAMPLE=* +**Required keys**: ID, SampleName, Individual, File, Platform, Source, Accession + +For UUID-compliant files, following rules should be followed: + +**Required keys**: ID, SampleName, Individual, SampleUUID, SampleTCGABarcode, +File, Platform, Source, Accession + +- Value assigned to "SampleUUID" should be a valid [aliquot + UUID](https://docs.gdc.cancer.gov/Encyclopedia/pages/UUID/) in the database. + +- Value assigned to "SampleTCGABarcode" should represent the aliquot-level + metadata associated with SampleUUID. This metadata mapping is originally + received by the DCC from BCR. + +> Example: + +> ##SAMPLE=,Mixture=<0.1,0.9>,Genome_Description=<"Germline +> contamination","Tumor genome">> + + + +SAMPLE lines are used to include additional metadata about each sample for which +data is represented in the VCF file. All samples are listed in the column header +line following the FORMAT column (Figure 1). Each of these samples should have +its own HEADER declaration where the sample identifier in the column header +should be the same as the value assigned to "ID" key in the corresponding +declaration. Value assigned to "SampleName" should be a valid [aliquot +barcode](https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/) / [UUID](https://docs.gdc.cancer.gov/Encyclopedia/pages/UUID/) +in the database. The declaration lists information about the sample (source, +platform, source file, etc.) and can also be used to indicate if the sample is a +mixture of different kind of genomes. In the example below, "Genomes", "Mixture" +and 'Genome_Description" tags represent comma-separated list of different +genomes that a sample contains, proportion of each genome in the sample, and a +brief description of each genome respectively. + +##SAMPLE=,Mixture= +,Genome_Description=<"S1","S2",..,"SK">> + +Example: + +##SAMPLE=,Mixture=<0.1,0.9>,Genome_Description=<"Germline +contamination","Tumor genome">> + +- "Description" field for genome mixture has been renamed to + "Genome_Description" to distinguish it from sample description. + +- Values for tags related to genome mixture (Genomes, Mixture, + Genome_Description) are within angle brackets. + +### Column header meta-information + +**Format**: Tab-delimited line starting with "#" and containing headers for all +columns in the BODY as shown below. + +This is a mandatory header line where the first 8 fields are fixed and have to +defined in the column header. "FORMAT" onwards are optional and are included to +encapsulate per-sample/genome genotype data. + +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ... + +BODY +### Variant records + +Data lines are tab-delimited and list information about individual variants and +associated genotypes across samples. The first 8 fields (Figure 1) are required +to be listed in the VCF column header line. Some of these fields require +non-null values (see Table 6) for each record. For the remaining fixed fields, +even if the field does not have an associated value, it still needs to be +specified with a missing value identifier ("." in VCF 4.1). Subsequent fields +are optional. + +**Table 6: Description of fields in the BODY of a VCF file** + +| **Index** | **Field** | **Case-sensitive** | **Description** | **Data type** | **Sample values** | **Required\*** | **Additional notes** | +| --------- | ---------- | ------------------ | --------------- | ------------- | --------------------- | -------------- | -------------------- | +| 1 | CHROM | Yes | *Chromosome*: an identifier from the reference genome or the assembly file defined in the HEADER. | Alphanumeric string | 20 | Yes | Chromosome name should not contain "chr" prefix, e.g., "chr10" will be an invalid entry | +| | | | | *([1-22], X, Y, MT, )* | | | | +| 2 | POS | Yes | *Position*: The reference position, with the 1st base having position 1. | Non-negative integer | 1110696 | Yes | \--- | +| 3 | ID | Yes | *Identifier*: Semi-colon separated list of unique identifiers if available. | String, no white-space or semi-colons | rs6054257_66370 | No | **Important**: When using an rsID as the variant identifier, please append chromosomal location of the variant to the ID. For example, if the variant is at chr7:6013153 and the corresponding rsID is rs10000, then the variant ID should be rs10000_6013153. This is to ensure that there is a consistent rule for satisfying the condition for unique IDs even if a file contains single rsID that maps to multiple variants. | +| 4 | REF | Yes | *Reference allele(s)*: Reference allele at the position. | String | GTCT | Yes | Value in POS field refers to the position of the first base in the REF string. | +| | | | | *([ACGTN]+* ) | | | | +| 5 | ALT | Yes | *Alternate allele(s)*: Comma separated list of alternate non-reference alleles called on at least one of the samples. Angle-bracketed ID String (\) can also be used for symbolically representing alternate alleles. | String; no whitespace, commas, or angle-brackets in the ID string | G,GTCT | No | if ALT==, ID needs to be defined in the header as | +| | | | | *([ACGTN]+, , .)* | . | | ##ALT= | +| | | | | | | | | +| 6 | QUAL | Yes | *Quality score*: Phred-scaled quality score for the assertion made in ALT. | Integer \>= 0 | 50 | No | Scores should be non-negative integers or missing values | +| 7 | FILTER | Yes | *Filtering results*: PASS if this position has passed all filters, Otherwise, if the site has not passed all filters, a semicolon-separated list of codes for filters that fail. | String, no whitespace or semi-colon | PASS | No | "0" is reserved and cannot be used as a filter String. | +| | | | | | q10;s50 | | | +| 8 | INFO | Yes | *Additional information*: INFO fields are encoded as a semicolon-separated series of keys (same as ID in an INFO declaration) with optional values in the format **. | String, no whitespace, semi-colons, or equal-signs | NS=3;DP=14; | No | \--- | +| 9 | FORMAT | Yes | *Genotype sub-fields*: If genotype data is present in the file, the fixed fields are followed by a FORMAT column. The field contains a colon-separated list of all pre-defined FORMAT sub-fields (same as ID in a FORMAT declaration) that are applicable to all samples that follow. | String, no whitespace, sub-fields cannot contain colon | GT:GQ:DP:HQ | No | "GT" must be the first sub-field if it is present in the FORMAT field. | +| 10 | | Case should be same as in "ID" tag of \#\#SAMPLE declaration in the header | *Per-sample genotype information*: An arbitrary number of sample IDs can be added to the column header line and a variant record in the BODY can contain genotype information corresponding to FORMAT column for each sample. Contains a colon-separated list of values assigned to each of the sub-fields in FORMAT column. | String, no whitespace, sub-fields cannot contain colon | 0\|0:48:1:51,51 | No | Values are assigned to FORMAT sub-fields in the SAME order as specified in "FORMAT" column. All samples in any given row for a variant record MUST contain values for all sub-fields as defined in "FORMAT" column. If any of the fields does not have an associated value, then missing value identifier (".") should be used for that field. However, "." cannot be used as a value for any of the IDs in the FORMAT field (e.g., GT:.:DP would lead to an error). | + +* A "Required" field cannot contain missing value identifier for any record +listed in data lines. + +Extensions for TCGA data +======================== + +TCGA data includes but is not limited to SNP's and small indels. A variant +representation format for cancer data should be able to support more complex +variation types such as structural variants, complex rearrangements and RNA-Seq +variants. The following sub-sections present an overview of the extensions that +have been added to clearly describe such variations in a VCF file. + +Structural variants +------------------- + +A [structural variant](http://www.ncbi.nlm.nih.gov/dbvar/content/overview/) (SV) +can be defined as a region of DNA that includes a variation in the structure of +the chromosome. Such variations could be due to inversions and balanced +translocations or genomic imbalances (insertions and deletions), also referred +to as copy number variants (CNVs). Certain features have been added to the +format in order to clearly describe structural variants in a VCF file. A +detailed description of the extensions is available +[here](http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41). + +Complex rearrangements +---------------------- + +Chromosomal rearrangements are caused by breakage of DNA double helices at two +different locations. The broken ends in turn rejoin to produce a new chromosomal +arrangement. Complex rearrangements involving more than two breaks are +frequently observed in cancer genomes. Certain modifications need to be made to +the VCF standard to adequately represent such variations in a VCF file. A +detailed specification of the proposed extensions to describe rearrangements in +a VCF file is available +[here](http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41). +Figure 2 illustrates some of the concepts relevant to VCF records for complex +rearrangements. + +**Figure 2: Adjacencies and breakends in a chromosomal rearrangement** (adapted +from VCF 4.1 specification) + +| ![media](images/ccr_VCF.png) | +| ------------------------------------------ | + + + +A VCF file has one line for each of the two breakends in an adjacency. Table 7 +provides a list of sub-fields that have been added to describe breakends. An +INFO sub-field (**SVTYPE=BND**) is used to indicate a breakend record. +Sub-fields MATEID and PARID are used to represent variant record IDs of +corresponding mates and partners respectively. + +**Table 7: Fields added for breakends** + +| **Field:Sub-field** | **Description** | **Declaration in HEADER** | **Required** | **(Sample values in BODY)** | +| ------------------- | --------------- | ------------------------- | ------------ | --------------------------- | +| INFO:**SVTYPE** | Type of structural variant; SVTYPE is set to "BND" for breakend records | ##INFO= | Yes | +| | | *SVTYPE=BND* | (SVTYPE=BND for breakend records) | +| INFO:**MATEID** | ID of corresponding mate of the breakend record | ##INFO= | No | +| | | *MATEID=bnd_U* | | +| INFO:**PARID** | ID of corresponding partner of the breakend record | ##INFO= | No | +| | | *PARID=bnd_V* | | +| INFO:**EVENT** | ID of event associated to breakend | ##INFO= | No | +| | | *EVENT=RR0* | | + +The specification for ALT field deviates from the standard format for breakend +records. ALT field for a breakend record can be represented in four possible +ways based on the type of replacement. + +REF ALT Description + +s t[p[ piece extending to the right of p is joined after t + +s t]p] reverse comp piece extending left of p is joined after t + +s ]p]t piece extending to the left of p is joined before t + +s [p[t reverse comp piece extending right of p is joined before t + +Legend: + +s: sequence of REF bases beginning at position POS + +t: sequence of bases that replaces "s" + +p: position of the breakend mate indicating the first mapped base that joins at +the adjacency; represented as a string of the form "chr:pos" + +[]: square brackets indicate direction that the joined sequence continues in, +starting from p + +RNA-Seq variants +---------------- + +VCF specifications have been extended to address expressed variants obtained +from RNA-Seq. Features added for structural variants from genome/exome +sequencing are applicable to RNA-Seq structural variants. However, RNA-Seq +breakends are represented by setting **SVTYPE=FND** instead of BND (Table 8) +since they can be different from those observed in DNA-Seq. + +**Table 8: Fields added for RNA-Seq variants** + +| **Field:Sub-field** | **Description** | **Declaration in HEADER** | **Required** | +| ------------------- | --------------- | ------------------------- | ------------ | +| INFO:**SVTYPE** | Type of structural variant; SVTYPE is set to "FND" for breakends associated with RNA-Seq | ##INFO= | Yes | +| | | *SVTYPE=FND* | (required for RNA-Seq breakend records; SVTYPE=FND) | + +VCF files for RNA-Seq variants may include gene-related annotations. However, +this is not a standard feature of VCF files as eventually all VCF variants will +be annotated using information in Generic Annotation File (GAF). Additional INFO +and FORMAT sub-fields have been included to describe the characteristics of +expressed nucleotide variants (Table 8a). + +**Table 8a: Annotation fields added for RNA-Seq variants** + +| **Field:Sub-field** | **Description** | **Declaration in HEADER** | **Required** | +| ------------------- | --------------- | ------------------------- | ------------ | +| INFO:**SID** | Unique identifiers from the gene annotation source as specified in ##geneAnno; "unknown" should be used if identifier is not known; comma-separated list of IDs can be used if variant overlaps with multiple features | ##INFO= | No | +| | | *SID=13,198* | | +| INFO:**GENE** | HUGO gene symbol; "unknown" should be used when gene symbol is unknown; comma-separated list of genes can be used if variant overlaps with multiple transcripts/genes | ##INFO= | No | +| | | *GENE=ERBB2,ERBB2* | | +| INFO:**RGN** | Region where a nucleotide variant occurs in relation to a gene | ##INFO= | No | +| | | *RGN=exon,3_utr* | | +| INFO:**RE** | Flag to indicate if position is known to have RNA-edits occur | ##INFO= | No | +| | | *RE* | | +| FORMAT:**TE** | Translational effect of a nucleotide variant in a codon | ##FORMAT= | | +| | | *MIS,NA* | | + +Including validation status in VCF file +--------------------------------------- + +Somatic variations are often validated using follow-up experiments to confirm +the variant is not due to sequencing errors. Following points need to be +considered while including validation status in VCF file: + +- A single VCF file will contain sequence data for a single case. The file + could be the result of merging calls from different centers so validation + can be performed on a set of variants reported in a merged VCF file. + +- Validation with secondary technology is performed after obtaining results + from primary sequencing method. Therefore, validation is a confirmation step + and may or may not be performed before a first-pass VCF file with all + candidate mutations is generated and submitted to the DCC. + +- A single mutation can be verified with multiple independent methods and the + results may or may not be in agreement. + +- If results from different methods are in conflict, the final validation + status of the variant call needs to be inferred based on available + information. This could be done manually or programatically. + +**Format validation** + +Since validation data is added as additional genotype/sample columns, the file +will pass validation as long as all existing format rules are followed and +header declarations are correct. + +**Sample TCGA VCF file with validation status** + +Line1 ##fileformat=VCFv4.1 + +Line2 ##tcgaversion=1.1 + +Line3 ##fileDate=20120205 + +Line4 ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta + +Line5 ##FORMAT= + +Line6 ##FORMAT= + +Line7 ##FORMAT= + +Line8 ##INFO= + +Line9 ##FILTER= + +Line10 +##SAMPLE= + +Line11 +##SAMPLE= + +Line12 +##SAMPLE= + +Line13 +##SAMPLE= + +Line14 +##SAMPLE= + +Line15 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR NORMAL_454 +TUMOR_454 TUMOR_Sanger + +Line16 20 14370 var1 G A 29 PASS VLS=2 GT:GQ:SS 0/0:48:. 0/1:50:2 0/0:20:. +0/1:20:2 0/1:.:2 + +Line17 5 15000 var2 T C 35 PASS VLS=1 GT:GQ:SS 0/1:48:. 1/1:51:3 0/1:60:. +0/1:50:1 0/1:13:1 + +Line18 3 170089 var2 G T 30 PASS . GT:GQ:SS 0/1:48:. 0/1:51:1 .:.:. .:.:. .:.:. + +The format follows these guidelines: + +1. **Sample columns** + + - An additional column is included for every line of evidence used for + validation. In the example above, tumor calls are verified with 454 and + Sanger sequencing and normal calls are validated with 454. Therefore, 3 + genotype columns exist in addition to the NORMAL and TUMOR sequencing + calls obtained with the primary sequencing method. + + - The validation platform name is appended to the original sample to + distinguish the validation results from primary sequencing. + _ is used in the example above. + + - **Note**: \ can be obtained from DCC [Code Tables + Report](http://tcga-data.nci.nih.gov/datareports/codeTablesReport.htm). + The ##SAMPLE meta-information line also includes a 'Platform' tag + where platform name is defined. + + - Each new genotype column header added to the file (e.g., TUMOR_454, + TUMOR_Sanger) has to be defined in the header using the ##SAMPLE + meta-information line (e.g., Lines 13 and 14). + + - As per VCF specification, the order of FORMAT sub-fields is defined by + the FORMAT column and all calls from primary and validation sequencing + should comply with this order. + + - If a sub-field does not apply to any given validation call, it should be + assigned a missing value ("."). + +2. **FORMAT sub-field "SS"** + + - For any given tumor genotype call, sub-field SS indicates variant status + with respect to non-adjacent normal counterpart (0, 1, 2, 3, 4 or 5 + based on whether the variant is wildtype, germline, somatic, LOH, + post-transcriptional modification, or unknown respectively). Therefore, + each tumor genotype call (primary and secondary sequencing) will have + its own corresponding SS sub-field. + +3. **INFO sub-field "VLS"** + + - Sub-field VLS represents an inferred decision for a tumor genotype call + and is based on the calls obtained with validation. In the example + above, var1 shows a somatic call (SS=2) for the tumor sample based on + primary sequencing, and both validation methods confirm this call. + Therefore, the final validation status of var1 is a somatic variation + (VLS=2). However, var2 has a LOH variant in tumor sample (SS=3) based on + primary sequencing whereas both validation methods indicate that it is a + germline variant (SS=1). In such a case, "VLS" has to be inferred from + available information and could differ from the SS value assigned to the + tumor sample based on primary sequencing. + +Validation rules +================ + +At the minimum, every file needs to go through the checks listed below. +Following is an example of a VCF file that shows certain violations cited in the +listed validation steps. Please note that line numbers in the file segment below +are added for illustration purposes alone and are not expected to be found in an +actual VCF file. + +Line1 ##fileformat=VCFv4.1 + +Line2 ##fileDate=20090805 + +Line3 ##source=myImputationProgramV3.1 + +Line4 ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta + +Line5 ##INFO= + +Line6 ##INFO= + +Line7 ##FORMAT= + +Line8 ##FORMAT= + +Line9 ##FORMAT= + +Line10 ##FORMAT= + +Line11 ##FILTER= + +Line12 ##FILTER= + +Line13 FILTER= + +Line14 ##ALT= + +Line15 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT TCGA-02-0001-01 +TCGA-02-0001-02 + +Line16 20 14370 var1 G A 29 q10 NS=2;DP=14 GT:GQ:DP 0|0:48 0|1:48:3 + +Line17 19 15000 var2 G A 35 q10;s50 NS=2.5 GQ:GT 48:0|0 51:0|1 + +Line18 19 16000 var3 C T 30 q10;s10 NS=2 GT:GQ:DP 0/2:48:3 0/1:51:4 + +Line19 2 14477 rs123 C \ 12 PASS NS=3;DB GT:GQ 0/1:50 1/1:40 + +Line20 9 13567 . A \ 20 PASS NS=3 GT:GQ:PL 0/1:49:42,3 1/1:38:96,47/70 + +Line21 3 18901 rs456 T C 15 PASS NS=3/DB GT 0/1 1/1 + +**Important**: A file will be validated as a TCGA VCF file only if it contains +##tcgaversion HEADER line (e.g., ##tcgaversion=1.1). The current acceptable +version is 1.1. + +1. Mandatory [header lines](#TCGAVariantCallFormat(VCF)1.1Specificat) should be + present. + +2. All meta-information header lines should be prefixed with "\#\#". + +3. [Column header](#TCGAVariantCallFormat(VCF)1.1Specificat) line should be + prefixed with "\#". A VCF file can contain only a single column header line + that must contain all required field names. + +4. Any line lacking the "##" or "#" prefix will be assumed to be a BODY data + line and will have to follow the specified format. For example, Line13 leads + to a violation as it lacks "##" or "#" but is not a tab-delimited row + containing variant information. + +5. HEADER lines cannot be present within the BODY of a file and vice-versa. + +6. [INFO](#TCGAVariantCallFormat(VCF)1.1Specificat), + [FORMAT](#TCGAVariantCallFormat(VCF)1.1Specificat) and + [FILTER](#TCGAVariantCallFormat(VCF)1.1Specificat)declarations should follow + the format below where all keys are required but the order of keys is + irrelevant. + +7. ##INFO= + +8. ##FORMAT= + +9. ##FILTER= + +10. Values assigned to *ID, Number, Type* and *Description* in INFO, FORMAT or + FILTER declarations should follow the rules listed below. A detailed + description of the declaration format is provided + [here](#TCGAVariantCallFormat(VCF)1.1Specificat). + + 1. If an INFO or FORMAT sub-field exists in Table 4 or 5 respectively (i.e. + ID of the sub-field matches value in "Sub-field" column of the table) + then *ID*, *Number*, *Type* and *Description* values for that sub-field + declaration must match the corresponding value in "Formatted + declaration" column of the table for that sub-field. (TCGA VCF 1.1) + + 2. ID, Number, Type !\~ /(\\s\|,\|=\|;)/ + + 3. *Number* is in {Integer\>=0, "A", "G", "."} + + 4. *Type* is in {Integer, String, Float, Flag, Character} + + 5. *Description* should be within double quotes and cannot itself contain a + double quote + + 6. *Description* string cannot contain leading or trailing whitespace after + opening or before closing quotation marks; Line10 shows a violation as + *Description* string contains leading and trailing whitespace. + + 7. If ID == "FORMAT", then Type != "Flag" + +11. Any INFO, FORMAT or FILTER sub-fields used in the BODY are required to be + defined in the HEADER. For example, var1 (Line16) shows an example of a + violation as read depth "DP" is assigned a value (DP=14) without being + defined as an INFO sub-field in the HEADER. + +12. Validation of **INFO** sub-fields: + + 1. An INFO sub-field should be included for a variant record in the BODY as + *\* (e.g., NS=2) where *key*is the "ID" value of the + sub-field in the HEADER declaration. + + - **Exception**: An INFO field of "Flag" *Type* will not be assigned a + value in the BODY. The presence of a flag in INFO column merely + indicates that the variant record satisfies a condition associated + with the flag. For example, Line19 has a "DB" flag without a value + entry in the INFO column. "DB" in the INFO column indicates that the + variant exists in dbSNP. + + 2. Multiple INFO sub-fields can be associated with a single variant record + using ";" as a separator (e.g., Line16). Line21 has a violation as "/" + is used as a separator in INFO column. + + 3. If INFO field "VLS" is defined for a record, its value can only be 0, 1, + 2, 3, 4, or 5 based on whether the mutation is wildtype, germline, + somatic, LOH, post-transcriptional modification, or unknown. + +13. Validation of **FORMAT** sub-fields: + + 1. FORMAT column for a variant record contains a colon-separated list of + all pre-defined FORMAT sub-fields (identified by "ID" value in the + HEADER declaration) that are applicable to all samples that follow. A + ":" is the only valid separator for sub-fields. + + 2. Number of colon-separated sub-fields in FORMAT column should equal to + number of colon-separated values assigned to each sample. For example, + var1 (Line16) violates this rule for the sample TCGA-02-0001-01 as there + are 3 sub-fields in FORMAT column but only 2 values in the sample + column. + + 3. Following FORMAT fields are required for all variant records in a VCF + file. Missing value (".") is allowed for these fields. + + - Genotype (**GT**) + + - Read depth (**DP**) + + - Reads supporting ALT (**AD** or **DP4**) + + - Average base quality for reads supporting alleles (**BQ**) + + - Somatic status of the variant (**SS**). SS can be 0, 1, 2, 3, 4, or + 5 depending on whether relative to normal the variant is wildtype, + germline, somatic, LOH, post-transcriptional modification, or + unknown respectively + + 4. *GT* must be the first sub-field in the string FORMAT. For example, var2 + (Line17) violates this rule as GT is not the first sub-field even though + it is present in the FORMAT field. + + - GT is a required sub field for all variants. Missing value (".") is + allowed for GT. GT is not a required sub field and can be omitted + for a variant row if none of the samples have genotype calls + available (TCGA VCF 1.1) + + - *GT* represents the genotype, encoded as allele values separated by + either of / (genotype unphased) or \| (genotype phased). The + allele values are 0 for the reference allele (in REF field), 1 for + the first allele listed in ALT, 2 for the second allele list in ALT + and so on. Examples: 0/1, 1\|0, or 1/2, etc. + + - *GT*is assigned only one allele value for haploid calls (e.g. on Y + chromosome). Therefore, if CHROM=="Y" then*GT*should have only one + allele value assigned to it (e.g., "1", "0", ".", etc.) instead of + two alleles (e.g., "1/1", "0\|0"). If CHROM=="MT" then There is no + constraint on the number of alleles as long as the number is bounded + within the alleles listed in REF and/or ALT (e.g., 0/1, 0/1/2, 1 are + all valid values for MT if REF and ALT have one and two allele + values respectively). + + - All samples should have values assigned to *GT* for any given + variant. If an allele cannot be called for a sample at a given + locus, . will be specified for each missing allele in the *GT* + field (for example "./." for a diploid genotype and "." for haploid + genotype). + + - Validation should include ensuring that allele number in *GT* is + within the range of alleles specified in ALT and REF. For example, + var3 (Line18) violates this rule as it lists GT as "0/2" for sample + TCGA-02-0001-01 but ALT contains only one allele so the only + acceptable allele numbers are 0 (REF) and 1 (ALT). + +14. If an INFO or FORMAT sub-field is declared in the header AND is assigned a + value for a variant record in the body, the data type should be consistent + with the expected type defined in the *Type*key of the corresponding + declaration. For example, var2 (Line17) violates this rule as the definition + for "NS" INFO sub-field states the data type is integer whereas the variant + record contains a float value (2.5) assigned to the sub-field. + + 1. **Exception**: The rule does not apply if *Type* of a field is not + defined or is incorrectly defined (e.g., field not declared in HEADER, + *Type* not included in declaration, incorrect value for *Type)*. It also + does not apply to any missing values (denoted with ".") in the record as + they do not have an associated data type. + +15. Multiple comma-separated values (corresponding to value assigned to *Number* + key in declaration) can be specified for an INFO or FORMAT sub-field for a + variant record. No other character can be used as separator. Line20 shows a + violation as a "/" is used as separator between 2nd and 3rd values for + *"PL"* FORMAT sub-field in the second sample column. + +16. If *Number* tag is assigned a known bounded value (an integer, "A", "G") for + an INFO/FORMAT sub-field, it should be consistent with number of values + specified for any variant record in BODY of file. For example, Line20 shows + a violation as *"PL"* is associated with 3 integer values (Line10) but the + variant record has only 2 comma-separated integer values (42,3) for + TCGA-02-0001-01. + +17. Validation of **FILTER** sub-fields: + + 1. Valid values for FILTER column are "PASS" or a code for the filter that + the variant call fails (e.g., "q10" in Line16). The code must correspond + to the "ID" value of the corresponding FILTER declaration. + + 2. If a call fails multiple filters, FILTER column should contain + semicolon-separated list of all failed filter codes (e.g., "q10;s50" in + Line17). A ";" is the only valid separator. + + 3. All codes listed in the FILTER column must have a well-formed + declaration in the HEADER. Line18 shows a violation as "q10" does not + have an associated definition in the HEADER. + +18. \ Validation of + [SAMPLE](#TCGAVariantCallFormat(VCF)1.1Specificat) meta-information lines: + + 1. Each sample ID in the column header (immediately after FORMAT column) + must have an associated HEADER declaration where value assigned to "ID" + tag in the declaration is the same as sample ID used in the column name. + + 2. Declaration must contain all required fields. + + 3. Genome mixture tags (Genomes, Mixture, Genome_Description) are enclosed + within angle brackets (<>) and can have multiple comma-separated + values. + + 4. If more than one of the genome mixture tags (Genomes, Mixture, + Genome_Description) are defined in a SAMPLE meta-information line, then + number of comma-separated values should be the same for all defined + tags. For example, "Genomes=,Mixture=<0.1,0.8,0.1>" would + lead to a violation as Mixture has 3 values while Genomes has only 2 + values. + + 5. Individual values in "Genomes" are strings without white-space, comma or + angle brackets. + + 6. Individual values in "Mixture" represent proportion (floating point + number >= 0 and <= 1) of each genome in the sample and all + comma-separated values should add up to a sum of 1. + + 7. Individual values in "Genome_Description" are strings surrounded by + double quotes where the string itself cannot contain a double quote. + + 8. The value assigned to "SampleName" must be a valid [aliquot + barcode](https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/) / [UUID](https://docs.gdc.cancer.gov/Encyclopedia/pages/UUID/) + in the database (TCGA VCF 1.1). + +19. Validation of + [PEDIGREE](#TCGAVariantCallFormat(VCF)1.1Specificat) meta-information lines: + + 1. Declaration line should follow the format: + + 2. ##PEDIGREE= + +> where: + +- N \>= 1 + +- Name_0 through Name_N are arbitrary (not literal) strings that cannot + contain white-space, comma, or angle brackets (TCGA VCF 1.1) + +- G0-ID through GN-ID are strings that cannot contain white-space, comma, or + angle brackets. Each of these should be a header for the genotype columns + immediately after FORMAT column and should be defined using "ID" tag in the + corresponding ##SAMPLE meta-information line. (TCGA VCF 1.1) + +- The keys and values used in the should be unique across + assignments in any given PEDIGREE declaration. + +1. Validation of custom meta-information fields: + + 1. If a user-created custom meta-information declaration is encountered and + the corresponding key/value structure and content have not been defined + in this specification, the line should be validated to ensure it follows + one of the following two formats: + + 2. ##key=value + + 3. Example: + + 4. ## + + 5. OR + + 6. ##FIELDTYPE= + + 7. Example: + + 8. ##contig= + +> where: + +- key !\~ /(\\s\|,\|=\|;)/ + +- value !\~ /(\\s\|,\|=\|;)/ UNLESS *value* is within double quotes, in which + case it cannot itself contain a double quote or leading/trailing whitespace + OR if *value* is within angle brackets. + +1. *CHROM*, *POS*, and *REF* are required fields and cannot contain missing + value identifiers. Please refer to [Table + 6](#TCGAVariantCallFormat(VCF)1.1Specificat) for acceptable values. + + 1. *CHROM* is in {[1-22], X, Y, MT,} where chr_ID + cannot contain whitespace or <> + + 2. If CHROM == then the VCF file MUST have a declaration for + assembly file in the HEADER. Please note that values assigned to the + field are currently not being validated. + + 3. ##assembly=url or filename + + 4. Example: + + 5. ##assembly=ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/sv/breakpoint_assemblies.fasta + + 6. ##assembly=breakpoint_assemblies.fasta + + 7. *POS* is a non-negative integer + + 8. *REF* =\~ /[ACGTN]+/ + +2. *ALT* is in {[ACGTN]+, ".", , **SV_ALT**}; + + 1. String SV_ALT can be in one of the following four formats and can be + used in the *ALT*field ONLY when the corresponding INFO field has the + key-value pair "SVTYPE=BND" or "SVTYPE=FND". + + 2. Format Example + + 3. seq[chr:pos[ G[17:198982[ + + 4. seq]chr:pos] GC]1:238909] + + 5. ]chr:pos]seq ]\:235788]GCNA + + 6. [chr:pos[seq [1:2812734[ACT + +> where: + +- *seq* is in {[ACGTN]+, "."} + +- *chr* is in {\, [1-22], X, Y, MT} where *chr_ID* is a string + +- *pos* is a non-negative integer + +1. Similar to 18b, if chr == (where *chr_ID* is a string) then the + VCF file must have an ##assembly declaration in the HEADER. + +2. If *ALT* is assigned a value in format, (e.g., rs123 in Line19), + should be defined in the HEADER as + ##ALT= (Line14) where ID cannot + contain white-space or angle brackets. Line20 shows a violation of this rule + as *ALT==* but there is no corresponding *ALT* declaration in the + HEADER with . + +3. ALT can contain multiple comma-separated values. No other character can be + used as a separator. + +4. No two records are allowed to have the the same *ID* value. Two records can, + however, have the same *CHROM* and *POS*values. + + 1. **Exception**: Multiple records in a file are allowed to have the same + missing value identifier (".") as *ID*. + +5. *QUAL* field can only contain non-negative integers or "." (missing value). + +6. If INFO sub-field "VT" is declared and used in the BODY, its + value can only be in {SNP, INS, DEL} + +7. If FORMAT sub-field "SS" is declared and used in the BODY, its + value can be 0, 1, 2, 3, 4 or 5 depending on whether relative to normal the + variant is wildtype, germline, somatic, LOH, post-transcriptional + modification, or unknown respectively. + +8. "DP" sub-field for read depth can be defined in INFO (combined + depth across all samples) or FORMAT (depth in a specific sample) field. If + both INFO and FORMAT have values for the sub-field, then sum of DP values + across all FORMAT sample columns should be equal to DP value in the INFO + field. + +9. Validation of **complex rearrangement** records: + + 1. If INFO field includes key-value pairs "SVTYPE=BND" or "SVTYPE=FND" and + has values for "MATEID" and/or "PARID", then the value (or multiple + comma-separated values) assigned to MATEID or PARID should exist in the + file as "ID" field for another variant record. + +10. Validation of RNA-Seq **annotation fields**: + + 1. If INFO field includes "SID", "GENE" or "RGN" keys with associated + values, then file MUST contain a declaration for ##geneAnno in the + HEADER. + + 2. Number of comma-separated values in the optional INFO sub-fields "SID", + "GENE" and "RGN" and the FORMAT sub-field "TE" must be the same if more + than one of these sub-fields are defined for a record. + + 3. INFO sub-field "RGN" is in {5_utr, 3_utr, exon, intron, ncds, sp}. + + 4. FORMAT sub-field "TE" is in {SIL, MIS, NSNS, NSTP, FSH, NA} + + 5. If "RGN" and "TE" have the same number of comma-separated values, then + "RGN" must be "exon" for "TE" to have any value other than "NA". For + example, if "RGN=exon,intron,intron" then having "MIS,SIL,NA" for TE + would lead to a violation as the 2nd value for RGN is "intron" but the + corresponding TE value is "SIL" instead of "NA". + +11. Validation of + [vcfProcessLog](#TCGAVariantCallFormat(VCF)1.1Specificat) tags: + +##vcfProcessLog=,InputVCFSource=,InputVCFVer=<1.0>,InputVCFParam=,InputVCFgeneAnno=> + +OR + +##vcfProcessLog=,InputVCFSource=,InputVCFVer=<1.0,2.1,2.0>, + +InputVCFParam=,InputVCFgeneAnno=, + +MergeSoftware=,MergeParam=,MergeVer=<2.1,3.0>,MergeContact=> + +1. Individual values for each tag are enclosed within angle brackets (<>) + instead of double quotes. + + 1. If a field contains multiple values, they are separated by comma. + **Exception**: Separator for multiple values in *InputVCFParam* and + *MergeParam* is a ";" instead of ",". Individual values within these + tags can contain comma-separated parameters (e.g., . + +**Table 9: Test files known to pass/fail validation steps** + +| **Expected result** | **Validation** | **Test file** | +|---------------------|--------------------------------------------------------------|----------------------------------------------------------------| +| Success | file with no failures | /vcfFormat/TCGA-24-0980_IlluminaGA-DNASeq_exome.format2.vcf | +| Failure | "chr" prefix for chromosome names | /BI/20110324/BCM-GBM_solid.TCGA-06-0208.mut.vcf | +| Failure | column header line has no "\#" as beginning | /genome.wustl.edu/20110420/TCGA-06-0145-01A-01W-0224-08.vcf.gz | +| Failure | double quotes missing from SAMPLE metadata lines Description | /BCM/TCGA-06-0145_IlluminaGA-DNASeq_exome.vcf | +| Failure | scores in QUAL column are negative integers | /UCSC/20110420/TCGA-13-0723_W_capture.vcf | + +- missing values for FORMAT fields + +- trailing whitespace at the end of description strings in metadata + +- filter not listed in FILTER metadata line + +- in multi-allelic sites, different alternative alleles in the ALT field are + separated by "/" instead of "," diff --git a/docs/Encyclopedia/pages/Variant_Call_Format.md b/docs/Encyclopedia/pages/Variant_Call_Format.md index 26c2b6b33..151c26bba 100644 --- a/docs/Encyclopedia/pages/Variant_Call_Format.md +++ b/docs/Encyclopedia/pages/Variant_Call_Format.md @@ -18,7 +18,7 @@ Details about the structure of the VCF is available in the VCF 4.1 Specification ## References ## 1. [VCF 4.1 Specification](https://samtools.github.io/hts-specs/VCFv4.1.pdf) -2. [GDC VCF Format](https://gdc-docs.nci.nih.gov/Data/File_Formats/VCF_Format/) +2. [GDC VCF Format](https://docs.gdc.cancer.gov/Data/File_Formats/VCF_Format/) ## External Links ## * [VCF 4.1 Specification](https://samtools.github.io/hts-specs/VCFv4.1.pdf) diff --git a/docs/index.md b/docs/index.md index 127c60055..88191c0b1 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,18 +1 @@ -# Welcome - -This is the official project documentation site of the NCI [Genomic Data Commons (GDC)](https://gdc.cancer.gov). - -## Contents - -The GDC documentation is divided into the following top-level sections: - -* GDC API: [PDF](API/PDF/API_UG.pdf) - [HTML](API/Users_Guide/Getting_Started.md) -* GDC Data Portal: [PDF](Data_Portal/PDF/Data_Portal_UG.pdf) - [HTML](Data_Portal/Users_Guide/Getting_Started.md) -* GDC Data Submission Portal: [PDF](Data_Submission_Portal/PDF/Data_Submission_Portal_UG.pdf) - [HTML](Data_Submission_Portal/Users_Guide/Getting_Started.md) -* GDC Data Transfer Tool: [PDF](Data_Transfer_Tool/PDF/Data_Transfer_Tool_UG.pdf) - [HTML](Data_Transfer_Tool/Users_Guide/Getting_Started.md) -* GDC Dictionary: [HTML](Data_Dictionary/index.md) -* GDC Data: [PDF](Data/PDF/Data_UG.pdf) - [HTML](Data/File_Formats/VCF_Format.md) - -## Learn More - -To learn more about GDC, please visit our official [GDC Website](https://gdc.cancer.gov). +# Homepage \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 7d2f7ab26..89d37b4f5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -2,6 +2,8 @@ copyright: '© 2015-2016' extra: dictionary_app_root: /apps/dictionary dictionary_viewer_page_id: Viewer + gdcmvs_app_root: /apps/gdcmvs + gdcmvs_viewer_page_id: Search encyclopedia_entries_page_title: EncyclopediaEntries encyclopedia_page_title: Encyclopedia nav_exclude: @@ -28,6 +30,7 @@ pages: - BAM Slicing: API/Users_Guide/BAM_Slicing.md - Submission: API/Users_Guide/Submission.md - Python Examples: API/Users_Guide/Python_Examples.md + - GraphQL Examples: API/Users_Guide/GraphQL_Examples.md - System Information: API/Users_Guide/System_Information.md - Additional Examples: API/Users_Guide/Additional_Examples.md - 'Appendix A: Available Fields': API/Users_Guide/Appendix_A_Available_Fields.md @@ -39,46 +42,35 @@ pages: - Getting Started: Data_Portal/Users_Guide/Getting_Started.md - Projects: Data_Portal/Users_Guide/Projects.md - Exploration: Data_Portal/Users_Guide/Exploration.md + - Analysis: Data_Portal/Users_Guide/Custom_Set_Analysis.md - Repository: Data_Portal/Users_Guide/Repository.md - - Genes and Mutations: Data_Portal/Users_Guide/Genes_and_Mutations.md - - Custom Set Analysis: Data_Portal/Users_Guide/Custom_Set_Analysis.md - - Annotations: Data_Portal/Users_Guide/Annotations.md - Advanced Search: Data_Portal/Users_Guide/Advanced_Search.md - - Authentication: Data_Portal/Users_Guide/Authentication.md - - Downloading Files: Data_Portal/Users_Guide/Cart.md - - File Cart: Data_Portal/Users_Guide/Cart.md - - Image Viewer: Data_Portal/Users_Guide/Image_viewer.md + - Cart and File Download: Data_Portal/Users_Guide/Cart.md - Legacy Archive: Data_Portal/Users_Guide/Legacy_Archive.md - Release Notes: Data_Portal/Release_Notes/Data_Portal_Release_Notes.md - fa-file-text Download PDF /Data_Portal/PDF/Data_Portal_UG.pdf: Data_Portal/PDF/index.md -- Data Submission Portal: - - Getting Started: Data_Submission_Portal/Users_Guide/Getting_Started.md - - Submission Workflow: Data_Submission_Portal/Users_Guide/Submission_Workflow.md - - Authentication: Data_Submission_Portal/Users_Guide/Authentication.md - - Homepage: Data_Submission_Portal/Users_Guide/Homepage.md - - Dashboard: Data_Submission_Portal/Users_Guide/Dashboard.md - - Upload Data: Data_Submission_Portal/Users_Guide/Data_Upload_UG.md - - Submit Data: Data_Submission_Portal/Users_Guide/Submit_Data.md - - Release Data: Data_Submission_Portal/Users_Guide/Release_Data.md - - Transactions: Data_Submission_Portal/Users_Guide/Transactions.md - - Browse Data: Data_Submission_Portal/Users_Guide/Browse_Data.md - - Pre-Release Data Review: Data_Submission_Portal/Users_Guide/Pre_Release_QC.md - - Best Practices: Data_Submission_Portal/Users_Guide/Best_Practices.md +- Data Submission: + - Before Submitting Data to the GDC Portal: Data_Submission_Portal/Users_Guide/Checklist.md + - Data Submission Overview: Data_Submission_Portal/Users_Guide/Data_Submission_Overview.md + - Data Submission Portal: Data_Submission_Portal/Users_Guide/Data_Submission_Process.md + - Data Upload Walkthrough: Data_Submission_Portal/Users_Guide/Data_Submission_Walkthrough.md + - Pre-Release Data Portal: Data_Submission_Portal/Users_Guide/Pre_Release_QC.md + - Submission Best Practices: Data_Submission_Portal/Users_Guide/Best_Practices.md - Release Notes: Data_Submission_Portal/Release_Notes/Data_Submission_Portal_Release_Notes.md - fa-file-text Download PDF /Data_Submission_Portal/PDF/Data_Submission_Portal_UG.pdf: Data_Submission_Portal/PDF/index.md - Data Transfer Tool: - Getting Started: Data_Transfer_Tool/Users_Guide/Getting_Started.md - - Accessing Built-in Help: Data_Transfer_Tool/Users_Guide/Accessing_Built-in_Help.md - Preparing for Data Download and Upload: Data_Transfer_Tool/Users_Guide/Preparing_for_Data_Download_and_Upload.md - - Data Download and Upload - Command Line: Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload.md - - Data Download - UI: Data_Transfer_Tool/Users_Guide/Data_Download_DTT_UI.md - - Key Terms: Data_Transfer_Tool/Users_Guide/Appendix_A_-_Key_Terms.md + - Data Transfer Tool Command Line Documentation: Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload.md - Release Notes - Command Line: Data_Transfer_Tool/Release_Notes/DTT_Release_Notes.md + - Data Transfer Tool UI Documentation: Data_Transfer_Tool/Users_Guide/Data_Download_DTT_UI.md - Release Notes - UI: Data_Transfer_Tool/Release_Notes/DTT_UI_Release_Notes.md + - Troubleshooting Guide: Data_Transfer_Tool/Users_Guide/Appendix_B_TroubleShooting.md - fa-file-text Download PDF /Data_Transfer_Tool/PDF/Data_Transfer_Tool_UG.pdf: Data_Transfer_Tool/PDF/index.md - Data Dictionary: - About: Data_Dictionary/index.md - Viewer: Data_Dictionary/viewer.md + - Search: Data_Dictionary/gdcmvs.md - Release Notes: Data_Dictionary/Release_Notes/Data_Dictionary_Release_Notes.md - Data: - Introduction: Data/Introduction.md @@ -91,14 +83,16 @@ pages: - 'Bioinformatics Pipeline: miRNA Analysis': Data/Bioinformatics_Pipelines/miRNA_Pipeline.md - 'Bioinformatics Pipeline: Copy Number Variation Analysis': Data/Bioinformatics_Pipelines/CNV_Pipeline.md - 'Bioinformatics Pipeline: Methylation Liftover Pipeline': Data/Bioinformatics_Pipelines/Methylation_LO_Pipeline.md + - Aligned Reads Summary Metrics: Data/Bioinformatics_Pipelines/Aligned_reads_summary_metrics.md - Release Notes: Data/Release_Notes/Data_Release_Notes.md - fa-file-text Download PDF /Data/PDF/Data_UG.pdf: Data/PDF/index.md - Encyclopedia: Encyclopedia/index.md - EncyclopediaEntries: - Affymetrix SNP 6.0: Encyclopedia/pages/Affymetrix_SNP_6.0.md + - Annotations: Encyclopedia/pages/Annotations.md + - Annotations TCGA: Encyclopedia/pages/Annotations_TCGA.md - Aggregated Somatic Mutation: Encyclopedia/pages/Aggregated_Somatic_Mutation.md - Aliquot: Encyclopedia/pages/Aliquot.md - - Annotations: Encyclopedia/pages/Introduction+to+Annotations.md - Biospecimen Data: Encyclopedia/pages/Biospecimen_Data.md - Case: Encyclopedia/pages/Case.md - Cancer Genomics Hub: Encyclopedia/pages/Cancer_Genomics_Hub.md @@ -126,6 +120,7 @@ pages: - Manifest File: Encyclopedia/pages/Manifest_File.md - MD5 Checksum: Encyclopedia/pages/MD5_Checksum.md - Mutation Annotation Format: Encyclopedia/pages/Mutation_Annotation_Format.md + - Mutation Annotation Format Legacy: Encyclopedia/pages/Mutation_Annotation_Format_TCGAv2.md - Redaction: Encyclopedia/pages/Redaction.md - Release Number: Encyclopedia/pages/Release_Number.md - REST API: Encyclopedia/pages/REST_API.md @@ -136,6 +131,7 @@ pages: - TCIA: Encyclopedia/pages/TCIA.md - UUID: Encyclopedia/pages/UUID.md - Variant Call Format: Encyclopedia/pages/Variant_Call_Format.md + - Variant Call Format Legacy: Encyclopedia/pages/Mutation_Annotation_Format_TCGAv2.md - Variant Type: Encyclopedia/pages/Variant_Type.md repo_url: https://github.com/NCI-GDC/gdc-docs site_name: GDC Docs diff --git a/requirements.txt b/requirements.txt index e1bd1df40..dc48490de 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,6 @@ mkdocs==0.15.1 +mkdocs-bootstrap==0.1.1 +mkdocs-bootswatch==0.4 BSCodeTabs +Markdown==2.6.11 +MarkupSafe==1.0 \ No newline at end of file diff --git a/theme/404.html b/theme/404.html index 73267ef57..4576d07d8 100644 --- a/theme/404.html +++ b/theme/404.html @@ -1,28 +1,52 @@ + +{# + 404 error page with optional static redirects to fix broken links. + Configure redirects in theme/config/redirects.json. + + Most of the page is copied from base.html and then modified to + remove all references to current_page since current_page doesn't + work in a 404 page, at least not with mkdocs 0.15. +#} + +{# Disable adding the / to all navs #} +{% set disableRootNavPrefixing = true %} + {% if page_description %}{% endif %} {% if site_author %}{% endif %} {% if canonical_url %}{% endif %} {% if favicon %} {% else %}{% endif %} - + + + Not Found - {{ site_name }} - Not Found - {{ site_name }} + + {%- for path in extra_css %} {%- endfor %} - + {% set encyclopedia_pages = [] %} + {% for page in config.pages %} + {% if config.extra.encyclopedia_entries_page_title in page %} + {% for page in page[config.extra.encyclopedia_entries_page_title] %} + {% if encyclopedia_pages.append(page) %} + {% endif %} + {% endfor %} + {% endif %} + {% endfor %} + + + + {% endif %} + + {% if current_page.title == "Home" %} + + {% endif %} +
    - {% if current_page.title != config.extra.dictionary_viewer_page_id %} + {% if current_page.title != config.extra.dictionary_viewer_page_id and current_page.title != config.extra.gdcmvs_viewer_page_id %}
    @@ -79,14 +94,18 @@ {% include "nav.html" %}
    -
    +
    + + {% if current_page.title == "Home" %} + {% include "homepage.html" %} + {% else %} {% if "/Encyclopedia" in current_page.abs_url %} {% include "encyclopedia/menu.html" %} - {% elif current_page.title != config.extra.dictionary_viewer_page_id and current_page.title != 'Home' %} + {% elif current_page.title != config.extra.dictionary_viewer_page_id and current_page.title != 'Home' and current_page.title != config.extra.gdcmvs_viewer_page_id %} {% include "nav-section-map.html" %} {% elif current_page.title == 'Home' %} {% if toc and ((toc | first) is defined) %} @@ -94,7 +113,7 @@ {% endif %} {% endif %} - {% if current_page.title == config.extra.dictionary_viewer_page_id%} + {% if current_page.title == config.extra.dictionary_viewer_page_id or current_page.title == config.extra.gdcmvs_viewer_page_id %}
    {% else %}
    @@ -107,18 +126,19 @@
    {% if next_page or previous_page %} {% endif %}
    + {% endif %}
    @@ -172,6 +192,16 @@ {% if current_page.title == config.extra.encyclopedia_page_title %} {% endif %} + {% if current_page.title == config.extra.gdcmvs_viewer_page_id %} + + + + + + {% endif %} + {% if current_page.title == "Home" %} + + {% endif %}
    - + \ No newline at end of file diff --git a/theme/config/redirects.json b/theme/config/redirects.json index 38b2dc5eb..e0e06235a 100644 --- a/theme/config/redirects.json +++ b/theme/config/redirects.json @@ -1,12 +1,8 @@ { - "defaultBaseFromURL": "http://gdc-docs.nci.nih.gov", - "defaultBaseToURL": "http://gdc-docs.nci.nih.gov", - "redirects": [ { - "from": "/", - "to": "/" + "from": "/Data_Portal/Users_Guide/Authentication", + "to": "/Data_Portal/Users_Guide/Cart/#gdc-authentication-tokens" } ] - } diff --git a/theme/css/bootstrap-custom.css b/theme/css/bootstrap-custom.css index d3b132614..9fde5db9d 100644 --- a/theme/css/bootstrap-custom.css +++ b/theme/css/bootstrap-custom.css @@ -2269,6 +2269,8 @@ th { width: 100%; max-width: 100%; margin-bottom: 21px; + table-layout: fixed; + word-wrap: break-word; } .table > thead > tr > th, .table > tbody > tr > th, diff --git a/theme/css/homepage.css b/theme/css/homepage.css new file mode 100644 index 000000000..37a989b11 --- /dev/null +++ b/theme/css/homepage.css @@ -0,0 +1,326 @@ +#body[data-current-page="Home"] { + max-width: 100%; + min-height: auto; +} + +.hp-hero { + background-color: #2a72a4; + padding: 28px 24px 38px; + margin-bottom: 40px; +} + +.hp-hero__site-title, .hp-hero__page-title, .hp-hero__subtitle { + color: #fff; + text-align: center; + margin: 0 auto; +} + +.hp-hero__site-title { + font-size: 20px; + line-height: 32px; + margin-bottom: 4px; +} + +.hp-hero__page-title { + font-size: 44px; + line-height: 38px; + margin-bottom: 16px; + font-weight: bold; +} + +.hp-hero__page-title .header-badge { + display: none; +} + +.hp-hero__page-title:after { + display: none; +} + + +.hp-hero__subtitle { + font-size: 16px; + line-height: 26px; + font-style: italic; + max-width: 28em; + margin-bottom: 20px; +} + +.hp-search { + max-width: 600px; + margin: 0 auto; + position: relative; +} + +.hp-search__wrapper-input { + display: flex; +} + +.hp-search__wrapper-results { + display: none; + position: absolute; + width: 100%; + background-color: #fff; + border: 1px solid #dedede; + border-radius: 0 0 5px 5px; + border-top-width: 0; +} + +.hp-search__label { + border: 1px solid #dedede; + border-right: 0; + background-color: #fff; + width: 50px; + height: 50px; + display: inline-block; + text-align: center; + border-radius: 5px 0 0 5px; + margin-bottom: 0; +} + +.hp-search.search-active .hp-search__label { + border-radius: 5px 0 0 0; +} + +.hp-search .fa { + color: #6b6262; + font-size: 25px; + line-height: 48px; +} + +.hp-search__cancel { + position: absolute; + right: 0; + top: 0; + padding: 0; + background: transparent; + border: none; + width: 50px; + height: 50px; + text-align: center; +} + +.hp-search__input { + display: inline-block; + border: none; + flex: 1; + font-size: 18px; + padding: 0 50px 0 10px; + border-radius: 0 5px 5px 0; + border: 1px solid #dedede; + border-left-width: 0; +} + +.hp-search.search-active .hp-search__input { + border-radius: 0 5px 0 0; +} + +.hp-search__body { + padding: 5px 10px; +} + +/* rewriting bootstrap rules in order to make the cards all the same height */ + +.hp-container { + margin-right: auto; + margin-left: auto; + padding-left: 15px; + padding-right: 15px; +} + +@media (min-width: 925px) { + .hp-container { + width: 750px; + } +} + +@media (min-width: 992px) { + .hp-container { + width: 970px; + } +} + +@media (min-width: 1200px) { + .hp-container { + width: 1170px; + } +} + +.hp-container .row { + display: flex; + flex-flow: row wrap; +} + +@media (max-width: 992px) { + .hp-container .col-md-4 { + width: 100%; + max-width: 630px; + margin-left: auto; + margin-right: auto; + } +} + +@media (min-width: 992px) { + .hp-container .col-md-4 { + padding-right: 10px; + padding-left: 10px; + } +} + +.hp-container .col-md-4 { + display: flex; + flex-flow: column; +} + +.hp-card { + display: flex; + align-items: stretch; + flex-direction: column; + border: 1px solid #dddddd; + background-color: #fff; + padding: 0 24px 24px; + height: 100%; + width: 100%; + margin-bottom: 24px; + position: relative; + border-radius: 0 0 5px 5px; +} + +.hp-card__title { + color: #3a3a3a; + font-size: 22px; + line-height: 34px; + border-top: 5px solid #000; + margin: 0 -24px 20px -24px; + padding: 16px 80px 0 24px; +} + +.hp-card__image { + height: 40px; + width: 50px; + position: absolute; + top: 16px; + right: 8px; +} + +.hp-card__links { + padding-left: 0; +} + +.hp-card__links-item { + margin-bottom: 8px; + list-style: none; + margin-left: 20px; +} + +.hp-card__links-item:before { + content: '\f061'; + font-family: 'fontawesome'; + display: inline-block; + color: #ccc; + margin-left: -20px; + padding-right: 6px; +} + +.hp-card__links-item a { + font-size: 16px; + line-height: 24px; + color: #24618d; +} + +.hp-footer { + margin: 6px auto 36px; + color: #555555; +} + +.hp-search__item h3 { + margin: 0 0 5px 0; + line-height: 1.5; + color: #3a3a3a; + font-size: 18px; +} + +.hp-search__item { + padding: 0; + display: table; + background-color: transparent; + word-wrap: break-word; + overflow: hidden; + width: 100%; + background-color: #fff; + border-top: solid 1px #eee; + -webkit-transition: -webkit-transform .15s linear; + -moz-transition: -moz-transform .15s linear; + -o-transition: -o-transform .15s linear; + transition: transform .15s linear; + position: relative; + z-index: 1; + text-decoration: none; +} + +.hp-search__item:first-child { + border-top-width: 0; +} + +.hp-search__item .doc-type-icon-container .header-badge { + font-size: 1.5rem; + padding: 0.75rem 1.2rem 0.75rem 1.2rem; +} + +.hp-search__item>div { + display: table-cell; + padding: 1rem 0rem; +} + +.hp-search__item .doc-type-icon-container { + vertical-align: middle; + font-size: 1.8rem; + width: 52px; +} + +.hp-search__item .search-body { + padding-left: 1.5rem; +} + +.hp-search__item .search-body p { + margin: 0; + word-wrap: break-word; + white-space: normal; + font-size: 14px; + line-height: 1.5; + color: #3a3a3a; +} + +.hp-search__item:hover .location-field, .hp-search__item:hover a, .hp-search__item:hover, +.hp-search__item:focus .location-field, .hp-search__item:focus a, .hp-search__item:focus { + background-color: #1f486c; + cursor: pointer; + color: #fff; + text-decoration: none; +} + +.hp-search__item:hover .search-body *, +.hp-search__item:focus .search-body * { + color: #fff; +} + +.hp-search__item .search-body .highlight { + background: yellow; + color: #000; + margin: 0; + padding: 2px; +} + +.hp-search__results { + max-height: 400px; + overflow-y: auto; + width: 100%; + background-color: #fff; +} + +.hp-search__wrapper-results { + z-index: 10; +} + +.hp-search__results-container { + border-top: 1px solid #dedede; +} \ No newline at end of file diff --git a/theme/homepage.html b/theme/homepage.html new file mode 100644 index 000000000..afccd8516 --- /dev/null +++ b/theme/homepage.html @@ -0,0 +1,76 @@ +{% set site_title = "NCI Genomic Data Commons (GDC)" %} +{% set page_title = "Documentation" %} +{% set page_subtitle = "A place where researchers, data submitters and developers can find detailed information on GDC processes and tools." %} + +{% set search_placeholder = "What are you looking for?" %} + +{% set cards = [ + ('For Researchers', '#1db888', '/img/hp-icons-researcher.png', [ + ('Understand the Data in the GDC', '/Data/Introduction/'), + ('Access Controlled Data in the GDC', '/Data/Data_Security/Data_Security/'), + ('Explore Data in the GDC', '/Data_Portal/Users_Guide/Exploration/'), + ('Download Files in the GDC', '/Data_Transfer_Tool/Users_Guide/Preparing_for_Data_Download_and_Upload/'), + ('Analyze Data in the GDC', '/Data_Portal/Users_Guide/Custom_Set_Analysis/') + ]), + ('For Data Submitters', '#676bc0', '/img/hp-icons-submitter.png', [ + ('View the GDC Data Dictionary', '/Data_Dictionary/viewer/'), + ('Search the GDC Data Dictionary', '/Data_Dictionary/gdcmvs/'), + ('Understand GDC Harmonized Data', 'https://gdc.cancer.gov/about-data/data-harmonization-and-generation/genomic-data-harmonization-0'), + ('Submit Data', '/Data_Submission_Portal/Users_Guide/Data_Submission_Overview/'), + ('Review Submitted Data', '/Data_Submission_Portal/Users_Guide/Data_Submission_Process/#review') + ]), + ('For Developers', '#743983', '/img/hp-icons-developer.png', [ + ('Understand the GDC Data Model', '/Data/Data_Model/GDC_Data_Model/'), + ('Use the GDC API', '/API/Users_Guide/Getting_Started/'), + ('Access the GDC Codebase', 'https://github.com/NCI-GDC'), + ]), +] %} + +{% set footer_text = 'To learn more about the GDC, please visit the GDC Website or Contact Us.' %} + +
    +

    {{ site_title }}

    +

    {{ page_title }}

    +

    {{ page_subtitle }}

    + +
    +
    +
    + {% for card_title, border_color, card_image, links in cards %} +
    +
    +

    {{card_title}}

    + + +
    +
    + {% endfor %} +
    +
    +
    + +
    \ No newline at end of file diff --git a/theme/img/hp-icons-computer.png b/theme/img/hp-icons-computer.png new file mode 100644 index 000000000..f52b35bb4 Binary files /dev/null and b/theme/img/hp-icons-computer.png differ diff --git a/theme/img/hp-icons-developer.png b/theme/img/hp-icons-developer.png new file mode 100644 index 000000000..c1a64445e Binary files /dev/null and b/theme/img/hp-icons-developer.png differ diff --git a/theme/img/hp-icons-researcher.png b/theme/img/hp-icons-researcher.png new file mode 100644 index 000000000..198c21cb7 Binary files /dev/null and b/theme/img/hp-icons-researcher.png differ diff --git a/theme/img/hp-icons-submitter.png b/theme/img/hp-icons-submitter.png new file mode 100644 index 000000000..6a1f3c585 Binary files /dev/null and b/theme/img/hp-icons-submitter.png differ diff --git a/theme/js/404.js b/theme/js/404.js index 2efc5f532..0079435b1 100644 --- a/theme/js/404.js +++ b/theme/js/404.js @@ -16,14 +16,14 @@ $(function() { var redirectRule = redirects[i], fromURLRedirect = redirectRule.from.toLowerCase(); - // Note this is a case insensitive (exact) string match trailing slashes may break the comparison + // Note this is a case-insensitive match for the given pattern. if (_currentPath.indexOf(fromURLRedirect) >= 0 && typeof redirectRule.to === 'string') { - var targetURL = _redirectMap.defaultBaseToURL + redirectRule.to; + var targetURL = redirectRule.to; defer.resolve(targetURL); setTimeout(function(){ - window.location.href = _redirectMap.defaultBaseToURL + redirectRule.to; + window.location.href = redirectRule.to; }, _timeOutMS); return; diff --git a/theme/js/gdc-common.js b/theme/js/gdc-common.js index 0c125de18..49f125a02 100644 --- a/theme/js/gdc-common.js +++ b/theme/js/gdc-common.js @@ -356,6 +356,8 @@ $(function() { function _initScrollSpy() { + if ($('#body').data('current-page') === 'Home') return; + var scrollSpyTarget = '.bs-sidebar', scrollBody = $('html, body'); diff --git a/theme/js/homepage.js b/theme/js/homepage.js new file mode 100644 index 000000000..82fc5ffbd --- /dev/null +++ b/theme/js/homepage.js @@ -0,0 +1,107 @@ +$(function () { + var _searchItemClass = 'hp-search__item'; + var _VALID_QUERY_LENGTH = 3; + var $cancelButton = $('.hp-search__cancel'); + var $inputBox = $('.hp-search__input'); + var $results = $('.hp-search__results'); + var $resultsWrapper = $('.hp-search__wrapper-results'); + var $resultsContainer = $('.hp-search__results-container'); + var $searchContainer = $('.hp-search'); + var $searchContentBody = $('.hp-search__body'); + + $inputBox.focus(); + + function _debounce(func, wait, immediate) { + var _timeout; + return function () { + var context = this; + var args = arguments; + var later = function () { + _timeout = null; + if (!immediate) func.apply(context, args); + }; + var callNow = immediate && !_timeout; + clearTimeout(_timeout); + _timeout = setTimeout(later, wait); + if (callNow) { + func.apply(context, args); + } + }; + } + + $.get(base_url + '/mkdocs/search_index.json', function (data) { + var index = lunr(function () { + this.field('title', { + boost: 10 + }); + this.field('text'); + this.ref('location'); + }); + + var documents = {}; + var doc; + + for (var i = 0; i < data.docs.length; i++) { + doc = data.docs[i]; + doc.location = base_url + doc.location; + index.add(doc); + documents[doc.location] = doc; + } + + function _search() { + var query = $.trim($inputBox.val()); + $results.empty(); + + if (query.length > 0) { + $cancelButton.show(); + } else { + $cancelButton.hide(); + } + + if (query.length < _VALID_QUERY_LENGTH || query === '') { + $resultsWrapper.hide(); + $searchContainer.removeClass('search-active'); + return; + } + + $resultsWrapper.show(); + $searchContainer.addClass('search-active'); + + var results = index.search(query); + var resultsHTML = ''; + + $searchContentBody.html(' ' + results.length + ' ' + (results.length === 1 ? 'result' : 'results') + ' found for ' + query + ''); + + if (results.length === 0) { + $resultsContainer.hide(); + } else { + $resultsContainer.show(); + for (var i = 0; i < results.length; i++) { + var result = results[i]; + var resultDoc = documents[result.ref]; + resultDoc.base_url = base_url; + resultDoc.summary = resultDoc.text.substring(0, 200); + + resultsHTML += '' + + '
    ' + + '

    ' + resultDoc.title + '

    ' + + '

    ' + resultDoc.summary + '

    ' + + '
    '; + } + + $results.append(resultsHTML); + $results.highlight(query); + + setTimeout(function () { $('.' + _searchItemClass).removeClass('animated fadeInTop'); }, 500); + } + } + + $inputBox.on('keyup', _debounce(_search, 300)); + + $cancelButton.on('click', function () { + $inputBox.val(''); + _search(); + }) + }); + + }); \ No newline at end of file diff --git a/theme/nav-section-map.html b/theme/nav-section-map.html index b5d4ed5ca..0e7ae815a 100644 --- a/theme/nav-section-map.html +++ b/theme/nav-section-map.html @@ -13,11 +13,14 @@ {% if toc and ((toc | first) is defined) and sub_nav_item.active %} {% include "nav-section-map-toc.html" %} {% endif %} - - {% endfor %} - - {% endif %} + {% if sub_nav_item.title == "Aligned Reads Summary Metrics" %} +
  • + GDC Reference Files +
  • + {% endif %} + {% endfor %} + {% endif %} {% endfor %}
diff --git a/theme/nav-sub.html b/theme/nav-sub.html index 749184ee3..e56e9f587 100644 --- a/theme/nav-sub.html +++ b/theme/nav-sub.html @@ -1,11 +1,16 @@ {% if not nav_item.children %} -
  • +
  • {% if disableRootNavPrefixing %} {{ nav_item.title }} {% else %} {{ nav_item.title }} {% endif %} -
  • + + {% if nav_item.title == "Aligned Reads Summary Metrics" %} +
  • + GDC Reference Files +
  • + {% endif %} {% else %}
  • - +