diff --git a/aws_open_datasets.json b/aws_open_datasets.json index 8a60745..a8575a0 100644 --- a/aws_open_datasets.json +++ b/aws_open_datasets.json @@ -25255,8 +25255,8 @@ }, { "Name": "Oxford Nanopore Technologies Benchmark Datasets", - "Description": "CpG dinucleotides frequently occur in high-density clusters called CpG islands (CGI) and >60% of human genes have their promoters embedded within CGIs Determining the methylation status of cytosines within CpGs is of substantial biological interest: alterations in methylation patterns within promoters is associated with changes in gene expression and disease states such as cancer Exploring methylation differences between tumour samples and normal samples can help to elucidate mechanisms associated with tumour formation and development Nanopore sequencing enables direct detection of methylated cytosines (eg at CpG sites), without the need for bisulfite conversionOxford Nanopore\u2019s Adaptive Sampling offers a flexible method to enrich regions of interest (eg CGIs) by depleting off-target regions during the sequencing run itself with no upfront sample manipulation Here we introduce Reduced Representation Methylation Sequencing (RRMS) to target 310 Mb of the human genome including regions which are highly enriched for CpGs including ~28,000 CpG islands, ~50,600 shores and ~42,700 shelves as well as ~21,600 promoter regions", - "ARN": "arn:aws:s3:::ont-open-data/rrms_2022.07", + "Description": "Using nanopore sequencing, researchers have directly identified DNA and RNA base modifications at nucleotide resolution, including 5-methylycytosine, 5-hydroxymethylcytosine, N6-methyladenosine, 5-bromodeoxyuridine in DAN; and N6-methyladenosine in RNA, with detection of other natural or synthetic epigenetic modifications possible through training basecalling algorithms One of the most widespread genomic modifications is 5-methylcytosine (5mC), which most frequently occurs at dinucleotides Compared to whole-genome bisulfite sequencing, the traditional method of 5mC detection, nanopore technology can offer many advantagesThe following cell lines/DNA samples were obtained from the NIGMS Human Genetic Cell Repository at the Coriell Institute for Medical Research: GM24385", + "ARN": "arn:aws:s3:::ont-open-data/gm24385_mod_2021.09/extra_analysis/bonito_remora", "Region": "eu-west-1", "Type": "S3 Bucket", "Documentation": "https://labs.epi2me.io/dataindex/", @@ -25311,8 +25311,8 @@ }, { "Name": "Oxford Nanopore Technologies Benchmark Datasets", - "Description": "Using nanopore sequencing, researchers have directly identified DNA and RNA base modifications at nucleotide resolution, including 5-methylycytosine, 5-hydroxymethylcytosine, N6-methyladenosine, 5-bromodeoxyuridine in DAN; and N6-methyladenosine in RNA, with detection of other natural or synthetic epigenetic modifications possible through training basecalling algorithms One of the most widespread genomic modifications is 5-methylcytosine (5mC), which most frequently occurs at dinucleotides Compared to whole-genome bisulfite sequencing, the traditional method of 5mC detection, nanopore technology can offer many advantagesThe following cell lines/DNA samples were obtained from the NIGMS Human Genetic Cell Repository at the Coriell Institute for Medical Research: GM24385", - "ARN": "arn:aws:s3:::ont-open-data/gm24385_mod_2021.09/extra_analysis/bonito_remora", + "Description": "CpG dinucleotides frequently occur in high-density clusters called CpG islands (CGI) and >60% of human genes have their promoters embedded within CGIs Determining the methylation status of cytosines within CpGs is of substantial biological interest: alterations in methylation patterns within promoters is associated with changes in gene expression and disease states such as cancer Exploring methylation differences between tumour samples and normal samples can help to elucidate mechanisms associated with tumour formation and development Nanopore sequencing enables direct detection of methylated cytosines (eg at CpG sites), without the need for bisulfite conversionOxford Nanopore\u2019s Adaptive Sampling offers a flexible method to enrich regions of interest (eg CGIs) by depleting off-target regions during the sequencing run itself with no upfront sample manipulation Here we introduce Reduced Representation Methylation Sequencing (RRMS) to target 310 Mb of the human genome including regions which are highly enriched for CpGs including ~28,000 CpG islands, ~50,600 shores and ~42,700 shelves as well as ~21,600 promoter regions", + "ARN": "arn:aws:s3:::ont-open-data/rrms_2022.07", "Region": "eu-west-1", "Type": "S3 Bucket", "Documentation": "https://labs.epi2me.io/dataindex/", @@ -25538,6 +25538,31 @@ "AccountRequired": null, "Host": null }, + { + "Name": "PD12M", + "Description": "Image files", + "ARN": "arn:aws:s3:::pd12m", + "Region": "us-west-2", + "Type": "S3 Bucket", + "Documentation": "https://huggingface.co/datasets/Spawning/PD12M", + "Contact": "info@spawning.ai", + "ManagedBy": "Spawning", + "UpdateFrequency": "Data will be adjusted as infringing works are discovered, improved provenance is acquired, or infringing captions are discovered.", + "License": "https://cdla.dev/permissive-2-0/", + "Tags": [ + "image processing", + "machine learning", + "media", + "art", + "deep learning", + "labeled" + ], + "Explore": null, + "RequesterPays": null, + "ControlledAccess": null, + "AccountRequired": null, + "Host": null + }, { "Name": "PROJ datum grids", "Description": "Horizontal and vertical adjustment datasets", @@ -25562,8 +25587,8 @@ }, { "Name": "Pacific Ocean Sound Recordings", - "Description": "machine learning models", - "ARN": "arn:aws:s3:::pacific-sound-models", + "Description": "original 256 kHz audio recordings year 2025", + "ARN": "arn:aws:s3:::pacific-sound-256khz-2025", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://docs.mbari.org/pacific-sound/", @@ -25594,8 +25619,8 @@ }, { "Name": "Pacific Ocean Sound Recordings", - "Description": "decimated 16 kHz audio recordings", - "ARN": "arn:aws:s3:::pacific-sound-16khz", + "Description": "original 256 kHz audio recordings year 2024", + "ARN": "arn:aws:s3:::pacific-sound-256khz-2024", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://docs.mbari.org/pacific-sound/", @@ -25626,8 +25651,8 @@ }, { "Name": "Pacific Ocean Sound Recordings", - "Description": "decimated 2 kHz audio recordings", - "ARN": "arn:aws:s3:::pacific-sound-2khz", + "Description": "original 256 kHz audio recordings year 2023", + "ARN": "arn:aws:s3:::pacific-sound-256khz-2023", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://docs.mbari.org/pacific-sound/", @@ -25658,8 +25683,8 @@ }, { "Name": "Pacific Ocean Sound Recordings", - "Description": "original 256 kHz audio recordings year 2025", - "ARN": "arn:aws:s3:::pacific-sound-256khz-2025", + "Description": "original 256 kHz audio recordings year 2022", + "ARN": "arn:aws:s3:::pacific-sound-256khz-2022", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://docs.mbari.org/pacific-sound/", @@ -25690,8 +25715,8 @@ }, { "Name": "Pacific Ocean Sound Recordings", - "Description": "original 256 kHz audio recordings year 2024", - "ARN": "arn:aws:s3:::pacific-sound-256khz-2024", + "Description": "decimated 16 kHz audio recordings", + "ARN": "arn:aws:s3:::pacific-sound-16khz", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://docs.mbari.org/pacific-sound/", @@ -25722,8 +25747,8 @@ }, { "Name": "Pacific Ocean Sound Recordings", - "Description": "original 256 kHz audio recordings year 2016", - "ARN": "arn:aws:s3:::pacific-sound-256khz-2016", + "Description": "original 256 kHz audio recordings year 2020", + "ARN": "arn:aws:s3:::pacific-sound-256khz-2020", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://docs.mbari.org/pacific-sound/", @@ -25754,8 +25779,8 @@ }, { "Name": "Pacific Ocean Sound Recordings", - "Description": "original 256 kHz audio recordings year 2023", - "ARN": "arn:aws:s3:::pacific-sound-256khz-2023", + "Description": "original 256 kHz audio recordings year 2019", + "ARN": "arn:aws:s3:::pacific-sound-256khz-2019", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://docs.mbari.org/pacific-sound/", @@ -25786,8 +25811,8 @@ }, { "Name": "Pacific Ocean Sound Recordings", - "Description": "original 256 kHz audio recordings year 2022", - "ARN": "arn:aws:s3:::pacific-sound-256khz-2022", + "Description": "original 256 kHz audio recordings year 2018", + "ARN": "arn:aws:s3:::pacific-sound-256khz-2018", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://docs.mbari.org/pacific-sound/", @@ -25850,8 +25875,8 @@ }, { "Name": "Pacific Ocean Sound Recordings", - "Description": "original 256 kHz audio recordings year 2015", - "ARN": "arn:aws:s3:::pacific-sound-256khz-2015", + "Description": "original 256 kHz audio recordings year 2016", + "ARN": "arn:aws:s3:::pacific-sound-256khz-2016", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://docs.mbari.org/pacific-sound/", @@ -25882,8 +25907,8 @@ }, { "Name": "Pacific Ocean Sound Recordings", - "Description": "original 256 kHz audio recordings year 2018", - "ARN": "arn:aws:s3:::pacific-sound-256khz-2018", + "Description": "original 256 kHz audio recordings year 2015", + "ARN": "arn:aws:s3:::pacific-sound-256khz-2015", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://docs.mbari.org/pacific-sound/", @@ -25914,8 +25939,8 @@ }, { "Name": "Pacific Ocean Sound Recordings", - "Description": "original 256 kHz audio recordings year 2021", - "ARN": "arn:aws:s3:::pacific-sound-256khz-2021", + "Description": "machine learning models", + "ARN": "arn:aws:s3:::pacific-sound-models", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://docs.mbari.org/pacific-sound/", @@ -25946,8 +25971,8 @@ }, { "Name": "Pacific Ocean Sound Recordings", - "Description": "original 256 kHz audio recordings year 2020", - "ARN": "arn:aws:s3:::pacific-sound-256khz-2020", + "Description": "decimated 2 kHz audio recordings", + "ARN": "arn:aws:s3:::pacific-sound-2khz", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://docs.mbari.org/pacific-sound/", @@ -25978,8 +26003,8 @@ }, { "Name": "Pacific Ocean Sound Recordings", - "Description": "original 256 kHz audio recordings year 2019", - "ARN": "arn:aws:s3:::pacific-sound-256khz-2019", + "Description": "original 256 kHz audio recordings year 2021", + "ARN": "arn:aws:s3:::pacific-sound-256khz-2021", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://docs.mbari.org/pacific-sound/", @@ -26201,8 +26226,8 @@ }, { "Name": "PoroTomo", - "Description": "PoroTomo Horizontal Distributed Acoustic Sensing (DASH) Data in HDF5 format", - "ARN": "arn:aws:s3:::nrel-pds-porotomo/DAS/H5/DASH/", + "Description": "PoroTomo Horizontal Distributed Acoustic Sensing (DASH) Data in SEG-Y format", + "ARN": "arn:aws:s3:::nrel-pds-porotomo/DAS/SEG-Y/DASH/", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md", @@ -26218,7 +26243,7 @@ "geospatial" ], "Explore": [ - "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FH5%2FDASH%2F)" + "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FSEG-Y%2FDASH%2F)" ], "RequesterPays": null, "ControlledAccess": null, @@ -26227,8 +26252,8 @@ }, { "Name": "PoroTomo", - "Description": "PoroTomo Vertical Distributed Acoustic Sensing (DASV) Data in HDF5 format", - "ARN": "arn:aws:s3:::nrel-pds-porotomo/DAS/H5/DASV/", + "Description": "PoroTomo Datasets", + "ARN": "arn:aws:s3:::nrel-pds-porotomo/", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md", @@ -26244,7 +26269,7 @@ "geospatial" ], "Explore": [ - "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FH5%2FDASV%2F)" + "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo)" ], "RequesterPays": null, "ControlledAccess": null, @@ -26253,8 +26278,8 @@ }, { "Name": "PoroTomo", - "Description": "PoroTomo Vertical Distributed Acoustic Sensing (DASV) Data in SEG-Y format", - "ARN": "arn:aws:s3:::nrel-pds-porotomo/DAS/SEG-Y/DASV/", + "Description": "PoroTomo Horizontal Distributed Acoustic Sensing (DASH) Data in HDF5 format", + "ARN": "arn:aws:s3:::nrel-pds-porotomo/DAS/H5/DASH/", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md", @@ -26270,7 +26295,7 @@ "geospatial" ], "Explore": [ - "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FSEG-Y%2FDASV%2F)" + "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FH5%2FDASH%2F)" ], "RequesterPays": null, "ControlledAccess": null, @@ -26279,8 +26304,8 @@ }, { "Name": "PoroTomo", - "Description": "PoroTomo Horizontal Distributed Acoustic Sensing (DASH) Data in SEG-Y format", - "ARN": "arn:aws:s3:::nrel-pds-porotomo/DAS/SEG-Y/DASH/", + "Description": "PoroTomo Horizontal Distributed Acoustic Sensing (DASH) Data Resampled in Time MATLAB Files", + "ARN": "arn:aws:s3:::nrel-pds-porotomo/DAS/SEG-Y/DASH/Resampled/", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md", @@ -26296,7 +26321,7 @@ "geospatial" ], "Explore": [ - "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FSEG-Y%2FDASH%2F)" + "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FSEG-Y%2FDASH%2FResampled%2F)" ], "RequesterPays": null, "ControlledAccess": null, @@ -26305,8 +26330,8 @@ }, { "Name": "PoroTomo", - "Description": "PoroTomo Horizontal Distributed Acoustic Sensing (DASH) Data Resampled in Time MATLAB Files", - "ARN": "arn:aws:s3:::nrel-pds-porotomo/DAS/SEG-Y/DASH/Resampled/", + "Description": "PoroTomo Nodal Seismometer Sweep Data", + "ARN": "arn:aws:s3:::nrel-pds-porotomo/Nodal/nodal_sac_sweep/", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md", @@ -26322,7 +26347,7 @@ "geospatial" ], "Explore": [ - "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FSEG-Y%2FDASH%2FResampled%2F)" + "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=Nodal%2Fnodal_sac_sweep%2F)" ], "RequesterPays": null, "ControlledAccess": null, @@ -26331,8 +26356,8 @@ }, { "Name": "PoroTomo", - "Description": "PoroTomo Datasets", - "ARN": "arn:aws:s3:::nrel-pds-porotomo/", + "Description": "PoroTomo Nodal Seismometer Field Notes and Metadata", + "ARN": "arn:aws:s3:::nrel-pds-porotomo/Nodal/nodal_metadata/", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md", @@ -26348,7 +26373,7 @@ "geospatial" ], "Explore": [ - "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo)" + "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=Nodal%2Fnodal_metadata%2F)" ], "RequesterPays": null, "ControlledAccess": null, @@ -26357,8 +26382,8 @@ }, { "Name": "PoroTomo", - "Description": "HSDS PoroTomo domains", - "ARN": "arn:aws:s3:::nrel-pds-hsds/nrel/porotomo/", + "Description": "PoroTomo Nodal Seismometer Continuous Data", + "ARN": "arn:aws:s3:::nrel-pds-porotomo/Nodal/nodal_sac/", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md", @@ -26374,7 +26399,7 @@ "geospatial" ], "Explore": [ - "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-hsds&prefix=nrel%2Fporotomo%2F)" + "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=Nodal%2Fnodal_sac%2F)" ], "RequesterPays": null, "ControlledAccess": null, @@ -26383,8 +26408,8 @@ }, { "Name": "PoroTomo", - "Description": "PoroTomo Nodal Seismometer Sweep Data", - "ARN": "arn:aws:s3:::nrel-pds-porotomo/Nodal/nodal_sac_sweep/", + "Description": "HSDS PoroTomo domains", + "ARN": "arn:aws:s3:::nrel-pds-hsds/nrel/porotomo/", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md", @@ -26400,7 +26425,7 @@ "geospatial" ], "Explore": [ - "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=Nodal%2Fnodal_sac_sweep%2F)" + "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-hsds&prefix=nrel%2Fporotomo%2F)" ], "RequesterPays": null, "ControlledAccess": null, @@ -26409,8 +26434,8 @@ }, { "Name": "PoroTomo", - "Description": "PoroTomo Nodal Seismometer Field Notes and Metadata", - "ARN": "arn:aws:s3:::nrel-pds-porotomo/Nodal/nodal_metadata/", + "Description": "PoroTomo Vertical Distributed Acoustic Sensing (DASV) Data in HDF5 format", + "ARN": "arn:aws:s3:::nrel-pds-porotomo/DAS/H5/DASV/", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md", @@ -26426,7 +26451,7 @@ "geospatial" ], "Explore": [ - "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=Nodal%2Fnodal_metadata%2F)" + "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FH5%2FDASV%2F)" ], "RequesterPays": null, "ControlledAccess": null, @@ -26435,8 +26460,8 @@ }, { "Name": "PoroTomo", - "Description": "PoroTomo Nodal Seismometer Continuous Data", - "ARN": "arn:aws:s3:::nrel-pds-porotomo/Nodal/nodal_sac/", + "Description": "PoroTomo Vertical Distributed Acoustic Sensing (DASV) Data in SEG-Y format", + "ARN": "arn:aws:s3:::nrel-pds-porotomo/DAS/SEG-Y/DASV/", "Region": "us-west-2", "Type": "S3 Bucket", "Documentation": "https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md", @@ -26452,7 +26477,7 @@ "geospatial" ], "Explore": [ - "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=Nodal%2Fnodal_sac%2F)" + "[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FSEG-Y%2FDASV%2F)" ], "RequesterPays": null, "ControlledAccess": null, @@ -26589,10 +26614,10 @@ }, { "Name": "Protein Data Bank 3D Structural Biology Data", - "Description": "Globally cached distribution of the dataset Web frontend also available to browse the dataset and file directory", - "ARN": null, + "Description": "Historical snapshots of archival datasets from 2005 onwards Snapshots are generated annually and at major milestone", + "ARN": "arn:aws:s3:::pdbsnapshots", "Region": "us-west-2", - "Type": "CloudFront Distribution", + "Type": "S3 Bucket", "Documentation": "https://www.wwpdb.org/documentation/file-format", "Contact": "https://www.wwpdb.org/about/contact", "ManagedBy": "[Worldwide Protein Data Bank Partnership](wwpdb.org)", @@ -26621,7 +26646,7 @@ "x-ray crystallography" ], "Explore": [ - "[Browse Dataset](https://s3.rcsb.org)" + "[Browse Bucket](https://pdbsnapshots.s3.us-west-2.amazonaws.com/index.html)" ], "RequesterPays": null, "ControlledAccess": null, @@ -26630,10 +26655,10 @@ }, { "Name": "Protein Data Bank 3D Structural Biology Data", - "Description": "Historical snapshots of archival datasets from 2005 onwards Snapshots are generated annually and at major milestone", - "ARN": "arn:aws:s3:::pdbsnapshots", + "Description": "Globally cached distribution of the dataset Web frontend also available to browse the dataset and file directory", + "ARN": null, "Region": "us-west-2", - "Type": "S3 Bucket", + "Type": "CloudFront Distribution", "Documentation": "https://www.wwpdb.org/documentation/file-format", "Contact": "https://www.wwpdb.org/about/contact", "ManagedBy": "[Worldwide Protein Data Bank Partnership](wwpdb.org)", @@ -26662,7 +26687,7 @@ "x-ray crystallography" ], "Explore": [ - "[Browse Bucket](https://pdbsnapshots.s3.us-west-2.amazonaws.com/index.html)" + "[Browse Dataset](https://s3.rcsb.org)" ], "RequesterPays": null, "ControlledAccess": null, diff --git a/aws_open_datasets.tsv b/aws_open_datasets.tsv index 761cf12..4b873d6 100644 --- a/aws_open_datasets.tsv +++ b/aws_open_datasets.tsv @@ -928,9 +928,9 @@ Orcasound - bioacoustic data for marine conservation Labeled audio data for ML m Oregon Health & Science University Chronic Neutrophilic Leukemia Dataset RNA-Seq Gene Expression Quantification arn:aws:s3:::gdc-ohsu-cnl-phs001799-2-open us-east-1 S3 Bucket https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs001799.v dcf-support@datacommons.io [Center for Translational Data Science at The University of Chicago](https://ctd Genomic Data Commons (GDC) is source of truth for this dataset; GDC offers month NIH Genomic Data Sharing Policy: https://gdc.cancer.gov/access-data/data-access- aws-pds, cancer, genomic, life sciences Overture Maps Foundation Open Map Data Overture Maps Foundation Data (GeoParquet) arn:aws:s3:::overturemaps-us-west-2/release/ us-west-2 S3 Bucket Documentation is available at [docs.overturemaps.org](https://docs.overturemaps. info@overturemaps.org [Overture Maps Foundation](https://overturemaps.org) Monthly Overture data is licensed under the Community Database License Agreement Permiss aws-pds, geospatial, global, mapping, osm, parquet, transportation Overture Maps Foundation Open Map Data New File Notification arn:aws:sns:us-west-2:913550007193:overturemaps-us-west-2 us-west-2 SNS Topic Documentation is available at [docs.overturemaps.org](https://docs.overturemaps. info@overturemaps.org [Overture Maps Foundation](https://overturemaps.org) Monthly Overture data is licensed under the Community Database License Agreement Permiss aws-pds, geospatial, global, mapping, osm, parquet, transportation -Oxford Nanopore Technologies Benchmark Datasets CpG dinucleotides frequently occur in high-density clusters called CpG islands ( arn:aws:s3:::ont-open-data/rrms_2022.07 eu-west-1 S3 Bucket https://labs.epi2me.io/dataindex/ support@nanoporetech.com Oxford Nanopore Technologies Additional datasets will be added periodically. Updates and amendents will be ma Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) https://creativecommo aws-pds, bioinformatics, biology, fastq, fast5, genomic, life sciences, Homo sapiens, whole genome sequencing False -Oxford Nanopore Technologies Benchmark Datasets Nanopore sequencing data of the Genome in a Bottle samples NA24385, NA24149, and arn:aws:s3:::ont-open-data/giab_lsk114_2022.12 eu-west-1 S3 Bucket https://labs.epi2me.io/dataindex/ support@nanoporetech.com Oxford Nanopore Technologies Additional datasets will be added periodically. Updates and amendents will be ma Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) https://creativecommo aws-pds, bioinformatics, biology, fastq, fast5, genomic, life sciences, Homo sapiens, whole genome sequencing False Oxford Nanopore Technologies Benchmark Datasets Using nanopore sequencing, researchers have directly identified DNA and RNA base arn:aws:s3:::ont-open-data/gm24385_mod_2021.09/extra_analysis/bonito_remora eu-west-1 S3 Bucket https://labs.epi2me.io/dataindex/ support@nanoporetech.com Oxford Nanopore Technologies Additional datasets will be added periodically. Updates and amendents will be ma Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) https://creativecommo aws-pds, bioinformatics, biology, fastq, fast5, genomic, life sciences, Homo sapiens, whole genome sequencing False +Oxford Nanopore Technologies Benchmark Datasets Nanopore sequencing data of the Genome in a Bottle samples NA24385, NA24149, and arn:aws:s3:::ont-open-data/giab_lsk114_2022.12 eu-west-1 S3 Bucket https://labs.epi2me.io/dataindex/ support@nanoporetech.com Oxford Nanopore Technologies Additional datasets will be added periodically. Updates and amendents will be ma Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) https://creativecommo aws-pds, bioinformatics, biology, fastq, fast5, genomic, life sciences, Homo sapiens, whole genome sequencing False +Oxford Nanopore Technologies Benchmark Datasets CpG dinucleotides frequently occur in high-density clusters called CpG islands ( arn:aws:s3:::ont-open-data/rrms_2022.07 eu-west-1 S3 Bucket https://labs.epi2me.io/dataindex/ support@nanoporetech.com Oxford Nanopore Technologies Additional datasets will be added periodically. Updates and amendents will be ma Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) https://creativecommo aws-pds, bioinformatics, biology, fastq, fast5, genomic, life sciences, Homo sapiens, whole genome sequencing False Oxford Nanopore Technologies Benchmark Datasets Oxford Nanopore Open Datasets arn:aws:s3:::ont-open-data eu-west-1 S3 Bucket https://labs.epi2me.io/dataindex/ support@nanoporetech.com Oxford Nanopore Technologies Additional datasets will be added periodically. Updates and amendents will be ma Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) https://creativecommo aws-pds, bioinformatics, biology, fastq, fast5, genomic, life sciences, Homo sapiens, whole genome sequencing False Ozone Monitoring Instrument (OMI) / Aura NO2 Tropospheric Column Density S3 Bucket for OMI NO2 in Cloud-Optimized GeoTiff format arn:aws:s3:::omi-no2-nasa us-west-2 S3 Bucket https://disc.gsfc.nasa.gov/datasets/OMNO2d_003/summary binita.kc@nasa.gov NASA None There are no restrictions on the use of these data. aws-pds, earth observation, geospatial, satellite imagery, air quality, atmosphere, environmental PALSAR-2 ScanSAR CARD4L (L2.2) PALSAR-2 ScanSAR CARD4L arn:aws:s3:::jaxaalos2/palsar2/L2.2/Africa/ us-west-2 S3 Bucket https://www.eorc.jaxa.jp/ALOS/en/dataset/palsar2_l22_e.htm aproject@jaxa.jp [JAXA](https://www.jaxa.jp/) Every month after 42 days observed Data is available for free under the [terms of use](https://earth.jaxa.jp/policy aws-pds, agriculture, earth observation, satellite imagery, geospatial, natural resource, sustainability, disaster response, synthetic aperture radar, deafrica, stac, cog False @@ -938,21 +938,22 @@ PALSAR-2 ScanSAR Flooding in Rwanda (L2.1) PALSAR-2 ScanSAR L11 & L22 arn:aws:s3 PALSAR-2 ScanSAR Tropical Cycolne Mocha (L2.1) PALSAR-2 ScanSAR L22 arn:aws:s3:::jaxaalos2/palsar2-scansar/Bangladesh/ us-west-2 S3 Bucket https://www.eorc.jaxa.jp/ALOS/en/dataset/alos_open_and_free_e.htm, https://www.e aproject@jaxa.jp [JAXA](https://www.jaxa.jp/) As available. Data is available for free under the terms of use. aws-pds, agriculture, cog, disaster response, earth observation, geospatial, natural resource, satellite imagery, stac, sustainability, synthetic aperture radar False PALSAR-2 ScanSAR Turkey & Syria Earthquake (L2.1 & L1.1) PALSAR-2 ScanSAR L11 & L22 arn:aws:s3:::jaxaalos2/palsar2-scansar/Turkey-Syria-earthquake/ us-west-2 S3 Bucket https://www.eorc.jaxa.jp/ALOS/en/dataset/alos_open_and_free_e.htm, https://www.e aproject@jaxa.jp [JAXA](https://www.jaxa.jp/) As available. Data is available for free under the [terms of use](https://earth.jaxa.jp/policy aws-pds, agriculture, earth observation, satellite imagery, geospatial, natural resource, sustainability, disaster response, synthetic aperture radar, deafrica, stac, cog False PASS: Perturb-and-Select Summarizer for Product Reviews A collection of summaries generated by PASS for the FewSum Product Reviews datas arn:aws:s3:::pass-summary-fewsum us-east-1 S3 Bucket https://pass-summary-fewsum.s3.amazonaws.com/README.md noved@amazon.com [Amazon](https://www.amazon.com/) Not updated This data is available for anyone to use under the terms of the CDLA-Sharing lic amazon.science, natural language processing, text analysis ['[pass_generated_summaries.jsonl](https://pass-summary-fewsum.s3.amazonaws.com/pass_gen_summaries_fewsum_amazon_val_test.jsonl)'] +PD12M Image files arn:aws:s3:::pd12m us-west-2 S3 Bucket https://huggingface.co/datasets/Spawning/PD12M info@spawning.ai Spawning Data will be adjusted as infringing works are discovered, improved provenance is https://cdla.dev/permissive-2-0/ image processing, machine learning, media, art, deep learning, labeled PROJ datum grids Horizontal and vertical adjustment datasets us-east-1 CloudFront Distribution https://github.com/OSGeo/proj-datumgrid-geotiff proj@lists.osgeo.org [PROJ](https://proj.org) New grids are added when made available Per file. Under an Open Source Definition compliant license. Consult the READMEs aws-pds, geospatial, mapping cdn.proj.org -Pacific Ocean Sound Recordings machine learning models arn:aws:s3:::pacific-sound-models us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software -Pacific Ocean Sound Recordings decimated 16 kHz audio recordings arn:aws:s3:::pacific-sound-16khz us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software -Pacific Ocean Sound Recordings decimated 2 kHz audio recordings arn:aws:s3:::pacific-sound-2khz us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software Pacific Ocean Sound Recordings original 256 kHz audio recordings year 2025 arn:aws:s3:::pacific-sound-256khz-2025 us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software Pacific Ocean Sound Recordings original 256 kHz audio recordings year 2024 arn:aws:s3:::pacific-sound-256khz-2024 us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software -Pacific Ocean Sound Recordings original 256 kHz audio recordings year 2016 arn:aws:s3:::pacific-sound-256khz-2016 us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software Pacific Ocean Sound Recordings original 256 kHz audio recordings year 2023 arn:aws:s3:::pacific-sound-256khz-2023 us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software Pacific Ocean Sound Recordings original 256 kHz audio recordings year 2022 arn:aws:s3:::pacific-sound-256khz-2022 us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software +Pacific Ocean Sound Recordings decimated 16 kHz audio recordings arn:aws:s3:::pacific-sound-16khz us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software +Pacific Ocean Sound Recordings original 256 kHz audio recordings year 2020 arn:aws:s3:::pacific-sound-256khz-2020 us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software +Pacific Ocean Sound Recordings original 256 kHz audio recordings year 2019 arn:aws:s3:::pacific-sound-256khz-2019 us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software +Pacific Ocean Sound Recordings original 256 kHz audio recordings year 2018 arn:aws:s3:::pacific-sound-256khz-2018 us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software Pacific Ocean Sound Recordings original 256 kHz audio recordings year 2017 arn:aws:s3:::pacific-sound-256khz-2017 us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software +Pacific Ocean Sound Recordings original 256 kHz audio recordings year 2016 arn:aws:s3:::pacific-sound-256khz-2016 us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software Pacific Ocean Sound Recordings original 256 kHz audio recordings year 2015 arn:aws:s3:::pacific-sound-256khz-2015 us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software -Pacific Ocean Sound Recordings original 256 kHz audio recordings year 2018 arn:aws:s3:::pacific-sound-256khz-2018 us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software +Pacific Ocean Sound Recordings machine learning models arn:aws:s3:::pacific-sound-models us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software +Pacific Ocean Sound Recordings decimated 2 kHz audio recordings arn:aws:s3:::pacific-sound-2khz us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software Pacific Ocean Sound Recordings original 256 kHz audio recordings year 2021 arn:aws:s3:::pacific-sound-256khz-2021 us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software -Pacific Ocean Sound Recordings original 256 kHz audio recordings year 2020 arn:aws:s3:::pacific-sound-256khz-2020 us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software -Pacific Ocean Sound Recordings original 256 kHz audio recordings year 2019 arn:aws:s3:::pacific-sound-256khz-2019 us-west-2 S3 Bucket https://docs.mbari.org/pacific-sound/ dcline@mbari.org [Monterey Bay Aquarium Research Institute](https://www.mbari.org/) daily CC-BY 4.0 aws-pds, acoustics, biodiversity, ecosystems, biology, marine mammals, oceans, climate, coastal, deep learning, machine learning, environmental, open source software Pan-STARRS PS1 Survey PS1 DR1 and DR2 image files arn:aws:s3:::stpubdata/ps1 us-east-1 S3 Bucket https://outerspace.stsci.edu/display/PANSTARRS/ archive@stsci.edu [Space Telescope Science Institute](http://www.stsci.edu/) Never STScI hereby grants the non-exclusive, royalty-free, non-transferable, worldwide aws-pds, astronomy False Pancreatic Cancer Organoid Profiling RNA-Seq Gene Expression Quantification arn:aws:s3:::gdc-organoid-pancreatic-phs001611-2-open us-east-1 S3 Bucket https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs001611.v dcf-support@datacommons.io [Center for Translational Data Science at The University of Chicago](https://ctd Genomic Data Commons (GDC) is source of truth for this dataset; GDC offers month NIH Genomic Data Sharing Policy: https://gdc.cancer.gov/access-data/data-access- aws-pds, cancer, genetic, genomic, transcriptomics, whole genome sequencing, STRIDES Pancreatic Cancer Organoid Profiling WGS/WXS/RNA-Seq Aligned Reads, WXS Annotated Somatic Mutation, WXS Raw Somatic M arn:aws:s3:::gdc-organoid-pancreatic-phs001611-2-controlled us-east-1 S3 Bucket https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs001611.v dcf-support@datacommons.io [Center for Translational Data Science at The University of Chicago](https://ctd Genomic Data Commons (GDC) is source of truth for this dataset; GDC offers month NIH Genomic Data Sharing Policy: https://gdc.cancer.gov/access-data/data-access- aws-pds, cancer, genetic, genomic, transcriptomics, whole genome sequencing, STRIDES https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs001611.v1.p1 @@ -961,23 +962,23 @@ Phrase Clustering Dataset (PCD) Phsrase Clustering Dataset (PCD) arn:aws:s3:::am Physionet https://s3amazonawscom/physionet-pds/indexhtml arn:aws:s3:::physionet-pds us-east-1 S3 Bucket https://physionet.org/ contact@physionet.org [MIT Laboratory for Computational Physiology](https://lcp.mit.edu/) Not updated PhysioBank databases are made available under the ODC Public Domain Dedication a aws-pds, biology, life sciences Platinum Pedigree https://githubcom/Platinum-Pedigree-Consortium/Platinum-Pedigree-Datasets arn:aws:s3:::platinum-pedigree-data us-west-1 S3 Bucket https://github.com/Platinum-Pedigree-Consortium https://github.com/Platinum-Pedigree-Consortium/Platinum-Pedigree-Datasets/issue Platinum Pedigree Consortium As needed [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) genomic, genotyping, long read sequencing, bioinformatics, Homo sapiens, life sciences, whole genome sequencing Pohang Canal Dataset: A Multimodal Maritime Dataset for Autonomous Navigation in Restricted Waters Pohang Canal dataset arn:aws:s3:::pohang-canal-dataset us-west-2 S3 Bucket https://sites.google.com/view/pohang-canal-dataset/home morin-lab@kaist.ac.kr [MORIN](http://morin.kaist.ac.kr) Not updated [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/) aws-pds, autonomous vehicles, marine navigation, robotics, computer vision, lidar -PoroTomo PoroTomo Horizontal Distributed Acoustic Sensing (DASH) Data in HDF5 format arn:aws:s3:::nrel-pds-porotomo/DAS/H5/DASH/ us-west-2 S3 Bucket https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md Thomas Coleman (thomas.coleman@silixa.com) [National Renewable Energy Laboratory](https://www.nrel.gov/) As needed Creative Commons Attribution 3.0 United States License aws-pds, geothermal, seismology, image processing, geospatial ['[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FH5%2FDASH%2F)'] -PoroTomo PoroTomo Vertical Distributed Acoustic Sensing (DASV) Data in HDF5 format arn:aws:s3:::nrel-pds-porotomo/DAS/H5/DASV/ us-west-2 S3 Bucket https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md Thomas Coleman (thomas.coleman@silixa.com) [National Renewable Energy Laboratory](https://www.nrel.gov/) As needed Creative Commons Attribution 3.0 United States License aws-pds, geothermal, seismology, image processing, geospatial ['[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FH5%2FDASV%2F)'] -PoroTomo PoroTomo Vertical Distributed Acoustic Sensing (DASV) Data in SEG-Y format arn:aws:s3:::nrel-pds-porotomo/DAS/SEG-Y/DASV/ us-west-2 S3 Bucket https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md Thomas Coleman (thomas.coleman@silixa.com) [National Renewable Energy Laboratory](https://www.nrel.gov/) As needed Creative Commons Attribution 3.0 United States License aws-pds, geothermal, seismology, image processing, geospatial ['[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FSEG-Y%2FDASV%2F)'] PoroTomo PoroTomo Horizontal Distributed Acoustic Sensing (DASH) Data in SEG-Y format arn:aws:s3:::nrel-pds-porotomo/DAS/SEG-Y/DASH/ us-west-2 S3 Bucket https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md Thomas Coleman (thomas.coleman@silixa.com) [National Renewable Energy Laboratory](https://www.nrel.gov/) As needed Creative Commons Attribution 3.0 United States License aws-pds, geothermal, seismology, image processing, geospatial ['[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FSEG-Y%2FDASH%2F)'] -PoroTomo PoroTomo Horizontal Distributed Acoustic Sensing (DASH) Data Resampled in Time M arn:aws:s3:::nrel-pds-porotomo/DAS/SEG-Y/DASH/Resampled/ us-west-2 S3 Bucket https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md Thomas Coleman (thomas.coleman@silixa.com) [National Renewable Energy Laboratory](https://www.nrel.gov/) As needed Creative Commons Attribution 3.0 United States License aws-pds, geothermal, seismology, image processing, geospatial ['[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FSEG-Y%2FDASH%2FResampled%2F)'] PoroTomo PoroTomo Datasets arn:aws:s3:::nrel-pds-porotomo/ us-west-2 S3 Bucket https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md Thomas Coleman (thomas.coleman@silixa.com) [National Renewable Energy Laboratory](https://www.nrel.gov/) As needed Creative Commons Attribution 3.0 United States License aws-pds, geothermal, seismology, image processing, geospatial ['[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo)'] -PoroTomo HSDS PoroTomo domains arn:aws:s3:::nrel-pds-hsds/nrel/porotomo/ us-west-2 S3 Bucket https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md Thomas Coleman (thomas.coleman@silixa.com) [National Renewable Energy Laboratory](https://www.nrel.gov/) As needed Creative Commons Attribution 3.0 United States License aws-pds, geothermal, seismology, image processing, geospatial ['[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-hsds&prefix=nrel%2Fporotomo%2F)'] +PoroTomo PoroTomo Horizontal Distributed Acoustic Sensing (DASH) Data in HDF5 format arn:aws:s3:::nrel-pds-porotomo/DAS/H5/DASH/ us-west-2 S3 Bucket https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md Thomas Coleman (thomas.coleman@silixa.com) [National Renewable Energy Laboratory](https://www.nrel.gov/) As needed Creative Commons Attribution 3.0 United States License aws-pds, geothermal, seismology, image processing, geospatial ['[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FH5%2FDASH%2F)'] +PoroTomo PoroTomo Horizontal Distributed Acoustic Sensing (DASH) Data Resampled in Time M arn:aws:s3:::nrel-pds-porotomo/DAS/SEG-Y/DASH/Resampled/ us-west-2 S3 Bucket https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md Thomas Coleman (thomas.coleman@silixa.com) [National Renewable Energy Laboratory](https://www.nrel.gov/) As needed Creative Commons Attribution 3.0 United States License aws-pds, geothermal, seismology, image processing, geospatial ['[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FSEG-Y%2FDASH%2FResampled%2F)'] PoroTomo PoroTomo Nodal Seismometer Sweep Data arn:aws:s3:::nrel-pds-porotomo/Nodal/nodal_sac_sweep/ us-west-2 S3 Bucket https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md Thomas Coleman (thomas.coleman@silixa.com) [National Renewable Energy Laboratory](https://www.nrel.gov/) As needed Creative Commons Attribution 3.0 United States License aws-pds, geothermal, seismology, image processing, geospatial ['[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=Nodal%2Fnodal_sac_sweep%2F)'] PoroTomo PoroTomo Nodal Seismometer Field Notes and Metadata arn:aws:s3:::nrel-pds-porotomo/Nodal/nodal_metadata/ us-west-2 S3 Bucket https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md Thomas Coleman (thomas.coleman@silixa.com) [National Renewable Energy Laboratory](https://www.nrel.gov/) As needed Creative Commons Attribution 3.0 United States License aws-pds, geothermal, seismology, image processing, geospatial ['[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=Nodal%2Fnodal_metadata%2F)'] PoroTomo PoroTomo Nodal Seismometer Continuous Data arn:aws:s3:::nrel-pds-porotomo/Nodal/nodal_sac/ us-west-2 S3 Bucket https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md Thomas Coleman (thomas.coleman@silixa.com) [National Renewable Energy Laboratory](https://www.nrel.gov/) As needed Creative Commons Attribution 3.0 United States License aws-pds, geothermal, seismology, image processing, geospatial ['[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=Nodal%2Fnodal_sac%2F)'] +PoroTomo HSDS PoroTomo domains arn:aws:s3:::nrel-pds-hsds/nrel/porotomo/ us-west-2 S3 Bucket https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md Thomas Coleman (thomas.coleman@silixa.com) [National Renewable Energy Laboratory](https://www.nrel.gov/) As needed Creative Commons Attribution 3.0 United States License aws-pds, geothermal, seismology, image processing, geospatial ['[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-hsds&prefix=nrel%2Fporotomo%2F)'] +PoroTomo PoroTomo Vertical Distributed Acoustic Sensing (DASV) Data in HDF5 format arn:aws:s3:::nrel-pds-porotomo/DAS/H5/DASV/ us-west-2 S3 Bucket https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md Thomas Coleman (thomas.coleman@silixa.com) [National Renewable Energy Laboratory](https://www.nrel.gov/) As needed Creative Commons Attribution 3.0 United States License aws-pds, geothermal, seismology, image processing, geospatial ['[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FH5%2FDASV%2F)'] +PoroTomo PoroTomo Vertical Distributed Acoustic Sensing (DASV) Data in SEG-Y format arn:aws:s3:::nrel-pds-porotomo/DAS/SEG-Y/DASV/ us-west-2 S3 Bucket https://github.com/openEDI/documentation/blob/master/PoroTomo/PoroTomo.md Thomas Coleman (thomas.coleman@silixa.com) [National Renewable Energy Laboratory](https://www.nrel.gov/) As needed Creative Commons Attribution 3.0 United States License aws-pds, geothermal, seismology, image processing, geospatial ['[Browse Dataset](https://data.openei.org/s3_viewer?bucket=nrel-pds-porotomo&prefix=DAS%2FSEG-Y%2FDASV%2F)'] Poseidon 3D Seismic, Australia Poseidon 3D Seismic MDIO volumes and Reports arn:aws:s3:::tgs-opendata-poseidon us-west-2 S3 Bucket TBD For any questions regarding the datasets and MDIO, email the TGS Open Data Team [TGS](https://www.tgs.com) Dataset is static. CC BY 4.0 seismology, geophysics, exploration ['[Browse Bucket](https://tgs-opendata-poseidon.s3.amazonaws.com/index.html)'] Pre- and post-purchase product questions S3 bucket with dataset arn:aws:s3:::pre-post-purchase-questions us-east-1 S3 Bucket https://pre-post-purchase-questions.s3.amazonaws.com/README.txt litalku@amazon.com [Amazon](https://www.amazon.com/) Not currently being updated [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) amazon.science, natural language processing, machine learning ['[PrePostQuestions.csv](https://pre-post-purchase-questions.s3.amazonaws.com/PrePostQuestions.csv)'] Prefeitura Municipal de São Paulo (PMSP) LiDAR Point Cloud São Paulo city's 3D LiDAR - LAZ Files arn:aws:s3:::laz-m3dc-pmsp sa-east-1 S3 Bucket https://github.com/geoinfo-smdu/M3DC geosampa@prefeitura.sp.gov.br [GeoSampa - o mapa digital da cidade de São Paulo](http://geosampa.prefeitura.sp Local survey executed by demand generates new data as local point clouds. [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.html) cities, land, lidar, urban, geospatial, elevation, mapping, aws-pds Prefeitura Municipal de São Paulo (PMSP) LiDAR Point Cloud São Paulo city's 3D LiDAR - Entwine Point Tiles arn:aws:s3:::ept-m3dc-pmsp sa-east-1 S3 Bucket https://github.com/geoinfo-smdu/M3DC geosampa@prefeitura.sp.gov.br [GeoSampa - o mapa digital da cidade de São Paulo](http://geosampa.prefeitura.sp Local survey executed by demand generates new data as local point clouds. [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.html) cities, land, lidar, urban, geospatial, elevation, mapping, aws-pds Product Comparison Dataset for Online Shopping Product Comparison Dataset for Online Shopping arn:aws:s3:::prod-comp-shopping-dataset us-west-2 S3 Bucket https://prod-comp-shopping-dataset.s3.us-west-2.amazonaws.com/README.md Post any questions to [re:Post](https://repost.aws/tags/questions/TApd0Wl5P8S9O6 [Amazon](https://www.amazon.com/) None [CC-BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/) product comparison, online shopping, amazon.science, natural language processing, machine learning ['[final_prodcomp_dataset_cleaned.tsv](https://prod-comp-shopping-dataset.s3.us-west-2.amazonaws.com/final_prodcomp_dataset_cleaned.tsv)'] -Protein Data Bank 3D Structural Biology Data Globally cached distribution of the dataset Web frontend also available to brow us-west-2 CloudFront Distribution https://www.wwpdb.org/documentation/file-format https://www.wwpdb.org/about/contact [Worldwide Protein Data Bank Partnership](wwpdb.org) New and updated data files are published weekly and released on Wednesdays 0:00 https://creativecommons.org/publicdomain/zero/1.0/ aws-pds, amino acid, archives, bioinformatics, biomolecular modeling, cell biology, chemical biology, COVID-19, electron microscopy, electron tomography, enzyme, life sciences, molecule, nuclear magnetic resonance, pharmaceutical, protein, protein template, SARS-CoV-2, structural biology, x-ray crystallography ['[Browse Dataset](https://s3.rcsb.org)'] Protein Data Bank 3D Structural Biology Data Historical snapshots of archival datasets from 2005 onwards Snapshots are gener arn:aws:s3:::pdbsnapshots us-west-2 S3 Bucket https://www.wwpdb.org/documentation/file-format https://www.wwpdb.org/about/contact [Worldwide Protein Data Bank Partnership](wwpdb.org) New and updated data files are published weekly and released on Wednesdays 0:00 https://creativecommons.org/publicdomain/zero/1.0/ aws-pds, amino acid, archives, bioinformatics, biomolecular modeling, cell biology, chemical biology, COVID-19, electron microscopy, electron tomography, enzyme, life sciences, molecule, nuclear magnetic resonance, pharmaceutical, protein, protein template, SARS-CoV-2, structural biology, x-ray crystallography ['[Browse Bucket](https://pdbsnapshots.s3.us-west-2.amazonaws.com/index.html)'] +Protein Data Bank 3D Structural Biology Data Globally cached distribution of the dataset Web frontend also available to brow us-west-2 CloudFront Distribution https://www.wwpdb.org/documentation/file-format https://www.wwpdb.org/about/contact [Worldwide Protein Data Bank Partnership](wwpdb.org) New and updated data files are published weekly and released on Wednesdays 0:00 https://creativecommons.org/publicdomain/zero/1.0/ aws-pds, amino acid, archives, bioinformatics, biomolecular modeling, cell biology, chemical biology, COVID-19, electron microscopy, electron tomography, enzyme, life sciences, molecule, nuclear magnetic resonance, pharmaceutical, protein, protein template, SARS-CoV-2, structural biology, x-ray crystallography ['[Browse Dataset](https://s3.rcsb.org)'] Provision of Web-Scale Parallel Corpora for Official European Languages (ParaCrawl) Parallel Corpora to/from English for all official EU languages arn:aws:s3:::web-language-models us-east-1 S3 Bucket https://paracrawl.eu/releases.html For questions regarding the datasets contact Kenneth Heafield, email kheafiel@in [ParaCrawl](https://paracrawl.eu) New data is added according to ParaCrawl release schedule. "Creative Commons CC0 license (""no rights reserved"")." aws-pds, machine translation, natural language processing PubSeq - Public Sequence Resource PubSeq submitted datasets (FASTA and JSON metadata) arn:aws:s3:::pubseq-datasets us-east-2 S3 Bucket https://covid19.genenetwork.org/about https://covid19.genenetwork.org/contact [UTHSC GeneNetwork](https://covid19.genenetwork.org/) Rolling dataset. Creative Commons Attribution 4.0 International (CC BY 4.0) unless otherwise spec aws-pds, bam, bioinformatics, biology, coronavirus, COVID-19, fasta, fastq, fast5, genetic, genomic, health, json, life sciences, long read sequencing, open source software, MERS, metadata, medicine, RDF, SARS, SARS-CoV-2, SPARQL ['[Browse Bucket](https://pubseq-datasets.s3.amazonaws.com/)'] PubSeq - Public Sequence Resource Pubseq output data (Arvados Keep) arn:aws:s3:::pubseq-output-data us-east-2 S3 Bucket https://covid19.genenetwork.org/about https://covid19.genenetwork.org/contact [UTHSC GeneNetwork](https://covid19.genenetwork.org/) Rolling dataset. Creative Commons Attribution 4.0 International (CC BY 4.0) unless otherwise spec aws-pds, bam, bioinformatics, biology, coronavirus, COVID-19, fasta, fastq, fast5, genetic, genomic, health, json, life sciences, long read sequencing, open source software, MERS, metadata, medicine, RDF, SARS, SARS-CoV-2, SPARQL ['[Arvados download](https://covid19.genenetwork.org/download)'] diff --git a/datasets/pd12m.yaml b/datasets/pd12m.yaml new file mode 100644 index 0000000..bc13503 --- /dev/null +++ b/datasets/pd12m.yaml @@ -0,0 +1,41 @@ +Name: PD12M +Description: PD12M is a collection of 12.4 million CC0/PD image-caption pairs for the purpose of training generative image models. +Documentation: https://huggingface.co/datasets/Spawning/PD12M +Contact: info@spawning.ai +ManagedBy: Spawning +UpdateFrequency: Data will be adjusted as infringing works are discovered, improved provenance is acquired, or infringing captions are discovered. +Tags: +- image processing +- machine learning +- media +- art +- deep learning +- labeled +License: https://cdla.dev/permissive-2-0/ +Resources: + - Description: Image files + ARN: arn:aws:s3:::pd12m + Region: us-west-2 + Type: S3 Bucket +DataAtWork: + Tutorials: + - Title: Working with the Metadata + URL: https://huggingface.co/datasets/Spawning/PD12M + AuthorName: Spawning + - Title: Downloading Images + URL: https://huggingface.co/datasets/Spawning/PD12M/blob/main/tutorials/images.md + AuthorName: Spawning + Tools & Applications: + - Title: Hugging Face Dataset + URL: https://huggingface.co/datasets/Spawning/PD12M + AuthorName: Spawning + - Title: Source.Plus + URL: https://source.plus/pd12m + AuthorName: Spawning + Publications: + - Title: "PD12M: A Large-Scale Image Captioning Dataset" + URL: https://arxiv.org/abs/2410.23144 + AuthorName: Jordan Meyer, Nick Padgett, Laura Exline, Cullen Miller + - Title: Datasheet + URL: https://huggingface.co/datasets/Spawning/PD12M/blob/main/Datasheet.pdf + AuthorName: Spawning