From bf926c3a45af24d03c9ad8ba9da68a8d48972f83 Mon Sep 17 00:00:00 2001 From: Kate Isaac <41767733+kweav@users.noreply.github.com> Date: Mon, 13 May 2024 20:35:16 -0400 Subject: [PATCH 1/7] add blurb on goals --- 09-DNA.Rmd | 27 +++- book.bib | 412 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 436 insertions(+), 3 deletions(-) diff --git a/09-DNA.Rmd b/09-DNA.Rmd index b7bd7449..9555079b 100644 --- a/09-DNA.Rmd +++ b/09-DNA.Rmd @@ -19,15 +19,36 @@ ottrpal::include_slide("https://docs.google.com/presentation/d/1YwxXy2rnUgbx_7B7 ## What are the goals of analyzing DNA sequences? -```{r, fig.alt = "", out.width = "100%", echo = FALSE} -ottrpal::include_slide("https://docs.google.com/presentation/d/1YwxXy2rnUgbx_7B7ENH9wpDX-j6JpJz6lGVzOkjo0qY/edit#slide=id.g12890ae15d7_0_71") -``` +There are several larger goals behind DNA sequencing experiments ranging from assembling whole genomes, to identifying variation or performing a functional genomic analysis or comparative genomic study. Each of these has implications when studying disease. + +* Assembling whole genomes: + + Because an organism's genome determines how an organism develops and functions [@NHGRIGlossary2024], an important task in the genomics field is assembling the genome of an organism from sequencing reads. This assembly process attempts to reconstruct how the sequencing reads overlap or fit together [@Schatz2010; @Li_Durbin_2024]. Recent examples of genome assembly in the genomics field include a complete 3.055 billion-base pair sequence of the human reference genome which was published by the Telomere-to-Telomere (T2T) Consortium [-@Nurk2022], the T2T-CHM13 version (followed not long after by the complete sequence of the human Y chromosome [-@Rhie2023]). A goal of the field is to better capture human genetic diversity by creating a reference pangenome, assembled from multiple donors within the population [-@Taylor2024]. Genome assemblies are an important part of genomics beyond human genomics research; there are reference gnomes available for most model organisms as well as many plants, animals, and pathogens, with more and more being published at a high frequency [@Miller2023; @Alonge2022; @Gershman2023; @Sistrom2016]. These reference genomes each act as an extensive compilation of the observed DNA sequence of genes, regulatory elements, etc. and the related coordinate systems for these elements, such that, for the corresponding organism, sequencing reads from other experiments can be mapped or aligned to the reference in order to localize where that read was in the genome. In the case of cancer informatics, a recent approach utilized personalized genome assembly to more accurately detect tumor somatic mutations. This is likely to be an area of future research for application in precision medicine [@Xiao2022; @Ermini_Driguez_2024]. + +* Identifying variation: + + Variant caller software is used within the field of genomics to identify places where reads from a DNA sequencing experiment differ from a comparative reference genome sequence [@NHGRIfactsheet2022]. Variants may be as small as single nucleotide differences (single-nucleotide polymorphisms or SNPs) or much larger (50 base pairs or more) structural variation (SVs) such as duplications, deletions, insertions, inversions, translocations [@Wong2011]. (Shorter insertions or deletions are termed indels.) The SVs involving gains or losses in genomic DNA can lead to copy number variations (CNVs). Mutation and structural variants are very common in cancer as well as larger-scale catastrophic genomic rearrangements [@Zhang2022]. Overall, variants may be rare in a population or fairly common [@Audano2019]. Further, variants may be somatic or germline variants: germline variants are hereditary and will be passed down from parent to offspring; in the offspring, the variant will be present in every cell, while somatic variants are generally not hereditary and present only in some cells rather than every cell [@NHSFrost2022]. Because variation, specifically genetic diversity is a necessary part of a healthy species [@GeneticDiversity] and because variation, specifically mutations/variants may cause disease, identifying variation is a common goal in a DNA sequencing workflow. An example of research focusing on studying genetic diversity in humans is the 1000 Genomes Project which recently expanded its resource of sequenced genomes and in doing so discovered even more variation present in the population [@Byrska-Bishop2022]. + +* Functional genomic analysis: + + Genomes contain more than just genes (the coding sequences that will be transcribed and translated into a protein); they also contain functional elements such as promoters, enhancers, or silencers that modulate the expression of genes [@Kellis2014]. Further, differential gene expression is the phenomenon by which cells with the same DNA sequence show different patterns of gene expression. Functional genomic analyses aim to better understand differential gene expression and the impact of genetic variation found in functional elements. For example, many human genetic variants associated with common traits and diseases are localized in or near known functional elements [@Hindorff2009]. These variants may impact gene expression due to either changes in transcription factor binding at that site, or resulting epigenetic changes, which are defined as chemical modifications of chromatin or nucleotides beyond the DNA sequence. Such epigenetic modifications, which include histone marks and DNA methylation, can alter DNA compaction and influence a functional element’s accessibility for transcriptional machinery (e.g., if the element isn't accessible, transcription may not occur; while previously the element was accessible and the gene could be transcribed). In later sections, methods that study epigenetic modifications like chromatin accessibility, DNA methylation, or binding of specific proteins will be discussed. All of these methods support functional genomic analyses and are important for better understanding differential gene expression and the impact of genetic variants located in functional elements may have on disease occurrence. A somewhat recent and high profile example of a functional genomic analysis centers again on work from the T2T Consortium. Not only did they publish a new, complete reference genome, but they also studied the epigenetic landscape in the newly resolved regions of the genome and pointed to potential newly discovered functional elements in a region previously thought to be transcriptionally inactive [@Gershman2022]. + +* Comparative genomics + + A common saying in the genomics field is that structure determines function and conserved structure may be constrained such that there is an important function which needs to be conserved [@Alföldi_Linblad-Toh_2013]. Further, similarities in structure may be due to shared ancestry through the processes of evolution; therefore, some comparative genomics studies aim to infer homology or an evolutionary relationship from structural similarity [@Pearson2013]. More pertinent to the topics discussed previously, comparative genomics studies are also useful for identifying functional elements [@Taylor2006] and variants associated with disease (e.g., by comparing the genomes of those with the disease and those without it and identifying differences) [@Alföldi_Lindblad-Toh_2013; @Eichler_2019]. ## Comparison of DNA methods ```{r, fig.alt = "Comparing DNA Sequencing Techniques. The most common DNA sequencing techniques are described. Whole genome sequencing coverages all genes and non-coding DNA. 3.2 billion bases are covered when applied to human samples. This the most expensive of the techniques. Depth of coverage required for 99.9% sensitivity is 30X. Whole exome sequencing coverage is the exome or expressed genes. Approximately 45 million bases are sequenced. This is a cost-effective technique. The depth of coverage required for 99.9% sensitivity is 100X. Targeted gene panel sequencing coverages 50-500 genes. 20,000 to 62 million bases are sequenced. This is the most cost-effective technique. Depth of coverage is >500X.", out.width = "100%", echo = FALSE} ottrpal::include_slide("https://docs.google.com/presentation/d/1YwxXy2rnUgbx_7B7ENH9wpDX-j6JpJz6lGVzOkjo0qY/edit#slide=id.g138a6ce16b7_35_18") ``` +There are four DNA sequencing methods discussed in this chapter. The above graph compares WGS, WXS, and Targeted gene sequencing. The last section compares all 4. + +1. Whole genome sequencing (WGS) +2. Whole exome sequencing (WXS) +3. Targeted gene sequencing +4. DNA/SNP microarrays + Compared to WXS and Targeted Gene Sequencing, WGS is the most expensive but requires the lowest depth of coverage to achieve 95% sensitivity. In other words, WGS requires sequencing each region of the genome (3.2 billion bases) 30 times in order to confidently be able to pick up all possible meaningful variants. [@Sims2014] goes into more depth on how these depths are calculated. Alternatively, WXS is a more cost effective way to study the genome, focusing places in the genome that have open reading frames -- aka generally genes that are able to be expressed. This focuses on enriching for exons and not introns so splicing variants may be missed. In this case, each gene must be sequenced 80-100x for sufficient sensitivity to pick up meaningful variants. diff --git a/book.bib b/book.bib index f007c8fa..36974ce9 100644 --- a/book.bib +++ b/book.bib @@ -5,6 +5,29 @@ @website{AlexsLemonade2022 year = {2022}, } +@article{Alföldi_Lindblad-Toh_2013, + title={Comparative genomics as a tool to understand evolution and disease}, + volume={23}, + ISSN={1088-9051, 1549-5469}, + url={https://genome.cshlp.org/content/23/7/1063}, + DOI={10.1101/gr.157503.113}, + abstractNote={When the human genome project started, the major challenge was how to sequence a 3 billion letter code in an organized and cost-effective manner. When completed, the project had laid the foundation for a huge variety of biomedical fields through the production of a complete human genome sequence, but also had driven the development of laboratory and analytical methods that could produce large amounts of sequencing data cheaply. These technological developments made possible the sequencing of many more vertebrate genomes, which have been necessary for the interpretation of the human genome. They have also enabled large-scale studies of vertebrate genome evolution, as well as comparative and human medicine. In this review, we give examples of evolutionary analysis using a wide variety of time frames—from the comparison of populations within a species to the comparison of species separated by at least 300 million years. Furthermore, we anticipate discoveries related to evolutionary mechanisms, adaptation, and disease to quickly accelerate in the coming years.}, + note={Company: Cold Spring Harbor Laboratory Press + Distributor: Cold Spring Harbor Laboratory Press + Institution: Cold Spring Harbor Laboratory Press + Label: Cold Spring Harbor Laboratory Press + publisher: Cold Spring Harbor Lab + PMID: 23817047}, + number={7}, + journal={Genome Research}, + author={Alföldi, Jessica and Lindblad-Toh, Kerstin}, + year={2013}, + month=jul, + pages={1063–1068}, + language={en} +} + + @article{Aljanahi2018, title = {An {Introduction} to the {Analysis} of {Single}-{Cell} {RNA}-{Sequencing} {Data}}, volume = {10}, @@ -20,6 +43,22 @@ @article{Aljanahi2018 pages = {189--196}, } +@article{Alonge2022, + title={Automated assembly scaffolding using RagTag elevates a new tomato system for high-throughput genome editing}, + volume={23}, + ISSN={1474-760X}, + url={https://doi.org/10.1186/s13059-022-02823-7}, + DOI={10.1186/s13059-022-02823-7}, + abstractNote={Advancing crop genomics requires efficient genetic systems enabled by high-quality personalized genome assemblies. Here, we introduce RagTag, a toolset for automating assembly scaffolding and patching, and we establish chromosome-scale reference genomes for the widely used tomato genotype M82 along with Sweet-100, a new rapid-cycling genotype that we developed to accelerate functional genomics and genome editing in tomato. This work outlines strategies to rapidly expand genetic systems and genomic resources in other plant species.}, + number={1}, + journal={Genome Biology}, + author={Alonge, Michael and Lebeigle, Ludivine and Kirsche, Melanie and Jenike, Katie and Ou, Shujun and Aganezov, Sergey and Wang, Xingang and Lippman, Zachary B. and Schatz, Michael C. and Soyk, Sebastian}, + year={2022}, + month=dec, + pages={258} +} + + @article{Amezquita2020, title = {Orchestrating single-cell analysis with {Bioconductor}}, volume = {17}, @@ -55,6 +94,23 @@ @article{Angerer2017 pages = {85--91}, } +@article{Audano2019, + title={Characterizing the Major Structural Variant Alleles of the Human Genome}, + volume={176}, + ISSN={00928674}, + url={https://linkinghub.elsevier.com/retrieve/pii/S0092867418316337}, + DOI={10.1016/j.cell.2018.12.019}, + abstractNote={In order to provide a comprehensive resource for human structural variants (SVs), we generated longread sequence data and analyzed SVs for fifteen human genomes. We sequence resolved 99,604 insertions, deletions, and inversions including 2,238 (1.6 Mbp) that are shared among all discovery genomes with an additional 13,053 (6.9 Mbp) present in the majority, indicating minor alleles or errors in the reference. Genotyping in 440 additional genomes confirms the most common SVs in unique euchromatin are now sequence resolved. We report a ninefold SV bias toward the last 5 Mbp of human chromosomes with nearly 55% of all VNTRs (variable number of tandem repeats) mapping to this portion of the genome. We identify SVs affecting coding and noncoding regulatory loci improving annotation and interpretation of functional variation. These data provide the framework to construct a canonical human reference and a resource for developing advanced representations capable of capturing allelic diversity.}, + number={3}, + journal={Cell}, + author={Audano, Peter A. and Sulovari, Arvis and Graves-Lindsay, Tina A. and Cantsilieris, Stuart and Sorensen, Melanie and Welch, AnneMarie E. and Dougherty, Max L. and Nelson, Bradley J. and Shah, Ankeeta and Dutcher, Susan K. and Warren, Wesley C. and Magrini, Vincent and McGrath, Sean D. and Li, Yang I. and Wilson, Richard K. and Eichler, Evan E.}, + year={2019}, + month=jan, + pages={663-675.e19}, + language={en} +} + + @article{BaranGale2018, title = {Experimental design for single-cell {RNA} sequencing}, volume = {17}, @@ -104,6 +160,23 @@ @misc{Bruning2021 year = {2021}, } +@article{Byrska-Bishop2022, + title={High-coverage whole-genome sequencing of the expanded 1000 Genomes Project cohort including 602 trios}, + volume={185}, + ISSN={0092-8674, 1097-4172}, + url={https://www.cell.com/cell/abstract/S0092-8674(22)00991-6}, + DOI={10.1016/j.cell.2022.08.004}, + number={18}, + journal={Cell}, + publisher={Elsevier}, + author={Byrska-Bishop, Marta and Evani, Uday S. and Zhao, Xuefang and Basile, Anna O. and Abel, Haley J. and Regier, Allison A. and Corvelo, André and Clarke, Wayne E. and Musunuri, Rajeeva and Nagulapalli, Kshithija and Fairley, Susan and Runnels, Alexi and Winterkorn, Lara and Lowy, Ernesto and Eichler, Evan E. and Korbel, Jan O. and Lee, Charles and Marschall, Tobias and Devine, Scott E. and Harvey, William T. and Zhou, Weichen and Mills, Ryan E. and Rausch, Tobias and Kumar, Sushant and Alkan, Can and Hormozdiari, Fereydoun and Chong, Zechen and Chen, Yu and Yang, Xiaofei and Lin, Jiadong and Gerstein, Mark B. and Kai, Ye and Zhu, Qihui and Yilmaz, Feyza and Xiao, Chunlin and Flicek, Paul and Germer, Soren and Brand, Harrison and Hall, Ira M. and Talkowski, Michael E. and Narzisi, Giuseppe and Zody, Michael C.}, + year={2022}, + month=sep, + pages={3426-3440.e19}, + language={English} +} + + @article{Conesa2016, doi = {10.1186/s13059-016-0881-8}, url = {https://doi.org/10.1186/s13059-016-0881-8}, @@ -145,6 +218,84 @@ @article{Ding2020 journal = {Nature Biotechnology} } +@article{Eichler_2019, + title={Genetic Variation, Comparative Genomics, and the Diagnosis of Disease}, + volume={381}, + ISSN={0028-4793}, + url={https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6681822/}, + DOI={10.1056/NEJMra1809315}, + number={1}, + journal={The New England journal of medicine}, + author={Eichler, Evan E.}, + year={2019}, + month=jul, + pages={64–74} +} + + +@article{Ermini_Driguez_2024, + title={The Application of Long-Read Sequencing to Cancer}, + volume={16}, + rights={http://creativecommons.org/licenses/by/3.0/}, + ISSN={2072-6694}, + url={https://www.mdpi.com/2072-6694/16/7/1275}, + DOI={10.3390/cancers16071275}, + abstractNote={Cancer is a multifaceted disease arising from numerous genomic aberrations that have been identified as a result of advancements in sequencing technologies. While next-generation sequencing (NGS), which uses short reads, has transformed cancer research and diagnostics, it is limited by read length. Third-generation sequencing (TGS), led by the Pacific Biosciences and Oxford Nanopore Technologies platforms, employs long-read sequences, which have marked a paradigm shift in cancer research. Cancer genomes often harbour complex events, and TGS, with its ability to span large genomic regions, has facilitated their characterisation, providing a better understanding of how complex rearrangements affect cancer initiation and progression. TGS has also characterised the entire transcriptome of various cancers, revealing cancer-associated isoforms that could serve as biomarkers or therapeutic targets. Furthermore, TGS has advanced cancer research by improving genome assemblies, detecting complex variants, and providing a more complete picture of transcriptomes and epigenomes. This review focuses on TGS and its growing role in cancer research. We investigate its advantages and limitations, providing a rigorous scientific analysis of its use in detecting previously hidden aberrations missed by NGS. This promising technology holds immense potential for both research and clinical applications, with far-reaching implications for cancer diagnosis and treatment.}, + number={77}, + journal={Cancers}, + publisher={Multidisciplinary Digital Publishing Institute}, + author={Ermini, Luca and Driguez, Patrick}, + year={2024}, + month=jan, + pages={1275}, + language={en} +} + +@website{GeneticDiversity, + url={https://kids.frontiersin.org/articles/10.3389/frym.2021.656168}, abstractNote={All living things on Earth contain a unique code within them, called DNA. DNA is organised into genes, similar to the way letters are organised into words. Genes give our bodies instructions on how to function. However, the exact DNA code is different even between individuals within the same species. We call this genetic diversity. Genetic diversity causes differences in the shape of bird beaks, in the flavours of tomatoes, and even in the colour of your hair! Genetic diversity is important because it gives species a better chance of survival. However, genetic diversity can be lost when populations get smaller and isolated, which decreases a species’ ability to adapt and survive. In this article, we explore the importance of genetic diversity, discuss how it is formed and maintained in wild populations, how it is lost and why that is dangerous, and what we can do to conserve it.}, + journal={Frontiers for Young Minds}, + title = {What is Genetic Diversity and Why Does it Matter?}, + language={en} +} + +@article{Gershman2022, + title={Epigenetic patterns in a complete human genome}, + volume={376}, + url={https://www.science.org/doi/10.1126/science.abj5089}, + DOI={10.1126/science.abj5089}, + abstractNote={The completion of a telomere-to-telomere human reference genome, T2T-CHM13, has resolved complex regions of the genome, including repetitive and homologous regions. Here, we present a high-resolution epigenetic study of previously unresolved sequences, representing entire acrocentric chromosome short arms, gene family expansions, and a diverse collection of repeat classes. This resource precisely maps CpG methylation (32.28 million CpGs), DNA accessibility, and short-read datasets (166,058 previously unresolved chromatin immunoprecipitation sequencing peaks) to provide evidence of activity across previously unidentified or corrected genes and reveals clinically relevant paralog-specific regulation. Probing CpG methylation across human centromeres from six diverse individuals generated an estimate of variability in kinetochore localization. This analysis provides a framework with which to investigate the most elusive regions of the human genome, granting insights into epigenetic regulation.}, + number={6588}, + journal={Science}, + publisher={American Association for the Advancement of Science}, + author={Gershman, Ariel and Sauria, Michael E. G. and Guitart, Xavi and Vollger, Mitchell R. and Hook, Paul W. and Hoyt, Savannah J. and Jain, Miten and Shumate, Alaina and Razaghi, Roham and Koren, Sergey and Altemose, Nicolas and Caldas, Gina V. and Logsdon, Glennis A. and Rhie, Arang and Eichler, Evan E. and Schatz, Michael C. and O’Neill, Rachel J. and Phillippy, Adam M. and Miga, Karen H. and Timp, Winston}, + year={2022}, + month=apr, + pages={eabj5089} +} + +@article{Gershman2023, + title={Genomic insights into metabolic flux in hummingbirds}, + volume={33}, + ISSN={1088-9051, 1549-5469}, + url={https://genome.cshlp.org/content/33/5/703}, + DOI={10.1101/gr.276779.122}, + abstractNote={Hummingbirds are very well adapted to sustain efficient and rapid metabolic shifts. They oxidize ingested nectar to directly fuel flight when foraging but have to switch to oxidizing stored lipids derived from ingested sugars during the night or long-distance migratory flights. Understanding how this organism moderates energy turnover is hampered by a lack of information regarding how relevant enzymes differ in sequence, expression, and regulation. To explore these questions, we generated a chromosome-scale genome assembly of the ruby-throated hummingbird (A. colubris) using a combination of long- and short-read sequencing, scaffolding it using existing assemblies. We then used hybrid long- and short-read RNA sequencing of liver and muscle tissue in fasted and fed metabolic states for a comprehensive transcriptome assembly and annotation. Our genomic and transcriptomic data found positive selection of key metabolic genes in nectivorous avian species and deletion of critical genes (SLC2A4, GCK) involved in glucostasis in other vertebrates. We found expression of a fructose-specific version of SLC2A5 putatively in place of insulin-sensitive SLC2A5, with predicted protein models suggesting affinity for both fructose and glucose. Alternative isoforms may even act to sequester fructose to preclude limitations from transport in metabolism. Finally, we identified differentially expressed genes from fasted and fed hummingbirds, suggesting key pathways for the rapid metabolic switch hummingbirds undergo.}, + note={Company: Cold Spring Harbor Laboratory Press + Distributor: Cold Spring Harbor Laboratory Press + Institution: Cold Spring Harbor Laboratory Press + Label: Cold Spring Harbor Laboratory Press + publisher: Cold Spring Harbor Lab + PMID: 37156619}, + number={5}, + journal={Genome Research}, + author={Gershman, Ariel and Hauck, Quinn and Dick, Morag and Jamison, Jerrica M. and Tassia, Michael and Agirrezabala, Xabier and Muhammad, Saad and Ali, Raafay and Workman, Rachael E. and Valle, Mikel and Wong, G. William and Welch, Kenneth C. and Timp, Winston}, + year={2023}, + month=may, + pages={703–714}, + language={en} +} + + @article{Kochmanski2019, author={Kochmanski, Joseph and Savonen, Candace and Bernstein, Alison I.}, article={A Novel Application of Mixed Effects Models for Reconciling Base-Pair Resolution 5-Methylcytosine and 5-Hydroxymethylcytosine Data in Neuroepigenetics}, @@ -170,6 +321,21 @@ @article{Hicks2017 journal = {Biostatistics} } +@article{Hindorff2009, + title={Potential etiologic and functional implications of genome-wide association loci for human diseases and traits}, + volume={106}, + url={https://www.pnas.org/doi/full/10.1073/pnas.0903103106}, + DOI={10.1073/pnas.0903103106}, + abstractNote={We have developed an online catalog of SNP-trait associations from published genome-wide association studies for use in investigating genomic characteristics of trait/disease-associated SNPs (TASs). Reported TASs were common [median risk allele frequency 36%, interquartile range (IQR) 21%−53%] and were associated with modest effect sizes [median odds ratio (OR) 1.33, IQR 1.20–1.61]. Among 20 genomic annotation sets, reported TASs were significantly overrepresented only in nonsynonymous sites [OR = 3.9 (2.2−7.0), p = 3.5 × 10−7] and 5kb-promoter regions [OR = 2.3 (1.5−3.6), p = 3 × 10−4] compared to SNPs randomly selected from genotyping arrays. Although 88% of TASs were intronic (45%) or intergenic (43%), TASs were not overrepresented in introns and were significantly depleted in intergenic regions [OR = 0.44 (0.34−0.58), p = 2.0 × 10−9]. Only slightly more TASs than expected by chance were predicted to be in regions under positive selection [OR = 1.3 (0.8−2.1), p = 0.2]. This new online resource, together with bioinformatic predictions of the underlying functionality at trait/disease-associated loci, is well-suited to guide future investigations of the role of common variants in complex disease etiology.}, + number={23}, + journal={Proceedings of the National Academy of Sciences}, + publisher={Proceedings of the National Academy of Sciences}, + author={Hindorff, Lucia A. and Sethupathy, Praveen and Junkins, Heather A. and Ramos, Erin M. and Mehta, Jayashri P. and Collins, Francis S. and Manolio, Teri A.}, + year={2009}, + month=jun, + pages={9362–9367} +} + @article{Hodges2007, doi = {10.1038/ng.2007.42}, url = {https://doi.org/10.1038/ng.2007.42}, @@ -184,6 +350,34 @@ @article{Hodges2007 journal = {Nature Genetics} } +@article{Karczewski2020, + doi = {10.1038/s41586-020-2308-7}, + url = {https://www.nature.com/articles/s41586-020-2308-7#change-history}, + year = {2020}, + month = {May}, + journal = {Nature}, + language = {en}, + author = {Konrad J. Karczewski and Laurent C. Francioli and Grace Tiao and Beryl B. Cummings and Jessica Alföldi and Qingbo Wang and Ryan L. Collins and Kristen M. Laricchia and Andrea Ganna and Daniel P. Birnbaum and Laura D. Gauthier and Harrison Brand and Matthew Solomonson and Nicholas A. Watts and Daniel Rhodes and Moriel Singer-Berk and Eleina M. England and Eleanor G. Seaby and Jack A. Kosmicki, and Raymond K. Walters and Katherine Tashman and Yossi Farjoun and Eric Banks and Timothy Poterba and Arcturus Wang and Cotton Seed and Nicola Whiffin and Jessica X. Chong and Kaitlin E. Samocha and Emma Pierce-Hoffman and Zachary Zappala and Anne H. O'Donnell-Luria and Eric Vallabh Minikel and ben Weisburd and Monkol Lek and James S. Ware and Christopher Vittal and Irina M. Armean and Louis Bergelson and Kristian Cibulskis and Kristen M Connolly and Miguel Covarrubias and Stacey Donnelly and Steven Ferriera and Stacey Gabriel and Jeff Gentry and Namrata Gupta and Thibault Jeandet and Diane Kaplan and Christopher Llanwarne and Ruchi Munshi and Sam Novod and Nikelle Petrillo and David Roazen and Valentin Ruano-Rubio and Andrea Saltzman and Molly Schleicher and Jose Soto and Kathleen Tibbetts and Charlotte Tolonen and Gordon Wade and Michael E. Talkowski and and Genome Aggregation Database Consortium and Benjamin M. Neale and Mark J. Daly and Daniel G. MacArthur} + title = {The mutational constraint spectrum quantified from variation in 141,456 humans}, + volume = {581} +} + +@article{Kellis2014, + title={Defining functional DNA elements in the human genome}, + volume={111}, + url={https://www.pnas.org/doi/10.1073/pnas.1318948111}, + DOI={10.1073/pnas.1318948111}, + abstractNote={With the completion of the human genome sequence, attention turned to identifying and annotating its functional DNA elements. As a complement to genetic and comparative genomics approaches, the Encyclopedia of DNA Elements Project was launched to contribute maps of RNA transcripts, transcriptional regulator binding sites, and chromatin states in many cell types. The resulting genome-wide data reveal sites of biochemical activity with high positional resolution and cell type specificity that facilitate studies of gene regulation and interpretation of noncoding variants associated with human disease. However, the biochemically active regions cover a much larger fraction of the genome than do evolutionarily conserved regions, raising the question of whether nonconserved but biochemically active regions are truly functional. Here, we review the strengths and limitations of biochemical, evolutionary, and genetic approaches for defining functional DNA segments, potential sources for the observed differences in estimated genomic coverage, and the biological implications of these discrepancies. We also analyze the relationship between signal intensity, genomic coverage, and evolutionary conservation. Our results reinforce the principle that each approach provides complementary information and that we need to use combinations of all three to elucidate genome function in human biology and disease.}, + number={17}, + journal={Proceedings of the National Academy of Sciences}, + publisher={Proceedings of the National Academy of Sciences}, + author={Kellis, Manolis and Wold, Barbara and Snyder, Michael P. and Bernstein, Bradley E. and Kundaje, Anshul and Marinov, Georgi K. and Ward, Lucas D. and Birney, Ewan and Crawford, Gregory E. and Dekker, Job and Dunham, Ian and Elnitski, Laura L. and Farnham, Peggy J. and Feingold, Elise A. and Gerstein, Mark and Giddings, Morgan C. and Gilbert, David M. and Gingeras, Thomas R. and Green, Eric D. and Guigo, Roderic and Hubbard, Tim and Kent, Jim and Lieb, Jason D. and Myers, Richard M. and Pazin, Michael J. and Ren, Bing and Stamatoyannopoulos, John A. and Weng, Zhiping and White, Kevin P. and Hardison, Ross C.}, + year={2014}, + month=apr, + pages={6131–6138} +} + + @article{Lafzi2018, title = {Tutorial: guidelines for the experimental design of single-cell {RNA} sequencing studies}, volume = {13}, @@ -204,6 +398,23 @@ @article{Lafzi2018 pages = {2742--2757}, } +@article{Li_Durbin_2024, + title={Genome assembly in the telomere-to-telomere era}, + rights={2024 Springer Nature Limited}, + ISSN={1471-0064}, + url={https://www.nature.com/articles/s41576-024-00718-w}, + DOI={10.1038/s41576-024-00718-w}, + abstractNote={Genome sequences largely determine the biology and encode the history of an organism, and de novo assembly — the process of reconstructing the genome sequence of an organism from sequencing reads — has been a central problem in bioinformatics for four decades. Until recently, genomes were typically assembled into fragments of a few megabases at best, but now technological advances in long-read sequencing enable the near-complete assembly of each chromosome — also known as telomere-to-telomere assembly — for many organisms. Here, we review recent progress on assembly algorithms and protocols, with a focus on how to derive near-telomere-to-telomere assemblies. We also discuss the additional developments that will be required to resolve remaining assembly gaps and to assemble non-diploid genomes.}, + journal={Nature Reviews Genetics}, + publisher={Nature Publishing Group}, + author={Li, Heng and Durbin, Richard}, + year={2024}, + month=apr, + pages={1–13}, + language={en} +} + + @article{Luecken2019, title = {Current best practices in single‐cell {RNA}‐seq analysis: a tutorial}, volume = {15}, @@ -246,6 +457,74 @@ @article{Mamanova2010 journal = {Nature Methods} } +@article{Miller2023, + title={Chromosome-level genome and the identification of sex chromosomes in Uloborus diversus}, + volume={12}, + ISSN={2047-217X}, + url={https://doi.org/10.1093/gigascience/giad002}, + DOI={10.1093/gigascience/giad002}, + abstractNote={The orb web is a remarkable example of animal architecture that is observed in families of spiders that diverged over 200 million years ago. While several genomes exist for araneid orb-weavers, none exist for other orb-weaving families, hampering efforts to investigate the genetic basis of this complex behavior. Here we present a chromosome-level genome assembly for the cribellate orb-weaving spider Uloborus diversus. The assembly reinforces evidence of an ancient arachnid genome duplication and identifies complete open reading frames for every class of spidroin gene, which encode the proteins that are the key structural components of spider silks. We identified the 2 X chromosomes for U. diversus and identify candidate sex-determining loci. This chromosome-level assembly will be a valuable resource for evolutionary research into the origins of orb-weaving, spidroin evolution, chromosomal rearrangement, and chromosomal sex determination in spiders.}, + journal={GigaScience}, + author={Miller, Jeremiah and Zimin, Aleksey V and Gordus, Andrew}, + year={2023}, + month=jan, + pages={giad002} +} + + +@website{NHGRIfactsheet2022, + year = {2022}, + title = {Genomic Data Science}, + url = {https://www.genome.gov/about-genomics/fact-sheets/Genomic-Data-Science}, + author = {NHGRI} +} + +@website{NHGRIGlossary2024, + year = {2024}, + title = {Genome}, + url = {https://www.genome.gov/genetics-glossary/Genome}, + author = {NHGRI} +} + +@website{NHSFrost2022, + year = {2022}, + title = {Constitutional (germline) vs somatic (tumour) variants}, + url = {https://www.genomicseducation.hee.nhs.uk/genotes/knowledge-hub/constitutional-germline-vs-somatic-tumour-variants/}, + author = {Dr Amy Frost}, + publisher = {NHS} +} + +@article{Nurk2022, + title={The complete sequence of a human genome}, + volume={376}, + url={https://www.science.org/doi/10.1126/science.abj6987}, + DOI={10.1126/science.abj6987}, + abstractNote={Since its initial release in 2000, the human reference genome has covered only the euchromatic fraction of the genome, leaving important heterochromatic regions unfinished. Addressing the remaining 8% of the genome, the Telomere-to-Telomere (T2T) Consortium presents a complete 3.055 billion–base pair sequence of a human genome, T2T-CHM13, that includes gapless assemblies for all chromosomes except Y, corrects errors in the prior references, and introduces nearly 200 million base pairs of sequence containing 1956 gene predictions, 99 of which are predicted to be protein coding. The completed regions include all centromeric satellite arrays, recent segmental duplications, and the short arms of all five acrocentric chromosomes, unlocking these complex regions of the genome to variational and functional studies.}, + number={6588}, + journal={Science}, + publisher={American Association for the Advancement of Science}, + author={Nurk, Sergey and Koren, Sergey and Rhie, Arang and Rautiainen, Mikko and Bzikadze, Andrey V. and Mikheenko, Alla and Vollger, Mitchell R. and Altemose, Nicolas and Uralsky, Lev and Gershman, Ariel and Aganezov, Sergey and Hoyt, Savannah J. and Diekhans, Mark and Logsdon, Glennis A. and Alonge, Michael and Antonarakis, Stylianos E. and Borchers, Matthew and Bouffard, Gerard G. and Brooks, Shelise Y. and Caldas, Gina V. and Chen, Nae-Chyun and Cheng, Haoyu and Chin, Chen-Shan and Chow, William and de Lima, Leonardo G. and Dishuck, Philip C. and Durbin, Richard and Dvorkina, Tatiana and Fiddes, Ian T. and Formenti, Giulio and Fulton, Robert S. and Fungtammasan, Arkarachai and Garrison, Erik and Grady, Patrick G. S. and Graves-Lindsay, Tina A. and Hall, Ira M. and Hansen, Nancy F. and Hartley, Gabrielle A. and Haukness, Marina and Howe, Kerstin and Hunkapiller, Michael W. and Jain, Chirag and Jain, Miten and Jarvis, Erich D. and Kerpedjiev, Peter and Kirsche, Melanie and Kolmogorov, Mikhail and Korlach, Jonas and Kremitzki, Milinn and Li, Heng and Maduro, Valerie V. and Marschall, Tobias and McCartney, Ann M. and McDaniel, Jennifer and Miller, Danny E. and Mullikin, James C. and Myers, Eugene W. and Olson, Nathan D. and Paten, Benedict and Peluso, Paul and Pevzner, Pavel A. and Porubsky, David and Potapova, Tamara and Rogaev, Evgeny I. and Rosenfeld, Jeffrey A. and Salzberg, Steven L. and Schneider, Valerie A. and Sedlazeck, Fritz J. and Shafin, Kishwar and Shew, Colin J. and Shumate, Alaina and Sims, Ying and Smit, Arian F. A. and Soto, Daniela C. and Sović, Ivan and Storer, Jessica M. and Streets, Aaron and Sullivan, Beth A. and Thibaud-Nissen, Françoise and Torrance, James and Wagner, Justin and Walenz, Brian P. and Wenger, Aaron and Wood, Jonathan M. D. and Xiao, Chunlin and Yan, Stephanie M. and Young, Alice C. and Zarate, Samantha and Surti, Urvashi and McCoy, Rajiv C. and Dennis, Megan Y. and Alexandrov, Ivan A. and Gerton, Jennifer L. and O’Neill, Rachel J. and Timp, Winston and Zook, Justin M. and Schatz, Michael C. and Eichler, Evan E. and Miga, Karen H. and Phillippy, Adam M.}, + year={2022}, + month=apr, + pages={44–53} +} + +@article{Pearson2013, + title={An Introduction to Sequence Similarity (“Homology”) Searching}, + volume={42}, + rights={http://onlinelibrary.wiley.com/termsAndConditions#vor}, + ISSN={1934-3396, 1934-340X}, + url={https://currentprotocols.onlinelibrary.wiley.com/doi/10.1002/0471250953.bi0301s42}, + DOI={10.1002/0471250953.bi0301s42}, + abstractNote={Sequence similarity searching, typically with BLAST (units 3.3, 3.4), is the most widely used, and most reliable, strategy for characterizing newly determined sequences. Sequence similarity searches can identify ”homologous” proteins or genes by detecting excess similarity – statistically significant similarity that reflects common ancestry. This unit provides an overview of the inference of homology from significant similarity, and introduces other units in this chapter that provide more details on effective strategies for identifying homologs.}, + number={1}, + journal={Current Protocols in Bioinformatics}, + author={Pearson, William R.}, + year={2013}, + month=jun, + language={en} +} + @article{Rao2019, doi = {10.3389/fgene.2018.00636}, url = {https://doi.org/10.3389/fgene.2018.00636}, @@ -265,6 +544,25 @@ @website{refinebioexamples2019 author = {CCDL for ALSF}, } +@article{Rhie2023, + title={The complete sequence of a human Y chromosome}, + volume={621}, + rights={2023 This is a U.S. Government work and not under copyright protection in the US; foreign copyright protection may apply}, + ISSN={1476-4687}, + url={https://www.nature.com/articles/s41586-023-06457-y}, + DOI={10.1038/s41586-023-06457-y}, + abstractNote={The human Y chromosome has been notoriously difficult to sequence and assemble because of its complex repeat structure that includes long palindromes, tandem repeats and segmental duplications1–3. As a result, more than half of the Y chromosome is missing from the GRCh38 reference sequence and it remains the last human chromosome to be finished4,5. Here, the Telomere-to-Telomere (T2T) consortium presents the complete 62,460,029-base-pair sequence of a human Y chromosome from the HG002 genome (T2T-Y) that corrects multiple errors in GRCh38-Y and adds over 30 million base pairs of sequence to the reference, showing the complete ampliconic structures of gene families TSPY, DAZ and RBMY; 41 additional protein-coding genes, mostly from the TSPY family; and an alternating pattern of human satellite 1 and 3 blocks in the heterochromatic Yq12 region. We have combined T2T-Y with a previous assembly of the CHM13 genome4 and mapped available population variation, clinical variants and functional genomics data to produce a complete and comprehensive reference sequence for all 24 human chromosomes.}, + number={7978}, + journal={Nature}, + publisher={Nature Publishing Group}, + author={Rhie, Arang and Nurk, Sergey and Cechova, Monika and Hoyt, Savannah J. and Taylor, Dylan J. and Altemose, Nicolas and Hook, Paul W. and Koren, Sergey and Rautiainen, Mikko and Alexandrov, Ivan A. and Allen, Jamie and Asri, Mobin and Bzikadze, Andrey V. and Chen, Nae-Chyun and Chin, Chen-Shan and Diekhans, Mark and Flicek, Paul and Formenti, Giulio and Fungtammasan, Arkarachai and Garcia Giron, Carlos and Garrison, Erik and Gershman, Ariel and Gerton, Jennifer L. and Grady, Patrick G. S. and Guarracino, Andrea and Haggerty, Leanne and Halabian, Reza and Hansen, Nancy F. and Harris, Robert and Hartley, Gabrielle A. and Harvey, William T. and Haukness, Marina and Heinz, Jakob and Hourlier, Thibaut and Hubley, Robert M. and Hunt, Sarah E. and Hwang, Stephen and Jain, Miten and Kesharwani, Rupesh K. and Lewis, Alexandra P. and Li, Heng and Logsdon, Glennis A. and Lucas, Julian K. and Makalowski, Wojciech and Markovic, Christopher and Martin, Fergal J. and Mc Cartney, Ann M. and McCoy, Rajiv C. and McDaniel, Jennifer and McNulty, Brandy M. and Medvedev, Paul and Mikheenko, Alla and Munson, Katherine M. and Murphy, Terence D. and Olsen, Hugh E. and Olson, Nathan D. and Paulin, Luis F. and Porubsky, David and Potapova, Tamara and Ryabov, Fedor and Salzberg, Steven L. and Sauria, Michael E. G. and Sedlazeck, Fritz J. and Shafin, Kishwar and Shepelev, Valery A. and Shumate, Alaina and Storer, Jessica M. and Surapaneni, Likhitha and Taravella Oill, Angela M. and Thibaud-Nissen, Françoise and Timp, Winston and Tomaszkiewicz, Marta and Vollger, Mitchell R. and Walenz, Brian P. and Watwood, Allison C. and Weissensteiner, Matthias H. and Wenger, Aaron M. and Wilson, Melissa A. and Zarate, Samantha and Zhu, Yiming and Zook, Justin M. and Eichler, Evan E. and O’Neill, Rachel J. and Schatz, Michael C. and Miga, Karen H. and Makova, Kateryna D. and Phillippy, Adam M.}, + year={2023}, + month=sep, + pages={344–354}, + language={en} +} + + @Manual{rmarkdown2021, title = {rmarkdown: Dynamic Documents for R}, author = {JJ Allaire and Yihui Xie and Jonathan McPherson and Javier Luraschi and Kevin Ushey and Aron Atkins and Hadley Wickham and Joe Cheng and Winston Chang and Richard Iannone}, @@ -273,6 +571,44 @@ @Manual{rmarkdown2021 url = {https://github.com/rstudio/rmarkdown}, } +@article{Schatz2010, + title={Assembly of large genomes using second-generation sequencing}, + volume={20}, + ISSN={1088-9051, 1549-5469}, + url={https://genome.cshlp.org/content/20/9/1165}, + DOI={10.1101/gr.101360.109}, + abstractNote={Second-generation sequencing technology can now be used to sequence an entire human genome in a matter of days and at low cost. Sequence read lengths, initially very short, have rapidly increased since the technology first appeared, and we now are seeing a growing number of efforts to sequence large genomes de novo from these short reads. In this Perspective, we describe the issues associated with short-read assembly, the different types of data produced by second-gen sequencers, and the latest assembly algorithms designed for these data. We also review the genomes that have been assembled recently from short reads and make recommendations for sequencing strategies that will yield a high-quality assembly.}, + note={Company: Cold Spring Harbor Laboratory Press + Distributor: Cold Spring Harbor Laboratory Press + Institution: Cold Spring Harbor Laboratory Press + Label: Cold Spring Harbor Laboratory Press + publisher: Cold Spring Harbor Lab + PMID: 20508146}, + number={9}, + journal={Genome Research}, + author={Schatz, Michael C. and Delcher, Arthur L. and Salzberg, Steven L.}, + year={2010}, + month=sep, + pages={1165–1173}, + language={en} +} + +@article{Sistrom2016, + title={De Novo Genome Assembly Shows Genome Wide Similarity between Trypanosoma brucei brucei and Trypanosoma brucei rhodesiense}, + volume={11}, + ISSN={1932-6203}, + url={https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0147660}, + DOI={10.1371/journal.pone.0147660}, + abstractNote={Background Trypanosoma brucei is a eukaryotic pathogen which causes African trypanosomiasis. It is notable for its variant surface glycoprotein (VSG) coat, which undergoes antigenic variation enabled by a large suite of VSG pseudogenes, allowing for persistent evasion of host adaptive immunity. While Trypanosoma brucei rhodesiense (Tbr) and T. b gambiense (Tbg) are human infective, related T. b. brucei (Tbb) is cleared by human sera. A single gene, the Serum Resistance Associated (SRA) gene, confers Tbr its human infectivity phenotype. Potential genetic recombination of this gene between Tbr and non-human infective Tbb strains has significant epidemiological consequences for Human African Trypanosomiasis outbreaks. Results Using long and short read whole genome sequencing, we generated a hybrid de novo assembly of a Tbr strain, producing 4,210 scaffolds totaling approximately 38.8 megabases, which comprise a significant proportion of the Tbr genome, and thus represents a valuable tool for a comparative genomics analyses among human and non-human infective T. brucei and future complete genome assembly. We detected 5,970 putative genes, of which two, an alcohol oxidoreductase and a pentatricopeptide repeat-containing protein, were members of gene families common to all T. brucei subspecies, but variants specific to the Tbr strain sequenced in this study. Our findings confirmed the extremely high level of genomic similarity between the two parasite subspecies found in other studies. Conclusions We confirm at the whole genome level high similarity between the two Tbb and Tbr strains studied. The discovery of extremely minor genomic differentiation between Tbb and Tbr suggests that the transference of the SRA gene via genetic recombination could potentially result in novel human infective strains, thus all genetic backgrounds of T. brucei should be considered potentially human infective in regions where Tbr is prevalent.}, number={2}, + journal={PLOS ONE}, + publisher={Public Library of Science}, + author={Sistrom, Mark and Evans, Benjamin and Benoit, Joshua and Balmer, Oliver and Aksoy, Serap and Caccone, Adalgisa}, + year={2016}, + month=feb, + pages={e0147660}, + language={en} +} + @article{Svensson2017, doi = {10.1038/nmeth.4220}, url = {https://doi.org/10.1038/nmeth.4220}, @@ -309,6 +645,34 @@ @article{Tarca2006 url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2435252/} } +@article{Taylor2006, + title={ESPERR: Learning strong and weak signals in genomic sequence alignments to identify functional elements}, + volume={16}, + ISSN={1088-9051}, + url={https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1665643/}, + DOI={10.1101/gr.4537706}, + abstractNote={Genomic sequence signals—such as base composition, presence of particular motifs, or evolutionary constraint—have been used effectively to identify functional elements. However, approaches based only on specific signals known to correlate with function can be quite limiting. When training data are available, application of computational learning algorithms to multispecies alignments has the potential to capture broader and more informative sequence and evolutionary patterns that better characterize a class of elements. However, effective exploitation of patterns in multispecies alignments is impeded by the vast number of possible alignment columns and by a limited understanding of which particular strings of columns may characterize a given class. We have developed a computational method, called ESPERR (evolutionary and sequence pattern extraction through reduced representations), which uses training examples to learn encodings of multispecies alignments into reduced forms tailored for the prediction of chosen classes of functional elements. ESPERR produces a greatly improved Regulatory Potential score, which can discriminate regulatory regions from neutral sites with excellent accuracy (∼94%). This score captures strong signals (GC content and conservation), as well as subtler signals (with small contributions from many different alignment patterns) that characterize the regulatory elements in our training set. ESPERR is also effective for predicting other classes of functional elements, as we show for DNaseI hypersensitive sites and highly conserved regions with developmental enhancer activity. Our software, training data, and genome-wide predictions are available from our Web site (http://www.bx.psu.edu/projects/esperr).}, + number={12}, + journal={Genome Research}, + author={Taylor, James and Tyekucheva, Svitlana and King, David C. and Hardison, Ross C. and Miller, Webb and Chiaromonte, Francesca}, + year={2006}, + month=dec, + pages={1596–1604} +} + +@article{Taylor_2024, + title={Beyond the Human Genome Project: The Age of Complete Human Genome Sequences and Pangenome References}, + ISSN={1527-8204, 1545-293X}, + url={https://www.annualreviews.org/content/journals/10.1146/annurev-genom-021623-081639}, + DOI={10.1146/annurev-genom-021623-081639}, + abstractNote={The Human Genome Project was an enormous accomplishment, providing a foundation for countless explorations into the genetics and genomics of the human species. Yet for many years, the human genome reference sequence remained incomplete and lacked representation of human genetic diversity. Recently, two major advances have emerged to address these shortcomings: complete gap-free human genome sequences, such as the one developed by the Telomere-to-Telomere Consortium, and high-quality pangenomes, such as the one developed by the Human Pangenome Reference Consortium. Facilitated by advances in long-read DNA sequencing and genome assembly algorithms, complete human genome sequences resolve regions that have been historically difficult to sequence, including centromeres, telomeres, and segmental duplications. In parallel, pangenomes capture the extensive genetic diversity across populations worldwide. Together, these advances usher in a new era of genomics research, enhancing the accuracy of genomic analysis, paving the path for precision medicine, and contributing to deeper insights into human biology.}, + journal={Annual Review of Genomics and Human Genetics}, + author={Taylor, Dylan J. and Eizenga, Jordan M. and Li, Qiuhui and Das, Arun and Jenike, Katharine M. and Kenny, Eimear E. and Miga, Karen H. and Monlong, Jean and McCoy, Rajiv C. and Paten, Benedict and Schatz, Michael C.}, + year={2024}, + month=apr, + language={en} +} + @article{Turner2009, doi = {10.1146/annurev-genom-082908-150112}, url = {https://doi.org/10.1146/annurev-genom-082908-150112}, @@ -323,6 +687,22 @@ @article{Turner2009 journal = {Annual Review of Genomics and Human Genetics} } +@article{Wong2011, + title={Unraveling the Genetics of Cancer: Genome Sequencing and Beyond}, + volume={12}, + ISSN={1527-8204, 1545-293X}, + url={https://www.annualreviews.org/doi/10.1146/annurev-genom-082509-141532}, + DOI={10.1146/annurev-genom-082509-141532}, + abstractNote={Advances in next-generation sequencing technology are enabling the systematic analyses of whole cancer genomes, providing insights into the landscape of somatic mutations and the great genetic heterogeneity that defines the unique signature of an individual tumor. Moreover, integrated studies of the genome, epigenome, and transcriptome reveal mechanisms of tumorigenesis at multiple levels. Progress in sequencing technologies and bioinformatics will improve the costs, sensitivity, and accuracy of detecting somatic mutations, while large-scale projects are underway to coordinate cancer genome sequencing at the global level to facilitate the generation and dissemination of high-quality uniform genetic data. These developments will create opportunities for deeper studies of cancer genetics and the clinical application of genome sequencing, and will motivate further research in cancer pathogenesis.}, + number={1}, + journal={Annual Review of Genomics and Human Genetics}, + author={Wong, Kit Man and Hudson, Thomas J. and McPherson, John D.}, + year={2011}, + month=sep, + pages={407–430}, + language={en} +} + @Book{Xie2018, title = {R Markdown: The Definitive Guide}, author = {Yihui Xie and J.J. Allaire and Garrett Grolemund}, @@ -333,6 +713,21 @@ @Book{Xie2018 url = {https://bookdown.org/yihui/rmarkdown}, } +@article{Xiao2022, + title={Personalized genome assembly for accurate cancer somatic mutation discovery using tumor-normal paired reference samples}, + volume={23}, + ISSN={1474-760X}, + url={https://doi.org/10.1186/s13059-022-02803-x}, + DOI={10.1186/s13059-022-02803-x}, + abstractNote={The use of a personalized haplotype-specific genome assembly, rather than an unrelated, mosaic genome like GRCh38, as a reference for detecting the full spectrum of somatic events from cancers has long been advocated but has never been explored in tumor-normal paired samples. Here, we provide the first demonstrated use of de novo assembled personalized genome as a reference for cancer mutation detection and quantifying the effects of the reference genomes on the accuracy of somatic mutation detection.}, + number={1}, + journal={Genome Biology}, + author={Xiao, Chunlin and Chen, Zhong and Chen, Wanqiu and Padilla, Cory and Colgan, Michael and Wu, Wenjun and Fang, Li-Tai and Liu, Tiantian and Yang, Yibin and Schneider, Valerie and Wang, Charles and Xiao, Wenming}, + year={2022}, + month=nov, + pages={237} +} + @Book{Xie2020, title = {R Markdown Cookbook}, author = {Yihui Xie and Christophe Dervieux and Emily Riederer}, @@ -370,6 +765,23 @@ @article{Zhang2015 journal = {Genome Biology} } +@article{Zhang2022, + title={Cancer Genomic Rearrangements and Copy Number Alterations from Errors in Cell Division}, + volume={6}, + rights={http://creativecommons.org/licenses/by/4.0/}, + ISSN={2472-3428, 2472-3428}, + url={https://www.annualreviews.org/doi/10.1146/annurev-cancerbio-070620-094029}, + DOI={10.1146/annurev-cancerbio-070620-094029}, + abstractNote={Analysis of cancer genomes has shown that a large fraction of chromosomal changes originate from catastrophic events including whole-genome duplication, chromothripsis, breakage-fusion-bridge cycles, and chromoplexy. Through sophisticated computational analysis of cancer genomes and experimental recapitulation of these catastrophic alterations, we have gained significant insights into the origin, mechanism, and evolutionary dynamics of cancer genome complexity. In this review, we summarize this progress and survey the major unresolved questions, with particular emphasis on the relative contributions of chromosome fragmentation and DNA replication errors to complex chromosomal alterations.}, + number={1}, + journal={Annual Review of Cancer Biology}, + author={Zhang, Cheng-Zhong and Pellman, David}, + year={2022}, + month=apr, + pages={245–268}, + language={en} +} + @article{Ziemann2016, doi = {10.1186/s13059-016-1044-7}, url = {https://doi.org/10.1186/s13059-016-1044-7}, From fafc32f1f58209076a526bdf4ceaaf39fad26234 Mon Sep 17 00:00:00 2001 From: Kate Isaac <41767733+kweav@users.noreply.github.com> Date: Mon, 13 May 2024 20:49:24 -0400 Subject: [PATCH 2/7] addressed spelling and url check errors --- 11c-ChIP-Seq.Rmd | 2 +- 11d-CUT-and-RUN.Rmd | 2 +- resources/dictionary.txt | 7 +++++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/11c-ChIP-Seq.Rmd b/11c-ChIP-Seq.Rmd index 05e609ef..bb716d64 100644 --- a/11c-ChIP-Seq.Rmd +++ b/11c-ChIP-Seq.Rmd @@ -118,7 +118,7 @@ Annotation ### Tools for preprocessing -- [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic) is a widely used tool for trimming and filtering Illumina sequencing data. It is often used to remove low-quality reads, adapter sequences, and other artifacts that can affect downstream analysis. +- [Trimmomatic](http://www.usadellab.org/cms/index.php?page=trimmomatic) is a widely used tool for trimming and filtering Illumina sequencing data. It is often used to remove low-quality reads, adapter sequences, and other artifacts that can affect downstream analysis. - [Cutadapt](https://cutadapt.readthedocs.io/en/stable/) is another popular tool for trimming adapter sequences from high-throughput sequencing data. It is particularly useful for removing adapters that contain degenerate nucleotides or that have been ligated with variable lengths. - [Bowtie2](https://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) is a fast and memory-efficient tool for aligning sequencing reads to a reference genome. It is often used to map ChIP-Seq reads to the genome prior to peak calling. - [SAMtools](http://www.htslib.org/) is a suite of tools for manipulating SAM/BAM files, which are commonly used to store alignment data from high-throughput sequencing experiments. It can be used for filtering and sorting reads, as well as for generating summary statistics. diff --git a/11d-CUT-and-RUN.Rmd b/11d-CUT-and-RUN.Rmd index fd446c07..2669e94a 100644 --- a/11d-CUT-and-RUN.Rmd +++ b/11d-CUT-and-RUN.Rmd @@ -48,7 +48,7 @@ ottrpal::include_slide("https://docs.google.com/presentation/d/1YwxXy2rnUgbx_7B7 ### CUT&RUN -**Cleavage Under Targets and Release Using Nuclease**, **CUT&RUN** for short, is an antibody-targeted chromatin profiling method to measure the histone modification enrichment or transcription factor binding. This is a more advanced technology for epigenomic landscape profiling compared to the tradditional ChIP-seq technology and known for its easy implementation and low cost. The procedure is carried out in situ where micrococcal nuclease tethered to protein A binds to an antibody of choice and cuts immediately adjacent DNA, releasing DNA-bound to the antibody target. Therefore, CUT&RUN produces precise transcription factor or histone modification profiles while avoiding crosslinking and solubilization issues. Extremely low backgrounds make profiling possible with typically one-tenth of the sequencing depth required for ChIP-seq and permit profiling using low cell numbers (i.e., a few hundred cells) without losing quality. +**Cleavage Under Targets and Release Using Nuclease**, **CUT&RUN** for short, is an antibody-targeted chromatin profiling method to measure the histone modification enrichment or transcription factor binding. This is a more advanced technology for epigenomic landscape profiling compared to the traditional ChIP-seq technology and known for its easy implementation and low cost. The procedure is carried out in situ where micrococcal nuclease tethered to protein A binds to an antibody of choice and cuts immediately adjacent DNA, releasing DNA-bound to the antibody target. Therefore, CUT&RUN produces precise transcription factor or histone modification profiles while avoiding crosslinking and solubilization issues. Extremely low backgrounds make profiling possible with typically one-tenth of the sequencing depth required for ChIP-seq and permit profiling using low cell numbers (i.e., a few hundred cells) without losing quality. diff --git a/resources/dictionary.txt b/resources/dictionary.txt index 5667a16d..a66596e7 100644 --- a/resources/dictionary.txt +++ b/resources/dictionary.txt @@ -1,3 +1,4 @@ +adaptor AutoCUT bacterially basepair @@ -59,6 +60,7 @@ SEACR solubilization sonicated supernatant +SVs tagmentation tagmented TIPseq @@ -83,10 +85,12 @@ MERFISH MERSCOPE mIF NMF +pangenome permeabilization phenotyping piecharts pkc +pseudogene RNAseq ROI ROIs @@ -105,6 +109,9 @@ Statistcial STdata STdeconvolve stroma +Telomere +transcriptionally +translocations TMAs visium Xenium From c8bc8b657a580dbb780d68780d4d76546ea0d4dd Mon Sep 17 00:00:00 2001 From: Kate Isaac <41767733+kweav@users.noreply.github.com> Date: Mon, 13 May 2024 22:09:43 -0400 Subject: [PATCH 3/7] add comma --- book.bib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/book.bib b/book.bib index 36974ce9..7c7aa0c5 100644 --- a/book.bib +++ b/book.bib @@ -357,7 +357,7 @@ @article{Karczewski2020 month = {May}, journal = {Nature}, language = {en}, - author = {Konrad J. Karczewski and Laurent C. Francioli and Grace Tiao and Beryl B. Cummings and Jessica Alföldi and Qingbo Wang and Ryan L. Collins and Kristen M. Laricchia and Andrea Ganna and Daniel P. Birnbaum and Laura D. Gauthier and Harrison Brand and Matthew Solomonson and Nicholas A. Watts and Daniel Rhodes and Moriel Singer-Berk and Eleina M. England and Eleanor G. Seaby and Jack A. Kosmicki, and Raymond K. Walters and Katherine Tashman and Yossi Farjoun and Eric Banks and Timothy Poterba and Arcturus Wang and Cotton Seed and Nicola Whiffin and Jessica X. Chong and Kaitlin E. Samocha and Emma Pierce-Hoffman and Zachary Zappala and Anne H. O'Donnell-Luria and Eric Vallabh Minikel and ben Weisburd and Monkol Lek and James S. Ware and Christopher Vittal and Irina M. Armean and Louis Bergelson and Kristian Cibulskis and Kristen M Connolly and Miguel Covarrubias and Stacey Donnelly and Steven Ferriera and Stacey Gabriel and Jeff Gentry and Namrata Gupta and Thibault Jeandet and Diane Kaplan and Christopher Llanwarne and Ruchi Munshi and Sam Novod and Nikelle Petrillo and David Roazen and Valentin Ruano-Rubio and Andrea Saltzman and Molly Schleicher and Jose Soto and Kathleen Tibbetts and Charlotte Tolonen and Gordon Wade and Michael E. Talkowski and and Genome Aggregation Database Consortium and Benjamin M. Neale and Mark J. Daly and Daniel G. MacArthur} + author = {Konrad J. Karczewski and Laurent C. Francioli and Grace Tiao and Beryl B. Cummings and Jessica Alföldi and Qingbo Wang and Ryan L. Collins and Kristen M. Laricchia and Andrea Ganna and Daniel P. Birnbaum and Laura D. Gauthier and Harrison Brand and Matthew Solomonson and Nicholas A. Watts and Daniel Rhodes and Moriel Singer-Berk and Eleina M. England and Eleanor G. Seaby and Jack A. Kosmicki, and Raymond K. Walters and Katherine Tashman and Yossi Farjoun and Eric Banks and Timothy Poterba and Arcturus Wang and Cotton Seed and Nicola Whiffin and Jessica X. Chong and Kaitlin E. Samocha and Emma Pierce-Hoffman and Zachary Zappala and Anne H. O'Donnell-Luria and Eric Vallabh Minikel and ben Weisburd and Monkol Lek and James S. Ware and Christopher Vittal and Irina M. Armean and Louis Bergelson and Kristian Cibulskis and Kristen M Connolly and Miguel Covarrubias and Stacey Donnelly and Steven Ferriera and Stacey Gabriel and Jeff Gentry and Namrata Gupta and Thibault Jeandet and Diane Kaplan and Christopher Llanwarne and Ruchi Munshi and Sam Novod and Nikelle Petrillo and David Roazen and Valentin Ruano-Rubio and Andrea Saltzman and Molly Schleicher and Jose Soto and Kathleen Tibbetts and Charlotte Tolonen and Gordon Wade and Michael E. Talkowski and and Genome Aggregation Database Consortium and Benjamin M. Neale and Mark J. Daly and Daniel G. MacArthur}, title = {The mutational constraint spectrum quantified from variation in 141,456 humans}, volume = {581} } From c1378a37be43e5637e1023bdb6c2749a073fcb28 Mon Sep 17 00:00:00 2001 From: Kate Isaac <41767733+kweav@users.noreply.github.com> Date: Mon, 13 May 2024 22:26:24 -0400 Subject: [PATCH 4/7] fix reference link typos --- 09-DNA.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/09-DNA.Rmd b/09-DNA.Rmd index 9555079b..064dab4c 100644 --- a/09-DNA.Rmd +++ b/09-DNA.Rmd @@ -23,7 +23,7 @@ There are several larger goals behind DNA sequencing experiments ranging from as * Assembling whole genomes: - Because an organism's genome determines how an organism develops and functions [@NHGRIGlossary2024], an important task in the genomics field is assembling the genome of an organism from sequencing reads. This assembly process attempts to reconstruct how the sequencing reads overlap or fit together [@Schatz2010; @Li_Durbin_2024]. Recent examples of genome assembly in the genomics field include a complete 3.055 billion-base pair sequence of the human reference genome which was published by the Telomere-to-Telomere (T2T) Consortium [-@Nurk2022], the T2T-CHM13 version (followed not long after by the complete sequence of the human Y chromosome [-@Rhie2023]). A goal of the field is to better capture human genetic diversity by creating a reference pangenome, assembled from multiple donors within the population [-@Taylor2024]. Genome assemblies are an important part of genomics beyond human genomics research; there are reference gnomes available for most model organisms as well as many plants, animals, and pathogens, with more and more being published at a high frequency [@Miller2023; @Alonge2022; @Gershman2023; @Sistrom2016]. These reference genomes each act as an extensive compilation of the observed DNA sequence of genes, regulatory elements, etc. and the related coordinate systems for these elements, such that, for the corresponding organism, sequencing reads from other experiments can be mapped or aligned to the reference in order to localize where that read was in the genome. In the case of cancer informatics, a recent approach utilized personalized genome assembly to more accurately detect tumor somatic mutations. This is likely to be an area of future research for application in precision medicine [@Xiao2022; @Ermini_Driguez_2024]. + Because an organism's genome determines how an organism develops and functions [@NHGRIGlossary2024], an important task in the genomics field is assembling the genome of an organism from sequencing reads. This assembly process attempts to reconstruct how the sequencing reads overlap or fit together [@Schatz2010; @Li_Durbin_2024]. Recent examples of genome assembly in the genomics field include a complete 3.055 billion-base pair sequence of the human reference genome which was published by the Telomere-to-Telomere (T2T) Consortium [-@Nurk2022], the T2T-CHM13 version (followed not long after by the complete sequence of the human Y chromosome [-@Rhie2023]). A goal of the field is to better capture human genetic diversity by creating a reference pangenome, assembled from multiple donors within the population [-@Taylor_2024]. Genome assemblies are an important part of genomics beyond human genomics research; there are reference gnomes available for most model organisms as well as many plants, animals, and pathogens, with more and more being published at a high frequency [@Miller2023; @Alonge2022; @Gershman2023; @Sistrom2016]. These reference genomes each act as an extensive compilation of the observed DNA sequence of genes, regulatory elements, etc. and the related coordinate systems for these elements, such that, for the corresponding organism, sequencing reads from other experiments can be mapped or aligned to the reference in order to localize where that read was in the genome. In the case of cancer informatics, a recent approach utilized personalized genome assembly to more accurately detect tumor somatic mutations. This is likely to be an area of future research for application in precision medicine [@Xiao2022; @Ermini_Driguez_2024]. * Identifying variation: @@ -35,7 +35,7 @@ There are several larger goals behind DNA sequencing experiments ranging from as * Comparative genomics - A common saying in the genomics field is that structure determines function and conserved structure may be constrained such that there is an important function which needs to be conserved [@Alföldi_Linblad-Toh_2013]. Further, similarities in structure may be due to shared ancestry through the processes of evolution; therefore, some comparative genomics studies aim to infer homology or an evolutionary relationship from structural similarity [@Pearson2013]. More pertinent to the topics discussed previously, comparative genomics studies are also useful for identifying functional elements [@Taylor2006] and variants associated with disease (e.g., by comparing the genomes of those with the disease and those without it and identifying differences) [@Alföldi_Lindblad-Toh_2013; @Eichler_2019]. + A common saying in the genomics field is that structure determines function and conserved structure may be constrained such that there is an important function which needs to be conserved [@Alföldi_Lindblad-Toh_2013]. Further, similarities in structure may be due to shared ancestry through the processes of evolution; therefore, some comparative genomics studies aim to infer homology or an evolutionary relationship from structural similarity [@Pearson2013]. More pertinent to the topics discussed previously, comparative genomics studies are also useful for identifying functional elements [@Taylor2006] and variants associated with disease (e.g., by comparing the genomes of those with the disease and those without it and identifying differences) [@Alföldi_Lindblad-Toh_2013; @Eichler_2019]. ## Comparison of DNA methods From 79ad0dd0139b12ec2e8d5c62076009ffc6c4d12c Mon Sep 17 00:00:00 2001 From: Kate Isaac <41767733+kweav@users.noreply.github.com> Date: Mon, 13 May 2024 22:27:12 -0400 Subject: [PATCH 5/7] add back colon --- 09-DNA.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/09-DNA.Rmd b/09-DNA.Rmd index 064dab4c..f40b849f 100644 --- a/09-DNA.Rmd +++ b/09-DNA.Rmd @@ -33,7 +33,7 @@ There are several larger goals behind DNA sequencing experiments ranging from as Genomes contain more than just genes (the coding sequences that will be transcribed and translated into a protein); they also contain functional elements such as promoters, enhancers, or silencers that modulate the expression of genes [@Kellis2014]. Further, differential gene expression is the phenomenon by which cells with the same DNA sequence show different patterns of gene expression. Functional genomic analyses aim to better understand differential gene expression and the impact of genetic variation found in functional elements. For example, many human genetic variants associated with common traits and diseases are localized in or near known functional elements [@Hindorff2009]. These variants may impact gene expression due to either changes in transcription factor binding at that site, or resulting epigenetic changes, which are defined as chemical modifications of chromatin or nucleotides beyond the DNA sequence. Such epigenetic modifications, which include histone marks and DNA methylation, can alter DNA compaction and influence a functional element’s accessibility for transcriptional machinery (e.g., if the element isn't accessible, transcription may not occur; while previously the element was accessible and the gene could be transcribed). In later sections, methods that study epigenetic modifications like chromatin accessibility, DNA methylation, or binding of specific proteins will be discussed. All of these methods support functional genomic analyses and are important for better understanding differential gene expression and the impact of genetic variants located in functional elements may have on disease occurrence. A somewhat recent and high profile example of a functional genomic analysis centers again on work from the T2T Consortium. Not only did they publish a new, complete reference genome, but they also studied the epigenetic landscape in the newly resolved regions of the genome and pointed to potential newly discovered functional elements in a region previously thought to be transcriptionally inactive [@Gershman2022]. -* Comparative genomics +* Comparative genomics: A common saying in the genomics field is that structure determines function and conserved structure may be constrained such that there is an important function which needs to be conserved [@Alföldi_Lindblad-Toh_2013]. Further, similarities in structure may be due to shared ancestry through the processes of evolution; therefore, some comparative genomics studies aim to infer homology or an evolutionary relationship from structural similarity [@Pearson2013]. More pertinent to the topics discussed previously, comparative genomics studies are also useful for identifying functional elements [@Taylor2006] and variants associated with disease (e.g., by comparing the genomes of those with the disease and those without it and identifying differences) [@Alföldi_Lindblad-Toh_2013; @Eichler_2019]. From 4793948b659a6cb4b5b0de54b1799bcfd02bdfdd Mon Sep 17 00:00:00 2001 From: Kate Isaac <41767733+kweav@users.noreply.github.com> Date: Mon, 13 May 2024 23:00:01 -0400 Subject: [PATCH 6/7] fix reference links or fill in as needed --- 07-microarray-data.Rmd | 6 +-- 10b-single-cell-RNA-seq.Rmd | 10 ++-- book.bib | 105 ++++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+), 8 deletions(-) diff --git a/07-microarray-data.Rmd b/07-microarray-data.Rmd index ddedbb94..e1ecd3a8 100644 --- a/07-microarray-data.Rmd +++ b/07-microarray-data.Rmd @@ -39,7 +39,7 @@ On a basic principle, oligonucleotide probes are designed for different targets ### Cons: - Microarray chips can only measure the targets they are designed for, and cannot be used for exploratory purposes [@Zhang2015]. -- Microarrays' probe designs can only be as up to date as the genome they were designed against at the time [@Mantione2014; @refinebioexamples]. +- Microarrays' probe designs can only be as up to date as the genome they were designed against at the time [@Mantione2014; @refinebioexamples2019]. - Microarray does not escape oligonucleotide biases like GC content and sequence composition biases[@refinebioexamples2019]. @@ -66,8 +66,8 @@ Gene expression arrays are designed to measure gene expression. They are designe #### Examples: - [refine.bio](https://www.refine.bio/) is the largest collection of publicly available, already normalized gene expression data (including gene expression microarrays). -- [Getting started in gene expression microarray analysis](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1000543) [@Slonim2009]. -- [Microarray and its applications](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3467903/) [@Govindarajan2012]. +- [Getting started in gene expression microarray analysis](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1000543) [@Slonim_Yanai_2009]. +- [Microarray and its applications](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3467903/) [-@Govindarajan2012]. - [Analysis of microarray experiments of gene expression profiling](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2435252/) [@Tarca2006]. ### DNA methylation arrays diff --git a/10b-single-cell-RNA-seq.Rmd b/10b-single-cell-RNA-seq.Rmd index de1fcaf1..09819861 100644 --- a/10b-single-cell-RNA-seq.Rmd +++ b/10b-single-cell-RNA-seq.Rmd @@ -167,13 +167,13 @@ These tutorials cover explicit steps, code, tool recommendations and other consi - [Processing raw 10X Genomics single-cell RNA-seq data (with cellranger)](https://swaruplab.bio.uci.edu/tutorial/cellranger/cellranger-rna.html) - a tutorial based on using CellRanger. ## Useful readings -- [An Introduction to the Analysis of Single-Cell RNA-Sequencing Data](https://doi.org/10.1016/j.omtm.2018.07.003) [@AlJanahi2018]. -- [Orchestrating single-cell analysis with Bioconductor](https://www.nature.com/articles/s41592-019-0654-x) [@Amezquita2019]. +- [An Introduction to the Analysis of Single-Cell RNA-Sequencing Data](https://doi.org/10.1016/j.omtm.2018.07.003) [@Aljanahi2018]. +- [Orchestrating single-cell analysis with Bioconductor](https://www.nature.com/articles/s41592-019-0654-x) [@Amezquita2020]. - [UMIs the problem, the solution and the proof](https://cgatoxford.wordpress.com/2015/08/14/unique-molecular-identifiers-the-problem-the-solution-and-the-proof/) [@Smith2015]. - [Experimental design for single-cell RNA sequencing](https://doi.org/10.1093/bfgp/elx035) [@BaranGale2018]. -- [Tutorial: guidelines for the experimental design of single-cell RNA sequencing studies](https://doi.org/10.1038/s41596-018-0073-y) [@Lafzi2019]. -- [Comparative Analysis of Single-Cell RNA Sequencing Methods](http://dx.doi.org/10.1016/j.molcel.2017.01.023) [@Ziegenhain2018]. -- [Comparative Analysis of Droplet-Based Ultra-High-Throughput Single-Cell RNA-Seq Systems](https://doi.org/10.1016/j.molcel.2018.10.020) [@Zhang2018]. +- [Tutorial: guidelines for the experimental design of single-cell RNA sequencing studies](https://doi.org/10.1038/s41596-018-0073-y) [@Lafzi2018]. +- [Comparative Analysis of Single-Cell RNA Sequencing Methods](http://dx.doi.org/10.1016/j.molcel.2017.01.023) [@Ziegenhain2017]. +- [Comparative Analysis of Droplet-Based Ultra-High-Throughput Single-Cell RNA-Seq Systems](https://doi.org/10.1016/j.molcel.2018.10.020) [@Zhang2019]. - [Single cells make big data: New challenges and opportunities in transcriptomics](http://dx.doi.org/10.1016/j.coisb.2017.07.004) [@Angerer2017]. - [Comparative Analysis of common alignment tools for single cell RNA sequencing](https://www.biorxiv.org/content/10.1101/2021.02.15.430948v2) [@Bruning2021]. - [Current best practices in single-cell RNA-seq analysis: a tutorial](https://doi.org/10.15252/msb.20188746) [@Luecken2019]. diff --git a/book.bib b/book.bib index 7c7aa0c5..efaed32c 100644 --- a/book.bib +++ b/book.bib @@ -295,6 +295,21 @@ @article{Gershman2023 language={en} } +@article{Govindarajan2012, + title={Microarray and its applications}, + volume={4}, + ISSN={0976-4879}, + url={https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3467903/}, + DOI={10.4103/0975-7406.100283}, + abstractNote={Microarray is one of the most recent advances being used for cancer research; it provides assistance in pharmacological approach to treat various diseases including oral lesions. Microarray helps in analyzing large amount of samples which have either been recorded previously or new samples; it even helps to test the incidence of a particular marker in tumors. Till recently, microarray’s usage in dentistry has been very limited, but in future, as the technology becomes affordable, there may be increase in its usage. Here, we discuss the various techniques and applications of microarray or DNA chip.}, + number={Suppl 2}, + journal={Journal of Pharmacy & Bioallied Sciences}, + author={Govindarajan, Rajeshwar and Duraiyan, Jeyapradha and Kaliyappan, Karunakaran and Palanisamy, Murugesan}, + year={2012}, + month=aug, + pages={S310–S312} +} + @article{Kochmanski2019, author={Kochmanski, Joseph and Savonen, Candace and Bernstein, Alison I.}, @@ -307,6 +322,30 @@ @article{Kochmanski2019 issn={1664-8021}, } +@website{Hadfield2016, + url={https://bitesizebio.com/13542/what-everyone-should-know-about-rna-seq/}, + author = {James Hadfield}, + year={2016}, + month=jul, + language={en-US} +} + +@article{Hansen2010, + title={Biases in Illumina transcriptome sequencing caused by random hexamer priming}, + volume={38}, + ISSN={0305-1048}, + url={https://doi.org/10.1093/nar/gkq224}, + DOI={10.1093/nar/gkq224}, + abstractNote={Generation of cDNA using random hexamer priming induces biases in the nucleotide composition at the beginning of transcriptome sequencing reads from the Illumina Genome Analyzer. The bias is independent of organism and laboratory and impacts the uniformity of the reads along the transcriptome. We provide a read count reweighting scheme, based on the nucleotide frequencies of the reads, that mitigates the impact of the bias.}, + number={12}, + journal={Nucleic Acids Research}, + author={Hansen, Kasper D. and Brenner, Steven E. and Dudoit, Sandrine}, + year={2010}, + month=jul, + pages={e131} +} + + @article{Hicks2017, doi = {10.1093/biostatistics/kxx053}, url = {https://doi.org/10.1093/biostatistics/kxx053}, @@ -414,6 +453,31 @@ @article{Li_Durbin_2024 language={en} } +@article{Love2016, + title={Modeling of RNA-seq fragment sequence bias reduces systematic errors in transcript abundance estimation}, + volume={34}, + ISSN={1087-0156}, + url={https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5143225/}, + DOI={10.1038/nbt.3682}, + number={12}, + journal={Nature biotechnology}, + author={Love, Michael I. and Hogenesch, John B. and Irizarry, Rafael A.}, + year={2016}, + month=dec, + pages={1287–1291} +} + +@website{bias-blog, + title={RNA-seq fragment sequence bias}, + url={https://mikelove.wordpress.com/2016/09/26/rna-seq-fragment-sequence-bias/}, + abstractNote={Our paper was just published describing a new method for modeling and correcting fragment sequence bias for estimation of transcript abundances from RNA-seq: “Modeling of RNA-seq fragment seq…}, + journal={Mike Love’s blog}, + author={Mike Love}, + year={2016}, + month=sep, + language={en} +} + @article{Luecken2019, title = {Current best practices in single‐cell {RNA}‐seq analysis: a tutorial}, @@ -525,6 +589,22 @@ @article{Pearson2013 language={en} } +@article{Pepke2009, + title={Computation for ChIP-seq and RNA-seq studies}, + volume={6}, + ISSN={1548-7091}, + url={https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4121056/}, + DOI={10.1038/nmeth.1371}, + abstractNote={Genome-wide measurements of protein-DNA interactions and transcriptomes are increasingly done by deep DNA sequencing methods (ChIP-seq and RNA-seq). The power and richness of these counting-based measurements comes at the cost of routinely handling tens to hundreds of millions of reads. While early-adopters necessarily developed their own custom computer code to analyze the first ChIP-seq and RNA-seq datasets, a new generation of more sophisticated algorithms and software tools are emerging to assist in the analysis phase of these projects. This review describes the multilayered analyses of ChIP-seq and RNA-seq datasets, discusses the software packages currently available to perform tasks at each layer, and describes some upcoming challenges and features for future analysis tools. We also discuss how software choices and uses are affected by specific aspects of the underlying biology and data structure, including genome size, positional clustering of transcription factor binding sites, transcript discovery, and expression quantification.}, + number={11 0}, + journal={Nature methods}, + author={Pepke, Shirley and Wold, Barbara and Mortazavi, Ali}, + year={2009}, + month=nov, + pages={S22–S32} +} + + @article{Rao2019, doi = {10.3389/fgene.2018.00636}, url = {https://doi.org/10.3389/fgene.2018.00636}, @@ -609,6 +689,23 @@ @article{Sistrom2016 language={en} } +@article{Slonim_Yanai_2009, + title={Getting Started in Gene Expression Microarray Analysis}, + volume={5}, + ISSN={1553-7358}, + url={https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1000543}, + DOI={10.1371/journal.pcbi.1000543}, + number={10}, + journal={PLOS Computational Biology}, + publisher={Public Library of Science}, + author={Slonim, Donna K. and Yanai, Itai}, + year={2009}, + month=oct, + pages={e1000543}, + language={en} +} + + @article{Svensson2017, doi = {10.1038/nmeth.4220}, url = {https://doi.org/10.1038/nmeth.4220}, @@ -632,6 +729,14 @@ @website{Smith2015 journal = {CGAT}, } +@website{Starmer2017-rnaseq, + url={https://www.youtube.com/watch?v=tlf6wYJrwKY}, + year={2017}, + month=aug + author = {Josh Starmer} + } + + @article{Tarca2006, author = {Tarca, A. L. and Romero, R. and Draghici, S. }, title = {Analysis of microarray experiments of gene expression profiling}, From 6bb96273af057ecea0d98e5f2846ea5decfdf1d1 Mon Sep 17 00:00:00 2001 From: Kate Isaac <41767733+kweav@users.noreply.github.com> Date: Mon, 13 May 2024 23:05:06 -0400 Subject: [PATCH 7/7] another comma --- book.bib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/book.bib b/book.bib index efaed32c..c3ea75bf 100644 --- a/book.bib +++ b/book.bib @@ -732,7 +732,7 @@ @website{Smith2015 @website{Starmer2017-rnaseq, url={https://www.youtube.com/watch?v=tlf6wYJrwKY}, year={2017}, - month=aug + month=aug, author = {Josh Starmer} }