From c39e375ed150bb2e77ae20a5d1345c42ecbe7a27 Mon Sep 17 00:00:00 2001 From: "Edward S. Rice" Date: Mon, 11 Jan 2021 17:39:46 -0800 Subject: [PATCH] Added code and README. --- 1-generate-sv-calls/manta.nf | 78 +++++++ 1-generate-sv-calls/smoove.nf | 137 ++++++++++++ 2-filter-sv-calls/filter_vcfs.sh | 17 ++ 3-analyze-sv-calls/annotate_deletions.py | 209 ++++++++++++++++++ 3-analyze-sv-calls/commands.sh | 37 ++++ .../count_variants_per_sample.py | 53 +++++ 3-analyze-sv-calls/merge_deletions.py | 208 +++++++++++++++++ README.md | 34 ++- library_list.txt | 43 ++++ sample_keys.tsv | 43 ++++ 10 files changed, 858 insertions(+), 1 deletion(-) create mode 100644 1-generate-sv-calls/manta.nf create mode 100644 1-generate-sv-calls/smoove.nf create mode 100755 2-filter-sv-calls/filter_vcfs.sh create mode 100755 3-analyze-sv-calls/annotate_deletions.py create mode 100644 3-analyze-sv-calls/commands.sh create mode 100755 3-analyze-sv-calls/count_variants_per_sample.py create mode 100755 3-analyze-sv-calls/merge_deletions.py create mode 100644 library_list.txt create mode 100644 sample_keys.tsv diff --git a/1-generate-sv-calls/manta.nf b/1-generate-sv-calls/manta.nf new file mode 100644 index 0000000..c470fc2 --- /dev/null +++ b/1-generate-sv-calls/manta.nf @@ -0,0 +1,78 @@ +#!/usr/bin/env nextflow + +params.reference = 'ref.fa' + +reference_file = file(params.reference) + +Channel + .fromSRA(file('../library_list.txt').readLines()) + .set{reads} + +process bwa_index { + publishDir 'bwa_index' + module 'bwa/bwa-0.7.17' + + input: + file reference from reference_file + + output: + file "${reference}.*" into reference_index + + """ bwa index ${reference} """ +} + +process samtools_faidx { + module 'samtools/samtools-1.9' + + input: + file reference from reference_file + + output: + file "${reference}.fai" into faidx + + """ samtools faidx ${reference} """ +} + +process align { + cpus 16 + module 'bwa/bwa-0.7.17:samtools/samtools-1.9' + publishDir 'alignments' + + input: + file ref from reference_file + file index from reference_index + set accession, file(both_ends) from reads + + output: + file "${accession}.bam" into aligned + file "${accession}.bam.bai" into aligned_index + + """ + bwa mem -t ${task.cpus} ${ref} ${both_ends} | samtools view -bh - | \ + samtools fixmate -m - - | samtools sort - | \ + samtools markdup -r - ${accession}.bam + samtools index ${accession}.bam + """ +} + +process manta { + cpus 16 + module 'biocompute/biocompute-modules' + module 'manta/manta-1.6.0' + publishDir 'results' + + input: + file 'ref.fa' from reference_file + file 'ref.fa.fai' from faidx + file bams from aligned.collect() + file bais from aligned_index.collect() + + output: + file "manta*" into results + + """ + bams=""; for bam in ${bams}; do bams+="--bam \$bam "; done + configManta.py \$bams --referenceFasta ref.fa --runDir manta + manta/runWorkflow.py -j ${task.cpus} + """ +} diff --git a/1-generate-sv-calls/smoove.nf b/1-generate-sv-calls/smoove.nf new file mode 100644 index 0000000..c63e969 --- /dev/null +++ b/1-generate-sv-calls/smoove.nf @@ -0,0 +1,137 @@ +#!/usr/bin/env nextflow + +params.reference = 'ref.fa' +params.scratch = '/local/scratch/esrbhb' + +Channel + .fromSRA(file('../library_list.txt').readLines()) + .set{reads} + +reference_file = file(params.reference) + +process bwa_index { + publishDir 'bwa_index' + module 'bwa/bwa-0.7.17' + + input: + file reference from reference_file + + output: + file "${reference}.*" into reference_index + + """ bwa index ${reference} """ +} + +process samtools_faidx { + module 'samtools/samtools-1.9' + + input: + file reference from reference_file + + output: + file "${reference}.fai" into faidx + + """ samtools faidx ${reference} """ +} + +process align { + cpus 16 + module 'bwa/bwa-0.7.17:samtools/samtools-1.9' + publishDir 'alignments' + + input: + file ref from reference_file + file index from reference_index + set accession, file(both_ends) from reads + + output: + set accession, "${accession}.bam*" into aligned + + """ + bwa mem -R "@RG\\tID:${accession}\\tSM:${accession}\\tPL:ILLUMINA" \ + -t ${task.cpus} ${ref} ${both_ends} | samtools view -bh - | \ + samtools fixmate -m - - | samtools sort - | \ + samtools markdup -r - ${accession}.bam + samtools index ${accession}.bam + """ +} + +aligned.into { aligned_for_smoove_call; aligned_for_smoove_genotype } +faidx.into { faidx_for_smoove_call; faidx_for_smoove_merge; + faidx_for_smoove_genotype } + +process smoove_call { + container 'brentp/smoove:v0.2.3' + publishDir 'unmerged' + cpus 8 + + input: + file 'ref.fa' from reference_file + file 'ref.fa.fai' from faidx_for_smoove_call + set accession, file(bam) from aligned_for_smoove_call + + output: + file "${accession}-smoove.genotyped.vcf.gz" into unmerged + + """ + smoove call --name ${accession} --fasta ref.fa -p ${task.cpus} \ + --genotype ${accession}.bam + """ +} + +process smoove_merge { + container 'brentp/smoove:v0.2.3' + + input: + file 'ref.fa' from reference_file + file 'ref.fa.fai' from faidx_for_smoove_merge + file all_unmerged from unmerged.collect() + + output: + file "merged.sites.vcf.gz" into merged + + """ + smoove merge --name merged -f ref.fa ${all_unmerged} + """ +} + +merged_vcf_faidx_and_bams = merged.combine(faidx_for_smoove_genotype) + .combine(aligned_for_smoove_genotype) + +process smoove_genotype { + container 'brentp/smoove:v0.2.3' + + input: + file 'ref.fa' from reference_file + set 'merged.sites.vcf.gz', 'ref.fa.fai', accession, + file(bam) from merged_vcf_faidx_and_bams + + output: + file "${accession}-joint-smoove.genotyped.vcf.gz" into joint_genotyped + file "${accession}-joint-smoove.genotyped.vcf.gz.csi" into j_g_index + + """ + export TMPDIR=\$PWD + smoove genotype -d -x --name ${accession}-joint --fasta ref.fa \ + --vcf merged.sites.vcf.gz ${accession}.bam + echo "done!" + """ +} + +process smoove_paste { + container 'brentp/smoove:v0.2.3' + publishDir 'output' + + input: + file all_vcfs from joint_genotyped.collect() + file all_indexes from j_g_index.collect() + + output: + file "pasted.smoove.square.vcf.gz" into pasted + + """ + smoove paste --name pasted ${all_vcfs} + """ +} + + diff --git a/2-filter-sv-calls/filter_vcfs.sh b/2-filter-sv-calls/filter_vcfs.sh new file mode 100755 index 0000000..8ab2a3d --- /dev/null +++ b/2-filter-sv-calls/filter_vcfs.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# filter manta calls +cut -f2,4 ../sample_keys.tsv > sample2id.tsv +bcftools filter -Oz \ + -i 'INFO/SVTYPE="DEL" && INFO/SVLEN < -500 && INFO/SVLEN > -100000' \ + ../1-generate-sv-calls/manta/results/manta/results/variants/diploidSV.vcf.gz \ + | bcftools reheader -s sample2id.tsv - > manta_deletions_only.vcf.gz +rm sample2id.tsv + +# filter lumpy calls +cut -f1,4 ../sample_keys.tsv > srr2id.tsv +bcftools filter -Oz \ + -i 'INFO/SVTYPE="DEL" && INFO/SVLEN < -500 && INFO/SVLEN > -100000' \ + ../1-generate-sv-calls/lumpy/output/pasted.smoove.square.vcf.gz | \ + bcftools reheader -s srr2id.tsv - > lumpy_deletions_only.vcf.gz +rm srr2id.tsv diff --git a/3-analyze-sv-calls/annotate_deletions.py b/3-analyze-sv-calls/annotate_deletions.py new file mode 100755 index 0000000..a93d145 --- /dev/null +++ b/3-analyze-sv-calls/annotate_deletions.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +""" +Given a vcf file containing deletions and a gff annotating the +reference genome, annotate each of the deletions as affecting either +an intergenic region, a regulatory region (+/- 2kb of a gene), an +intron, a coding sequence, or some combination of these. +""" + +import argparse +import itertools +import os +import sys + +import gffutils +import vcf + + +def gff_type(gff_path): + """ + argparse type function for GFF files. Uses gffutils to create + a database if one does not yet exist, and then returns that + database. + """ + db_path = gff_path + ".db" + if gff_path.split(".")[-1] == "db": + return gffutils.FeatureDB(gff_path) + elif os.path.exists(db_path): + return gffutils.FeatureDB(db_path) + else: + print("Creating gff db...", file=sys.stderr) + return gffutils.create_db( + gff_path, + db_path, + # the id_spec is necessary because NCBI gff's do not follow + # the GFF specification + id_spec={"gene": "db_xref"}, + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "-r", + "--regulatory-margin", + type=int, + default=2000, + help="distance up/downstream of a gene to consider " + "as affecting that gene, in bp [2000]", + ) + parser.add_argument( + "vcf", help="the vcf file to annotate", type=lambda f: vcf.Reader(filename=f) + ) + parser.add_argument( + "gff", + help="annotations of the reference genome, " + "either in NCBI gff or a pre-created sqlite db", + type=gff_type, + ) + return parser.parse_args() + + +def get_deletion_effects(deletion_record, gff_db, regulatory_margin=2000): + """ + Figure out the effects of a deletion using a gff. + + Args: + deletion_record (vcf.Record): a vcf Record representing the + deletion + gff_db (gffutils.FeatureDB): a gffutils DB of a genome's + annotations + regulatory_margin (int): the amount of sequence on either side + of the deletion to look in for genes to be classified as + having their regulatory regions affected + """ + affected_genes = set() + intergenic = True + regulatory, intronic, coding = [False] * 3 + + # first, go through all the features that overlap the deletion + # and use them to set the above booleans and add any affected + # genes to affected_genes + features_in_deletion = gff_db.region( + seqid=deletion_record.CHROM, + start=deletion_record.POS, + end=deletion_record.sv_end, + ) + for feature in features_in_deletion: + if feature.featuretype == "gene": + affected_genes.add(feature.attributes["Name"][0].upper()) + intergenic = False + intronic = True + elif feature.featuretype == "CDS": + coding = True + intronic = False + + # next, look for any genes *near* the deletion + features_near_deletion = itertools.chain( + gff_db.region( + seqid=deletion_record.CHROM, + start=deletion_record.POS - regulatory_margin, + end=deletion_record.POS, + ), + gff_db.region( + seqid=deletion_record.CHROM, + start=deletion_record.sv_end, + end=deletion_record.sv_end + regulatory_margin, + ), + ) + for feature in features_near_deletion: + if feature.featuretype == "gene": + gene_name = feature.attributes["Name"][0].upper() + # only consider this a deletion of a regulatory region if + # this gene has not been otherwise affected + if gene_name not in affected_genes: + regulatory = True + intergenic = False + affected_genes.add(gene_name) + + return affected_genes, intergenic, regulatory, intronic, coding + + +def annotate_deletion(record, affected_genes, intergenic, regulatory, intronic, coding): + """ adds INFO fields to a vcf record """ + record.INFO["affected_genes"] = list(affected_genes) + # all these if statements are necessary because pyvcf adds + # unnecessary semicolons for false flags and empty fields + if intergenic: + record.INFO["intergenic"] = True + if regulatory: + record.INFO["regulatory"] = True + if intronic: + record.INFO["intronic"] = True + if coding: + record.INFO["coding"] = True + return record + + +def add_info_fields_to_header(vcf_reader): + """ + PyVCF uses a vcf.Reader to get a header and output it when a + vcf.Writer is created. This function takes a Reader and adds + some INFO fields to its header so that it can be used as a + template for the vcf this program outputs. + + Args: + vcf_reader (vcf.Reader): reader with a file whose header we + want to add INFO fields to + + Returns: + vcf_reader (vcf.Reader): the same reader that was input, but + with some new INFO fields + """ + vcf_reader.infos["affected_genes"] = vcf.parser._Info( + id="affected_genes", + num=".", + type="String", + desc="List of genes affected by this deletion", + source=None, + version=None, + ) + vcf_reader.infos["intergenic"] = vcf.parser._Info( + id="intergenic", + num="0", + type="Flag", + desc="This deletion does not affect any genes", + source=None, + version=None, + ) + vcf_reader.infos["regulatory"] = vcf.parser._Info( + id="regulatory", + num="0", + type="Flag", + desc="This deletion occurs directly up- or downstream of gene(s)", + source=None, + version=None, + ) + vcf_reader.infos["intronic"] = vcf.parser._Info( + id="intronic", + num="0", + type="Flag", + desc="This deletion affects the introns of one or more genes", + source=None, + version=None, + ) + vcf_reader.infos["coding"] = vcf.parser._Info( + id="coding", + num="0", + type="Flag", + desc="This deletion affects the coding sequence of one or more genes", + source=None, + version=None, + ) + return vcf_reader + + +def main(): + """ __main__ method for this file """ + args = parse_args() + + writer = vcf.Writer(sys.stdout, add_info_fields_to_header(args.vcf)) + for record in args.vcf: + effects_tuple = get_deletion_effects(record, args.gff, args.regulatory_margin) + annotated_record = annotate_deletion(record, *effects_tuple) + writer.write_record(annotated_record) + writer.close() + + +if __name__ == "__main__": + main() diff --git a/3-analyze-sv-calls/commands.sh b/3-analyze-sv-calls/commands.sh new file mode 100644 index 0000000..e8284ff --- /dev/null +++ b/3-analyze-sv-calls/commands.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# merge lumpy and manta deletions +./merge_deletions.py \ + ../vcf_filtering/lumpy_deletions_only.vcf.gz \ + ../vcf_filtering/manta_deletions_only.vcf.gz | bgzip \ + > merged_deletions.vcf.gz + +# count number of deletions per sample +./count_variants_per_sample.py \ + ../2-filter-sv-calls/lumpy_deletions_only.vcf.gz > lumpy_deletion_counts.tsv +./count_variants_per_sample.py \ + ../2-filter-sv-calls/manta_deletions_only.vcf.gz > manta_deletion_counts.tsv +./count_variants_per_sample.py \ + merged_deletions.vcf.gz > merged_deletion_counts.tsv + +# annotate the vcf +./annotate_deletions.py merged_deletions.vcf.gz ../AstMex.db \ + | bgzip > merged_annotated.vcf.gz + +# filter the vcf to count only deletions of intronic sequence +bcftools filter -i intronic=1 -Oz merged_annotated.vcf.gz \ + > intronic_deletions.vcf.gz +./count_variants_per_sample.py \ + intronic_deletions.vcf.gz > intron_deletion_counts.tsv + +# fitler the vcf to count only deletions of regulatory sequence +bcftools filter -i regulatory=1 -Oz merged_annotated.vcf.gz \ + > regulatory_deletions.vcf.gz +./count_variants_per_sample.py \ + regulatory_deletions.vcf.gz > regulatory_deletion_counts.tsv + +# filter the vcf to count only deletions of coding sequence +bcftools filter -i coding=1 -Oz merged_annotated.vcf.gz \ + > coding_deletions.vcf.gz +./count_variants_per_sample.py \ + coding_deletions.vcf.gz > coding_deletion_counts.tsv diff --git a/3-analyze-sv-calls/count_variants_per_sample.py b/3-analyze-sv-calls/count_variants_per_sample.py new file mode 100755 index 0000000..3b6b400 --- /dev/null +++ b/3-analyze-sv-calls/count_variants_per_sample.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +""" +Count the number of called variants per sample in a VCF file. +""" + +import argparse +import collections + +import vcf + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "vcf", help="the vcf file to analyze", type=lambda f: vcf.Reader(filename=f) + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + call_counts = collections.Counter() + hom_alt_counts = collections.Counter() + het_counts = collections.Counter() + for record in filter(lambda r: not r.is_filtered, args.vcf): + for call in filter(lambda s: not s.is_filtered, record.samples): + call_counts[call.sample] += 1 + if call.is_variant: + if call.is_het: + het_counts[call.sample] += 1 + else: + hom_alt_counts[call.sample] += 1 + + print("\t".join(["sample", "call_count", "hom_alt_count", "het_count"])) + for sample in call_counts.keys(): + print( + "\t".join( + map( + str, + [ + sample, + call_counts[sample], + hom_alt_counts[sample], + het_counts[sample], + ], + ) + ) + ) + + +if __name__ == "__main__": + main() diff --git a/3-analyze-sv-calls/merge_deletions.py b/3-analyze-sv-calls/merge_deletions.py new file mode 100755 index 0000000..b0a885b --- /dev/null +++ b/3-analyze-sv-calls/merge_deletions.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +""" +Given two vcf files containing deletions, output a new vcf file +containing only the deletions that appear in both inputs. +""" + +import argparse +from itertools import starmap +import sys + +import vcf + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "-r", + "--reciprocal-overlap", + type=float, + default=0.5, + help="minimum reciprocal overlap to consider a " + "deletion in both files to be the same deletion [0.5]", + ) + parser.add_argument( + "vcf1", + help="the first vcf file to merge", + type=lambda f: vcf.Reader(filename=f), + ) + parser.add_argument( + "vcf2", + help="the second vcf file to merge", + type=lambda f: vcf.Reader(filename=f), + ) + return parser.parse_args() + + +def compare_records(record1, record2, contig_index): + """ + Compares two vcf._Record instances by location to determine which + would come earlier in a vcf file. + + Args: + record1, record2 (vcf._Record): two vcf records to compare + contig_index (dict): mapping CHROM names to their index in the + VCF header + + Returns: + compare_value (int): >0 if record1 > record2, <0 if record1 < + record2; 0 if record1 and record2 overlap + """ + if record1.CHROM != record2.CHROM: + return contig_index[record1.CHROM] - contig_index[record2.CHROM] + else: + if (record1.POS >= record2.POS and record1.POS <= record2.sv_end) or ( + record1.sv_end >= record2.POS and record1.sv_end <= record2.sv_end + ): + return 0 + else: + return record1.POS - record2.POS + + +def reciprocal_overlap(record1, record2, min_reciprocal_overlap=0.5): + """ + Given two vcf._Record instances representing deletions, determines + whether there is an overlap between them based on a minimum + reciprocal overlap. If there is no overlap, returns None; if there + is an overlap, returns the start and end points of that overlap. + + Args: + record1, record2 (vcf._Record): the records to be intersected + min_reciprocal_overlap(float): the minimum reciprocal overlap + between the two records in order for them to be considered + overlapping. Reciprocal overlap is defined as + size_of_overlap / max(size_of_r1, size_of_r2) + + Returns: + None if there is not an overlap between the two records, or + there is an overlap but it is smaller than + min_reciprocal_overlap + (overlap_start, overlap_end) if there is an overlap of + sufficient size between the two records + """ + if record1.CHROM != record2.CHROM: + return 0 + + r1_deletion_size = record1.sv_end - record1.POS + r2_deletion_size = record2.sv_end - record2.POS + + overlap_start = max(record1.POS, record2.POS) + overlap_end = min(record1.sv_end, record2.sv_end) + overlap_size = overlap_end - overlap_start + reciprocal_overlap = min( + overlap_size / r1_deletion_size, overlap_size / r2_deletion_size + ) + + return reciprocal_overlap + + +def call_type_key(call): + if call.gt_type is None: + return -1 + else: + return call.gt_type + + +CallData = vcf.model.make_calldata_tuple(["GT"]) + + +def merge_calls(call1, call2): + """ + Given two calls on the same sample, outputs a new call with the + lesser of the two genotypes; i.e., 0/0 < 0/1 < 1/1. Also simplifies + the call data, leaving it with only a GT field. + + N.B. This function is not commutative, as the output call will have + site = call1.site. + """ + return vcf.model._Call( + call1.site, + call1.sample, + CallData(min(call1, call2, key=call_type_key).data.GT), + ) + + +def make_sample_index_key(sample_index): + def sample_index_key(call): + return sample_index[call.sample] + + return sample_index_key + + +def merge_records(record1, record2, sample_index_key): + """ + Takes two VCF records containing deletions and merges them into a + single record. Rather than trying to split hairs over where the + boundaries of the merged record should be when the called boundaries + are imprecise anyway, this function just uses the metadata + associated with the bigger deletion. + + Args: + record1, record2 (vcf.model._Record): deletion records to + merge. This function does not check to make sure it's + reasonable to merge the two; it just takes the consensus + for each of the individual sample calls. + sample_index_key (function): function that maps _Call objects + to the index of their sample. `make_sample_index_key()` can + create one of these. + """ + # figure out which record is bigger, so that we can use it as the + # template for the new record + small_record, big_record = sorted( + [record1, record2], key=lambda r: r.sv_end - r.POS + ) + + # sort the calls by sample index, and then merge each pair of calls + # for the same sample into a single call + big_record.samples = list( + starmap( + merge_calls, + zip( + sorted(big_record.samples, key=sample_index_key), + sorted(small_record.samples, key=sample_index_key), + ), + ) + ) + big_record.FORMAT = "GT" + + return big_record + + +def merge_all_deletions(reader1, reader2, min_reciprocal_overlap=0.5): + # god help us if the VCFs have headers in different orders + contig_index = {k: i for i, k in enumerate(reversed(reader1.contigs.keys()))} + + # keep the samples in the order in which they appear in the reader1 + # header + sample_index = {k: i for i, k in enumerate(reader1.samples)} + sample_index_key = make_sample_index_key(sample_index) + + try: + record1, record2 = next(reader1), next(reader2) + while True: # TODO please don't do this + while compare_records(record1, record2, contig_index) < 0: + record1 = next(reader1) + while compare_records(record1, record2, contig_index) > 0: + record2 = next(reader2) + overlap = reciprocal_overlap(record1, record2) + if overlap >= min_reciprocal_overlap: + yield merge_records(record1, record2, sample_index_key) + record1 = next(reader1) + record2 = next(reader2) + except StopIteration: + return + + +def main(): + args = parse_args() + + writer = vcf.Writer(sys.stdout, args.vcf1) + # TODO fix output header + for record in merge_all_deletions(args.vcf1, args.vcf2, args.reciprocal_overlap): + writer.write_record(record) + writer.flush() + writer.close() + + +if __name__ == "__main__": + main() diff --git a/README.md b/README.md index cc6e389..b4a1629 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,34 @@ -# cavefish-genome-paper +# astyanax-genome-paper Scripts and workflows used in Warren et al. (2021) + +This repository contains scripts and workflow used to analyze deletions in the +paper: Warren et al. (2021) "A chromosome level genome of _Astyanax mexicanus_ +surface fish for comparing population-specific genetic differences contributing +to trait evolution." _Nature Communications_ (in press) + +## Generating structural variant calls +We generated structural variant calls with short reads for all of the libraries +listed in `library_list.txt`, using two short-read SV-callers: +[https://github.com/Illumina/manta](manta) and +[https://github.com/arq5x/lumpy-sv](lumpy). For lumpy, we used the +[https://github.com/brentp/smoove](smoove) pipeline as recommended. +[https://www.nextflow.io/](nextflow) workflows for both are in +`1-generate-sv-calls`. Please refer to the webpages for these software packages +for requirements and installation instructions. + +## Filtering structural variant calls +We filtered the structural variant calls generated by manta and lumpy to +include only deletions within the size range (500,100000). A shell script for +this is in `2-filter-sv-calls`. + +## Analyzing and counting structural variant calls +We analyzed the deletion calls by counting the numbers of deletions per sample, +merging the lumpy and manta results, and annotating the deletions based on +whether they contain intronic, regulatory, or coding sequence, as described in +the manuscript. Python scripts for counting, merging, and annotating are in +`3-analyze-sv-calls`, as well as a bash script containing the commands we used +to run these programs. + +## More information +Please consult the paper for more information about the methods and results, +and direct questions to the corresponding authors. diff --git a/library_list.txt b/library_list.txt new file mode 100644 index 0000000..fbb3481 --- /dev/null +++ b/library_list.txt @@ -0,0 +1,43 @@ +SRR1575270 +SRR1575271 +SRR1575272 +SRR1575273 +SRR1575274 +SRR1575275 +SRR1575276 +SRR1575277 +SRR1575278 +SRR1575279 +SRR1575280 +SRR1575281 +SRR1575282 +SRR1575283 +SRR1575284 +SRR1575285 +SRR1575286 +SRR1575287 +SRR1575288 +SRR1575289 +SRR1575290 +SRR1575291 +SRR1575292 +SRR1575293 +SRR1575294 +SRR1575295 +SRR1575296 +SRR1575297 +SRR1575298 +SRR1927184 +SRR1927212 +SRR1927214 +SRR1927215 +SRR1927218 +SRR1927221 +SRR1927224 +SRR1927228 +SRR1927232 +SRR1927233 +SRR1927234 +SRR1927235 +SRR1927236 +SRR1927237 diff --git a/sample_keys.tsv b/sample_keys.tsv new file mode 100644 index 0000000..e2afbbf --- /dev/null +++ b/sample_keys.tsv @@ -0,0 +1,43 @@ +SRR1575270 SAMPLE25 Astyanax mexicanus Choy_01 +SRR1575271 SAMPLE36 Astyanax mexicanus Choy_05 +SRR1575272 SAMPLE12 Astyanax mexicanus Choy_06 +SRR1575273 SAMPLE1 Astyanax mexicanus Choy_09 +SRR1575274 SAMPLE34 Astyanax mexicanus Choy_10 +SRR1575275 SAMPLE31 Astyanax mexicanus Choy_11 +SRR1575276 SAMPLE4 Astyanax mexicanus Choy_12 +SRR1575277 SAMPLE29 Astyanax mexicanus Choy_13 +SRR1575278 SAMPLE21 Astyanax mexicanus Choy_14 +SRR1575279 SAMPLE30 Astyanax mexicanus Pach_3 +SRR1575280 SAMPLE33 Astyanax mexicanus Pach_7 +SRR1575281 SAMPLE32 Astyanax mexicanus Pach_8 +SRR1575282 SAMPLE6 Astyanax mexicanus Pach_9 +SRR1575283 SAMPLE8 Astyanax mexicanus Pach_11 +SRR1575284 SAMPLE38 Astyanax mexicanus Pach_12 +SRR1575285 SAMPLE23 Astyanax mexicanus Pach_14 +SRR1575286 SAMPLE2 Astyanax mexicanus Pach_15 +SRR1575287 SAMPLE27 Astyanax mexicanus Pach_17 +SRR1575288 SAMPLE42 Astyanax mexicanus Molino_2a +SRR1575289 SAMPLE41 Astyanax mexicanus Molino_7a +SRR1575290 SAMPLE7 Astyanax mexicanus Molino_9b +SRR1575291 SAMPLE19 Astyanax mexicanus Molino_10b +SRR1575292 SAMPLE18 Astyanax mexicanus Molino_11a +SRR1575293 SAMPLE20 Astyanax mexicanus Molino_12a +SRR1575294 SAMPLE13 Astyanax mexicanus Molino_13b +SRR1575295 SAMPLE16 Astyanax mexicanus Molino_14a +SRR1575296 SAMPLE11 Astyanax mexicanus Molino_15b +SRR1575297 SAMPLE28 Astyanax mexicanus Rascon_02 +SRR1575298 SAMPLE26 Astyanax mexicanus Rascon_04 +SRR1927184 SAMPLE5 Astyanax mexicanus Tinaja_11 +SRR1927212 SAMPLE39 Astyanax mexicanus Tinaja_6 +SRR1927214 SAMPLE14 Astyanax mexicanus Tinaja_12 +SRR1927215 SAMPLE15 Astyanax mexicanus Tinaja_B +SRR1927218 SAMPLE22 Astyanax mexicanus Tinaja_2 +SRR1927221 SAMPLE37 Astyanax mexicanus Tinaja_C +SRR1927224 SAMPLE43 Astyanax mexicanus Tinaja_3 +SRR1927228 SAMPLE35 Astyanax mexicanus Tinaja_D +SRR1927232 SAMPLE10 Astyanax mexicanus Tinaja_5 +SRR1927233 SAMPLE3 Astyanax mexicanus Tinaja_E +SRR1927234 SAMPLE40 Astyanax mexicanus Rascon_13 +SRR1927235 SAMPLE17 Astyanax mexicanus Rascon_15 +SRR1927236 SAMPLE9 Astyanax mexicanus Rascon_8 +SRR1927237 SAMPLE24 Astyanax mexicanus Rascon_6