diff --git a/data/motifs/createPwm.py b/data/motifs/createPwm.py index b0c263a95..55256f0d1 100644 --- a/data/motifs/createPwm.py +++ b/data/motifs/createPwm.py @@ -2,20 +2,28 @@ import re import argparse +from rgt.Util import npath + parser = argparse.ArgumentParser() parser.add_argument('-i', '--input-file', type=str, required=True, help='name of the input file') -parser.add_argument('-f', '--input-format', choices=['jaspar-2014', 'jaspar-2016', 'hocomoco-pcm'], type=str, required=True, help='format of the input file') +parser.add_argument('-f', '--input-format', choices=['jaspar-2014', 'jaspar-2016', 'hocomoco-pcm'], + type=str, required=True, help='format of the input file') parser.add_argument('-o', '--output-folder', type=str, required=True, help='name of output Folder') args = parser.parse_args() # read the input file -with open(args.input_file, "r") as f: +with open(npath(args.input_file), "r") as f: content = f.readlines() n_lines = len(content) +output_folder = npath(args.output_folder) + +# make output directory path, if it doesn't exist +os.makedirs(output_folder) + ################################################################################################### # JASPAR 2014 ################################################################################################### @@ -32,7 +40,7 @@ count_g = re.sub('\s+', ' ', count_g) count_t = re.sub('\s+', ' ', count_t) - outputFileName = os.path.join(args.output_folder, "{}.pwm".format(motif_name.replace(">", ""))) + outputFileName = os.path.join(output_folder, "{}.pwm".format(motif_name.replace(">", ""))) with open(outputFileName, "w") as f: f.write(count_a + "\n") f.write(count_c + "\n") @@ -55,7 +63,7 @@ count_g = re.sub('\s+', ' ', count_g) count_t = re.sub('\s+', ' ', count_t) - outputFileName = os.path.join(args.output_folder, "{}.pwm".format(motif_name)) + outputFileName = os.path.join(output_folder, "{}.pwm".format(motif_name)) with open(outputFileName, "w") as f: f.write(count_a + "\n") f.write(count_c + "\n") @@ -97,7 +105,7 @@ count_g = ' '.join(count_g) count_t = ' '.join(count_t) - outputFileName = os.path.join(args.output_folder, "{}.pwm".format(motif_name)) + outputFileName = os.path.join(output_folder, "{}.pwm".format(motif_name)) with open(outputFileName, "w") as f: f.write(count_a + "\n") f.write(count_c + "\n") diff --git a/requirements.txt b/requirements.txt index 34447a88f..50d23ec11 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ HTSeq hmmlearn configparser matplotlib -pyx natsort moods-python +pandas +logomaker diff --git a/rgt/HINT/DifferentialAnalysis.py b/rgt/HINT/DifferentialAnalysis.py index e09158e81..882140824 100644 --- a/rgt/HINT/DifferentialAnalysis.py +++ b/rgt/HINT/DifferentialAnalysis.py @@ -1,18 +1,14 @@ from __future__ import print_function - import os import numpy as np +import pandas as pd import pysam + +import logomaker from pysam import Samfile, Fastafile from math import ceil, floor -from Bio import motifs -import matplotlib import logging -matplotlib.use('Agg') -import matplotlib.pyplot as plt - -import pyx from scipy.stats import zscore from scipy.stats import norm from argparse import SUPPRESS @@ -24,10 +20,12 @@ from rgt.GenomicRegionSet import GenomicRegionSet from rgt.HINT.biasTable import BiasTable +import matplotlib.pyplot as plt + """ Perform differential footprints analysis based on the prediction of transcription factor binding sites. -Authors: Eduardo G. Gusmao, Zhijian Li +Authors: Zhijian Li """ dic = {"A": 0, "C": 1, "G": 2, "T": 3} @@ -38,31 +36,18 @@ def diff_analysis_args(parser): parser.add_argument("--organism", type=str, metavar="STRING", default="hg19", help="Organism considered on the analysis. Must have been setup in the RGTDATA folder. " "Common choices are hg19, hg38. mm9, and mm10. DEFAULT: hg19") - parser.add_argument("--mpbs-file1", type=str, metavar="FILE", default=None, - help="motif predicted binding sites file for condition 1, must be .bed file. DEFAULT: None") - parser.add_argument("--mpbs-file2", type=str, metavar="FILE", default=None, - help="motif predicted binding sites file for condition 2, must be .bed file. DEFAULT: None") - parser.add_argument("--reads-file1", type=str, metavar="FILE", default=None, - help="The BAM file containing the DNase-seq or ATAC-seq reads for condition 1. DEFAULT: None") - parser.add_argument("--reads-file2", type=str, metavar="FILE", default=None, - help="The BAM file containing the DNase-seq or ATAC-seq reads for condition 2. DEFAULT: None") - + parser.add_argument("--mpbs-files", metavar='FILE1,FILE2...', type=str, + help='Predicted motif binding sites for each condition.' + 'Files should be separated with comma.') + parser.add_argument("--reads-files", metavar='FILE1,FILE2...', type=str, + help='Reads for each condition. Files should be separated with comma.') + parser.add_argument("--conditions", metavar='STRING', type=str, + help='Name for each condition. DEFAULT: condition1,condition2, ...') + parser.add_argument("--colors", metavar='STRING', type=str, + help='Set color in line plot. DEFAULT: None, ...') parser.add_argument("--window-size", type=int, metavar="INT", default=200, help="The window size for differential analysis. DEFAULT: 200") - parser.add_argument("--factor1", type=float, metavar="FLOAT", default=None, - help="The normalization factor for condition 1. DEFAULT: None") - parser.add_argument("--factor2", type=float, metavar="FLOAT", default=None, - help="The normalization factor for condition 1. DEFAULT: None") - - parser.add_argument("--forward-shift", type=int, metavar="INT", default=5, help=SUPPRESS) - parser.add_argument("--reverse-shift", type=int, metavar="INT", default=-4, help=SUPPRESS) - parser.add_argument("--bias-table1", type=str, metavar="FILE1_F,FILE1_R", default=None, help=SUPPRESS) - parser.add_argument("--bias-table2", type=str, metavar="FILE2_F,FILE2_R", default=None, help=SUPPRESS) - parser.add_argument("--condition1", type=str, metavar="STRING", default="condition1", - help="The name of condition1. DEFAULT: condition1") - parser.add_argument("--condition2", type=str, metavar="STRING", default="condition1", - help="The name of condition2. DEFAULT: condition2") parser.add_argument("--fdr", type=float, metavar="FLOAT", default=0.05, help="The false discovery rate. DEFAULT: 0.05") parser.add_argument("--bc", action="store_true", default=False, @@ -70,6 +55,9 @@ def diff_analysis_args(parser): parser.add_argument("--nc", type=int, metavar="INT", default=1, help="The number of cores. DEFAULT: 1") + parser.add_argument("--forward-shift", type=int, metavar="INT", default=5, help=SUPPRESS) + parser.add_argument("--reverse-shift", type=int, metavar="INT", default=-4, help=SUPPRESS) + # Output Options parser.add_argument("--output-location", type=str, metavar="PATH", default=os.getcwd(), help="Path where the output bias table files will be written. DEFAULT: current directory") @@ -82,62 +70,195 @@ def diff_analysis_args(parser): "specific instance of the given motif. DEFAULT: False") -def get_raw_signal(arguments): - (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism, - window_size, forward_shift, reverse_shift) = arguments +def diff_analysis_run(args): + # Initializing Error Handler + err = ErrorHandler() - mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") - mpbs1.read(mpbs_file1) + output_location = os.path.join(args.output_location, "Lineplots") + try: + if not os.path.isdir(output_location): + os.makedirs(output_location) + except Exception: + err.throw_error("MM_OUT_FOLDER_CREATION") + + # check if they have same length + mpbs_files = args.mpbs_files.strip().split(",") + reads_files = args.reads_files.strip().split(",") + conditions = args.conditions.strip().split(",") + + if args.colors is not None: + colors = args.colors.strip().split(",") + else: + colors = ["#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33", + "#a65628", "#f781bf", "#999999"] + + assert len(mpbs_files) == len(reads_files) == len(conditions), \ + "Number of motif, read and condition names are not same: {}, {}, {}".format(len(mpbs_files), len(reads_files), + len(conditions)) - mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") - mpbs2.read(mpbs_file2) + # Check if the index file exists + for reads_file in reads_files: + base_name = "{}.bai".format(reads_file) + if not os.path.exists(base_name): + pysam.index(reads_file) + + mpbs = GenomicRegionSet("Motif Predicted Binding Sites of All Conditions") + for i, mpbs_file in enumerate(mpbs_files): + mpbs.read(mpbs_file) - mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() + mpbs.remove_duplicates() + mpbs_name_list = list(set(mpbs.get_names())) - bam1 = Samfile(reads_file1, "rb") - bam2 = Samfile(reads_file2, "rb") + signals = np.zeros(shape=(len(conditions), len(mpbs_name_list), args.window_size), dtype=np.float32) + motif_len = list() + motif_num = list() + motif_pwm = list() - genome_data = GenomeData(organism) + genome_data = GenomeData(args.organism) fasta = Fastafile(genome_data.get_genome()) - signal_1 = np.zeros(window_size) - signal_2 = np.zeros(window_size) - motif_len = None - pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), - ("G", [0.0] * window_size), ("T", [0.0] * window_size), - ("N", [0.0] * window_size)]) + print("generating signal for each motif and condition...\n") + # differential analysis using bias corrected signal + if args.bc: + hmm_data = HmmData() + table_forward = hmm_data.get_default_bias_table_F_ATAC() + table_reverse = hmm_data.get_default_bias_table_R_ATAC() + bias_table = BiasTable().load_table(table_file_name_F=table_forward, table_file_name_R=table_reverse) + + # do not use multi-processing + if args.nc == 1: + for i, condition in enumerate(conditions): + for j, mpbs_name in enumerate(mpbs_name_list): + mpbs_regions = mpbs.by_names([mpbs_name]) + arguments = (mpbs_regions, reads_files[i], args.organism, args.window_size, args.forward_shift, + args.reverse_shift, bias_table) + try: + signals[i, j, :] = get_bc_signal(arguments) + except Exception: + logging.exception("get bias corrected signal failed") + + # get motif length, number and pwm matrix + motif_len.append(mpbs_regions[0].final - mpbs_regions[0].initial) + motif_num.append(len(mpbs_regions)) + motif_pwm.append(get_pwm(fasta, mpbs_regions, args.window_size)) + + # use multi-processing + else: + pool = Pool(processes=args.nc) + for i, condition in enumerate(conditions): + arguments_list = list() + for mpbs_name in mpbs_name_list: + mpbs_regions = mpbs.by_names([mpbs_name]) + arguments = (mpbs_regions, reads_files[i], args.organism, args.window_size, args.forward_shift, + args.reverse_shift, bias_table) + arguments_list.append(arguments) + + # get motif length, number and pwm matrix + motif_len.append(mpbs_regions[0].final - mpbs_regions[0].initial) + motif_num.append(len(mpbs_regions)) + motif_pwm.append(get_pwm(fasta, mpbs_regions, args.window_size)) + + try: + res = pool.map(get_bc_signal, arguments_list) + signals[i] = np.array(res) + except Exception: + logging.exception("get bias corrected signal failed") - mpbs_regions = mpbs.by_names([mpbs_name]) - num_motif = len(mpbs_regions) + # differential analysis using raw signal + else: + # do not use multi-processing + if args.nc == 1: + for i, condition in enumerate(conditions): + for j, mpbs_name in enumerate(mpbs_name_list): + mpbs_regions = mpbs.by_names([mpbs_name]) + arguments = (mpbs_regions, reads_files[i], args.organism, args.window_size, args.forward_shift, + args.reverse_shift) + try: + signals[i, j, :] = get_raw_signal(arguments) + except Exception: + logging.exception("get bias corrected signal failed") + + # get motif length, number and pwm matrix + motif_len.append(mpbs_regions[0].final - mpbs_regions[0].initial) + motif_num.append(len(mpbs_regions)) + motif_pwm.append(get_pwm(fasta, mpbs_regions, args.window_size)) + + # use multi-processing + else: + pool = Pool(processes=args.nc) + for i, condition in enumerate(conditions): + arguments_list = list() + for mpbs_name in mpbs_name_list: + mpbs_regions = mpbs.by_names([mpbs_name]) + arguments = (mpbs_regions, reads_files[i], args.organism, args.window_size, args.forward_shift, + args.reverse_shift) + arguments_list.append(arguments) + + # get motif length, number and pwm matrix + motif_len.append(mpbs_regions[0].final - mpbs_regions[0].initial) + motif_num.append(len(mpbs_regions)) + motif_pwm.append(get_pwm(fasta, mpbs_regions, args.window_size)) + + try: + res = pool.map(get_raw_signal, arguments_list) + signals[i] = np.array(res) + except Exception: + logging.exception("get bias corrected signal failed") + + print("signal generation is done!") + + # compute normalization facotr for each condition + factors = compute_factors(signals) + output_factor(args, factors, conditions) + + # normalize signals by factor and number of motifs + for i in range(len(conditions)): + for j in range(len(mpbs_name_list)): + signals[i, j, :] = signals[i, j, :] / (factors[i] * motif_num[j]) + + if args.output_profiles: + output_profiles(mpbs_name_list, signals, conditions, args.output_location) + + print("generating line plot for each motif...\n") + if args.nc == 1: + for i, mpbs_name in enumerate(mpbs_name_list): + output_line_plot((mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location, + args.window_size, colors)) + else: + pool = Pool(processes=args.nc) + arguments_list = list() + for i, mpbs_name in enumerate(mpbs_name_list): + arguments_list.append((mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location, + args.window_size, colors)) + pool.map(output_line_plot, arguments_list) + + ps_tc_results = list() + for i, mpbs_name in enumerate(mpbs_name_list): + ps_tc_results.append(get_ps_tc_results(signals[:, i, :], motif_len[i], args.window_size)) + + # find the significant motifs and generate a scatter plot if two conditions are given + if len(conditions) == 2: + ps_tc_results = scatter_plot(args, ps_tc_results, mpbs_name_list, conditions) + + output_stat_results(ps_tc_results, conditions, mpbs_name_list, motif_num, args) - for region in mpbs_regions: - if motif_len is None: - motif_len = region.final - region.initial +def get_raw_signal(arguments): + (mpbs_region, reads_file, organism, window_size, forward_shift, reverse_shift) = arguments + + bam = Samfile(reads_file, "rb") + signal = np.zeros(window_size) + + for region in mpbs_region: mid = (region.final + region.initial) / 2 p1 = mid - window_size / 2 p2 = mid + window_size / 2 if p1 <= 0: continue - # Fetch raw signal - for read in bam1.fetch(region.chrom, p1, p2): - # check if the read is unmapped, according to issue #112 - if read.is_unmapped: - continue - - if not read.is_reverse: - cut_site = read.pos + forward_shift - if p1 <= cut_site < p2: - signal_1[cut_site - p1] += 1.0 - else: - cut_site = read.aend + reverse_shift - 1 - if p1 <= cut_site < p2: - signal_1[cut_site - p1] += 1.0 - - for read in bam2.fetch(region.chrom, p1, p2): + for read in bam.fetch(region.chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue @@ -145,51 +266,23 @@ def get_raw_signal(arguments): if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: - signal_2[cut_site - p1] += 1.0 + signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: - signal_2[cut_site - p1] += 1.0 - - update_pwm(pwm, fasta, region, p1, p2) + signal[cut_site - p1] += 1.0 - return signal_1, signal_2, motif_len, pwm, num_motif + return signal def get_bc_signal(arguments): - (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism, - window_size, forward_shift, reverse_shift, bias_table1, bias_table2) = arguments - - mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") - mpbs1.read(mpbs_file1) - - mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") - mpbs2.read(mpbs_file2) - - mpbs = mpbs1.combine(mpbs2, output=True) - mpbs.sort() - - bam1 = Samfile(reads_file1, "rb") - bam2 = Samfile(reads_file2, "rb") + (mpbs_region, reads_file, organism, window_size, forward_shift, reverse_shift, bias_table) = arguments + bam = Samfile(reads_file, "rb") genome_data = GenomeData(organism) - fasta = Fastafile(genome_data.get_genome()) - - signal_1 = np.zeros(window_size) - signal_2 = np.zeros(window_size) - motif_len = None - pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), - ("G", [0.0] * window_size), ("T", [0.0] * window_size), - ("N", [0.0] * window_size)]) - - mpbs_regions = mpbs.by_names([mpbs_name]) - num_motif = len(mpbs_regions) - + signal = np.zeros(window_size) # Fetch bias corrected signal - for region in mpbs_regions: - if motif_len is None: - motif_len = region.final - region.initial - + for region in mpbs_region: mid = (region.final + region.initial) / 2 p1 = mid - window_size / 2 p2 = mid + window_size / 2 @@ -197,130 +290,16 @@ def get_bc_signal(arguments): if p1 <= 0: continue # Fetch raw signal - signal1 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam1, - bias_table=bias_table1, genome_file_name=genome_data.get_genome(), - forward_shift=forward_shift, reverse_shift=reverse_shift) - - signal2 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam2, - bias_table=bias_table2, genome_file_name=genome_data.get_genome(), + _signal = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam, + bias_table=bias_table, genome_file_name=genome_data.get_genome(), forward_shift=forward_shift, reverse_shift=reverse_shift) - - if len(signal1) != len(signal_1) or len(signal2) != len(signal_2): + if len(_signal) != window_size: continue # smooth the signal - signal_1 = np.add(signal_1, np.array(signal1)) - signal_2 = np.add(signal_2, np.array(signal2)) - - update_pwm(pwm, fasta, region, p1, p2) - - return signal_1, signal_2, motif_len, pwm, num_motif - - -def diff_analysis_run(args): - # Initializing Error Handler - err = ErrorHandler() - - output_location = os.path.join(args.output_location, "Lineplots") - try: - if not os.path.isdir(output_location): - os.makedirs(output_location) - except Exception: - err.throw_error("MM_OUT_FOLDER_CREATION") - - # Check if the index file exists - base_name1 = "{}.bai".format(args.reads_file1) - if not os.path.exists(base_name1): - pysam.index(args.reads_file1) - - base_name2 = "{}.bai".format(args.reads_file2) - if not os.path.exists(base_name2): - pysam.index(args.reads_file2) - - mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") - mpbs1.read(args.mpbs_file1) - - mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") - mpbs2.read(args.mpbs_file2) - - mpbs = mpbs1.combine(mpbs2, output=True) - mpbs.sort() - mpbs.remove_duplicates() - mpbs_name_list = list(set(mpbs.get_names())) - - signal_dict_by_tf_1 = dict() - signal_dict_by_tf_2 = dict() - motif_len_dict = dict() - motif_num_dict = dict() - pwm_dict_by_tf = dict() - - pool = Pool(processes=args.nc) - # differential analysis using bias corrected signal - if args.bc: - hmm_data = HmmData() - table_F = hmm_data.get_default_bias_table_F_ATAC() - table_R = hmm_data.get_default_bias_table_R_ATAC() - bias_table1 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) - bias_table2 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) - - mpbs_list = list() - for mpbs_name in mpbs_name_list: - mpbs_list.append((mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2, - args.organism, args.window_size, args.forward_shift, args.reverse_shift, - bias_table1, bias_table2)) - try: - res = pool.map(get_bc_signal, mpbs_list) - except Exception: - logging.exception("get bias corrected signal failed") - - # differential analysis using raw signal - else: - mpbs_list = list() - for mpbs_name in mpbs_name_list: - mpbs_list.append((mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2, - args.organism, args.window_size, args.forward_shift, args.reverse_shift)) - try: - res = pool.map(get_raw_signal, mpbs_list) - except Exception: - logging.exception("get raw signal failed") + signal = np.add(signal, np.array(_signal)) - for idx, mpbs_name in enumerate(mpbs_name_list): - signal_dict_by_tf_1[mpbs_name] = res[idx][0] - signal_dict_by_tf_2[mpbs_name] = res[idx][1] - motif_len_dict[mpbs_name] = res[idx][2] - pwm_dict_by_tf[mpbs_name] = res[idx][3] - motif_num_dict[mpbs_name] = res[idx][4] - - if args.factor1 is None or args.factor2 is None: - args.factor1, args.factor2 = compute_factors(signal_dict_by_tf_1, signal_dict_by_tf_2) - output_factor(args, args.factor1, args.factor2) - - if args.output_profiles: - output_profiles(mpbs_name_list, signal_dict_by_tf_1, output_location, args.condition1) - output_profiles(mpbs_name_list, signal_dict_by_tf_2, output_location, args.condition2) - - ps_tc_results_by_tf = dict() - - plots_list = list() - for mpbs_name in mpbs_name_list: - plots_list.append((mpbs_name, motif_num_dict[mpbs_name], signal_dict_by_tf_1[mpbs_name], - signal_dict_by_tf_2[mpbs_name], args.factor1, args.factor2, args.condition1, - args.condition2, pwm_dict_by_tf[mpbs_name], output_location, args.window_size, - args.standardize)) - - pool.map(line_plot, plots_list) - - for mpbs_name in mpbs_name_list: - res = get_ps_tc_results(signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name], - args.factor1, args.factor2, motif_num_dict[mpbs_name], motif_len_dict[mpbs_name]) - # - # # only use the factors whose protection scores are greater than 0 - # if res[0] > 0 and res[1] < 0: - ps_tc_results_by_tf[mpbs_name] = res - # - # stat_results_by_tf = get_stat_results(ps_tc_results_by_tf) - ps_tc_results_by_tf = scatter_plot(args, ps_tc_results_by_tf) - output_stat_results(args, ps_tc_results_by_tf, motif_num_dict) + return signal def bias_correction(chrom, start, end, bam, bias_table, genome_file_name, forward_shift, reverse_shift): @@ -339,7 +318,7 @@ def bias_correction(chrom, start, end, bam, bias_table, genome_file_name, forwar p2_w = p2 + (window / 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) - if p1 <= 0 or p1_w <= 0 or p2_wk <= 0: + if p1 <= 0 or p1_w <= 0 or p1_wk <= 0 or p2_wk <= 0: # Return raw counts bc_signal = [0.0] * (p2 - p1) for read in bam.fetch(chrom, p1, p2): @@ -433,119 +412,85 @@ def bias_correction(chrom, start, end, bam, bias_table, genome_file_name, forwar return bc_signal -def get_ps_tc_results(signal_1, signal_2, factor1, factor2, num_motif, motif_len): - signal_1 = (signal_1 / factor1) / num_motif - signal_2 = (signal_2 / factor2) / num_motif - - # signal_1, signal_2 = standard(signal_1, signal_2) - - signal_half_len = len(signal_1) / 2 - - nc = sum(signal_1[signal_half_len - motif_len / 2:signal_half_len + motif_len / 2]) - nr = sum(signal_1[signal_half_len + motif_len / 2:signal_half_len + motif_len / 2 + motif_len]) - nl = sum(signal_1[signal_half_len - motif_len / 2 - motif_len:signal_half_len - motif_len / 2]) +def get_ps_tc_results(signals, motif_len, window_size): + signal_half_len = window_size / 2 + nc = np.sum(signals[:, signal_half_len - motif_len / 2:signal_half_len + motif_len / 2], axis=1) + nr = np.sum(signals[:, signal_half_len + motif_len / 2:signal_half_len + motif_len / 2 + motif_len], axis=1) + nl = np.sum(signals[:, signal_half_len - motif_len / 2 - motif_len:signal_half_len - motif_len / 2], axis=1) - protect_score1 = (nr - nc) / motif_len + (nl - nc) / motif_len - tc1 = (sum(signal_1) - nc) / (len(signal_1) - motif_len) + protect_scores = (nr - nc) / motif_len + (nl - nc) / motif_len - nc = sum(signal_2[signal_half_len - motif_len / 2:signal_half_len + motif_len / 2]) - nr = sum(signal_2[signal_half_len + motif_len / 2:signal_half_len + motif_len / 2 + motif_len]) - nl = sum(signal_2[signal_half_len - motif_len / 2 - motif_len:signal_half_len - motif_len / 2]) + tcs = (np.sum(signals, axis=1) - nc) / (window_size - motif_len) - protect_score2 = (nr - nc) / motif_len + (nl - nc) / motif_len - tc2 = (sum(signal_2) - nc) / (len(signal_2) - motif_len) + return [protect_scores, tcs] - protect_diff = protect_score2 - protect_score1 - tc_diff = tc2 - tc1 - return [protect_score1, protect_score2, protect_diff, tc1, tc2, tc_diff] +def get_pwm(fasta, regions, window_size): + pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), + ("G", [0.0] * window_size), ("T", [0.0] * window_size), + ("N", [0.0] * window_size)]) -def update_pwm(pwm, fasta, region, p1, p2): - # Update pwm - aux_plus = 1 - dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper() - - if (region.final - region.initial) % 2 == 0: - aux_plus = 0 - - dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom, - p1 + aux_plus, p2 + aux_plus)).upper()) - if region.orientation == "+": - for i in range(0, len(dna_seq)): - pwm[dna_seq[i]][i] += 1 - - elif region.orientation == "-": - for i in range(0, len(dna_seq_rev)): - pwm[dna_seq_rev[i]][i] += 1 - - -def compute_factors(signal_dict_by_tf_1, signal_dict_by_tf_2): - keys = signal_dict_by_tf_1.keys() - - signal_1 = np.zeros(len(keys)) - signal_2 = np.zeros(len(keys)) + for region in regions: + middle = (region.initial + region.final) / 2 + p1 = middle - window_size / 2 + p2 = middle + window_size / 2 - for idx, key in enumerate(keys): - signal_1[idx] = sum(signal_dict_by_tf_1[key]) - signal_2[idx] = sum(signal_dict_by_tf_2[key]) + if p1 <= 0: + continue - # Take log - log_tc1 = np.log(signal_1) - log_tc2 = np.log(signal_2) + aux_plus = 1 + dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper() - # Average - average_log_tc = np.add(log_tc1, log_tc2) / 2 + if window_size % 2 == 0: + aux_plus = 0 - # Filter - filter_log_tc1 = log_tc1[~np.isnan(log_tc1)] - filter_log_tc2 = log_tc2[~np.isnan(log_tc2)] - filter_log_tc = average_log_tc[~np.isnan(average_log_tc)] + dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom, + p1 + aux_plus, p2 + aux_plus)).upper()) + if region.orientation == "+": + for i in range(len(dna_seq)): + pwm[dna_seq[i]][i] += 1 - # Subtract - sub_tc1 = np.subtract(filter_log_tc1, filter_log_tc) - sub_tc2 = np.subtract(filter_log_tc2, filter_log_tc) + elif region.orientation == "-": + for i in range(len(dna_seq_rev)): + pwm[dna_seq_rev[i]][i] += 1 - median_tc1 = np.median(sub_tc1) - median_tc2 = np.median(sub_tc2) + return pwm - factor1 = np.exp(median_tc1) - factor2 = np.exp(median_tc2) - return factor1, factor2 +def compute_factors(signals): + signals = np.sum(signals, axis=2) + signals_log = np.log(signals) + signals_log = signals_log[:, ~np.isnan(signals_log).any(axis=0)] + signals_log = signals_log - np.mean(signals_log, axis=0, keepdims=True) + factors = np.exp(np.median(signals_log, axis=1)) + return factors -def line_plot(arguments): - (mpbs_name, num_fp, signal_1, signal_2, factor1, factor2, condition1, condition2, - pwm_dict, output_location, window_size, standardize) = arguments - mpbs_name = mpbs_name.replace("(", "_") - mpbs_name = mpbs_name.replace(")", "") - mean_signal_1 = (signal_1 / num_fp) / factor1 - mean_signal_2 = (signal_2 / num_fp) / factor2 +def output_line_plot(arguments): + (mpbs_name, mpbs_num, signals, conditions, pwm, output_location, window_size, colors) = arguments + mpbs_name = mpbs_name.replace("(", "_").replace(")", "") # output signal - signal_fname = os.path.join(output_location, "{}.txt".format(mpbs_name)) - with open(signal_fname, "w") as f: - f.write(condition1 + "\t" + condition2 + "\n") + output_filename = os.path.join(output_location, "{}.txt".format(mpbs_name)) + with open(output_filename, "w") as f: + f.write("\t".join(conditions) + "\n") for i in range(window_size): - f.write(str(mean_signal_1[i]) + "\t" + str(mean_signal_2[i]) + "\n") - - if standardize: - mean_signal_1, mean_signal_2 = standard(mean_signal_1, mean_signal_2) - - # Output PWM and create logo - pwm_fname = os.path.join(output_location, "{}.pwm".format(mpbs_name)) - pwm_file = open(pwm_fname, "w") - for e in ["A", "C", "G", "T"]: - pwm_file.write(" ".join([str(int(f)) for f in pwm_dict[e]]) + "\n") - pwm_file.close() - - logo_fname = os.path.join(output_location, "{}.logo.eps".format(mpbs_name)) - pwm = motifs.read(open(pwm_fname), "pfm") - pwm.weblogo(logo_fname, format="eps", stack_width="large", stacks_per_line=str(window_size), - color_scheme="color_classic", unit_name="", show_errorbars=False, logo_title="", - show_xaxis=False, xaxis_label="", show_yaxis=False, yaxis_label="", - show_fineprint=False, show_ends=False) + res = [] + for j, condition in enumerate(conditions): + res.append(signals[j][i]) + + f.write("\t".join(map(str, res)) + "\n") + + # to create a motif loge, we only use A, C, G, T + pwm = {k: pwm[k] for k in ('A', 'C', 'G', 'T')} + pwm = pd.DataFrame(data=pwm) + pwm = pwm.add(1) + pwm_prob = (pwm.T / pwm.T.sum()).T + pwm_prob_log = np.log2(pwm_prob) + pwm_prob_log = pwm_prob_log.mul(pwm_prob) + info_content = pwm_prob_log.T.sum() + 2 + icm = pwm_prob.mul(info_content, axis=0) start = -(window_size / 2) end = (window_size / 2) - 1 @@ -553,9 +498,10 @@ def line_plot(arguments): plt.close('all') fig, ax = plt.subplots() - ax.plot(x, mean_signal_2, color='red', label=condition2) - ax.plot(x, mean_signal_1, color='blue', label=condition1) - ax.text(0.15, 0.9, 'n = {}'.format(num_fp), verticalalignment='bottom', horizontalalignment='right', + for i, condition in enumerate(conditions): + ax.plot(x, signals[i], color=colors[i], label=condition) + + ax.text(0.15, 0.9, 'n = {}'.format(mpbs_num), verticalalignment='bottom', horizontalalignment='right', transform=ax.transAxes, fontweight='bold') ax.xaxis.set_ticks_position('bottom') @@ -566,8 +512,8 @@ def line_plot(arguments): ax.tick_params(direction='out') ax.set_xticks([start, 0, end]) ax.set_xticklabels([str(start), 0, str(end)]) - min_signal = min(min(mean_signal_1), min(mean_signal_2)) - max_signal = max(max(mean_signal_1), max(mean_signal_2)) + min_signal = np.min(signals) + max_signal = np.max(signals) ax.set_yticks([min_signal, max_signal]) ax.set_yticklabels([str(round(min_signal, 2)), str(round(max_signal, 2))], rotation=90) @@ -577,41 +523,31 @@ def line_plot(arguments): ax.legend(loc="upper right", frameon=False) ax.spines['bottom'].set_position(('outward', 70)) - figure_name = os.path.join(output_location, "{}.line.eps".format(mpbs_name)) + ax = plt.axes([0.105, 0.085, 0.85, .2]) + logo = logomaker.Logo(icm, ax=ax, show_spines=False, baseline_width=0) + ax.set_xticks([]) + ax.set_yticks([]) fig.tight_layout() - fig.savefig(figure_name, format="eps", dpi=300) - - # Creating canvas and printing eps / pdf with merged results - output_fname = os.path.join(output_location, "{}.eps".format(mpbs_name)) - c = pyx.canvas.canvas() - c.insert(pyx.epsfile.epsfile(0, 0, figure_name, scale=1.0)) - c.insert(pyx.epsfile.epsfile(0.45, 0.8, logo_fname, width=16.5, height=3)) - c.writeEPSfile(output_fname) - os.system(" ".join(["epstopdf", output_fname])) + output_filename = os.path.join(output_location, "{}.pdf".format(mpbs_name)) + plt.savefig(output_filename) - os.remove(figure_name) - os.remove(logo_fname) - os.remove(output_fname) - os.remove(pwm_fname) +def scatter_plot(args, ps_tc_results, mpbs_name_list, conditions): + tf_activity_score1 = np.zeros(len(mpbs_name_list)) + tf_activity_score2 = np.zeros(len(mpbs_name_list)) -def scatter_plot(args, ps_tc_results_by_tf): - tf_activity_score = list() - mpbs_name_list = ps_tc_results_by_tf.keys() - - for mpbs_name in mpbs_name_list: - tf_activity_score.append(float(ps_tc_results_by_tf[mpbs_name][2]) + float(ps_tc_results_by_tf[mpbs_name][-1])) + for i, mpbs_name in enumerate(mpbs_name_list): + tf_activity_score1[i] = float(ps_tc_results[i][0][0]) + float(ps_tc_results[i][1][0]) + tf_activity_score2[i] = float(ps_tc_results[i][0][1]) + float(ps_tc_results[i][1][1]) - tf_activity_score = np.array(tf_activity_score) + tf_activity_score = np.subtract(tf_activity_score2, tf_activity_score1) z_score = zscore(tf_activity_score) p_values = norm.sf(abs(z_score)) * 2 # add TF activity score, z score and p values to the result dictionary for i, mpbs_name in enumerate(mpbs_name_list): - ps_tc_results_by_tf[mpbs_name].append(tf_activity_score[i]) - ps_tc_results_by_tf[mpbs_name].append(z_score[i]) - ps_tc_results_by_tf[mpbs_name].append(p_values[i]) + ps_tc_results[i].append([tf_activity_score[i], z_score[i], p_values[i]]) # plot TF activity score x_axis = np.random.uniform(low=-0.1, high=0.1, size=len(p_values)) @@ -626,57 +562,51 @@ def scatter_plot(args, ps_tc_results_by_tf): ax.margins(0.05) ax.set_xticks([]) - ax.set_ylabel("Activity Score \n {} $\longleftrightarrow$ {}".format(args.condition1, args.condition2), + ax.set_ylabel("Activity Score \n {} $\longleftrightarrow$ {}".format(conditions[0], conditions[1]), rotation=90, fontsize=20) - figure_name = os.path.join(args.output_location, "{}_{}_statistics.pdf".format(args.condition1, args.condition2)) + figure_name = os.path.join(args.output_location, "{}_statistics.pdf".format(args.output_prefix)) fig.savefig(figure_name, format="pdf", dpi=300) - return ps_tc_results_by_tf - - -def output_results(args, ps_tc_results_by_tf): - mpbs_name_list = ps_tc_results_by_tf.keys() - header = ["Motif", - "Protection_Score_{}".format(args.condition1), "Protection_Score_{}".format(args.condition2), - "Protection_Diff_{}_{}".format(args.condition1, args.condition2), - "TC_{}".format(args.condition1), "TC_{}".format(args.condition2), - "TC_Diff_{}_{}".format(args.condition1, args.condition2)] - output_fname = os.path.join(args.output_location, "{}_{}_results.txt".format(args.condition1, args.condition2)) - with open(output_fname, "w") as f: - f.write("\t".join(header) + "\n") - for mpbs_name in mpbs_name_list: - f.write(mpbs_name + "\t" + "\t".join(map(str, ps_tc_results_by_tf[mpbs_name])) + "\n") - - -def output_stat_results(args, stat_results_by_tf, motif_num_dict): - output_fname = os.path.join(args.output_location, "{}_{}_statistics.txt".format(args.condition1, args.condition2)) - header = ["Motif", "Num", - "Protection_Score_{}".format(args.condition1), "Protection_Score_{}".format(args.condition2), - "Protection_Diff_{}_{}".format(args.condition1, args.condition2), - "TC_{}".format(args.condition1), "TC_{}".format(args.condition2), - "TC_Diff_{}_{}".format(args.condition1, args.condition2), "TF_Activity", "Z_score", "P_values"] - with open(output_fname, "w") as f: - f.write("\t".join(header) + "\n") - for mpbs_name in stat_results_by_tf.keys(): - f.write(mpbs_name + "\t" + str(motif_num_dict[mpbs_name]) + "\t" + - "\t".join(map(str, stat_results_by_tf[mpbs_name])) + "\n") - - -def output_factor(args, factor1, factor2): - output_file = os.path.join(args.output_location, "{}_{}_factor.txt".format(args.condition1, args.condition2)) - f = open(output_file, "w") - f.write("Factor1: " + str(factor1) + "\n") - f.write("Factor2: " + str(factor2) + "\n") - f.close() - - -def output_mu(args, median_diff_prot, median_diff_tc): - output_file = os.path.join(args.output_location, "{}_{}_mu.txt".format(args.condition1, args.condition2)) - f = open(output_file, "w") - f.write("median_diff_prot: " + str(median_diff_prot) + "\n") - f.write("median_diff_tc: " + str(median_diff_tc) + "\n") - f.close() + return ps_tc_results + + +def output_stat_results(ps_tc_results, conditions, mpbs_name_list, motif_num, args): + output_filename = os.path.join(args.output_location, "{}_statistics.txt".format(args.output_prefix)) + + if len(conditions) == 2: + header = ["Motif", "Num", + "Protection_Score_{}".format(conditions[0]), "Protection_Score_{}".format(conditions[1]), + "TC_{}".format(conditions[0]), "TC_{}".format(conditions[1]), "TF_Activity", "Z_score", "P_values"] + + with open(output_filename, "w") as f: + f.write("\t".join(header) + "\n") + for i, mpbs_name in enumerate(mpbs_name_list): + f.write(mpbs_name + "\t" + str(motif_num[i]) + "\t" + + "\t".join(map(str, ps_tc_results[i][0])) + "\t" + + "\t".join(map(str, ps_tc_results[i][1])) + "\t" + + "\t".join(map(str, ps_tc_results[i][2])) + "\n") + + else: + header = ["Motif", "Num"] + for condition in conditions: + header.append("Protection_Score_{}".format(condition)) + for condition in conditions: + header.append("TC_{}".format(condition)) + + with open(output_filename, "w") as f: + f.write("\t".join(header) + "\n") + for i, mpbs_name in enumerate(mpbs_name_list): + f.write(mpbs_name + "\t" + str(motif_num[i]) + "\t" + + "\t".join(map(str, ps_tc_results[i][0])) + "\t" + + "\t".join(map(str, ps_tc_results[i][1])) + "\n") + + +def output_factor(args, factors, conditions): + output_file = os.path.join(args.output_location, "{}_factor.txt".format(args.output_prefix)) + with open(output_file, "w") as f: + f.write("\t".join(conditions) + "\n") + f.write("\t".join(map(str, factors)) + "\n") def standard(vector1, vector2): @@ -697,8 +627,9 @@ def adjust_p_values(p_values): return q[by_orig] -def output_profiles(mpbs_name_list, signal_dict_by_tf, output_location, condition): - for mpbs_name in mpbs_name_list: - output_fname = os.path.join(output_location, "{}_{}.txt".format(mpbs_name, condition)) - with open(output_fname, "w") as f: - f.write("\t".join(map(str, signal_dict_by_tf[mpbs_name])) + "\n") +def output_profiles(mpbs_name_list, signals, conditions, output_location): + for i, condition in enumerate(conditions): + for j, mpbs_name in enumerate(mpbs_name_list): + output_filename = os.path.join(output_location, "{}_{}.txt".format(condition, mpbs_name)) + with open(output_filename, "w") as f: + f.write("\t".join(map(str, signals[i][j])) + "\n") diff --git a/rgt/HINT/Main.py b/rgt/HINT/Main.py index 1788f464f..3ad588881 100644 --- a/rgt/HINT/Main.py +++ b/rgt/HINT/Main.py @@ -4,6 +4,11 @@ from random import seed from argparse import ArgumentParser +import warnings + +if not sys.warnoptions: + warnings.simplefilter("ignore") + # Internal from rgt import __version__ from rgt.HINT.Training import training_args, training_run diff --git a/rgt/HINT/Plotting.py b/rgt/HINT/Plotting.py index d326d8ad7..415b90888 100644 --- a/rgt/HINT/Plotting.py +++ b/rgt/HINT/Plotting.py @@ -47,6 +47,7 @@ def plotting_args(parser): parser.add_argument("--forward-shift", type=int, metavar="INT", default=5, help=SUPPRESS) parser.add_argument("--reverse-shift", type=int, metavar="INT", default=-5, help=SUPPRESS) parser.add_argument("--k-nb", type=int, metavar="INT", default=6, help=SUPPRESS) + parser.add_argument("--y-lim", type=float, metavar="FLOAT", default=0.3, help=SUPPRESS) # Output Options parser.add_argument("--output-location", type=str, metavar="PATH", default=os.getcwd(), @@ -130,7 +131,7 @@ def seq_logo(args): pwm.weblogo(logo_fname, format="eps", stack_width="large", stacks_per_line=str(args.window_size), color_scheme="color_classic", unit_name="", show_errorbars=False, logo_title="", show_xaxis=False, xaxis_label="", show_yaxis=False, yaxis_label="", - show_fineprint=False, show_ends=False, yaxis_scale=0.3) + show_fineprint=False, show_ends=False, yaxis_scale=args.y_lim) start = -(args.window_size / 2) end = (args.window_size / 2) - 1 @@ -151,9 +152,9 @@ def seq_logo(args): ax.set_xticklabels(map(str, x1), rotation=90) ax.set_xlabel("Coordinates from Read Start", fontweight='bold') - ax.set_ylim([0, 0.3]) - ax.yaxis.set_ticks([0, 0.3]) - ax.set_yticklabels([str(0), str(0.3)], rotation=90) + ax.set_ylim([0, args.y_lim]) + ax.yaxis.set_ticks([0, args.y_lim]) + ax.set_yticklabels([str(0), str(args.y_lim)], rotation=90) ax.set_ylabel("bits", rotation=90) figure_name = os.path.join(args.output_location, "{}.line.eps".format(args.output_prefix)) @@ -164,7 +165,7 @@ def seq_logo(args): output_fname = os.path.join(args.output_location, "{}.eps".format(args.output_prefix)) c = pyx.canvas.canvas() c.insert(pyx.epsfile.epsfile(0, 0, figure_name, scale=1.0)) - c.insert(pyx.epsfile.epsfile(1.68, 1.5, logo_fname, width=18.8, height=3.5)) + c.insert(pyx.epsfile.epsfile(1.5, 1.5, logo_fname, width=18.8, height=3.5)) c.writeEPSfile(output_fname) os.system("epstopdf " + output_fname) @@ -1077,8 +1078,7 @@ def unstrand_line(args): if args.bias_table is not None: bias_table = BiasTable() bias_table_list = args.bias_table.split(",") - table = bias_table.load_table(table_file_name_F=bias_table_list[0], - table_file_name_R=bias_table_list[1]) + table = bias_table.load_table(table_file_name_F=bias_table_list[0], table_file_name_R=bias_table_list[1]) genome_data = GenomeData(args.organism) fasta = Fastafile(genome_data.get_genome()) @@ -1091,9 +1091,10 @@ def unstrand_line(args): mean_signal = np.zeros(args.window_size) pwm_dict = None - for region in mpbs_regions: - if str(region.name).split(":")[-1] == "Y": - # Extend by 50 bp + output_fname = os.path.join(args.output_location, "{}.txt".format(args.output_prefix)) + + with open(output_fname, "w") as output_f: + for region in mpbs_regions: mid = (region.initial + region.final) / 2 p1 = mid - (args.window_size / 2) p2 = mid + (args.window_size / 2) @@ -1112,12 +1113,16 @@ def unstrand_line(args): reverse_shift=args.reverse_shift, strand=False) + if region.orientation == "-": + signal = np.flip(signal) + + name = "{}_{}_{}".format(region.chrom, str(region.initial), str(region.final)) + output_f.write(name + "\t" + "\t".join(map(str, map(int, signal))) + "\n") num_sites += 1 mean_signal = np.add(mean_signal, signal) # Update pwm - if pwm_dict is None: pwm_dict = dict([("A", [0.0] * (p2 - p1)), ("C", [0.0] * (p2 - p1)), ("G", [0.0] * (p2 - p1)), ("T", [0.0] * (p2 - p1)), @@ -1127,8 +1132,8 @@ def unstrand_line(args): dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper() if (region.final - region.initial) % 2 == 0: aux_plus = 0 - dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom, - p1 + aux_plus, p2 + aux_plus)).upper()) + dna_seq_rev = AuxiliaryFunctions.revcomp( + str(fasta.fetch(region.chrom, p1 + aux_plus, p2 + aux_plus)).upper()) if region.orientation == "+": for i in range(0, len(dna_seq)): pwm_dict[dna_seq[i]][i] += 1 @@ -1138,20 +1143,6 @@ def unstrand_line(args): mean_signal = mean_signal / num_sites - mean_norm_signal = genomic_signal.boyle_norm(mean_signal) - perc = scoreatpercentile(mean_norm_signal, 98) - std = np.std(mean_norm_signal) - mean_norm_signal = genomic_signal.hon_norm_atac(mean_norm_signal, perc, std) - - mean_slope_signal = genomic_signal.slope(mean_norm_signal, genomic_signal.sg_coefs) - - # Output the norm and slope signal - output_fname = os.path.join(args.output_location, "{}.txt".format(args.output_prefix)) - f = open(output_fname, "w") - f.write("\t".join((map(str, mean_norm_signal))) + "\n") - f.write("\t".join((map(str, mean_slope_signal))) + "\n") - f.close() - # Output PWM and create logo pwm_fname = os.path.join(args.output_location, "{}.pwm".format(args.output_prefix)) pwm_file = open(pwm_fname, "w") @@ -1171,28 +1162,28 @@ def unstrand_line(args): x = np.linspace(start, end, num=args.window_size) fig = plt.figure(figsize=(8, 4)) - ax2 = fig.add_subplot(111) + ax = fig.add_subplot(111) min_signal = min(mean_signal) max_signal = max(mean_signal) - ax2.plot(x, mean_signal, color='red') - ax2.set_title(args.output_prefix, fontweight='bold') + ax.plot(x, mean_signal, color='red') + ax.set_title(args.output_prefix, fontweight='bold') - ax2.xaxis.set_ticks_position('bottom') - ax2.yaxis.set_ticks_position('left') - ax2.spines['top'].set_visible(False) - ax2.spines['right'].set_visible(False) - ax2.spines['left'].set_position(('outward', 15)) - ax2.tick_params(direction='out') - ax2.set_xticks([start, 0, end]) - ax2.set_xticklabels([str(start), 0, str(end)]) - ax2.set_yticks([min_signal, max_signal]) - ax2.set_yticklabels([str(round(min_signal, 2)), str(round(max_signal, 2))], rotation=90) - ax2.set_xlim(start, end) - ax2.set_ylim([min_signal, max_signal]) - ax2.legend(loc="upper right", frameon=False) + ax.xaxis.set_ticks_position('bottom') + ax.yaxis.set_ticks_position('left') + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['left'].set_position(('outward', 15)) + ax.tick_params(direction='out') + ax.set_xticks([start, 0, end]) + ax.set_xticklabels([str(start), 0, str(end)]) + ax.set_yticks([min_signal, max_signal]) + ax.set_yticklabels([str(round(min_signal, 2)), str(round(max_signal, 2))], rotation=90) + ax.set_xlim(start, end) + ax.set_ylim([min_signal, max_signal]) + ax.legend(loc="upper right", frameon=False) - ax2.spines['bottom'].set_position(('outward', 40)) + ax.spines['bottom'].set_position(('outward', 40)) figure_name = os.path.join(args.output_location, "{}.line.eps".format(args.output_prefix)) fig.subplots_adjust(bottom=.2, hspace=.5) @@ -1209,11 +1200,11 @@ def unstrand_line(args): os.system("epstopdf " + logo_fname) os.system("epstopdf " + output_fname) - # os.remove(pwm_fname) + os.remove(pwm_fname) os.remove(os.path.join(args.output_location, "{}.line.eps".format(args.output_prefix))) os.remove(os.path.join(args.output_location, "{}.logo.eps".format(args.output_prefix))) os.remove(os.path.join(args.output_location, "{}.line.pdf".format(args.output_prefix))) - os.remove(os.path.join(args.output_location, "{}.logo.pdf".format(args.output_prefix))) + # os.remove(os.path.join(args.output_location, "{}.logo.pdf".format(args.output_prefix))) os.remove(os.path.join(args.output_location, "{}.eps".format(args.output_prefix))) @@ -1550,153 +1541,6 @@ def __init__(self, organism, reads_file, motif_file, window_size, self.output_loc = output_loc self.output_prefix = output_prefix - def unstrand_line(self): - genomic_signal = GenomicSignal(self.reads_file) - genomic_signal.load_sg_coefs(slope_window_size=9) - - table = None - if self.bias_table is not None: - bias_table = BiasTable() - bias_table_list = self.bias_table.split(",") - table = bias_table.load_table(table_file_name_F=bias_table_list[0], - table_file_name_R=bias_table_list[1]) - - genome_data = GenomeData(self.organism) - fasta = Fastafile(genome_data.get_genome()) - - num_sites = 0 - mpbs_regions = GenomicRegionSet("Motif Predicted Binding Sites") - mpbs_regions.read(self.motif_file) - bam = Samfile(self.reads_file, "rb") - - mean_signal = np.zeros(self.window_size) - - pwm_dict = None - for region in mpbs_regions: - if str(region.name).split(":")[-1] == "Y": - # Extend by 50 bp - mid = (region.initial + region.final) / 2 - p1 = mid - (self.window_size / 2) - p2 = mid + (self.window_size / 2) - - if self.bias_table is not None: - signal = genomic_signal.get_bc_signal_by_fragment_length(ref=region.chrom, start=p1, end=p2, - bam=bam, fasta=fasta, - bias_table=table, - forward_shift=self.forward_shift, - reverse_shift=self.reverse_shift, - strand=False) - else: - signal = genomic_signal.get_raw_signal_by_fragment_length(ref=region.chrom, start=p1, end=p2, - bam=bam, - forward_shift=self.forward_shift, - reverse_shift=self.reverse_shift, - strand=False) - - num_sites += 1 - - mean_signal = np.add(mean_signal, signal) - - # Update pwm - - if pwm_dict is None: - pwm_dict = dict([("A", [0.0] * (p2 - p1)), ("C", [0.0] * (p2 - p1)), - ("G", [0.0] * (p2 - p1)), ("T", [0.0] * (p2 - p1)), - ("N", [0.0] * (p2 - p1))]) - - aux_plus = 1 - dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper() - if (region.final - region.initial) % 2 == 0: - aux_plus = 0 - dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom, - p1 + aux_plus, p2 + aux_plus)).upper()) - if region.orientation == "+": - for i in range(0, len(dna_seq)): - pwm_dict[dna_seq[i]][i] += 1 - elif region.orientation == "-": - for i in range(0, len(dna_seq_rev)): - pwm_dict[dna_seq_rev[i]][i] += 1 - - mean_signal = mean_signal / num_sites - - mean_norm_signal = genomic_signal.boyle_norm(mean_signal) - perc = scoreatpercentile(mean_norm_signal, 98) - std = np.std(mean_norm_signal) - mean_norm_signal = genomic_signal.hon_norm_atac(mean_norm_signal, perc, std) - - mean_slope_signal = genomic_signal.slope(mean_norm_signal, genomic_signal.sg_coefs) - - # Output the norm and slope signal - output_fname = os.path.join(self.output_loc, "{}.txt".format(self.output_prefix)) - f = open(output_fname, "w") - f.write("\t".join((map(str, mean_norm_signal))) + "\n") - f.write("\t".join((map(str, mean_slope_signal))) + "\n") - f.close() - - # Output PWM and create logo - pwm_fname = os.path.join(self.output_loc, "{}.pwm".format(self.output_prefix)) - pwm_file = open(pwm_fname, "w") - for e in ["A", "C", "G", "T"]: - pwm_file.write(" ".join([str(int(f)) for f in pwm_dict[e]]) + "\n") - pwm_file.close() - - logo_fname = os.path.join(self.output_loc, "{}.logo.eps".format(self.output_prefix)) - pwm = motifs.read(open(pwm_fname), "pfm") - pwm.weblogo(logo_fname, format="eps", stack_width="large", stacks_per_line=str(self.window_size), - color_scheme="color_classic", unit_name="", show_errorbars=False, logo_title="", - show_xaxis=False, xaxis_label="", show_yaxis=False, yaxis_label="", - show_fineprint=False, show_ends=False) - - start = -(self.window_size / 2) - end = (self.window_size / 2) - 1 - x = np.linspace(start, end, num=self.window_size) - - fig = plt.figure(figsize=(8, 4)) - ax2 = fig.add_subplot(111) - - min_signal = min(mean_signal) - max_signal = max(mean_signal) - ax2.plot(x, mean_signal, color='red') - ax2.set_title(self.output_prefix, fontweight='bold') - - ax2.xaxis.set_ticks_position('bottom') - ax2.yaxis.set_ticks_position('left') - ax2.spines['top'].set_visible(False) - ax2.spines['right'].set_visible(False) - ax2.spines['left'].set_position(('outward', 15)) - ax2.tick_params(direction='out') - ax2.set_xticks([start, 0, end]) - ax2.set_xticklabels([str(start), 0, str(end)]) - ax2.set_yticks([min_signal, max_signal]) - ax2.set_yticklabels([str(round(min_signal, 2)), str(round(max_signal, 2))], rotation=90) - ax2.set_xlim(start, end) - ax2.set_ylim([min_signal, max_signal]) - ax2.legend(loc="upper right", frameon=False) - - ax2.spines['bottom'].set_position(('outward', 40)) - - figure_name = os.path.join(self.output_loc, "{}.line.eps".format(self.output_prefix)) - fig.subplots_adjust(bottom=.2, hspace=.5) - fig.tight_layout() - fig.savefig(figure_name, format="eps", dpi=300) - - # Creating canvas and printing eps / pdf with merged results - output_fname = os.path.join(self.output_loc, "{}.eps".format(self.output_prefix)) - c = pyx.canvas.canvas() - c.insert(pyx.epsfile.epsfile(0, 0, figure_name, scale=1.0)) - c.insert(pyx.epsfile.epsfile(1.31, 0.89, logo_fname, width=18.5, height=1.75)) - c.writeEPSfile(output_fname) - os.system("epstopdf " + figure_name) - os.system("epstopdf " + logo_fname) - os.system("epstopdf " + output_fname) - - # os.remove(pwm_fname) - os.remove(os.path.join(self.output_loc, "{}.line.eps".format(self.output_prefix))) - os.remove(os.path.join(self.output_loc, "{}.logo.eps".format(self.output_prefix))) - os.remove(os.path.join(self.output_loc, "{}.line.pdf".format(self.output_prefix))) - os.remove(os.path.join(self.output_loc, "{}.logo.pdf".format(self.output_prefix))) - os.remove(os.path.join(self.output_loc, "{}.eps".format(self.output_prefix))) - def line3(self, bias_table1, bias_table2): signal = GenomicSignal(self.reads_file) signal.load_sg_coefs(slope_window_size=9) @@ -1865,91 +1709,6 @@ def line3(self, bias_table1, bias_table2): os.remove(os.path.join(self.output_loc, "{}.logo.pdf".format(self.output_prefix))) os.remove(os.path.join(self.output_loc, "{}.eps".format(self.output_prefix))) - def get_signal3(self, ref, start, end, bam, fasta, bias_table): - # Parameters - window = 50 - defaultKmerValue = 1.0 - - # Initialization - fBiasDict = bias_table[0] - rBiasDict = bias_table[1] - k_nb = len(fBiasDict.keys()[0]) - p1 = start - p2 = end - p1_w = p1 - (window / 2) - p2_w = p2 + (window / 2) - p1_wk = p1_w - int(k_nb / 2.) - p2_wk = p2_w + int(k_nb / 2.) - - currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper() - currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper()) - - # Iterating on sequence to create the bias signal - signal_bias_f = [] - signal_bias_r = [] - for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1): - fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)] - rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i] - try: - signal_bias_f.append(fBiasDict[fseq]) - except Exception: - signal_bias_f.append(defaultKmerValue) - try: - signal_bias_r.append(rBiasDict[rseq]) - except Exception: - signal_bias_r.append(defaultKmerValue) - - # Raw counts - signal_raw_f = [0.0] * (p2_w - p1_w) - signal_raw_r = [0.0] * (p2_w - p1_w) - for read in bam.fetch(ref, p1_w, p2_w): - if not read.is_reverse: - cut_site = read.pos + self.forward_shift - if p1_w <= cut_site < p2_w: - signal_raw_f[cut_site - p1_w] += 1.0 - else: - cut_site = read.aend + self.reverse_shift - 1 - if p1_w <= cut_site < p2_w: - signal_raw_r[cut_site - p1_w] += 1.0 - - # Smoothed counts - Nf = [] - Nr = [] - fSum = sum(signal_raw_f[:window]) - rSum = sum(signal_raw_r[:window]) - fLast = signal_raw_f[0] - rLast = signal_raw_r[0] - for i in range((window / 2), len(signal_raw_f) - (window / 2)): - Nf.append(fSum) - Nr.append(rSum) - fSum -= fLast - fSum += signal_raw_f[i + (window / 2)] - fLast = signal_raw_f[i - (window / 2) + 1] - rSum -= rLast - rSum += signal_raw_r[i + (window / 2)] - rLast = signal_raw_r[i - (window / 2) + 1] - - # Calculating bias and writing to wig file - fSum = sum(signal_bias_f[:window]) - rSum = sum(signal_bias_r[:window]) - fLast = signal_bias_f[0] - rLast = signal_bias_r[0] - signal_raw = [] - signal_bc = [] - for i in range((window / 2), len(signal_bias_f) - (window / 2)): - nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum) - nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum) - signal_raw.append(signal_raw_f[i] + signal_raw_r[i]) - signal_bc.append(nhatf + nhatr) - fSum -= fLast - fSum += signal_bias_f[i + (window / 2)] - fLast = signal_bias_f[i - (window / 2) + 1] - rSum -= rLast - rSum += signal_bias_r[i + (window / 2)] - rLast = signal_bias_r[i - (window / 2) + 1] - - return signal_raw, signal_bc - def line4(self): genome_data = GenomeData(self.organism) fasta = Fastafile(genome_data.get_genome()) diff --git a/rgt/__version__.py b/rgt/__version__.py index def467e07..76da4a988 100644 --- a/rgt/__version__.py +++ b/rgt/__version__.py @@ -1 +1 @@ -__version__ = "0.12.1" +__version__ = "0.12.2" diff --git a/setup.py b/setup.py index a2ae48e0e..1212bd197 100644 --- a/setup.py +++ b/setup.py @@ -106,13 +106,13 @@ def find_version(*file_paths): "motifanalysis": ( "rgt-motifanalysis", "rgt.motifanalysis.Main:main", - ["Biopython>=1.64", "fisher>=0.1.5", "moods-python>=1.9.3.1"], + ["Biopython>=1.64", "fisher>=0.1.5", "moods-python>=1.9.4.1"], ["data/bin/" + bin_dir + "/bedToBigBed", "data/bin/" + bin_dir + "/bigBedToBed"] ), "hint": ( "rgt-hint", "rgt.HINT.Main:main", - ["scikit-learn>=0.19.0", "hmmlearn>=0.2", "pyx" if p3_supported else "pyx==0.12.1"], + ["scikit-learn>=0.19.0", "hmmlearn>=0.2", "pandas", "logomaker"], [] ), "THOR": (