diff --git a/AFSDistFromSite.py b/AFSDistFromSite.py new file mode 100644 index 0000000..d965672 --- /dev/null +++ b/AFSDistFromSite.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +import sys + +msFile, targetPos, maxDistanceFromTarget = sys.argv[1:] +targetPos = float(targetPos) +maxDistanceFromTarget = float(maxDistanceFromTarget) + +def processSimulation(samples, positions, targetPos, maxDistanceFromTarget, afs): + freqH = {} + for i in range(len(samples[0])): + if abs(targetPos - positions[i]) < maxDistanceFromTarget: + freqH[i] = {} + for sample in samples: + if not freqH[i].has_key(sample[i]): + freqH[i][sample[i]] = 0 + freqH[i][sample[i]] += 1 + for i in freqH.keys(): + alleles = sorted(freqH[i].keys()) + if alleles == ['0', '1']: + freq = freqH[i]['1'] + afs[freq] += 1 + +if msFile == "stdin": + isFile = False + msStream = sys.stdin +else: + isFile = True + msStream = open(msFile) + +header = msStream.readline() +program,numSamples,numSims = header.strip().split()[:3] +numSamples, numSims = int(numSamples), int(numSims) + +processedSims = 0 +#advance to first simulation +line = msStream.readline() +while line.strip() != "//": + line = msStream.readline() +afs = -1 +while line: + if line.strip() != "//": + sys.exit("Malformed ms-style output file: read '%s' instead of '//'. AAAARRRRGGHHH!!!!!\n" %(line.strip())) + segsitesBlah,segsites = msStream.readline().strip().split() + segsites = int(segsites) + if segsitesBlah != "segsites:": + sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n") + + if segsites != 0: + positionsLine = msStream.readline().strip().split() + if not positionsLine[0] == "positions:": + sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n") + positions = [float(x) for x in positionsLine[1:]] + + samples = [] + for i in range(numSamples): + sampleLine = msStream.readline().strip() + if len(sampleLine) != segsites: + sys.exit("Malformed ms-style output file %s segsites but %s columns in line: %s; line %s of %s samples AAAARRRRGGHHH!!!!!\n" %(segsites, len(sampleLine), sampleLine, i, numSamples)) + samples.append(sampleLine) + if len(samples) != numSamples: + raise Exception + if afs == -1: + afs = [0]*len(samples) + processSimulation(samples, positions, targetPos, maxDistanceFromTarget, afs) + processedSims += 1 + line = msStream.readline() + #advance to the next non-empty line or EOF + while line and line.strip() == "": + line = msStream.readline() +#print afs[1:] +denom = float(sum(afs[1:])) +for i in range(1,len(afs)): + print "%i %le" %(i, afs[i]/denom) +if processedSims != numSims: + sys.exit("Malformed ms-style output file: %s of %s sims processed. AAAARRRRGGHHH!!!!!\n" %(processedSims, numSims)) + +if isFile: + msStream.close() diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3ed47c1 --- /dev/null +++ b/Makefile @@ -0,0 +1,98 @@ +CC = gcc +CFLAGS = -O3 -Wall -lm + +msMask: msMask.c + $(CC) msMask.c -o msMask $(CFLAGS) + +msMaskAllRows: msMaskAllRows.c + $(CC) msMaskAllRows.c -o msMaskAllRows $(CFLAGS) + +maskedStats: maskedStats.c msGeneralStats.c + $(CC) maskedStats.c msGeneralStats.c -o maskedStats $(CFLAGS) + +niceStatsDan: niceStatsDan.c msGeneralStats.c + $(CC) niceStatsDan.c msGeneralStats.c -g -o niceStatsDan $(CFLAGS) + +niceStatsSFSRegular: niceStatsSFSRegular.c msGeneralStats.c + $(CC) niceStatsSFSRegular.c msGeneralStats.c -g -o niceStatsSFSRegular $(CFLAGS) + +niceStats: niceStats.c msGeneralStats.c + $(CC) niceStats.c msGeneralStats.c -o niceStats $(CFLAGS) + +maskedStatsSubpop: maskedStatsSubpop.c msGeneralStats.c + $(CC) maskedStatsSubpop.c msGeneralStats.c -o maskedStatsSubpop $(CFLAGS) + +twoPopnNiceStats: twoPopnNiceStats.c msGeneralStats.c + $(CC) twoPopnNiceStats.c msGeneralStats.c -o twoPopnNiceStats $(CFLAGS) + +twoPopnStats_forML: twoPopnStats_forML.c msGeneralStats.c + $(CC) twoPopnStats_forML.c msGeneralStats.c -o twoPopnStats_forML $(CFLAGS) + +onePopnStats_forGhostIntroML: onePopnStats_forGhostIntroML.c msGeneralStats.c + $(CC) onePopnStats_forGhostIntroML.c msGeneralStats.c -o onePopnStats_forGhostIntroML $(CFLAGS) + +threePopnStats: threePopnStats.c msGeneralStats.c + $(CC) threePopnStats.c msGeneralStats.c -o threePopnStats $(CFLAGS) +msParams: msParams.c + $(CC) msParams.c ../coalLib/ranlibComplete.c ../pgLib/bedFile.c -o msParams $(CFLAGS) + +msParamsSubpop: msParamsSubpop.c + $(CC) msParamsSubpop.c ../coalLib/ranlibComplete.c ../pgLib/bedFile.c -o msParamsSubpop $(CFLAGS) + +msParamsSubpopNoAd: msParamsSubpopNoAd.c + $(CC) msParamsSubpopNoAd.c ../coalLib/ranlibComplete.c ../pgLib/bedFile.c -o msParamsSubpopNoAd $(CFLAGS) + +msParamsSubpopTrans: msParamsSubpopTrans.c + $(CC) msParamsSubpopTrans.c ../coalLib/ranlibComplete.c ../pgLib/bedFile.c -o msParamsSubpopTrans $(CFLAGS) + +msParamsTest: msParamsTest.c + $(CC) msParamsTest.c ../coalLib/ranlibComplete.c ../pgLib/bedFile.c -o msParamsTest $(CFLAGS) + +pairDist: pairDist.c msGeneralStats.c + $(CC) pairDist.c msGeneralStats.c -o pairDist $(CFLAGS) +pairwiseDists: pairwiseDists.c msGeneralStats.c + $(CC) pairwiseDists.c msGeneralStats.c -o pairwiseDists $(CFLAGS) + +pairwiseIBSTracts: pairwiseIBSTracts.c msGeneralStats.c + $(CC) pairwiseIBSTracts.c msGeneralStats.c -o pairwiseIBSTracts $(CFLAGS) + +msHKA: msHKA.c msGeneralStats.c + $(CC) msHKA.c msGeneralStats.c -o msHKA $(CFLAGS) + +ms2TwoSite: ms2TwoSite.c msGeneralStats.c + $(CC) ms2TwoSite.c msGeneralStats.c -o ms2TwoSite $(CFLAGS) + +ms2TwoSite2Popn: ms2TwoSite2Popn.c msGeneralStats.c + $(CC) ms2TwoSite2Popn.c msGeneralStats.c -o ms2TwoSite2Popn $(CFLAGS) + +ms2SFS2D: ms2SFS2D.c msGeneralStats.c + $(CC) ms2SFS2D.c msGeneralStats.c -o ms2SFS2D $(CFLAGS) +ms2SFS2D_discoal: ms2SFS2D_discoal.c msGeneralStats.c + $(CC) ms2SFS2D_discoal.c msGeneralStats.c -o ms2SFS2D_discoal $(CFLAGS) + +ms2SFSVector: ms2SFSVector.c msGeneralStats.c + $(CC) ms2SFSVector.c msGeneralStats.c -o ms2SFSVector $(CFLAGS) + +ms2SFSVectorWindow: ms2SFSVectorWindow.c msGeneralStats.c + $(CC) ms2SFSVectorWindow.c msGeneralStats.c -o ms2SFSVectorWindow $(CFLAGS) + +discoal_mig2hmm: discoal_mig2hmm.c msGeneralStats.c + $(CC) discoal_mig2hmm.c msGeneralStats.c -o discoal_mig2hmm $(CFLAGS) + +slideFST: slideFST.c msGeneralStats.c + $(CC) slideFST.c msGeneralStats.c -o slideFST $(CFLAGS) + +niceStatsNoOmega: niceStatsNoOmega.c msGeneralStats.c + $(CC) niceStatsNoOmega.c msGeneralStats.c -g -o niceStatsNoOmega $(CFLAGS) + +niceStatsShanku: niceStatsShanku.c msGeneralStats.c + $(CC) niceStatsShanku.c msGeneralStats.c -o niceStatsShanku $(CFLAGS) + +niceStatsAchazSystem: niceStatsAchazSystem.c msGeneralStats.c + $(CC) niceStatsAchazSystem.c msGeneralStats.c -O3 -o niceStatsAchazSystem $(CFLAGS) + +niceStatsDiploid: niceStatsDiploid.c msGeneralStats.c msDiploidStats.c + $(CC) niceStatsDiploid.c msGeneralStats.c msDiploidStats.c -o niceStatsDiploid $(CFLAGS) +niceStats4Gamete: niceStats4Gamete.c msGeneralStats.c + $(CC) niceStats4Gamete.c msGeneralStats.c -o niceStats4Gamete $(CFLAGS) + diff --git a/bitStuff.c b/bitStuff.c new file mode 100644 index 0000000..a5f4492 --- /dev/null +++ b/bitStuff.c @@ -0,0 +1,35 @@ +//bitStuff.c +// basic bit manipulations + +#include "bitStuff.h" + +int setBit(int x, unsigned char position) +{ + int mask = 1 << position; + return x | mask; +} + +int clearBit(int x, unsigned char position) +{ + int mask = 1 << position; + return x & ~mask; +} + +int modifyBit(int x, unsigned char position, bool newState) +{ + int mask = 1 << position; + int state = (int) newState; // relies on true = 1 and false = 0 + return (x & ~mask) | (-state & mask); +} + +int flipBit(int x, unsigned char position) +{ + int mask = 1 << position; + return x ^ mask; +} + +bool isBitSet(int x, unsigned char position) + { + x >>= position; + return (x & 1) != 0; + } \ No newline at end of file diff --git a/bitStuff.h b/bitStuff.h new file mode 100644 index 0000000..a99a380 --- /dev/null +++ b/bitStuff.h @@ -0,0 +1,9 @@ +#include +#include +#include + +int setBit(int x, unsigned char position); +int clearBit(int x, unsigned char position); +int modifyBit(int x, unsigned char position, bool newState); +int flipBit(int x, unsigned char position); +bool isBitSet(int x, unsigned char position); diff --git a/combineMSFileDir.py b/combineMSFileDir.py new file mode 100755 index 0000000..e5db8d9 --- /dev/null +++ b/combineMSFileDir.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +import os, sys, gzip, random + +msFileDir, shuffle = sys.argv[1:] +if not shuffle in ["shuffle", "no_shuffle"]: + sys.exit("shuffle must be set to either 'shuffle' or 'no_shuffle'. AAAARRRRGGGGHHHHHHHHHH!!!!\n") + +def readAllMSRepsFromFile(msFileName): + if msFileName.endswith(".gz"): + fopen = gzip.open + else: + fopen = open + msStream = fopen(msFileName) + + header = msStream.readline().strip().split() + program,numSamples,numSims = header[:3] + if len(header) > 3: + otherParams = " " + " ".join(header[3:]) + else: + otherParams = "" + numSamples, numSims = int(numSamples),int(numSims) + + #advance to first simulation + line = msStream.readline() + while not line.strip().startswith("//"): + line = msStream.readline() + repLs = [] + while line: + if not line.strip().startswith("//"): + sys.exit("Malformed ms-style output file: read '%s' instead of '//'. AAAARRRRGGHHH!!!!!\n" %(line.strip())) + repStr = ["\n//"] + repStr.append(msStream.readline().strip()) #segsites line + positionsLine = msStream.readline().strip() + if not positionsLine.startswith("positions:"): + sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n") + repStr.append(positionsLine) #positions line + + for i in range(numSamples): + currLine = msStream.readline() + repStr.append(currLine.strip()) + line = msStream.readline() + #advance to the next non-empty line or EOF + while line and line.strip() == "": + line = msStream.readline() + repStr = "\n".join(repStr) + repLs.append(repStr) + msStream.close() + + return numSamples, repLs + +repLs = [] +allNumSamples = {} +for msFileName in os.listdir(msFileDir): + sys.stderr.write("%s\n" %(msFileName)) + currNumSamples, currRepLs = readAllMSRepsFromFile(msFileDir + "/" + msFileName) + allNumSamples[currNumSamples] = 1 + repLs += currRepLs +assert len(allNumSamples) == 1 +print "./msStyle %s %s\nblah\n" %(currNumSamples, len(repLs)) +if shuffle == "shuffle": + random.shuffle(repLs) +print "\n".join(repLs) diff --git a/combineMSFileSubsets.py b/combineMSFileSubsets.py new file mode 100755 index 0000000..9f9ea48 --- /dev/null +++ b/combineMSFileSubsets.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +import sys, gzip, random + +msFileName1, numReps1, msFileName2, numReps2, shuffle = sys.argv[1:] +numReps1, numReps2 = int(numReps1), int(numReps2) +if not shuffle in ["shuffle", "no_shuffle"]: + sys.exit("shuffle must be set to either 'shuffle' or 'no_shuffle'. AAAARRRRGGGGHHHHHHHHHH!!!!\n") + +def readAllMSRepsFromFile(msFileName): + msStream = open(msFileName) + + header = msStream.readline().strip().split() + program,numSamples,numSims = header[:3] + if len(header) > 3: + otherParams = " " + " ".join(header[3:]) + else: + otherParams = "" + numSamples, numSims = int(numSamples),int(numSims) + + #advance to first simulation + line = msStream.readline() + while line.strip() != "//": + line = msStream.readline() + repLs = [] + while line: + if line.strip() != "//": + sys.exit("Malformed ms-style output file: read '%s' instead of '//'. AAAARRRRGGHHH!!!!!\n" %(line.strip())) + repStr = ["\n//"] + repStr.append(msStream.readline().strip()) #segsites line + positionsLine = msStream.readline().strip() + if not positionsLine.startswith("positions:"): + sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n") + repStr.append(positionsLine) #positions line + + for i in range(numSamples): + currLine = msStream.readline() + repStr.append(currLine.strip()) + line = msStream.readline() + #advance to the next non-empty line or EOF + while line and line.strip() == "": + line = msStream.readline() + repStr = "\n".join(repStr) + repLs.append(repStr) + msStream.close() + + return numSamples, repLs + +numSamples1, repLs1 = readAllMSRepsFromFile(msFileName1) +numSamples2, repLs2 = readAllMSRepsFromFile(msFileName2) +if numSamples1 != numSamples2: + sys.exit("sample size differs between %s (%s) and %s (%s). AAAARRRRGGGGHHHHHHHH!\n" %(msFileName1, numSamples1, msFileName2, numSamples2)) +print "./msStyle %s %s\nblah\n" %(numSamples1, numReps1+numReps2) +if shuffle == "shuffle": + random.shuffle(repLs1) + random.shuffle(repLs2) +print "\n".join(repLs1[:numReps1]) +print "\n".join(repLs2[:numReps2]) diff --git a/discoal_mig2hmm.c b/discoal_mig2hmm.c new file mode 100644 index 0000000..60f2588 --- /dev/null +++ b/discoal_mig2hmm.c @@ -0,0 +1,112 @@ +/******* ms2TwoSite2Popn.c ******** +converts ms output to list of all +pairwise two-site comparisons for 2 popns. +The AFS in a two site 2 popn setting is a 6D +Matrix with the following entries +x = {p1, p2, x11, p3, p4, y11} +where p1 and p2 represent the frequency of +derived allele at locus one and locus two in popn 1, +X11 is the number of p1p2 haps in popn 1, +p3 and p4 are respective freqs in popn2, +and y11 is the number of p3,p4 haps in popn2 + +Seven column output: p1 p2 x11 p3 p4 y11 dist +********************************/ + +#include +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); +void printSFS2D(int segsites, int nsam, int n1, int n2, char **list, int *v); + + +int maxsites = 100000 ; + int nsites; +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit, rho, mig ; + int segsites, count , nadv, npops,n1, n2, *positSites; + + double ss, t; + char dum[20], astr[100] ; + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d %d -t %lf -r %lf -p %d %d %d %s", dum, &nsam, &howmany, &nsites, &t, &rho ,&npops,&n1,&n2, astr); +// printf("dum: %s, theta: %lf npops: %d n1: %d rho: %f nsites: %d\n",dum,ss,npops,n1,rho,nsites); + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + positSites = (int *)malloc(nsites*sizeof(int)); + count=0; + while( howmany-count++ ) { + +/* read in a sample */ + do { + fgets( line, LINEBUF, pfin); + }while ( line[0] != '/' ); + + fscanf(pfin," segsites: %d", &segsites ); + if( segsites >= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i 3: + otherParams = " " + " ".join(header[3:]) + else: + otherParams = "" + numSamples, numSims = int(numSamples),int(numSims) + + #advance to first simulation + line = msStream.readline() + while line.strip() != "//": + line = msStream.readline() + repLs = [] + while line: + if line.strip() != "//": + sys.exit("Malformed ms-style output file: read '%s' instead of '//'. AAAARRRRGGHHH!!!!!\n" %(line.strip())) + repStr = ["\n//"] + repStr.append(msStream.readline().strip()) #segsites line + positionsLine = msStream.readline().strip() + if not positionsLine.startswith("positions:"): + sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n") + repStr.append(positionsLine) #positions line + + for i in range(numSamples): + currLine = msStream.readline() + repStr.append(currLine.strip()) + line = msStream.readline() + #advance to the next non-empty line or EOF + while line and line.strip() == "": + line = msStream.readline() + repStr = "\n".join(repStr) + repLs.append(repStr) + msStream.close() + + return numSamples, repLs + +numSamples, repLs = readAllMSRepsFromFile(msFileName) +if header == "header": + if headerNumReps == "actual": + print "./msStyle %s %s\nblah" %(numSamples, numReps) + else: + print "./msStyle %s %s\nblah" %(numSamples, headerNumReps) + +if numReps > 0: + random.shuffle(repLs) + print "\n".join(repLs[:numReps]) diff --git a/getMSRepSubrange.py b/getMSRepSubrange.py new file mode 100755 index 0000000..28293fc --- /dev/null +++ b/getMSRepSubrange.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +import sys, gzip + +#example: python ~/kerncode/msUtils/getMSRepSubrange.py test.msout 1 2 | less +msFileName, firstRepNumber, lastRepNumber = sys.argv[1:] #firstRepNumber and lastRepNumber are zero-based incidces of the first and last reps we want to print +firstRepNumber, lastRepNumber = int(firstRepNumber), int(lastRepNumber) +if lastRepNumber < firstRepNumber: + sys.exit("lastRepNumber must be >= firstRepNumber. AAAAARRRRRGGGGGGGHHHHHH!!!\n") + +if msFileName.endswith(".gz"): + msStream = gzip.open(msFileName) +elif msFileName == "stdin": + msStream = sys.stdin +else: + msStream = open(msFileName) + +header = msStream.readline().strip().split() +program,numSamples,numSims = header[:3] +if len(header) > 3: + otherParams = " " + " ".join(header[3:]) +else: + otherParams = "" +numSamples,numSims = int(numSamples),int(numSims) +sys.stdout.write("./msStyle %s %s%s\nblah" %(numSamples, lastRepNumber-firstRepNumber+1, otherParams)) + +processedSims = 0 +#advance to first simulation +line = msStream.readline() +while not line.strip().startswith("//"): + line = msStream.readline() +while line: + if not line.strip().startswith("//"): + sys.exit("Malformed ms-style output file: read '%s' instead of '//'. AAAARRRRGGHHH!!!!!\n" %(line.strip())) + repStr = "\n\n//\n" + repStr += msStream.readline() #segsites line + positionsLine = msStream.readline() + if not positionsLine.startswith("positions:"): + sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n") + repStr += positionsLine #positions line + + for i in range(numSamples): + currLine = msStream.readline() + repStr += currLine + if processedSims >= firstRepNumber: + sys.stdout.write(repStr.rstrip()) + if processedSims == lastRepNumber: + break + processedSims += 1 + line = msStream.readline() + #advance to the next non-empty line or EOF + while line and line.strip() == "": + #if processedSims > firstRepNumber: + # print line.strip() + line = msStream.readline() + +if msFileName != "stdin": + msStream.close() diff --git a/infiniteSitesToFinite.py b/infiniteSitesToFinite.py new file mode 100644 index 0000000..a7440b5 --- /dev/null +++ b/infiniteSitesToFinite.py @@ -0,0 +1,46 @@ +import sys, gzip +import numpy as np +from msTools import * + +def main(): + usageStr="""usage: +python infiniteSitesToFinite.py msOutFileName physLen outFileName + +This script takes simulation results in ms-style format (whose path is the first argument: msOutFileName) and converts the positions from continuous values into descrete chromosomal locations ranging from [0, physLen-1], where physLen is the second argument. The script then emits the positions of polymorphisms, with one line for each rep in the simulation file. Output is written to outFileName (third argument). If outFileName ends with .npz, a matrix of zeros and ones is saved in npz format, associated with the key 'm'. The length of each row in the matrix is equal to physLen, and positions that are polymorphic are denoted with 1, and those that are monomorphic are 0, with one row for each simulation replicate. If outFileName ends with .gz then for each simulation rep a tab-separated list of integer locations of polymorphisms is written on a separate line, and this file is compressed in gzip format. Otherwise, the output is the same as for .gz but is not compressed. The number of polymorphisms in each rep must be <= physLen, if you know what's good for you! +""" + + if len(sys.argv) != 4: + sys.exit(usageStr) + + msOutFileName, physLen, outFileName = sys.argv[1:] + physLen = int(physLen) + if msOutFileName.endswith(".gz"): + fopen = gzip.open + else: + fopen = open + + newPositionLists = [] + with fopen(msOutFileName) as msOutFile: + for line in msOutFile: + if line.startswith("positions:"): + positions = [float(x) for x in line.lstrip("positions:").strip().split()] + newPositions = msPositionsToIntegerPositions(positions, physLen) + newPositionLists.append(newPositions) + + if outFileName.endswith(".npz"): + m = np.zeros((len(newPositionLists), physLen),dtype="bool") + for i in range(len(newPositionLists)): + for j in newPositionLists[i]: + m[i, j] = 1 + np.savez(outFileName, m=m) + else: + if outFileName.endswith(".gz"): + fopen = gzip.open + else: + fopen = open + with fopen(outFileName, "w") as outFile: + for newPositions in newPositionLists: + outFile.write("\t".join([str(x) for x in newPositions]) + "\n") + +if __name__ == "__main__": + main() diff --git a/maskedStats.c b/maskedStats.c new file mode 100644 index 0000000..a30724f --- /dev/null +++ b/maskedStats.c @@ -0,0 +1,88 @@ +/******* maskedStats.c ******** +for calculating sample stats from MS output +after it has been filtered by msMask +********************************/ + +#include +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); + + + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit ; + int segsites, count , nadv, iss,h ; + double pi , th, z; + char dum[20], astr[100] ; + + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + + while( howmany-count++ ) { + +/* read in a sample */ + do { + fgets( line, LINEBUF, pfin); + }while ( line[0] != '/' ); + + fscanf(pfin," segsites: %d", &segsites ); + if( segsites >= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 +int maxsites = 100000 ; +void usage(); + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit ; + int segsites, count , n1, n2, iss,h ; + double pi , th, z, f, snn,dxy, dxy_min, dxy_mean, H, tajD; + char dum[20], astr[100] ; + + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + n1 = atoi( argv[1] ) ; + n2 = atoi( argv[2] ) ; + } + else{ + usage(); + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + + while( howmany-count++ ) { + +/* read in a sample */ + do { + fgets( line, LINEBUF, pfin); + }while ( line[0] != '/' ); + + fscanf(pfin," segsites: %d", &segsites ); + if( segsites >= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); +void printSFS2D(int segsites, int nsam, int n1, int n2, char **list); + + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit, rho, mig ; + int segsites, count , nadv, npops,n1, n2; + int nsites; + double ss; + char dum[20], astr[100] ; + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d %s %lf -r %lf %d -I %d %d %d %lf", dum, &nsam, &howmany, astr, &ss, &rho, &nsites,&npops,&n1,&n2, &mig); +// printf("dum: %s, theta: %lf npops: %d n1: %d rho: %f nsites: %d\n",dum,ss,npops,n1,rho,nsites); + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + count=0; + while( howmany-count++ ) { + +/* read in a sample */ + do { + fgets( line, LINEBUF, pfin); + }while ( line[0] != '/' ); + + fscanf(pfin," segsites: %d", &segsites ); + if( segsites >= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); +void printSFS2D(int segsites, int nsam, int n1, int n2, char **list); +void tallySFS2D(int segsites, int nsam, int n1, int n2, char **list, int **jSFS); + + +int maxsites = 100000 ; +int sum=0; +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i,j, howmany, **jSFS; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit, theta ; + int segsites, count , nadv, npops,n1, n2; + int nsites; + char dum[20], astr[100] ; + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d %d -t %lf -p %d %d %d %s", dum, &nsam, &howmany, &nsites, &theta, &npops,&n1,&n2, astr); +// printf("dum: %s, theta: %lf npops: %d n1: %d rho: %f nsites: %d\n",dum,ss,npops,n1,rho,nsites); + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + jSFS = imatrix(n1+1,n2+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + count=0; + while( howmany-count++ ) { + +/* read in a sample */ + do { + fgets( line, LINEBUF, pfin); + }while ( line[0] != '/' ); + + fscanf(pfin," segsites: %d", &segsites ); + if( segsites >= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); +int maxSampleSize = 5000; + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit, rho, mig ; + int segsites, count , nadv, npops,n1, n2; + int nsites, derived_counts[maxSampleSize]; + double ss; + char dum[20], astr[100] ; + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d %s", dum, &nsam, &howmany, astr); + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + count=0; + while( howmany-count++ ) { + +/* read in a sample */ + do { + fgets( line, LINEBUF, pfin); + }while ( line[0] != '/' ); + + fscanf(pfin," segsites: %d", &segsites ); + if( segsites >= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); +int maxSampleSize = 5000; + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit, rho, mig ; + int segsites, count , npops,n1, n2; + double high, low; + int nsites, derived_counts[maxSampleSize], totalSNPs; + double ss, totalSFS[maxSampleSize]; + char dum[20], astr[100] ; + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d %s", dum, &nsam, &howmany, astr); + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + low = atof( argv[1] ) ; + high = atof( argv[2] ) ; + + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + count=0; + + for(i=0;i= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); + + + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit, rho ; + int segsites, count , nadv, ss, nsites; + char dum[20], astr[100] ; + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + //sscanf(line," %s %d %d", dum, &nsam, &howmany); + sscanf(line," %s %d %d %s %d -r %lf %d", dum, &nsam, &howmany, astr, &ss, &rho, &nsites); + + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + count=0; + while( howmany-count++ ) { + +/* read in a sample */ + do { + fgets( line, LINEBUF, pfin); + }while ( line[0] != '/' ); + + fscanf(pfin," segsites: %d", &segsites ); + if( segsites >= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); + + + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit, rho, mig ; + int segsites, count , nadv, npops,n1, n2; + int nsites; + double ss; + char dum[20], astr[100] ; + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d %s %lf -r %lf %d -I %d %d %d %lf", dum, &nsam, &howmany, astr, &ss, &rho, &nsites,&npops,&n1,&n2, &mig); +// printf("dum: %s, theta: %lf npops: %d n1: %d rho: %f nsites: %d\n",dum,ss,npops,n1,rho,nsites); + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + count=0; + while( howmany-count++ ) { + +/* read in a sample */ + do { + fgets( line, LINEBUF, pfin); + }while ( line[0] != '/' ); + + fscanf(pfin," segsites: %d", &segsites ); + if( segsites >= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" +/* allocates space for gametes (character strings) */ + + +void cmatrix_free(char **m, int nsam) +{ + int i; + for(i=0;i 1){ + nnm1 = nd/(nd-1.0) ; + p1 = frequency('1', s,nsam,list)/nd ; + pi += 2.0*p1*(1.0 -p1)*nnm1 ; + } + } + return( pi ) ; +} + +double maxFDA( int nsam, int segsites, char **list) +{ + int s, frequency( char, int, int, char**); + double mfda, p1, nd; + + mfda = 0.0 ; + + + for( s = 0; s 1){ + p1 = frequency('1', s,nsam,list)/nd ; + if (p1 < 1.0 && p1 > mfda){ + mfda = p1; + } + } + } + return( mfda ) ; +} + +//fills a vector size l with values of nucdiv in "windows" +void nucdivWindow( int nwins, double *posit, double *output, int nsam, int segsites, char **list) +{ + int s, frequency( char, int, int, char**); + int wcount = 0; + double pi, p1, nd, nnm1 ; + double start, end, delta; + start = 0; + delta = 1.0 / nwins; + end = delta; + + while(start < 1.0){ + pi = 0.0 ; + for( s = 0; s start){ + nd = sampleSizeSite(s,nsam,list); + if (nd > 1){ + nnm1 = nd/(nd-1.0) ; + p1 = frequency('1', s,nsam,list)/nd ; + pi += 2.0*p1*(1.0 -p1)*nnm1 ; + } + } + } + output[wcount++]=pi; + start += delta; + end += delta; + } +} + + +//fills a vector size l with values of Tajima's D in "windows" +void tajdWindow(int nwins, double *posit, double *output, int nsam, int segsites, char **list) +{ + int s, frequency( char, int, int, char**); + int wcount = 0; + double pi, p1, nd, nnm1 ; + double start, end, delta; + int segsitesInWin; + start = 0; + delta = 1.0 / nwins; + end = delta; + + while(start < 1.0){ + pi = 0.0 ; + segsitesInWin = 0; + for( s = 0; s start){ + nd = sampleSizeSite(s,nsam,list); + if (nd > 1){ + segsitesInWin += 1; + nnm1 = nd/(nd-1.0) ; + p1 = frequency('1', s,nsam,list)/nd ; + pi += 2.0*p1*(1.0 -p1)*nnm1 ; + } + } + } + output[wcount++]=tajd(nsam,segsitesInWin,pi); + start += delta; + end += delta; + } +} + + +double achazThetaExponentWeights(int nsam, int segsites, char **list, int exponent) +{ + int s, frequency( char, int, int, char**); + double thetaA, i, wi, nd, nnm1, wSum ; + + wSum = 0.0 ; + for ( i = 1; i < nsam; i++){ + wi = pow(i,exponent); + wSum += wi; + } + + thetaA = 0.0 ; + for( s = 0; s 1){ + nnm1 = nd/(nd-1.0); + i = frequency('1', s,nsam,list); + wi = pow(i,exponent); + thetaA += wi*i; + } + } + + thetaA = thetaA/wSum; + return thetaA; +} + +double achazThetaParabolicWeights(int nsam, int segsites, char **list, int exponent,double center) +{ + int s, frequency( char, int, int, char**); + double thetaA, i, wi, nd, nnm1, wSum ; + + wSum = 0.0 ; + for ( i = 1; i < nsam; i++){ + wi = pow((center - i),exponent); + wSum += wi; + } + + thetaA = 0.0 ; + for( s = 0; s 1){ + nnm1 = nd/(nd-1.0); + i = frequency('1', s,nsam,list); + wi = pow((center - i), (double) exponent); + thetaA += wi*i; + } + } + + thetaA = thetaA/wSum; + return thetaA; +} + +//pi with a twist of theta H +double achazThetaHPi(int nsam, int segsites, char **list) +{ + int s, frequency( char, int, int, char**); + double thetaA, i, wi, nd, nnm1, wSum ; + + wSum = 0.0 ; + for ( i = 1; i < nsam; i++){ + wi = i*(nsam-i); + wSum += wi; + } + + thetaA = 0.0 ; + for( s = 0; s 1){ + nnm1 = nd/(nd-1.0); + i = frequency('1', s,nsam,list); + wi = 1/(i*i); //i^2 is tw4 + thetaA += wi*i; + } + } + + thetaA = thetaA/wSum; + return thetaA; +} + +//pi minus a sumary stat that is highest when your SFS is U-shaped +double achazTajimasDExtreme(int nsam, int segsites, char **list) +{ + int s, frequency( char, int, int, char**); + double pi, upsideDownPi, i, wi1, wi2, nd, nnm1, wSum1, wSum2 ; + + wSum1 = 0.0 ; + wSum2 = 0.0; + for ( i = 1; i < nsam; i++){ + wi1 = (nsam-i); + wi2 = pow(((nsam/2.0)-i),2)/i; + wSum1 += wi1; + wSum2 += wi2; + } + + pi = 0.0 ; + upsideDownPi = 0.0; + for( s = 0; s 1){ + nnm1 = nd/(nd-1.0); + i = frequency('1', s,nsam,list); + wi1 = (nsam-i); + wi2 = pow(((nsam/2.0)-i),2)/i; + pi += wi1*i; + upsideDownPi += wi2*i; + } + } + + pi = pi/wSum1; + upsideDownPi = upsideDownPi/wSum2; + return pi-upsideDownPi; +} + +double sigmaAlpha(int n) +{ + int i; + int sum = 0; + for (i=1; i 1){ + i = frequency('1', s,nsam,list); + afs[i] += 1; + } + } + + //calculate neutrality test statistic + Tnum = 0.0 ; + betaNpart1 = 0.0; + betaNpart2 = 0.0; + alphaN = 0.0; + for ( i = 1; i < nsam; i++){ + + Tnum += omega[i]*(double)(i*afs[i]); + + alphaN += i*omega[i]*omega[i]; + betaNpart1 += (double)(i*i)*omega[i]*omega[i]*sigma_ii_FU1995(i,harmonicSums,nsam); + for (j=i+1; j 1){ + pi += (p1*p1)/( nd*(nd-1.0) ) ; + } + } + return(pi*2.0) ; +} + + +int frequency( char allele,int site,int nsam, char **list){ + int i, count=0; + for( i=0; i 0) && (frequency('0', i, nsam, list) >0 )) ? 1:0); + } + return(ss); +} + +//gets the derived site frequency spectrum; fixations treated as monomorphic +//derived_counts must be an int array of length nsam +//derived_counts[i] is the fraction of sites with derived allele present in i chromosomes +//number of monomorphic sites is sorted in derived_counts[0] +void getSiteFreqSpec(int segsites, int nsam, char**list, int nSites, int *derived_counts) +{ + int i; + int freq; + + for (i=0; i 0 && freq < nsam) + { + polycount++; + derived_counts[freq] += 1; + } + } + derived_counts[0] = nSites-polycount; +} + +//gets the derived site frequency spectrum; fixations treated as monomorphic +//derived_counts must be an int array of length nsam +//derived_counts[i] is the fraction of sites with derived allele present in i chromosomes +//number of monomorphic sites is sorted in derived_counts[0] +void getSiteFreqSpecWindow(int segsites, int nsam, char**list, int nSites, int *derived_counts, double *pos, double low, double high) +{ + int i; + int freq; + + for (i=0; ilow && pos[i] <= high) + { + freq = frequency('1', i, nsam, list); + if (freq > 0 && freq < nsam) + { + polycount++; + derived_counts[freq] += 1; + } + } + } + derived_counts[0] = nSites-polycount; +} + +//counts the number of haplotypes, and gets their frequencies (stored in haplotype_counts) +//haplotype_counts must be an int array of length nsam +//haplotype_counts[i] is the number of haplotypes found in exactly i+1 individuals +int getHaplotypeFreqSpec(int segsites, int nsam, char **list, int *haplotype_counts) +{ + int i; + int j; + int k; + int haplotype_found; + int allsame; + int freq; + + int n_haplotypes = 0; + char haplotypes[nsam][segsites+1]; + int haplotype_occurrences[nsam]; + + for(i=0; i 0 && freq <= nsam) + { + haplotype_counts[freq-1] += 1; + } + } + + return n_haplotypes; +} + +double petrovH1(int *haplotype_counts, int nsam) +{ + int hapFreq; + double pi; + double h1 = 0.0; + + for (hapFreq=nsam; hapFreq>0; hapFreq--) + { + pi = hapFreq/ (double)nsam; + h1 += haplotype_counts[hapFreq-1]*pi*pi; + } + return h1; +} + +double petrovH2(int *haplotype_counts, int nsam) +{ + int hapFreq; + double pi; + double h2 = 0.0; + int first = 1; + + for (hapFreq=nsam; hapFreq>0; hapFreq--) + { + pi = hapFreq/ (double)nsam; + if (haplotype_counts[hapFreq-1] > 0) + { + if (first) + { + first = 0; + h2 += (haplotype_counts[hapFreq-1]-1)*pi*pi; + } + else + { + h2 += haplotype_counts[hapFreq-1]*pi*pi; + } + } + } + return h2; +} + +double petrovH12(int *haplotype_counts, int nsam) +{ + int hapFreq, i; + double pi; + double part1 = 0.0; + double part2 = 0.0; + int totalAdded = 0; + + for (hapFreq=nsam; hapFreq>0; hapFreq--) + { + pi = hapFreq/ (double)nsam; + for (i = 0;i < haplotype_counts[hapFreq-1];i++) + { + if (totalAdded < 2) + { + part1 += pi; + } + else + { + part2 += pi*pi; + + } + totalAdded++; + } + } + + part1 = part1*part1; + return part1+part2; +} + +//gets H12, H1, and H2 in windows +void petrovHStatsWindow(int segsites, int nwins, double *posit, double *winsH12, double *winsH1, double *winsH2, int nsam, char **list) +{ + int i; + int j; + int k; + int haplotype_found, freq; + int allsame; + float start, delta, end; + int n_haplotypes = 0; + char haplotypes[nsam][segsites+1]; + int haplotype_occurrences[nsam]; + int haplotype_counts[nsam]; + int wcount = 0; + start = 0; + delta = 1.0 / nwins; + end = delta; + + while(start < 1.0) + { + for(i=0; i start && posit[k] <= end) + { + if(haplotypes[j][k] != list[i][k]) + { + if(haplotypes[j][k] == 'N' ) + haplotypes[j][k] = list[i][k]; + else if(list[i][k] != 'N') + { + allsame = 0; + break; + } + } + } + } + if(allsame) + { + haplotype_found = 1; + haplotype_occurrences[j]+=1; + break; + } + } + if(!haplotype_found) + { + n_haplotypes++; + for(j=0; j 0 && freq <= nsam) + { + haplotype_counts[freq-1] += 1; + } + } + winsH12[wcount]=petrovH12(haplotype_counts,nsam); + winsH1[wcount]=petrovH1(haplotype_counts,nsam); + winsH2[wcount]=petrovH2(haplotype_counts,nsam); + wcount++; + start += delta; + end += delta; + n_haplotypes = 0; + } +} + +double meanEHH(int segsites, double *posit, double delta, int nsam, char **list) +{ + int i, j, k, l; + int allsame; + float start, end; + int sameCount; + int totalCount; + float ehhSum = 0; + float ehhCount = 0; + for(i=0;i= 0 && end <= 1) + { + for(j=0; j= start && posit[l] <= end && l != i) + { + if (list[j][l] != list[k][l]) + { + allsame = 0; + break; + } + } + } + if (allsame) + { + sameCount++; + } + totalCount++; + } + } + } + } + if (totalCount > 0) + { + ehhSum += sameCount/(float)totalCount; + } + ehhCount += 1; + } + } + return ehhSum/ehhCount; +} + +double meanREHH(int segsites, double *posit, double delta, int nsam, char **list) +{ + int i, j, k, l; + int allsame; + float start, end; + int sameCountDer, sameCountAnc; + int totalCountDer, totalCountAnc; + float ehhSum = 0; + float ehhCount = 0; + for(i=0;i= 0 && end <= 1) + { + for(j=0; j= start && posit[l] <= end && l != i) + { + if (list[j][l] != list[k][l]) + { + allsame = 0; + break; + } + } + } + if (allsame) + { + sameCountDer++; + } + totalCountDer++; + } + } + } + else if (list[j][i] == '0') + { + for(k=0;k= start && posit[l] <= end && l != i) + { + if (list[j][l] != list[k][l]) + { + allsame = 0; + break; + } + } + } + if (allsame) + { + sameCountAnc++; + } + totalCountAnc++; + } + } + } + } + if (totalCountAnc > 0 && totalCountDer > 0 && sameCountAnc > 0) + { + ehhSum += (sameCountDer/(float)totalCountDer) / (sameCountAnc/(float)totalCountAnc); + } + ehhCount += 1; + } + } + return ehhSum/ehhCount; +} + +//counts number of haplotypes +int nHaplotypes(int segsites, int nsam, char **list) +{ + int i; + int j; + int k; + int haplotype_found; + int allsame; + + int n_haplotypes = 0; + char haplotypes[nsam][segsites+1]; + + for(i=0; i maxVal) + { + maxVal = currVal; + } + } + sStarArray[i] = maxVal; + if (maxVal > globalMaxVal) + { + globalMaxVal = maxVal; + } + } + return globalMaxVal; +} + +double dij(int i, int j, int nsam, char** list){ + + double pi = 0.0; + double pj = 0.0; + double pij = 0.0; + double count = 0.0; + int k; + for(k=0; k= left)sum += comp; + if(i < left && j < left) sumL += comp; + if(i >= left && j >= left) sumR += comp; + } + } + denom = sum * (1.0/(left*(s-left))); + numer = 1.0 / ((left*(left-1)/2) + ((s-left)*(s-left-1)/2)); + numer *= (sumL+sumR); + //printf("n/d: %f d: %f n %f sumL: %f sumR: %f term: %f left: %d\n",numer/denom,denom,numer,sumL,sumR,1.0 / ((left*(left-1)/2) + ((s-left)*(s-left-1)/2)),left); + return(numer/denom); +} + +/*omega statistic from Kim and Nielsen (2003) +** not robust to missing data +*/ +double omega(int left, int segsites,int nsam, char** list){ + int i,j, s; + double sum,sumL,sumR,comp,denom; + double numer; + + sum = sumL = sumR =comp=denom=0; + if(segsites < 3) + return 0.0; + s = segSites(segsites, nsam, list); + //calculate: + // sum for denom-- all pairwise r2 + // sumL and sumR for numerator -- blockwise r2 + for(i=0; i= left)sum += comp; + if(i < left && j < left) sumL += comp; + if(i >= left && j >= left) sumR += comp; + } + } + denom = sum * (1.0/(left*(s-left))); + numer = 1.0 / ((left*(left-1)/2) + ((s-left)*(s-left-1)/2)); + numer *= (sumL+sumR); + return(numer/denom); +} + +/*Get omega at known fixation position. I.E. center of simulation*/ +double omegaCenter(int siteIdx , int segsites,int nsam, char** list){ + int i,j, s; + double sum,sumL,sumR,comp,denom; + double numer; + + sum = sumL = sumR =comp=denom=0; + if(segsites < 3) + return 0.0; + s = segSites(segsites, nsam, list); + //calculate: + // sum for denom-- all pairwise r2 + // sumL and sumR for numerator -- blockwise r2 + for(i=0; i= siteIdx)sum += comp; + if(i < siteIdx && j < siteIdx) sumL += comp; + if(i >= siteIdx && j >= siteIdx) sumR += comp; + } + } + denom = sum * (1.0/(siteIdx*(s-siteIdx))); + numer = 1.0 / ((siteIdx*(siteIdx-1)/2) + ((s-siteIdx)*(s-siteIdx-1)/2)); + numer *= (sumL+sumR); +// printf("d: %f n %f sumL: %f sumR: %f term: %f left: %d\n",denom,numer,sumL,sumR,1.0 / ((left*(left-1)/2) + ((s-left)*(s-left-1)/2)),left); + if (isnan(denom)){ + return 0.0; + } + else{ + return(numer/denom); + } +} + + +/*omegaMax -- goes through all possible site divisions to maximize omega +// Kim and Nielsen (2003) +// This version builds a table of all pairwise r^2 vals for faster downstream computation +*/ +double omegaMax(int segsites,int nsam, char** list){ + int l, i, j; + double max= 0; + double tmp=0; + double **dijTable; + + dijTable = (double **)malloc( sizeof(double)*segsites ); + + if(segsites < 3) + return(0); + for(i=0; i max){ + max = tmp; + } + } + + for(i=0; i < segsites-1; i++){ + free(dijTable[i]); + } + free(dijTable); + + return(max); +} + +///////////////////////// +//Two Site Utils +////// +//// +// +//sampleConfig-- fills vector with the sample configuration +// for sites i and j +void sampleConfig(int i, int j, int nsam, char** list, int *config){ + int p1, p2, x11, k; + p1 = p2 = x11 = 0; + + for(k=0; k p2){ + config[0] = p1; + config[1] = p2; + } + else{ + config[0] = p2; + config[1] = p1; + } + config[2] = x11; +} + +void printPairwiseSampleConfigs(int segsites, int nsam, char **list, double *posit, int nsites){ + int i,j, config[3]; + + if(segsites < 2){ + return; + } + else{ + for(i=0; i p2){ + config[0] = p1; + config[1] = p2; +// } +// else{ +// config[0] = p2; +// config[1] = p1; +// } + config[2] = x11; +// if(p3 > p4){ + config[3] = p3; + config[4] = p4; +// } +// else{ +// config[3] = p4; +// config[4] = p3; +// } + config[5] = y11; +} + +void printPairwiseSampleConfigs2Popn(int segsites, int nsam, int popnSize1, char **list, double *posit, int nsites){ + int i,j, config[6]; + + if(segsites < 2){ + return; + } + else{ + for(i=0; i0) return(1); + else return(0); +} + +/**************** +/// Sub popn versions +******************/ + +//frequencySub-- allows for arbitrary allele indexes as you would need for sub pops +int frequencySub(char allele, int site, int startAllele, int stopAllele, char **list){ + int i, count=0; + for( i=startAllele; i 0) && (frequencySub('0', i, startAllele, stopAllele, list) >0 )) ? 1:0); + } + return(ss); +} + +//gets the right number of segSites when there are Ns +void privateSegSitesInTwoPopns(int segsites, int nsam, int stopAllele, int *private1, int *private2, char **list){ + int i, isSeg1, isSeg2; + (*private1) = 0; + (*private2) = 0; + + for(i=0; i < segsites; i++){ + isSeg1 = 0; + isSeg2 = 0; + if ((frequencySub('1', i, 0, stopAllele, list) > 0) && (frequencySub('0', i, 0, stopAllele, list) > 0)) + isSeg1 = 1; + if ((frequencySub('1', i, stopAllele, nsam, list) > 0) && (frequencySub('0', i, stopAllele, nsam, list) > 0)) + isSeg2 = 1; + if (isSeg1 && !isSeg2) + (*private1)++; + else if (isSeg2 && !isSeg1) + (*private2)++; + } +} + + +//sampleSizeSiteSub -- returns the sampleSize at a site corrected for missing data +//in startAllele to stopAllele rows +int sampleSizeSiteSub(int site, int nsam, int startAllele, int stopAllele, char **list){ + return(stopAllele - startAllele - frequencySub('N',site,startAllele, stopAllele,list)); +} + +double *hetVec1Popn(int segsites, int n1, int physLen, int *vecLen, double *posit, char **list) +{ + (*vecLen) = 0; + int i, j, k; + double diffs; + double *vec; + + vec = (double *) malloc(sizeof(double) *n1*(n1-1)/2); + for(i=0; i= g1Size) + { + //printf("indeed we must (%d >= %d)! Looking for a suitable parter starting with %d which is %d\n", membership1[i], g1Size, j, membership2[j]); + while (membership2[j] >= g1Size) + { + j++; + //printf("nope! moving on to %d which is %d\n", j, membership2[j]); + } + if (j >= g2Size) fprintf(stderr, "reorderListIntoClusters has a bug!\n"), exit(-1); + //printf("okay we have a winner! (%d, which is %d)\n", j, membership2[j]); + tmp = list[membership1[i]]; + list[membership1[i]] = list[membership2[j]]; + list[membership2[j]] = tmp; + j++; + } + } +} + +int breakClusterAssignmentTie(double **hetMatrix, int targIndex, int *membership1, int m1Index, int *membership2, int m2Index){ + double d1 = 0., d2 = 0.; + int i; + for (i=0; i 0.5) + { + return 1; + } + else + { + return 2; + } + }*/ + //else if (d1 > d2) + if (d1 > d2) + { + return 2; + } + else + { + return 1; + } +} + +//returns 1 if we have to swap the resulting clusters (in order to make the first cluster the larger one +//which we assume to be the case later one). +int assignClusters(double *hetVec, int n1, int *g1Size, int *membership1, int *g2Size, int *membership2){ + int i, j, minI, maxPairI = -1, maxPairJ = -1, pairIndex = 0, m1Index = 0, m2Index = 0; + double minHet, maxHet = -1; + double **hetMatrix; + //printf("n1: %d\n", n1); + hetMatrix = (double **) malloc( (unsigned)( n1*sizeof( double* ))); + for(i=0; i maxHet) + { + //printf("maxHet: %f\n", maxHet); + maxHet = hetVec[pairIndex]; + maxPairI = i; + maxPairJ = j; + } + pairIndex++; + } + } + membership1[m1Index++] = maxPairI; + membership2[m2Index++] = maxPairJ; + //printf("%d: 1\n", maxPairI); + //printf("%d: 2\n", maxPairJ); + + for(i=0; i hetMatrix[i][maxPairJ] - 1e-9) + { + //printf("%d: 2, %f, %f\n", i, hetMatrix[i][maxPairI], hetMatrix[i][maxPairJ]); + membership2[m2Index++] = i; + } + else if (hetMatrix[i][maxPairI] < hetMatrix[i][maxPairJ] - 1e-9 || breakClusterAssignmentTie(hetMatrix, i, membership1, m1Index, membership2, m2Index) == 1) + { + //printf("%d: 1, %f, %f\n", i, hetMatrix[i][maxPairI], hetMatrix[i][maxPairJ]); + membership1[m1Index++] = i; + } + else + { + //printf("%d: 2, %f, %f\n", i, hetMatrix[i][maxPairI], hetMatrix[i][maxPairJ]); + membership2[m2Index++] = i; + } + } + } + if (m1Index == 1 && m2Index > 1) + { + minHet = 10e10; + //printf("not enough samples in group 1, going to take one from group 2\n"); + for (i=0; i 1) + { + minHet = 10e10; + //printf("not enough samples in group 2, going to take one from group 1\n"); + for (i=0; i 1){ + nnm1 = nd/(nd-1.0) ; + p1 = frequencySub('1', s,startAllele,stopAllele,list)/nd ; + pi += 2.0*p1*(1.0 -p1)*nnm1 ; + } + } + return( pi ) ; +} + + +//fills a vector size l with values of nucdiv in "windows". for subpops +void nucdivSubWindow( int nwins, double *posit, double *output, int nsam, int segsites,int startAllele, int stopAllele, char **list) +{ + int s, frequency( char, int, int, char**); + int wcount = 0; + double pi, p1, nd, nnm1 ; + double start, end, delta; + start = 0; + delta = 1.0 / nwins; + end = delta; + + while(start < 1.0){ + pi = 0.0 ; + for( s = 0; s start){ + nd = sampleSizeSiteSub(s,nsam,startAllele,stopAllele,list); + if (nd > 1){ + nnm1 = nd/(nd-1.0) ; + p1 = frequencySub('1', s,startAllele,stopAllele,list)/nd ; + pi += 2.0*p1*(1.0 -p1)*nnm1 ; + } + } + } + output[wcount++]=pi; + start += delta; + end += delta; + } +} + + +void fst2SubsWindow(int nwins, double *posit, double *output,int segsites, int nsam, int start1, int stop1, int start2, int stop2, char **list){ + double h1[nwins], h2[nwins], hTot[nwins], hW; + + int i; + + nucdivSubWindow(nwins,posit,h1,nsam, segsites, start1, stop1, list); + nucdivSubWindow(nwins,posit,h2,nsam, segsites,start2,stop2,list); + nucdivWindow(nwins,posit,hTot,nsam, segsites,list); + + for(i=0;i 1 && p1 != (stopAllele-startAllele)){ + pi += (p1*p1)/( nd*(nd-1.0) ) ; + } + } + return(pi*2.0) ; +} + +//counts number of haplotypes +int nHaplotypesSub(int segsites, int nsam, int startAllele, int stopAllele, char **list) +{ + int i; + int j; + int k; + int haplotype_found; + int allsame; + + int n_haplotypes = 0; + char haplotypes[ stopAllele - startAllele][segsites]; + + for(i=startAllele; i= 0.0){ + sum += tmpDist; + denom++; + } + } + } + return(sum / (float) denom); +} + +double Dxy_mean(int segsites,int nsam, int n1, int n2, char **list){ + int i,j,tmp; + double sum = 0.0; + double ncomps = 0.0; + double tmpDist; + tmp=n1+n2; + + for(i=0;i= 0.0){ + sum += tmpDist; + ncomps++; + } + } + } + return(sum /ncomps); +} + +//Dxy_min statistic used in Garrigan's Gmin +double Dxy_min(int segsites,int nsam, int n1, int n2, char **list){ + int i,j,tmp; + double min = 666666666.0; + double tmpVal; + + tmp=n1+n2; + for(i=0;i= 0.0) min = tmpVal; + } + } + return(min); +} + +//pairwiseDistRankAmongSampleRange takes a number of pairwise differences and a range of rows in the alignment +//and returns the number of pairs of samples within this range that have a lesser or equal number of differences +double pairwiseDistRankAmongSampleRange(int segsites, int diffs, int firstSample, int numSamples, double *hetVar, char **list){ + int i, j, currDiffs, leCount, numComps, sumDiffs; + double numer, meanDiffs; + double *hetLs; + sumDiffs = leCount = numComps = 0; + (*hetVar)=0.0; + + hetLs = (double *)malloc(sizeof(double)*(numSamples*(numSamples-1)/2)); + for(i=firstSample; i= 0.0){ + if (currDiffs <= diffs){ + leCount += 1; + } + hetLs[numComps]=currDiffs; + numComps += 1; + sumDiffs += currDiffs; + } + } + } + + numer = 0.0; + meanDiffs = sumDiffs/(float)numComps; + for(i=0; i yy) return 1; + return 0; +} + + +//pairwiseDistances-- fills an array of doubles with all pairwise dists per site +void pairwiseDistances(int segsites,int nsam, double *dists, char **list){ + int i,j, count; + + count = 0; + for(i=0;i= 0.0){ + if( tmp< minWith){ + minCountW = 1; + minWith = tmp; + } + if( tmp == minWith) + minCountW += 1; + } + } + } + //get between min and count + for(i = n1; i < nsam; i++){ + tmp = seqDist_Snn(segsites,seqIndex1,i,list); + if (tmp >= 0.0){ + if( tmp< minBet){ + minCountB = 1; + minBet = tmp; + } + if( tmp == minBet) + minCountB += 1; + } + } + } + else{ + //get within min and count + for(i = n1; i < nsam; i++){ + if(i != seqIndex1){ + tmp = seqDist_Snn(segsites,seqIndex1,i,list); + if (tmp >= 0.0){ + if( tmp< minWith){ + minCountW = 1; + minWith = tmp; + } + if( tmp == minWith) + minCountW += 1; + } + } + } + //get between min and count + for(i = 0; i < n1; i++){ + tmp = seqDist_Snn(segsites,seqIndex1,i,list); + if (tmp >= 0.0){ + if( tmp< minBet){ + minCountB = 1; + minBet = tmp; + } + if( tmp == minBet) + minCountB += 1; + } + } + } + + if(minWith < minBet){ + return(1.0); + } + if(minWith == minBet){ + return(minCountW / (double) (minCountW+minCountB)); + } + return(0); + +} + +//seqDist_Snn is the metric for Snn; note that a lot of functions use this thing +//and it might not behave the way they want when there are Ns in the alignment. +double seqDist_Snn(int segsites, int index1, int index2, char **list){ + + int i; + double compareCount, nCount; + double count = 0.0; + char c1, c2; + + compareCount = 0.0; + nCount = 0.0; + + for(i = 0; i < segsites; i++){ + c1 = list[index1][i]; + c2 = list[index2][i]; + if(c1 == 'N' || c1 == 'N'){ + nCount += 1; //uncertainty about states? + } + else{ + if(c1 != c2) + count += 1; + } + compareCount +=1; + } + //arbitrary coverage requirement? disabled for now + if (nCount / compareCount > 1.0){ + //return large negative number + return(-666.0); + } + return(count); +} + +//meanRefDist-- calculates the mean dist of all alleles to a ref--sequence 1 +double meanRefDist(int segsites, int nsam, char **list){ + int i; + double sum = 0.0; + double denom = 0.0; + double tmp; + + for(i = 1; i < nsam; i++){ + tmp = seqDist_Snn(segsites,0,i,list); + if (tmp >= 0.0){ + sum += tmp; + denom++; + } + } + return(sum/denom); +} + + +//Pairwise IBS stuff +double *pairwiseIBSVec1Popn(int segsites, int n1, int *vecLen, double *posit, char **list){ + double tmpLen,start; + (*vecLen)=0; + int i, j, k; + double *vec; + + vec = (double *) malloc(sizeof(double) * (segsites+1)*n1*(n1-1)/2); + for(i=0; i max){ + max = tmpLen; + } + start = posit[k]; + } + } + tmpLen = 1.0-start; + //printf("%d %d %d %f\n", i, j, k, tmpLen); + if(tmpLen > max){ + max = tmpLen; + } + } + } + return max; +} + +double pairwiseIBSMeanWithin(int segsites,int first, int last, double *posit, char **list){ + int i, j,k; + double tmpLenSum,start; + float comp =0.0; + tmpLenSum =0.0; + for( i=first; i +#include +#include +#include + + +void printSFS(int segsites, int nsam, double *posit, int nsites, char **list); +void getParameters(int argc, char *argv[]); +void usage(); + +int maxsites = 1000 ; +int sites; + + +main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, j ,nsites, i, howmany ; + char **list, line[1001] ; + FILE *fopen(), *pfin ; + double *posit,prob ; + int segsites, count,probflag , nadv ; + char dum[20], astr[100] ; + int segsub( int nsam, int segsites, char **list ) ; + +//read in args + getParameters(argc, argv); + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, 1000, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, 1000, pfin); + + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + probflag = 0 ; + while( howmany-count++ ) { + +/* read in a sample */ + do { + fgets( line, 1000, pfin); + }while ( line[0] != '/' ); + + fscanf(pfin," segsites: %d", &segsites ); + if( segsites >= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + if( astr[1] == 'r' ){ + fscanf(pfin," %lf", &prob ) ; + probflag = 1; + fscanf(pfin," %*s"); + } + for( i=0; i 0) && (frequencySub('0', i, 1, nsam, fsList) >0 )) ? 1:0); + if(freq == 0 && fdFlag == 0){ printf("%d\t%d\t%d\n",i,0,nsam); } + if(freq == 0 && fdFlag == 1){ printf("%d\t%d\t%d\n",i,1,nsam); } + if(freq == 1 && fdFlag == 0){ printf("%d\t%d\t%d\n",i,2,nsam); } + if(freq == 1 && fdFlag == 1){ printf("%d\t%d\t%d\n",i,3,nsam); } + + } + + for(j=0;j +#include +#include +#include +#include "msMask.h" + +#define LINEBUF 1000000 + +int maxsites = 100000 ; +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, j, howmany,n ; + char **list, **cmatrix(), line[LINEBUF+1] ; + char *curLine; + FILE *fopen(), *pfin, *maskFile ; + double *posit ; + int segsites, count , nadv, probflag ; + char dum[20]; + int segsub( int nsam, int segsites, char **list ) ; + mask *aMask; + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + //print line + printf("%s",line); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + + //now have the info to deal with the maskfiles + aMask = mask_new(nsam); + + //open mask file + maskFile = fopen(argv[1], "r"); + if (maskFile == NULL){ + fprintf(stderr,"Error opening maskfile! ARRRRR!!!!\n"); + exit(1); + } + + + + fgets( line, LINEBUF, pfin); + //print line + printf("%s",line); + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + probflag = 0 ; + + while( howmany-count++ ) { + +/* read in a sample */ + do { + fgets( line, LINEBUF, pfin); + printf("%s",line); + }while ( line[0] != '/' ); + fgets( line, LINEBUF, pfin); + printf("%s",line); + sscanf(line," segsites: %d", &segsites ); + // fgets( line, LINEBUF, pfin); + if( segsites >= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + //read the next mask + readNextMask(maskFile,aMask, nsam); + if( segsites > 0) { + //print positions + fgets( line, LINEBUF, pfin); + printf("%s",line); + //need this stupid dummy ptr to advance buffer + curLine = line; + sscanf(curLine, "%*s %lf%n",posit,&n); + curLine += n; + for( i=1; ibitmap[j][(int)(posit[i]*LENGTH)] == 0){ + list[j][i] = 'N'; + } + } + } + //print adjusted output + for( i=0; ibitmap = tmpBits; + return(tmp); +} + +//reads the mask file and stores them all in the ptr masks +int readMaskFiles(char *fileName,mask **masks, int nsam){ + int i,j,count; + FILE *infile; + char line[LINEBUF]; + double start,end; + + //open file + infile = fopen(fileName, "r"); + if (infile == NULL){ + fprintf(stderr,"Error opening maskfile! ARRRRR!!!!\n"); + exit(1); + } + //init first mask + count = 0; + masks[count] = mask_new(nsam); + //start going through the file + while(fgets( line, LINEBUF,infile)){ + if (line[0] == '/'){ + count++; + masks[count] = mask_new(nsam); + } + else if(sscanf(line,"%d %lf %lf",&i,&start,&end) == 3){ + //flip some bits + for(j=start*LENGTH;j<=end*LENGTH;j++) + masks[count]->bitmap[i][j] = 0; + } + + + } + return(1); +} + +//reads the next mask, stores it in ptr +int readNextMask(FILE *handle,mask *aMask, int nsam){ + int i,j; + char line[LINEBUF]; + double start,end; + + //init mask + for(i = 0; i < nsam; i++){ + for(j=0; j < LENGTH; j++){ + aMask->bitmap[i][j] = 1; + } + } + + //start going through the file + fgets( line, LINEBUF,handle); + while(line[0] != '/'){ + if(sscanf(line,"%d %lf %lf",&i,&start,&end) == 3){ + //flip some bits + for(j=start*LENGTH;j<=end*LENGTH;j++) + aMask->bitmap[i][j] = 0; + } + fgets( line, LINEBUF,handle); + } + return(1); +} + diff --git a/msMask.h b/msMask.h new file mode 100644 index 0000000..a47c20f --- /dev/null +++ b/msMask.h @@ -0,0 +1,25 @@ +/* msMask.h -- masking for ms ascertainment stuff*/ + +#ifndef MASK_INC +#define MASK_INC + +#define LENGTH 1000000 + +#include "stdio.h" + +// stringWrap Object Definition +typedef struct{ + int **bitmap; // where the masking is stored + int sampleSize; // number of alleles represented +} mask; + + + +int biggerlist(int nsam, unsigned nmax, char **list ); +char **cmatrix(int nsam, int len); +mask *mask_new(int nsam); +int readMaskFiles(char *fileName, mask **masks, int nsam); +int readNextMask(FILE *handle,mask *masks, int nsam); + + +#endif diff --git a/msMaskAllRows.c b/msMaskAllRows.c new file mode 100644 index 0000000..4ef0188 --- /dev/null +++ b/msMaskAllRows.c @@ -0,0 +1,209 @@ +/******* ms2SFS.c ******** +just converts ms output to sfs style input +********************************/ + +#include +#include +#include +#include +#include "msMask.h" + +#define LINEBUF 1000000 + +int maxsites = 100000 ; +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, j, howmany,n ; + char **list, **cmatrix(), line[LINEBUF+1] ; + char *curLine; + FILE *fopen(), *pfin, *maskFile ; + double *posit ; + int segsites, count , nadv, probflag ; + char dum[20]; + int segsub( int nsam, int segsites, char **list ) ; + mask *aMask; + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + //print line + printf("%s",line); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + //now have the info to deal with the maskfiles + aMask = mask_new(nsam); + + //open mask file + maskFile = fopen(argv[1], "r"); + if (maskFile == NULL){ + fprintf(stderr,"Error opening maskfile! ARRRRR!!!!\n"); + exit(1); + } + + + + fgets( line, LINEBUF, pfin); + //print line + printf("%s",line); + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + probflag = 0 ; + + while( howmany-count++ ) { +/* read in a sample */ + do { + fgets( line, LINEBUF, pfin); + printf("%s",line); + }while ( line[0] != '/' ); + fgets( line, LINEBUF, pfin); + printf("%s",line); + sscanf(line," segsites: %d", &segsites ); + // fgets( line, LINEBUF, pfin); + if( segsites >= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + //read the next mask + readNextMask(maskFile,aMask, 1); + if( segsites > 0) { + //print positions + fgets( line, LINEBUF, pfin); + printf("%s",line); + //need this stupid dummy ptr to advance buffer + curLine = line; + sscanf(curLine, "%*s %lf%n",posit,&n); + curLine += n; + for( i=1; ibitmap[0][(int)(posit[j]*LENGTH)] == 1){ + putchar(list[i][j]); + } + else { + putchar('N'); + } + } + printf("\n"); + } + } + + } + fclose(maskFile); + return(1); +} + + +/* allocates space for gametes (character strings) */ +char **cmatrix(int nsam,int len){ + int i; + char **m; + + if( ! ( m = (char **) malloc( (unsigned)( nsam*sizeof( char* )) ) ) ) + perror("alloc error in cmatrix") ; + for( i=0; ibitmap = tmpBits; + return(tmp); +} + +//reads the mask file and stores them all in the ptr masks +int readMaskFiles(char *fileName,mask **masks, int nsam){ + int i,j,count; + FILE *infile; + char line[LINEBUF]; + double start,end; + + //open file + infile = fopen(fileName, "r"); + if (infile == NULL){ + fprintf(stderr,"Error opening maskfile! ARRRRR!!!!\n"); + exit(1); + } + //init first mask + count = 0; + masks[count] = mask_new(1); + //start going through the file + while(fgets( line, LINEBUF,infile)){ + if (line[0] == '/'){ + count++; + masks[count] = mask_new(1); + } + else if(sscanf(line,"%d %lf %lf",&i,&start,&end) == 3){ + //flip some bits + for(j=start*LENGTH;j<=end*LENGTH;j++) + masks[count]->bitmap[0][j] = 0; + } + + + } + return(1); +} + +//reads the next mask, stores it in ptr +int readNextMask(FILE *handle,mask *aMask, int nsam){ + int i,j; + char line[LINEBUF]; + double start,end; + + //init mask + for(j=0; j < LENGTH; j++){ + aMask->bitmap[0][j] = 1; + } + + //start going through the file + fgets( line, LINEBUF,handle); + while(line[0] != '/'){ + if(sscanf(line,"%d %lf %lf",&i,&start,&end) == 3){ + //flip some bits + for(j=start*LENGTH;j<=end*LENGTH;j++) + aMask->bitmap[0][j] = 0; + } + fgets( line, LINEBUF,handle); + } + return(1); +} + diff --git a/msParams.c b/msParams.c new file mode 100644 index 0000000..06784cf --- /dev/null +++ b/msParams.c @@ -0,0 +1,71 @@ +/*msParams.c ************** +/ makes a params file for simple +/model for ms input -- this is one popn case +***************************/ + +#include +#include +#include +#include +#include +#include +#include "../coalLib/ranlib.h" +#include "../pgLib/bedFile.h" + +void usage(); +unsigned int devrand(void); + +int main(int argc, char *argv[]){ + int i,n; + long int length, seed1, seed2; + struct bedEl data[50000]; + double t1, t2, n1, n2, theta, rho, rhoMax, thetaMax; + + if(argc < 2){ + usage(); + exit(1); + } + seed1 = (long) abs(devrand() % 2147483399); + seed2 = (long) abs(devrand() % 2147483399); + setall(seed1, seed2); + + n = bedFileImport3(argv[1],data); + t1 = genunf(0.0,0.2); + t2 = genunf(t1,0.5); + n1 = genunf(0.0,0.2); + n2 = genunf(n1,1.0); + for(i = 0; i < n; i++){ + length = (data[i].chromEnd) - data[i].chromStart; + thetaMax = length * 0.02 * 10.0; + theta = genunf(0.001,thetaMax); + rhoMax = thetaMax * 5.0; + rho = genunf(0,rhoMax); + //adjust rho + if (rho > 500){ + rho = 500; + } + printf("%f\t%f\t%ld\t%f\t%f\t%f\t%f\n",theta,rho, length, t1,n1,t2,n2); + } + + + return(0); +} + + +void usage(){ + printf("msParams bedFile\n"); +} + +/* used for getting random number seeds */ +unsigned int devrand(void) { + int fn; + unsigned int r; + + fn = open("/dev/urandom", O_RDONLY); + if (fn == -1) + exit(-1); /* Failed! */ + if (read(fn, &r, 4) != 4) + exit(-1); /* Failed! */ + close(fn); + return r; +} diff --git a/msParamsSubpop.c b/msParamsSubpop.c new file mode 100644 index 0000000..f00563a --- /dev/null +++ b/msParamsSubpop.c @@ -0,0 +1,90 @@ +/*msParamsSubpop.c ************** +/ makes a params file for 2 +/ popn scenario +***************************/ + +#include +#include +#include +#include +#include +#include +#include "../coalLib/ranlib.h" +#include "../pgLib/bedFile.h" + +void usage(); +unsigned int devrand(void); + +int main(int argc, char *argv[]){ + int i,n; + long int length, seed1, seed2; + struct bedEl data[50000]; + double t1, t2, n1, n2, theta, rho, rhops, thetaps,nAfr; + double pAdmix, tAdmix,xaRatio; + + + if(argc < 2){ + usage(); + exit(1); + } + seed1 = (long) abs(devrand() % 2147483399); + seed2 = (long) abs(devrand() % 2147483399); + setall(seed1, seed2); + n = bedFileImport3(argv[1],data); + if(argc < 3){ + t1 = genunf(0.0,0.2); + t2 = genunf(t1,0.8); + n1 = genunf(0.0,0.2); + // n2 = genunf(n1,1.0); + tAdmix = genunf(0,t2); + pAdmix = genunf(0,0.75); +// nAfr = genunf(0,1.5); + thetaps = genunf(0.00000001,0.03); + rhops = genunf(0,0.3); + } + else{ + xaRatio = atof(argv[7]); + tAdmix = atof(argv[2]) * xaRatio; + pAdmix = atof(argv[3]); + t1 = atof(argv[4]) * xaRatio; + n1 = atof(argv[5]); + t2 = atof(argv[6]) * xaRatio; + thetaps = genunf(0.00000001,0.03); + rhops = genunf(0,0.3); + } + for(i = 0; i < n; i++){ + length = (data[i].chromEnd) - data[i].chromStart; + theta = length *thetaps; + rho = length*rhops; + //adjust rho + if (rho > 600){ + rho = 600; + } + + //want theta, rho, length, trec, tbn, tAdmix, pAdmix, + printf("%lf\t%lf\t%ld\t%0.12lf\t%0.12lf\t%0.12lf\t%0.12lf\t%0.12lf\t%0.12lf\t%0.12lf\n",theta,rho, length, tAdmix, pAdmix, t1,n1,t2, \ + thetaps, rhops); + } + + + return(0); +} + + +void usage(){ + printf("msParamsSubpop bedFile \n"); +} + +/* used for getting random number seeds */ +unsigned int devrand(void) { + int fn; + unsigned int r; + + fn = open("/dev/urandom", O_RDONLY); + if (fn == -1) + exit(-1); /* Failed! */ + if (read(fn, &r, 4) != 4) + exit(-1); /* Failed! */ + close(fn); + return r; +} diff --git a/msParamsSubpopNoAd.c b/msParamsSubpopNoAd.c new file mode 100644 index 0000000..b12a499 --- /dev/null +++ b/msParamsSubpopNoAd.c @@ -0,0 +1,83 @@ +/*msParamsSubpop.c ************** +/ makes a params file for 2 +/ popn scenario +***************************/ + +#include +#include +#include +#include +#include +#include +#include "../coalLib/ranlib.h" +#include "../pgLib/bedFile.h" + +void usage(); +unsigned int devrand(void); + +int main(int argc, char *argv[]){ + int i,n; + long int length, seed1, seed2; + struct bedEl data[50000]; + double t1, t2, n1, n2, theta, rho, rhops, thetaps,nAfr; + double pAdmix, tAdmix,xaRatio; + + + if(argc < 2){ + usage(); + exit(1); + } + seed1 = (long) abs(devrand() % 2147483399); + seed2 = (long) abs(devrand() % 2147483399); + setall(seed1, seed2); + n = bedFileImport3(argv[1],data); + if(argc < 3){ + t1 = genunf(0.0,0.2); + t2 = genunf(t1,0.8); + n1 = genunf(0.0,0.2); + thetaps = genunf(0.00000001,0.03); + rhops = genunf(0,thetaps*10); + } + else{ + xaRatio = atof(argv[5]); + t1 = atof(argv[2]) * xaRatio; + n1 = atof(argv[3]); + t2 = atof(argv[4]) * xaRatio; + thetaps = genunf(0.00000001,0.03); + rhops = genunf(0,0.3); + } + for(i = 0; i < n; i++){ + length = (data[i].chromEnd) - data[i].chromStart; + theta = length *thetaps; + rho = length*rhops; + //adjust rho + if (rho > 600){ + rho = 600; + } + + //want theta, rho, length, trec, tbn, tAdmix, pAdmix, + printf("%lf\t%lf\t%ld\t%0.12lf\t%0.12lf\t%0.12lf\t%0.12lf\t%0.12lf\n",theta,rho, length, t1,n1,t2, \ + thetaps, rhops); + } + + + return(0); +} + +void usage(){ + printf("msParamsSubpopNoAd bedFile \n"); +} + +/* used for getting random number seeds */ +unsigned int devrand(void) { + int fn; + unsigned int r; + + fn = open("/dev/urandom", O_RDONLY); + if (fn == -1) + exit(-1); /* Failed! */ + if (read(fn, &r, 4) != 4) + exit(-1); /* Failed! */ + close(fn); + return r; +} diff --git a/msParamsSubpopTrans.c b/msParamsSubpopTrans.c new file mode 100644 index 0000000..16ec599 --- /dev/null +++ b/msParamsSubpopTrans.c @@ -0,0 +1,76 @@ +/*msParamsSubpop.c ************** +/ makes a params file for 2 +/ popn scenario +***************************/ + +#include +#include +#include +#include +#include +#include +#include "../coalLib/ranlib.h" +#include "../pgLib/bedFile.h" + +void usage(); +unsigned int devrand(void); + +int main(int argc, char *argv[]){ + int i,n; + long int length, seed1, seed2; + struct bedEl data[50000]; + double t1, t2, n1, n2, theta, rho, rhops, thetaps,nAfr; + double pAdmix, tAdmix,xaRatio; + + + if(argc < 2){ + usage(); + exit(1); + } + seed1 = (long) abs(devrand() % 2147483399); + seed2 = (long) abs(devrand() % 2147483399); + setall(seed1, seed2); + n = bedFileImport3(argv[1],data); + + xaRatio = 1.0; + t1 = atof(argv[2]) * xaRatio; + n1 = atof(argv[3]); + t2 = atof(argv[4]) * xaRatio; + thetaps = atof(argv[5]); + rhops = atof(argv[6]); + + for(i = 0; i < n; i++){ + length = (data[i].chromEnd) - data[i].chromStart; + theta = length *thetaps; + rho = length*rhops; + //adjust rho + if (rho > 600){ + rho = 600; + } + + //want theta, rho, length, trec, tbn, tAdmix, pAdmix, + printf("%lf\t%lf\t%ld\t%0.12lf\t%0.12lf\t%0.12lf\t%0.12lf\t%0.12lf\n",theta,rho, length, t1,n1,t2, \ + thetaps, rhops); + } + + + return(0); +} + +void usage(){ + printf("msParamsSubpopTrans bedFile <5 args>\n"); +} + +/* used for getting random number seeds */ +unsigned int devrand(void) { + int fn; + unsigned int r; + + fn = open("/dev/urandom", O_RDONLY); + if (fn == -1) + exit(-1); /* Failed! */ + if (read(fn, &r, 4) != 4) + exit(-1); /* Failed! */ + close(fn); + return r; +} diff --git a/msParamsTest.c b/msParamsTest.c new file mode 100644 index 0000000..aa2b319 --- /dev/null +++ b/msParamsTest.c @@ -0,0 +1,78 @@ +/*msParamsSubpop.c ************** +/ makes a params file for 2 +/ popn scenario +***************************/ + +#include +#include +#include +#include +#include +#include +#include "../coalLib/ranlib.h" +#include "../pgLib/bedFile.h" + +void usage(); +unsigned int devrand(void); + +int main(int argc, char *argv[]){ + int i,n; + long int length, seed1, seed2; + struct bedEl data[50000]; + double t1, t2, n1, n2, theta, rho, rhoMax, thetaMax,nAfr; + double pAdmix, tAdmix; + + + if(argc < 2){ + usage(); + exit(1); + } + seed1 = (long) abs(devrand() % 2147483399); + seed2 = (long) abs(devrand() % 2147483399); + setall(seed1, seed2); + + n = bedFileImport3(argv[1],data); + t1 = genunf(0.0,0.2); + t2 = genunf(0.0,0.8); + n1 = genunf(0.0,0.2); + n2 = genunf(n1,1.0); + tAdmix = genunf(0,t2); + pAdmix = genunf(0,0.75); + nAfr = genunf(0.8,3.0); + for(i = 0; i < n; i++){ + length = (data[i].chromEnd) - data[i].chromStart; + thetaMax = length * 0.02 * 10.0; + theta = genunf(0.001,thetaMax); + rhoMax = thetaMax * 5.0; + rho = genunf(0,rhoMax); + //adjust rho + if (rho > 600){ + rho = 600; + } + + //want theta, rho, length, trec, tbn, tAdmix, pAdmix, + printf("%lf\t%lf\t%ld\t%0.12lf\t%0.12lf\n",theta,rho, length, nAfr,t2); + } + + + return(0); +} + + +void usage(){ + printf("msParams bedFile\n"); +} + +/* used for getting random number seeds */ +unsigned int devrand(void) { + int fn; + unsigned int r; + + fn = open("/dev/urandom", O_RDONLY); + if (fn == -1) + exit(-1); /* Failed! */ + if (read(fn, &r, 4) != 4) + exit(-1); /* Failed! */ + close(fn); + return r; +} diff --git a/msSample.py b/msSample.py new file mode 100644 index 0000000..b459648 --- /dev/null +++ b/msSample.py @@ -0,0 +1,68 @@ +import sys, gzip, random +import numpy as np + +class msSample: + def __init__(self, segsites, pos,haplos): + self.segsites = segsites + self.positions = pos + self.haplotypes = haplos + self.n = len(self.haplotypes) + + def convertHaplosFiniteSites(self,seqlength): + newH = np.zeros((self.n,seqlength)) + if self.segsites > 0: + newPos = [np.floor(x*seqlength) for x in self.positions] + for i in list(range(0,self.n)): + for j in list(range(0,self.segsites)): + newH[i,int(newPos[j])]=self.haplotypes[i][j] + + self.haplotypes=newH + + +class msFile: + def __init__(self): + self.repNumber=0 + self.samples = [] + + def readFile(aFileName): + sampleList=[] + msStream = open(aFileName) + header = msStream.readline().strip().split() + program,numSamples,numSims = header[:3] + if len(header) > 3: + otherParams = " " + " ".join(header[3:]) + else: + otherParams = "" + + numSamples, numSims = int(numSamples),int(numSims) + #advance to first simulation + line = msStream.readline() + while line.strip() != "//": + line = msStream.readline() + repLs = [] + while line: + if line.strip() != "//": + sys.exit("Malformed ms-style output file: read '%s' instead of '//'. AAAARRRRGGHHH!!!!!\n" %(line.strip())) + s = int(msStream.readline().strip().split()[-1]) #segsites line + h=[] + positionsLine=[] + if s > 0: + positionsLine = msStream.readline().strip().split() + if not positionsLine[0] == ("positions:"): + sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n") + positionsLine.pop(0) + positionsLine = [float(x) for x in positionsLine] + + + for i in range(numSamples): + currLine = msStream.readline() + h.append([int(x) for x in list(currLine.strip())]) + line = msStream.readline() + #advance to the next non-empty line or EOF + while line and line.strip() == "": + line = msStream.readline() + sampleList.append(msSample(s,positionsLine,h)) + + msStream.close() + return sampleList + diff --git a/msSlidingWindow.rb b/msSlidingWindow.rb new file mode 100644 index 0000000..a2f4cdd --- /dev/null +++ b/msSlidingWindow.rb @@ -0,0 +1,30 @@ +#!/usr/bin/env ruby +# +# + +load "~/rubyStuff/Hudson.rb" + + +#bring in file +m = MSRun.new(ARGV[0]) +m.samples.each{|aSample| + w = 0.0 + i = 0 + while(w<1.0) + count = Array.new(aSample.seqMat.sampleSize - 1,0) + + pos = aSample.positions.select{|x| x.to_f > w && x.to_f < w+0.1} + pos.each{ | aPos | + ind= aSample.positions.index(aPos) + 1.upto(aSample.seqMat.sampleSize-1){ | j | +# print aSample.seqMat.matrix[1][1,1] + if (aSample.seqMat.matrix[0][ind,1] != aSample.seqMat.matrix[j][ind,1]) + count[j-1] = count[j-1].to_i + 1 + end + } + } + w+=0.05 + count.each{ |anAllele| print anAllele,"\t" } + print "\n" + end +} \ No newline at end of file diff --git a/msTools.py b/msTools.py new file mode 100644 index 0000000..6bf4ca8 --- /dev/null +++ b/msTools.py @@ -0,0 +1,174 @@ +import sys, gzip, bisect + +def sortedFlankingPositionsByDistToTargSite(targetPos, flankingPositionsToExamine, desiredNumPositions, physLen): + i=1 + sortedFlankingPositions = [] + + while len(sortedFlankingPositions) < desiredNumPositions: + lPos = targetPos-i + rPos = targetPos+i + if lPos >= 0 and lPos in flankingPositionsToExamine: + sortedFlankingPositions.append(lPos) + if rPos < physLen and rPos in flankingPositionsToExamine and len(sortedFlankingPositions) < desiredNumPositions: + sortedFlankingPositions.append(rPos) + i += 1 + + return sortedFlankingPositions + +def getNearestEmptyPositions(donorPos, snpCountAtPos, physLen): + numColliders = snpCountAtPos[donorPos]-1 + + freeSlots = [] + lPos = donorPos - 1 + rPos = donorPos + 1 + while len(freeSlots) < numColliders: + if lPos >= 0: + if snpCountAtPos[lPos] == 0: + freeSlots.append(lPos) + lPos -= 1 + if rPos <= physLen-1: + if snpCountAtPos[rPos] == 0: + freeSlots.append(rPos) + rPos += 1 + + return freeSlots + +def resolveCollision(donorPos, snpCountAtPos, physLen): + for recipientPos in getNearestEmptyPositions(donorPos, snpCountAtPos, physLen): + snpCountAtPos[recipientPos] += 1 + assert snpCountAtPos[recipientPos] == 1 + snpCountAtPos[donorPos] -= 1 + +def msPositionsToIntegerPositions(positions, physLen): + assert physLen >= len(positions) + + snpCountAtPos = {} + for i in range(physLen): + snpCountAtPos[i] = 0 + for position in positions: + intPos = int(physLen*position) + if intPos == physLen: + intPos = physLen-1 + snpCountAtPos[intPos] += 1 + + collisions = {} + for pos in snpCountAtPos: + if snpCountAtPos[pos] > 1: + collisions[pos] = 1 + + midPos = physLen/2 + collisionPositions = [] + midHasCollision=0 + if midPos in collisions: + collisionPositions.append(midPos) + midHasCollision=1 + collisionPositions += sortedFlankingPositionsByDistToTargSite(midPos, collisions, len(collisions)-midHasCollision, physLen) + for pos in collisionPositions: + resolveCollision(pos, snpCountAtPos, physLen) + + assert max(snpCountAtPos.values()) == 1 + newPositions = [x for x in sorted(snpCountAtPos) if snpCountAtPos[x] > 0] + assert newPositions[0] >= 0 and newPositions[-1] < physLen + + return newPositions + +def msRepToHaplotypeArrayIn(samples, positions, totalPhysLen, positionsFirst=True): + for i in range(len(samples)): + assert len(samples[i]) == len(positions) + + positions = msPositionsToIntegerPositions(positions, totalPhysLen) + + hapArrayIn = [] + if positionsFirst: + for j in range(len(positions)): + hapArrayIn.append([]) + for i in range(len(samples)): + hapArrayIn[j].append(samples[i][j]) + else: + for i in range(len(samples)): + hapArrayIn.append([]) + for j in range(len(positions)): + hapArrayIn[i].append(samples[i][j]) + + return hapArrayIn, positions + +def msOutToHaplotypeArrayIn(msOutputFileName, totalPhysLen, positionsFirst=True): + if msOutputFileName == "stdin": + isFile = False + msStream = sys.stdin + else: + isFile = True + if msOutputFileName.endswith(".gz"): + msStream = gzip.open(msOutputFileName) + else: + msStream = open(msOutputFileName) + + header = msStream.readline() + program,numSamples,numSims = header.strip().split()[:3] + numSamples,numSims = int(numSamples),int(numSims) + + hapArraysIn = [] + positionArrays = [] + #advance to first simulation + line = msStream.readline() + while not line.strip().startswith("//"): + line = msStream.readline() + while line: + if not line.strip().startswith("//"): + sys.exit("Malformed ms-style output file: read '%s' instead of '//'. AAAARRRRGGHHH!!!!!\n" %(line.strip())) + segsitesBlah,segsites = msStream.readline().strip().split() + segsites = int(segsites) + if segsitesBlah != "segsites:": + sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n") + + positionsLine = msStream.readline().strip().split() + if not positionsLine[0] == "positions:": + sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n") + positions = [float(x) for x in positionsLine[1:]] + + samples = [] + for i in range(numSamples): + sampleLine = msStream.readline().strip() + if len(sampleLine) != segsites: + sys.exit("Malformed ms-style output file %s segsites but %s columns in line: %s; line %s of %s samples AAAARRRRGGHHH!!!!!\n" %(segsites,len(sampleLine),sampleLine,i,numSamples)) + samples.append(sampleLine) + if len(samples) != numSamples: + raise Exception + hapArrayIn, positions = msRepToHaplotypeArrayIn(samples, positions, totalPhysLen, positionsFirst=positionsFirst) + hapArraysIn.append(hapArrayIn) + positionArrays.append(positions) + line = msStream.readline() + #advance to the next non-empty line or EOF + while line and line.strip() == "": + line = msStream.readline() + #sys.stderr.write("finished rep %d\n" %(len(hapArraysIn))) + if len(hapArraysIn) != numSims: + sys.exit("Malformed ms-style output file: %s of %s sims processed. AAAARRRRGGHHH!!!!!\n" %(len(hapArraysIn), numSims)) + + if isFile: + msStream.close() + return hapArraysIn, positionArrays + +def msOutToHaplotypeMatrices(msOutputFileName, totalPhysLen): + return msOutToHaplotypeArrayIn(msOutputFileName, totalPhysLen, positionsFirst=False) + +def windowHaps(hapArraySamplesFirst, positionArray, winStart, winEnd): + indicesToKeep = [i for i in range(len(positionArray)) if positionArray[i] >= winStart and positionArray[i] <= winEnd] + windowHapArray = [] + for sample in hapArraySamplesFirst: + windowHapArray.append([]) + for i in indicesToKeep: + windowHapArray[-1].append(sample[i]) + windowPositions = [positionArray[i] for i in indicesToKeep] + return windowHapArray, windowPositions + +def msWinStr(hapArraysSamplesFirst, positionArrays, winStart, winEnd): + outStr = "./windowedMSOutput %s %s\nblah\n" %(len(hapArraysSamplesFirst[0]), len(hapArraysSamplesFirst)) + for i in range(len(hapArraysSamplesFirst)): + currHapArray, currPositions = windowHaps(hapArraysSamplesFirst[i], positionArrays[i], winStart, winEnd) + currPositions = [(pos-winStart)/(winEnd-winStart+1.0) for pos in currPositions] + outStr += "\n//\nsegsites: %s\n" %(len(currPositions)) + outStr += "positions: " + " ".join([str(x) for x in currPositions]) + "\n" + for sample in currHapArray: + outStr += "".join(sample) + "\n" + return outStr diff --git a/niceStats.c b/niceStats.c new file mode 100644 index 0000000..7f970a1 --- /dev/null +++ b/niceStats.c @@ -0,0 +1,173 @@ +/******* niceStats.c ******** +for calculating sample stats from MS output +********************************/ + +#include +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); + + + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ,nwins, siteIdx; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit ; + int segsites, count , nadv, iss, h, exponent1, exponent2; + double pi , th, z, H, mfda, tajD,w, wins[50],max, min, temp_site, thetaA, thetaHPi, tajDX, achazD, h1, h2, h12, thetaW; + double winsH1[50], winsH2[50], winsH12[50]; + char dum[50], astr[100] ; + double *harmonicSums; + double ehh, rehh; + int *haplotype_counts; + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + //print header + nwins=5; + //printf("pi\tss\tthetaH\ttajD\tfayWuH\tHapCount"); + //output used for soft shoulder analysis: + //printf("pi\tss\tthetaH\ttajD\tfayWuH\tHapCount\tH1\tomegaCenter\tZnS"); + //output used for spatial svm: + printf("pi\tss\tthetaH\ttajD\tfayWuH\tmaxFDA\tHapCount\tH1\tH12\tH2/H1\tOmega\tZnS"); + /*for (exponent1 = -20; exponent1 <= 20; exponent1++){ + for (exponent2 = -20; exponent2 < exponent1; exponent2++) { + printf("\tachazsD_%d_%d", exponent1,exponent2); + } + }*/ + /*for (exponent1 = -50; exponent1 <= 50; exponent1++){ + printf("\tthetaA_%d", exponent1); + }*/ + //for(i=0;i= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i= 0.5 && fabs(0.5 - temp_site) <= min){ + siteIdx = i; + min = fabs(0.5-temp_site); + } + } + w = omegaCenter(siteIdx, segsites, nsam, list); + */ + w = omegaMax(segsites, nsam,list); + + //printf("%lf\t%d\t%lf\t%lf\t%lf\t%d\t%f\t%f", pi, iss, th ,tajD,H, h, z, w); + //output used for soft shoulders manuscript: + //printf("%lf\t%d\t%lf\t%lf\t%lf\t%d\t%f\t%f\t%f", pi, iss, th ,tajD,H, h, h1, w, z); + //output used for spatial SVM + printf("%lf\t%d\t%lf\t%lf\t%lf\t%lf\t%d\t%d\t%lf\t%lf\t%lf\t%lf\t%lf", pi, iss, th, tajD, H, mfda, h, h1, h12, h2/h1, w, z); + //printf("%lf\t%d\t%lf\t%lf\t%lf\t%d", pi, iss, th ,tajD,H, h); + + /*for (exponent1 = -50; exponent1 <= 50; exponent1++){ + thetaA = achazThetaExponentWeights(nsam,segsites,list,exponent1); + printf("\t%f", thetaA); + }*/ + + /*harmonicSums = compute_HarmonicSums(nsam); + for (exponent1 = -20; exponent1 <= 20; exponent1++){ + for (exponent2 = -20; exponent2 < exponent1; exponent2++) { + achazD = achazNeutTestExponentWeights(nsam,segsites,list,exponent1,exponent2,harmonicSums); + printf("\t%f", achazD); + } + }*/ + + + //window stats + //tajdWindow( nwins, posit, wins, nsam, segsites,list); + //petrovHStatsWindow(segsites, nwins, posit, winsH12, winsH1, winsH2, nsam, list); + /* + //get max for normalizing windows + max=0.0; + for( i=0; i max){ + max = wins[i]; + } + } + //print windows + + for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); + + + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ,nwins, siteIdx; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit ; + int segsites, count , nadv, iss, h, exponent1, exponent2, fourGams; + double pi , th, z, H, mfda, tajD,w, wins[50],max, min, temp_site, thetaA, thetaHPi, tajDX, achazD, h1, h2, h12, thetaW; + double winsH1[50], winsH2[50], winsH12[50]; + char dum[50], astr[100] ; + double *harmonicSums; + double ehh, rehh; + int *haplotype_counts; + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + //print header + nwins=5; + //printf("pi\tss\tthetaH\ttajD\tfayWuH\tHapCount"); + //output used for soft shoulder analysis: + //printf("pi\tss\tthetaH\ttajD\tfayWuH\tHapCount\tH1\tomegaCenter\tZnS"); + //output used for spatial svm: + printf("pi\tss\tthetaH\ttajD\tfayWuH\tmaxFDA\tHapCount\tH1\tH12\tH2/H1\tOmega\tZnS\tfourGamete"); + /*for (exponent1 = -20; exponent1 <= 20; exponent1++){ + for (exponent2 = -20; exponent2 < exponent1; exponent2++) { + printf("\tachazsD_%d_%d", exponent1,exponent2); + } + }*/ + /*for (exponent1 = -50; exponent1 <= 50; exponent1++){ + printf("\tthetaA_%d", exponent1); + }*/ + //for(i=0;i= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); + + + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ,nwins, siteIdx; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit ; + int segsites, count , nadv, iss, h, exponent1, exponent2; + double pi , th, z, H, tajD,w, wins[50],max, min, temp_site, thetaA, thetaHPi, tajDX, achazD, h1, h2, h12, thetaW; + double winsH1[50], winsH2[50], winsH12[50]; + char dum[50], astr[100] ; + double *harmonicSums; + double ehh, rehh; + int *haplotype_counts; + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + //print header + nwins=5; + + + printf("achazsD_1"); + for (exponent1 = 2; exponent1 < nsam; exponent1++){ + printf("\tachazsD_%d", exponent1); + } + + printf("\n"); + while( howmany-count++ ) { + +/* read in a sample */ + do { + fgets( line, LINEBUF, pfin); + }while ( line[0] != '/' ); + + fscanf(pfin," segsites: %d", &segsites ); + if( segsites >= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); + + + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ,nwins, siteIdx; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit ; + int segsites, count , nSites, iss,h ; + double pi , th, z, H, tajD,w, wins[50],max, min, temp_site; + char dum[50], astr[100] ; + int *haplotype_counts; + int *derived_counts; + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, LINEBUF, pfin); + + nSites = atoi( argv[1] ) ; + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + //print header + nwins=9; + //printf("pi\tss\tthetaH\ttajD\tfayWuH\tHapCount\tZnS\tomega"); + //for(i=0;i= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i max){ + max = wins[i]; + } + }*/ + //for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + +void usage(); + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ,nwins, siteIdx; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit ; + int segsites, count , nadv, iss,h ; + double pi , th, z, H, tajD,w, wins[50],max, min, temp_site; + char dum[50], astr[100] ; + + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + //print header + nwins=9; + printf("pi\tss\tthetaH\ttajD\tfayWuH\tHapCount\tZnS"); + /*printf("pi\tss\tthetaH\ttajD\tfayWuH\tHapCount");*/ + for(i=0;i= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i max){ + max = wins[i]; + } + } + for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); + + + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ,nwins, siteIdx; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit ; + int segsites, count, nSites, iss, h ; + double pi , th, z, H, tajD,w, wins[50],max, min, temp_site; + char dum[50], astr[100] ; + int *haplotype_counts; + int *derived_counts; + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, LINEBUF, pfin); + + nSites = atoi( argv[1] ) ; + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + //print header + nwins=9; + //printf("pi\tss\tthetaH\ttajD\tfayWuH\tHapCount\tZnS\tomega"); + //for(i=0;i= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i max){ + max = wins[i]; + } + }*/ + //for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + +void usage(); + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ,nwins, siteIdx; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit ; + int segsites, count , nadv, iss,h ; + double pi , th, z, H, tajD,w, wins[50],max, min, temp_site; + char dum[50], astr[100] ; + + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + //print header + nwins=9; + printf("pi\tss\tthetaH\ttajD\tfayWuH\tHapCount\tZnS\tOmegaCenter"); + for(i=0;i= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i= 0.5 && fabs(0.5 - temp_site) <= min){ + siteIdx = i; + min = fabs(0.5-temp_site); + } + } + w = omegaCenter(siteIdx, segsites, nsam, list); + + + printf("%lf\t%d\t%lf\t%lf\t%lf\t%d\t%f\t%f", pi, iss, th ,tajD,H, h, z, w); + + //window stats + + nucdivWindow( nwins, posit, wins, nsam, segsites,list); + //print normalized windows + max=0.0; + for( i=0; i max){ + max = wins[i]; + } + } + for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 +int maxsites = 100000 ; +void usage(); + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit, *ibsVec, *hetVec; + int segsites, count, n1, iss, h, ibsVecLen, hetVecLen, g1Size, g2Size, private1, private2; + double pi, th, z, H, tajD, sStarVal; + double ibsMean, ibsVar, ibsSkew, ibsKurt, ibsMin, ibsMed, ibsMax; + double hetMean, hetVar, hetSkew, hetKurt, hetMin, hetMed, hetMax; + double z1, z2, ztot, pi1, pi2, f, snn, dxy_min, dxy_mean, gmin, hetVar1, hetVar2; + double dd1, dd2, ddRank1, ddRank2, ibsMaxBetween, ibsMeanWithin1, ibsMeanWithin2; + char dum[20], astr[100] ; + int starFlag=0; + int migOption=0; + int physLen; +// int bins = 10; +// double hist[bins]; + + if( argc > 2 ) { + n1 = atoi( argv[1] ) ; + physLen = atoi(argv[2]); + if(argc==4){ + if(argv[3][1] == 'c') migOption=1; + } + } + else{ + usage(); + } + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, LINEBUF, pfin); + + if (n1 <= 0) n1 = nsam; + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + + //print header line + printf("pi\tss\tthetaH\ttajd\tH\tHapCount\tZnS\t"); + printf("hetVar\thetSkew\thetKurt\thetMin\thetMed\thetMax\t"); + printf("ibsMean\tibsVar\tibsSkew\tibsKurt\tibsMin\tibsMed\tibsMax\tS*\t"); + printf("Fst\tsnn\tdxy_mean\tdxy_min\tgmin\tzx\tdd1\tdd2\tddRank1\tddRank2\tibsMaxB\tibsMean1\tibsMean2\tprivate1\tprivate2\n"); +// for(i=0;i= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i\n"); + exit(1); +} + diff --git a/pairDist.c b/pairDist.c new file mode 100644 index 0000000..4b17063 --- /dev/null +++ b/pairDist.c @@ -0,0 +1,81 @@ +/******* pairDist.c ******** +calculates the distribution of pairwise differences +between all seqs and a reference seq +********************************/ + +#include +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); + + + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit ; + int segsites, count , nadv ; + char dum[20], astr[100] ; + + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + + while( howmany-count++ ) { + +/* read in a sample */ + do { + fgets( line, LINEBUF, pfin); + }while ( line[0] != '/' ); + + fscanf(pfin," segsites: %d", &segsites ); + if( segsites >= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); + + + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ,nwins, siteIdx; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit ; + int segsites, count , nadv, ncomp; + + char dum[50], astr[100] ; + double *dists; + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + nadv = atoi( argv[1] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + ncomp = nsam * (nsam-1) / 2; + while( howmany-count++ ) { + +/* read in a sample */ + do { + fgets( line, LINEBUF, pfin); + }while ( line[0] != '/' ); + + fscanf(pfin," segsites: %d", &segsites ); + if( segsites >= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); + + + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, j, k, howmany; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit ; + int segsites, count; + double tmpLen; + + char dum[50], astr[100] ; + double start; + double sites; + int bins = 10; + double binWidth = 1.0 / ((float) bins); + int hist[bins+1]; + float sumComp = 0; + + + if(argc < 1){ + usage(); + } +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, LINEBUF, pfin); + + + sites = atof(argv[1]); + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + + + while( howmany-count++ ) { + +/* read in a sample */ + do { + fgets( line, LINEBUF, pfin); + }while ( line[0] != '/' ); + + fscanf(pfin," segsites: %d", &segsites ); + if( segsites >= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i 0)printf("%d\n",tmpLen); + start = posit[k]; + } + } + //tmpLen =(int) round((1.0-start) * sites); + tmpLen = 1.0-start; + //printf("%f\n",round(tmpLen/binWidth)); + hist[(int)round(tmpLen/binWidth)]+= 1; + sumComp +=1; + //if(tmpLen > 0)printf("%d\n",tmpLen); + } + } + + for (i=0;i 1 + +def processSimulation(samples, positions): + newPositions = [] + newSamples = [""]*len(samples) + if positions: + for i in range(len(samples[0])): + if isSnp(samples, i): + newPositions.append(positions[i]) + for j in range(len(samples)): + newSamples[j] += samples[j][i] + if len(newPositions) > 0: + newPositionsStr = "positions: " + " ".join([str(newPositions[x]) for x in range(len(newPositions))]) + else: + newPositionsStr = "" + newSamplesStr = "\n".join([str(newSamples[x]) for x in range(len(newSamples))]) + print "\n//\nsegsites: %s\n%s\n%s" %(len(newPositions), newPositionsStr, newSamplesStr) + else: + print "\n//\nsegsites: 0\n" + +if msFile == "stdin": + isFile = False + msStream = sys.stdin +else: + isFile = True + if msFile.endswith(".gz"): + msStream = gzip.open(msFile) + else: + msStream = open(msFile) + +header = msStream.readline() +program,numSamples,numSims = header.strip().split()[:3] +numSamples,numSims = int(numSamples),int(numSims) + +processedSims = 0 +#advance to first simulation +line = msStream.readline().strip() +print header.strip() +while not line.strip().startswith("//"): + if line.strip() != "": + print line.strip() + line = msStream.readline() +while line: + if not line.strip().startswith("//"): + sys.exit("Malformed ms-style output file: read '%s' instead of '//'. AAAARRRRGGHHH!!!!!\n" %(line.strip())) + segsitesBlah,segsites = msStream.readline().strip().split() + segsites = int(segsites) + if segsitesBlah != "segsites:": + sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n") + + positionsLine = msStream.readline().strip() + if not positionsLine.startswith("positions:"): + if segsites == 0: + processSimulation([], []) + else: + sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n") + else: + positionsLine = positionsLine.split() + positions = [float(x) for x in positionsLine[1:]] + + samples = [] + for i in range(numSamples): + sampleLine = msStream.readline().strip() + if len(sampleLine) != segsites: + sys.exit("Malformed ms-style output file %s segsites but %s columns in line: %s; line %s of %s samples AAAARRRRGGHHH!!!!!\n" %(segsites,len(sampleLine),sampleLine,i,numSamples)) + samples.append(sampleLine) + if len(samples) != numSamples: + raise Exception + processSimulation(samples,positions) + processedSims += 1 + line = msStream.readline() + #advance to the next non-empty line or EOF + while line and line.strip() == "": + line = msStream.readline() +if processedSims != numSims: + sys.exit("Malformed ms-style output file: %s of %s sims processed. AAAARRRRGGHHH!!!!!\n" %(processedSims,numSims)) + +if isFile: + msStream.close() diff --git a/slideFST.c b/slideFST.c new file mode 100644 index 0000000..fe8a431 --- /dev/null +++ b/slideFST.c @@ -0,0 +1,89 @@ +/******* slideFST.c ******** + +********************************/ + +#include +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 + + + +void usage(); + + + +int maxsites = 100000 ; + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany, siteIdx; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit ; + int nwins=200; + int segsites, count ,n1,n2 ; + double wins[nwins],max, min, temp_site; + char dum[20], astr[100] ; + + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, LINEBUF, pfin); + + if( argc > 1 ) { + n1 = atoi( argv[1] ) ; + n2 = atoi( argv[2] ) ; + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + //print header + + printf("fstWin0"); + for(i=1;i= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i 10000: + sys.exit("Let's not get carried away with the number of windows . . .\n") + winStart = 0.0 + winEnd = 0+delta + winIndex = 0 + snpWinAssignments = [] + for i in range(len(positions)): + while not ((positions[i] > winStart or positions[i] == 0) and positions[i] <= winEnd): + winStart = winEnd + winEnd += delta + winIndex += 1 + #trying to avoid some floating-point precision weirdness here + if abs(1.0-winEnd) < 1e-9: + winEnd = 1.0 + snpWinAssignments.append(winIndex) + return snpWinAssignments + +def getWinRange(fileIndex,numWins,delta): + winStart = fileIndex/float(numWins) + winEnd = winStart + delta + if abs(1.0-winEnd) < 1e-9: + winEnd = 1.0 + return winStart,winEnd + +def getSegSitesForFiles(snpWindowAssignments,positions,numWins,outFileLs): + segsiteCountLs = [] + segsitePositions = [] + for i in range(len(outFileLs)): + segsiteCountLs.append(0) + segsitePositions.append([]) + + for i in range(len(snpWindowAssignments)): + fileIndex = snpWindowAssignments[i] + delta = 1.0/numWins + winStart,winEnd = getWinRange(fileIndex,numWins,delta) + windowedPosition = (positions[i]-winStart)/delta + segsitePositions[fileIndex].append(windowedPosition) + segsiteCountLs[fileIndex] += 1 + + return segsiteCountLs,segsitePositions + +def processSimulation(samples,snpWindowAssignments,positions,numWins,outFileLs): + #first output the header information for the simulation + segsiteCountLs,segsitePositionsLs = getSegSitesForFiles(snpWindowAssignments,positions,numWins,outFileLs) + for i in range(len(outFileLs)): + outFileLs[i] += "\n//\nsegsites: %s\n" %(segsiteCountLs[i]) + outFileLs[i] += "positions: " + " ".join([str(x) for x in segsitePositionsLs[i]]) + "\n" + for sample in samples: + for i in range(len(sample)): + outFileLs[snpWindowAssignments[i]] += sample[i] + for i in range(len(outFileLs)): + outFileLs[i] += "\n" + +if msFile == "stdin": + isFile = False + msStream = sys.stdin +else: + isFile = True + if msFile.endswith(".gz"): + msStream = gzip.open(msFile) + else: + msStream = open(msFile) + +header = msStream.readline() +program,numSamples,numSims = header.strip().split()[:3] +numSamples,numSims = int(numSamples),int(numSims) + +#initialize list of output files +outFileLs = [] +outFileNameLs = [] +for i in range(numWins): + outFileNameLs.append("%s_%s.msWin" %(winFilePrefix,i)) + outFileLs.append("./windowedMSOutput %s %s\nblah\n" %(numSamples,numSims)) + +processedSims = 0 +#advance to first simulation +line = msStream.readline() +while not line.startswith("//"): + line = msStream.readline() +while line: + if not line.startswith("//"): + sys.exit("Malformed ms-style output file: read '%s' instead of '//'. AAAARRRRGGHHH!!!!!\n" %(line.strip())) + segsitesBlah,segsites = msStream.readline().strip().split() + segsites = int(segsites) + if segsitesBlah != "segsites:": + sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n") + + positionsLine = msStream.readline().strip().split() + if not positionsLine[0] == "positions:": + sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n") + positions = [float(x) for x in positionsLine[1:]] + snpWindowAssignments = getSnpWindowAssignments(positions,numWins) + + samples = [] + for i in range(numSamples): + sampleLine = msStream.readline().strip() + if len(sampleLine) != segsites: + sys.exit("Malformed ms-style output file %s segsites but %s columns in line: %s; line %s of %s samples AAAARRRRGGHHH!!!!!\n" %(segsites,len(sampleLine),sampleLine,i,numSamples)) + samples.append(sampleLine) + if len(samples) != numSamples: + raise Exception + processSimulation(samples,snpWindowAssignments,positions,numWins,outFileLs) + processedSims += 1 + line = msStream.readline() + #advance to the next non-empty line or EOF + while line and line.strip() == "": + line = msStream.readline() +if processedSims != numSims: + sys.exit("Malformed ms-style output file: %s of %s sims processed. AAAARRRRGGHHH!!!!!\n" %(processedSims,numSims)) + +for i in range(len(outFileLs)): + outFile = open(outFileNameLs[i], "w") + outFile.write(outFileLs[i]) + outFile.close() + +if isFile: + msStream.close() diff --git a/test.bed b/test.bed new file mode 100644 index 0000000..b8b405d --- /dev/null +++ b/test.bed @@ -0,0 +1,2 @@ +chrX 1 12 +chrX 20 30 diff --git a/testBits.c b/testBits.c new file mode 100644 index 0000000..cba26b6 --- /dev/null +++ b/testBits.c @@ -0,0 +1,14 @@ +#include +#include +#include "bitStuff.h" + +int main(int argc,char *argv[]){ + int bit = 1000; + + printf("%d\n",bit); + bit = setBit(bit,0); + printf("%d\n",bit); + bit = setBit(bit,1); + printf("%d\n",bit); + return(1); +} diff --git a/testMask b/testMask new file mode 100644 index 0000000..8ca6b44 --- /dev/null +++ b/testMask @@ -0,0 +1,6 @@ +1 0.1 0.5 +2 0.3 0.5 + +// +1 0.2 0.24 +3 0.7 0.8 diff --git a/testMask2 b/testMask2 new file mode 100644 index 0000000..a29b9b5 --- /dev/null +++ b/testMask2 @@ -0,0 +1,15 @@ +0 0.038283 0.038284 +2 0.038283 0.038284 +5 0.038283 0.038283 +6 0.038283 0.038284 +11 0.038284 0.038284 +11 0.038284 0.038284 +13 0.038283 0.038283 +15 0.038284 0.038284 +19 0.038283 0.038283 +21 0.038283 0.038284 +22 0.038283 0.038284 +35 0.038283 0.038284 +36 0.0 0.038283 + +// diff --git a/testMasks b/testMasks new file mode 100644 index 0000000..fc98411 --- /dev/null +++ b/testMasks @@ -0,0 +1,40 @@ +0 0.01 0.55 +1 0.2 0.55 + +// +1 0.1 0.2 +3 0.1 0.2 + +// +1 0.1 0.3 +3 0.1 0.2 + +// +1 0.1 0.2 +3 0.1 0.2 + +// +1 0.1 0.2 +3 0.1 0.2 + +// +1 0.1 0.2 +3 0.1 0.2 + +// +1 0.1 0.2 +3 0.1 0.2 + +// +1 0.1 0.2 +3 0.1 0.2 + +// +1 0.001 0.2 +3 0.001 0.2 + +// +1 0.001 0.2 +3 0.001 0.2 + +// diff --git a/twoPopnNiceStats.c b/twoPopnNiceStats.c new file mode 100644 index 0000000..ed7b632 --- /dev/null +++ b/twoPopnNiceStats.c @@ -0,0 +1,118 @@ +/******* maskedStats.c ******** +for calculating sample stats from MS output +after it has been filtered by msMask +********************************/ + +#include +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 +int maxsites = 100000 ; +void usage(); + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit ; + int segsites, count , n1, n2, iss,h,nnm1 ; + double pi , th, z, f, snn,dxy, dxy_min, dxy_mean, H, tajD, *dxyVec; + char dum[20], astr[100] ; + + + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, LINEBUF, pfin); + + nnm1=0; + if( argc > 1 ) { + n1 = atoi( argv[1] ) ; + n2 = atoi( argv[2] ) ; + nnm1 = n1 * n2; + } + else{ + usage(); + } + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + dxyVec = (double *)malloc( nnm1*sizeof( double ) ) ; + count=0; + //print header +// printf("pi1\tss1\tthetaH1\ttajd1\tH1\tHapCount1\tZnS1\t") ; +// printf("pi2\tss2\tthetaH2\ttajd2\tH2\tHapCount2\tZnS2\t") ; +// printf("Fst\tsnn\tdxy\tdxy_mean\tdxy_min\n"); + + while( howmany-count++ ) { + +/* read in a sample */ + do { + fgets( line, LINEBUF, pfin); + }while ( line[0] != '/' ); + + fscanf(pfin," segsites: %d", &segsites ); + if( segsites >= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i +#include +#include +#include +#include "msGeneralStats.h" + +#define LINEBUF 1000000 +int maxsites = 100000 ; +void usage(); + +int main(argc,argv) + int argc; +char *argv[]; +{ + int nsam, i, howmany ; + char **list, **cmatrix(), line[LINEBUF+1] ; + FILE *fopen(), *pfin ; + double *posit ; + int segsites, count , n1, n2, iss,h, private1, private2 ; + double pi1, pi2 , th, z1,z2,ztot, f, snn, dxy_min, dxy_mean, H, tajD, gmin,dd1,dd2,ddRank1,ddRank2,ibsMaxBetween,ibsMaxWithin1,ibsMaxWithin2,hetVar1,hetVar2; + char dum[20], astr[100] ; + int starFlag=0; + int migOption=0; +// int bins = 10; +// double hist[bins]; + + if( argc > 1 ) { + n1 = atoi( argv[1] ) ; + n2 = atoi( argv[2] ) ; + if(argc==4){ + if(argv[3][1] == 'c') migOption=1; + } + } + else{ + usage(); + } + +/* read in first two lines of output (parameters and seed) */ + pfin = stdin ; + fgets( line, LINEBUF, pfin); + sscanf(line," %s %d %d", dum, &nsam, &howmany); + fgets( line, LINEBUF, pfin); + + + list = cmatrix(nsam,maxsites+1); + posit = (double *)malloc( maxsites*sizeof( double ) ) ; + + count=0; + + //print header line + printf("pi1\thetVar1\tss1\tprivate1\tthetaH1\ttajd1\tH1\tHapCount1\tZnS1\tpi2\thetVar2\tss2\tprivate2\tthetaH2\ttajd2\tH2\tHapCount2\tZnS2\tFst\tsnn\tdxy_mean\tdxy_min\tgmin\tzx\tdd1\tdd2\tddRank1\tddRank2\tibsMaxB\tibsMean1\tibsMean2" ) ; +// for(i=0;i= maxsites){ + maxsites = segsites + 10 ; + posit = (double *)realloc( posit, maxsites*sizeof( double) ) ; + biggerlist(nsam,maxsites, list) ; + } + if( segsites > 0) { + fscanf(pfin," %s", astr); + + for( i=0; i\n"); + exit(1); +} +