-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathcb_local
executable file
·249 lines (221 loc) · 9.89 KB
/
cb_local
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#!/usr/bin/perl -w
##
# Author: Ben Langmead
# Date: March 17, 2010
#
# Use perl Hadoop wrappers to intiate a local Crossbow computation
# similar to the Hadoop version (but without using Hadoop or Java).
#
use strict;
use warnings;
use Getopt::Long;
use FindBin qw($Bin);
use lib $Bin;
use CrossbowIface;
use Cwd 'abs_path';
Getopt::Long::Configure ("pass_through", "no_auto_abbrev", "permute", "prefix_pattern=(--|-)");
my $APP = "Crossbow";
my $app = lc $APP;
my $SCRIPT = "cb_local";
my $VERSION = `cat $Bin/VERSION`; $VERSION =~ s/\s//g;
my $usage = qq{
$SCRIPT: Run $APP v$VERSION on a single computer
Usage: perl $SCRIPT --input <url> --output <url> \
[--reference <url> | --just-preprocess ] [options]
Options (defaults in []):
Job params:
--input <path> Local path to input. Typically points to a directory
containing preprocessed reads, but if --preprocess or
--just-preprocess are enabled, points to a manifest
file. If --resume-* is specified, <path> is the
directory with the intermediate results to be resumed.
--output <path> Path to store final output
--intermediate <path> Path to store intermediate output
[/tmp/$APP/intermediate]
--dry-run Produce and print path to a script for running the
local job, but don't run it.
--bowtie <path> Path to bowtie binary [search BOWTIE_HOME, PATH]
--soapsnp <path> Path to soapsnp binary [search SOAPSNP_HOME, PATH]
$APP params:
--reference <path> Path to directory with reference information (must be
local, with "index", "sequences", "snps" subdirs)
--just-align Don't do SNP calling; --output will contain alignments
--bowtie-args "<args>" Arguments for Bowtie [-M 1] (Note: --partition --mm -t
--hadoopout --startverbose are always set by Crossbow)
--ss-args "<args>" Arguments for SOAPsnp [-2 -u -n -q] (Note: -i -d -o -s
-z -L -T are always set by Crossbow)
--ss-hap-args "<args>" Additional SOAPsnp arguments when reference is haploid
[-r 0.0001] (Note: -m is always set by Crossbow)
--ss-dip-args "<args>" Additional SOAPsnp arguments when reference is diploid
[-r 0.00005 -e 0.0001]
--haploids "<chrs>" Comma-separated names of references to be considered
haploid. Others are considered diploid. [None]
--all-haploids Consider all chromosomes to be haploid when calling
SNPs. [All diploid]
--quality <type> Encoding for sequence quality values; one of: phred33,
phred64, solexa64 [phred33]
--discard-reads <frac> Randomly discard specified fraction of input reads.
[off]
--truncate <int> Truncate reads longer than <int> bases to <int> bases
by trimming from the 3' end.
--truncate-discard <int> Same as --truncate except that reads shorter than
<int> bases are discarded.
--partition-len <int> Partition length in bases [1 million]
Preprocessing params (not necessary if --input contains preprocessed reads):
--preprocess --input URL is a manifest file describing a set of
unpreprocessed, FASTQ read files; preprocess them
before running $APP [off]
--just-preprocess Like --preprocess but $APP isn't run; --output
contains preprocessed reads [off]
--pre-output <url> If --preprocess is on, put preprocessed output here
instead of in the intermediate directory [off]. Has
no effect if --just-preprocess is specified (--output
is used instead). Useful if future jobs use same
input.
--pre-compress <type> Compression type; one of: gzip, none [gzip]
--pre-stop <int> Stop preprocessing after <int> reads/mates [no limit]
--pre-filemax <int> Split preprocessed output such that there are no more
than <int> reads/mates per preprocessed read file;
0 = no limit. [500,000]
Other params:
--cpus <int> # CPUs to use (in parallel) for this job [1].
--max-sort-records <int> Maximum # records dispatched to sort routine at one
time when sorting bins before reduce step. In each
child process, this number is effectively divided by
--cpus. [200000]
--max-sort-files <int> Maximum # total files opened by sort routines. In
each child process, this number is effectively divided
by --cpus. [40]
--dont-overwrite Abort if an intermediate or output directory already
exists, instead of removing it [off]
--tempdir <path> Put temporary scripts in <path>
[/tmp/$APP/invoke.scripts]
(umask 0077 used to protect credentials)
};
# Try to avoid forcing the user to use the equals sign in cases where
# they're specifying a set of arguments, as in --bowtie-args "-n 3 -l 35"
for(my $i = 0; $i < scalar(@ARGV)-1; $i++) {
if($ARGV[$i] =~ /^-.*-args$/) {
$ARGV[$i] = "$ARGV[$i]=\"".$ARGV[$i+1]."\"";
splice @ARGV, $i+1, 1;
}
}
my $input = "";
my $output = "";
my $intermediate = "";
my $ref = "";
my $index = "";
my $sequences = "";
my $snps = "";
my $cmap = "";
my $justPreprocess = 0;
my $verbose = 0;
my $test = 0;
sub dieusage($$$) {
my ($text, $usage, $lev) = @_;
print STDERR "$usage\nError:\n";
print STDERR "$text\n\n";
exit $lev;
}
GetOptions (
"input:s" => \$input,
"output:s" => \$output,
"intermediate:s" => \$intermediate,
"reference:s" => \$ref,
"index:s" => \$index,
"sequences:s" => \$sequences,
"snps:s" => \$snps,
"cmap:s" => \$cmap,
"just-preprocess" => \$justPreprocess,
"test" => \$test,
"verbose" => \$verbose
) || die "Error parsing options";
##
# Take a path and make it absolute. If it has a protocol, assume it's
# already absolute.
#
sub absPath($$$) {
my ($path, $check, $name) = @_;
return $path if $path =~ /^s3n?:\//i;
return $path if $path =~ /^hdfs:\//i;
return $path if $path =~ /^file:\//i;
$path =~ s/^~/$ENV{HOME}/;
die "Error: $name path doesn't exist: $path" unless (!$check || -f $path || -d $path);
return abs_path($path);
}
if($verbose) {
print STDERR "Relative paths:\n";
print STDERR " input: $input\n";
print STDERR " output: $output\n";
print STDERR " intermediate: $intermediate\n";
print STDERR " reference: $ref\n";
print STDERR " index: $index\n";
print STDERR " sequences: $sequences\n";
print STDERR " snps: $snps\n";
print STDERR " cmap: $cmap\n";
}
$input = absPath($input, 1, "--input") if $input ne "";
$output = absPath($output, 0, "--output") if $output ne "";
$intermediate = absPath($intermediate, 0, "--intermediate") if $intermediate ne "";
$index = absPath($index, 1, "--index") if $index ne "";
$ref = absPath($ref, 1, "--reference") if $ref ne "";
$sequences = absPath($sequences, 1, "--sequences") if $sequences ne "";
$snps = absPath($snps, 1, "--snps") if $snps ne "";
$cmap = absPath($cmap, 1, "--cmap") if $cmap ne "";
if($verbose) {
print STDERR "Absolute paths:\n";
print STDERR " input: $input\n";
print STDERR " output: $output\n";
print STDERR " intermediate: $intermediate\n";
print STDERR " reference: $ref\n";
print STDERR " index: $index\n";
print STDERR " sequences: $sequences\n";
print STDERR " snps: $snps\n";
print STDERR " cmap: $cmap\n";
}
if(!$test) {
$input ne "" || dieusage("Must specify --input", $usage, 1);
$output ne "" || dieusage("Must specify --output", $usage, 1);
}
my @args = ();
push @args, "--local-job";
push @args, ("--input-local", $input) if $input ne "";
push @args, ("--output-local", $output) if $output ne "";
push @args, ("--intermediate-local", $intermediate) if $intermediate ne "";
push @args, "--verbose" if $verbose;
if($ref ne "") {
$index = "$ref/index";
$sequences = "$ref/sequences";
$snps = "$ref/snps";
$cmap = "$ref/cmap.txt";
-d $index || die "Index directory under --reference directory, $index doesn't exist\n";
-d $sequences || die "Sequences directory under --reference directory, $sequences doesn't exist\n";
-d $snps || die "SNPs directory under --reference directory, $snps doesn't exist\n";
-f $cmap || die "cmap files under --reference directory, $cmap doesn't exist\n";
my @indexes = <$index/*.rev.1.ebwt>;
scalar(@indexes) <= 1 ||
die "Error: there seems to be more than one index in $index\n";
scalar(@indexes) == 1 || die "Error: no index found in $index\n";
my @is = split(/\//, $indexes[0]);
my $indexBase = $is[-1];
$indexBase =~ s/\.rev\.1\.ebwt$//;
push @args, ("--index-local", "$index/$indexBase");
push @args, ("--sequences-local", $sequences);
push @args, ("--snps-local", $snps);
push @args, ("--cmap-local", $cmap);
} else {
push @args, ("--index-local", $index) if $index ne "";
push @args, ("--sequences-local", $sequences) if $sequences ne "";
push @args, ("--cmap-local", $cmap) if $cmap ne "";
push @args, ("--snps-local", $snps) if $snps ne "";
}
if(!$justPreprocess) {
$index ne "" || $test || die "Must specify --reference or --index\n";
$sequences ne "" || $test || die "Must specify --reference or --sequences\n";
$snps ne "" || $test || die "Must specify --reference or --snps\n";
$cmap ne "" || $test || die "Must specify --reference or --cmap\n";
}
push @args, "--test" if $test;
push @args, "--just-preprocess" if $justPreprocess;
push @args, @ARGV;
CrossbowIface::crossbow(\@args, $SCRIPT, $usage, undef, undef, undef, undef);