-
Notifications
You must be signed in to change notification settings - Fork 8
/
stream_grep
executable file
·252 lines (234 loc) · 7.17 KB
/
stream_grep
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
#!/usr/bin/perl -w
#
#
# Stream_grep Splits a stream file based on tag name/value
#
# Written by Andrew Aquila 2011
#
# Version 1.1 Dec 1 2011:
# Now stdin and stdout work so piping is possible
# Changed input options to match shell script test function for numbers
# Added -v for invert-match option
# Added cell parameter matching options
#
use Getopt::Long;
use Switch;
my ($input_stream_name, $tag_name, $output_stream_name, $help, $v,
$lt, $le, $eq, $ge, $gt, $ne, $cell_a, $cell_b, $cell_c, $cell_al,
$cell_be, $cell_ga) ;
my $opts = GetOptions('help|?|h' => \$help, 'i|input=s' => \$input_stream_name,
'o|output=s' => \$output_stream_name, 'n|tag-name=s' => \$tag_name, 'v|invert-match' => \$v,
'eq=f'=>\$eq, 'lt=f'=>\$lt, 'le=f'=>\$le, 'ge=f'=>\$ge,'gt=f'=>\$gt,'ne=f'=>\$ne,
'g|greater-than' => \$gt, 'cell-a' => \$cell_a, 'cell-b' => \$cell_b, 'cell-c' => \$cell_c,
'cell-alpha' => \$cell_al, 'cell-beta' => \$cell_be, 'cell-gamma' => \$cell_ga);
#sanity check and error message
if (! $opts or defined $help) {
print STDERR "@ARGV\n";
help_msgs();
exit;
}
#check if filtering a Cell parameter
my $N_cell_types = 0;
my $cell_type = 0;
if (defined $tag_name) {
$N_cell_types++;
}
if (defined $cell_a) {
$tag_name = "^Cell\ parameters\ ([0-9\.]+)\ [0-9\.]+\ [0-9\.]+";
$cell_type = 1;
$N_cell_types++;
}
if (defined $cell_b) {
$tag_name = "^Cell\ parameters\ [0-9\.]+\ ([0-9\.]+)\ [0-9\.]+";
$cell_type = 1;
$N_cell_types++;
}
if (defined $cell_c) {
$tag_name = "^Cell\ parameters\ [0-9\.]+\ [0-9\.]+\ ([0-9\.]+)";
$cell_type = 1;
$N_cell_types++;
}
if (defined $cell_al) {
$tag_name = "([0-9\.]+)\ [0-9\.]+\ [0-9\.]+ deg\$";
$cell_type = 1;
$N_cell_types++;
}
if (defined $cell_be) {
$tag_name = "[0-9\.]+\ ([0-9\.]+)\ [0-9\.]+ deg\$";
$cell_type = 1;
$N_cell_types++;
}
if (defined $cell_ga) {
$tag_name = "[0-9\.]+\ [0-9\.]+\ ([0-9\.]+) deg\$";
$cell_type = 1;
$N_cell_types++;
}
# A bit of error checking on number of tags
if ($N_cell_types>1) {
print STDERR "More then one tag-name/cell parameret is used!\n";
help_msgs();
exit;
}
if (!defined $tag_name) {
print STDERR "No tag-name/cell parameret is defined!\n";
help_msgs();
exit;
}
#set type and tag value
my $tag_type = 0;
my $tag_value = 0;
my $N_tag_types = 0;
if (defined $lt) {
$tag_type = 1;
$tag_value = $lt;
$N_tag_types++;
}
if (defined $le) {
$tag_type = 2;
$tag_value = $le;
$N_tag_types++;
}
if (defined $eq) {
$tag_type = 3;
$tag_value = $eq;
$N_tag_types++;
}
if (defined $ge) {
$tag_type = 4;
$tag_value = $ge;
$N_tag_types++;
}
if (defined $gt) {
$tag_type = 5;
$tag_value = $gt;
$N_tag_types++;
}
if (defined $ne) {
$tag_type = 6;
$tag_value = $ne;
$N_tag_types++;
}
# sanity check for xor of numeric options
if ($N_tag_types>1) {
print STDERR "More then one comparison is used!\n";
help_msgs();
exit;
}
#set inverse value
if (defined $v) {$v = -1;} else {$v = 1;}
# set input file handle
my $FHin = STDIN;
if (defined $input_stream_name) {
open( IN,"< $input_stream_name") || die "Can't open file $input_stream_name\n";
$FHin = IN;
}
# set output file handle
my $FHout = STDOUT;
if (defined $output_stream_name) {
open(OUT, "> $output_stream_name") || die "Can't open file $output_stream_name\n";
$FHout = OUT;
}
# initialize variables
my @chunk =();
my $N_chunks = 0;
my $N_matches = 0;
my $test_chunk;
my $line;
# loop over file
while ($line = <$FHin>) {
if ($line =~ /^-----\ Begin\ chunk -----$/) { # new chunk!
if (@chunk != 0) { # ignore if empty (i.e. first chunk)
$test_chunk = check_match(\@chunk,$tag_name,$tag_value,$tag_type,$cell_type);
if (($test_chunk * $v) > 0) { # simple test including inverse
print_chunk(\@chunk,$FHout);
$N_matches++;
}
}
$N_chunks++;
@chunk = (); # clear chunk
}
if ($N_chunks == 0) { # check if in header
print $FHout $line; # print header
}
else {
push(@chunk, $line); # add line to end of the chunk
}
}
# don't forget the last chunk!
$test_chunk = check_match(\@chunk,$tag_name,$tag_value,$tag_type,$cell_type);
if (($test_chunk * $v) > 0) {
print_chunk(\@chunk,$FHout);
$N_matches++;
}
# close handles if files
if (defined $input_stream_name) {
close(IN);
}
if (defined $output_stream_name) {
close(OUT);
}
# print useful data on the old and new streams
print STDERR "I have read $N_chunks chunks.\n";
print STDERR "Of those $N_matches matched the criteria.\n";
# function to print the chunk
sub print_chunk
{
($chunk_ref, $fh) = @_;
print $fh @{$chunk_ref};
}
# function to match chunk
# returns 1 if TRUE and -1 if FALSE
sub check_match
{
my ($chunk_ref, $name, $ref_value, $eq_type, $split_type) = @_;
my $junk;
my $value;
foreach (@{$chunk_ref}) {
if($_ =~ $name) {
if ($split_type) {
$value = $1; # evaluate cell parameter
} else {
($junk, $value) = split(/=/,$_); # evaluate everthing else
}
switch($eq_type) {
case 0 {return 1;}
case 1 {if ($value < $ref_value) {return 1;}}
case 2 {if ($value <= $ref_value) {return 1;}}
case 3 {if ($value == $ref_value) {return 1;}}
case 4 {if ($value >= $ref_value) {return 1;}}
case 5 {if ($value > $ref_value) {return 1;}}
case 6 {if ($value != $ref_value) {return 1;}}
}
}
}
return -1; # chunk is empty and nothing matches
}
sub help_msgs
{
print STDERR "Unknown option: @_\n" if (@_);
print STDERR "Syntax: stream_grep [options] \n";
print STDERR "Stream_grep takes in a CrystFEL stream and outputs a stream \n";
print STDERR "with only chunks matching the specific tag-name and tag-value.\n\n";
print STDERR "-h, --help\t Displays this help message.\n";
print STDERR "-i, --input=<file>\t Input CrystFEL stream filename (default is stdin)\n";
print STDERR "-o, --output=<file>\t Output CrystFEL stream filename (default is stdout)\n";
print STDERR "-n, --tag-name=<name>\t Name of tag to match on\n";
print STDERR "-v, --invert-match\t Select non-matching chunks\n";
print STDERR "\n";
print STDERR "--cell-a\t Use the smallest unit cell length [nm] as the tag-name\n";
print STDERR "--cell-b\t Use the middle unit cell length [nm] as the tag-name\n";
print STDERR "--cell-c\t Use the largest unit cell length [nm] as the tag-name\n";
print STDERR "--cell-alpha\t Use the first rotation angle [deg] as the tag-name\n";
print STDERR "--cell-beta\t Use the second rotation angle [deg] as the tag-name\n";
print STDERR "--cell-gamma\t Use the third rotation angle [deg] as the tag-name\n";
print STDERR "\n";
print STDERR "-eq <value>,\t Match all chunks of the stream with tag values equal to the given value\n";
print STDERR "-ne <value>,\t Match all chunks of the stream with tag values not equal to the given value\n";
print STDERR "-lt <value>,\t Match all chunks of the stream with tag values less then the given value\n";
print STDERR "-le <value>,\t Match all chunks of the stream with tag values less then or equal to the given value\n";
print STDERR "-gt <value>,\t Match all chunks of the stream with tag values greater then the given value\n";
print STDERR "-ge <value>,\t Match all chunks of the stream with tag values greater then or equal to the given value\n";
print STDERR "\n";
print STDERR "Usage note: if --tag-name is specified without a comparison tag-value then ";
print STDERR "all chunks with the tag-name match.\n";
}