-
Notifications
You must be signed in to change notification settings - Fork 1
/
merge_by_NthCol.py
167 lines (146 loc) · 6.73 KB
/
merge_by_NthCol.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/usr/bin/env python
import sys, argparse
from argparse import RawTextHelpFormatter
#######################################################
### 0. script description, arguments, and functions ###
#######################################################
## 0.0 synopsis
synopsis1 = "\
- add <file2> to <file1>, by matching values in the 1st column of <file2>,\n\
with the <N>st/nd/th column of the <file1>;\n\
- similar to the old 'join_files_by_NthCol.py' but with more flexibility;\n"
synopsis2 = "detailed description:\n\
1. Input files, argument, and options\n\
- <file1> and <file2> are tab-delimited text files;\n\
2. Merging options:\n\
- '-m Mode' [1]: dealing with multiple matches in <file2>;\n\
-m 1 : only the first occurence in <file2> is merged (default)\n\
-m 2 : merge all matching <file2> lines, repeat the <file1> line\n\
- '-n N' [1]: the <file1> column number that acts as keys to merge <file2>;\n\
default is to use the 1st column of <file1>;\n\
- '-H'|'--header' [False]: first lines of <file1> and <file2> are column headers\n\
and merged regardless of matching;\n\
- '-k'|'--keep_keys' [False]: include the 1st column of <file2> (keys) in the\n\
merged file; default is not to include, since it will be redundant;\n\
*** options related to when a line in <file1> does not have any match in <file2>;\n\
- '-x' [False]: do not print lines in <file1> without a match in <file2>;\n\
- '-e filler_when_empty' ['na']: filler string to use when there is no match\n\
in <file2>; -e '' will fill in with empty tabs;\n\
- '-N totalCol' [0]: set the total number of columns in <file1> - lines in\n\
<file1> with different column numbers will be printed as they are, without\n\
looking for a match in <file2>; totalCol must be the same or larger than N;\n\
if not given (default), all lines in <file1> will be matched to <file2>;\n\
3. Output:\n\
- the resulting merged file is printed to STDOUT;\n\
- the 1st column of <file2>, used as the key, is omitted in the merged file;\n\
by [email protected] 20220609 ver 0.3.2\n\n"
#version_history
# 20220609 ver 0.3.2 fix a bug where empty columns at the end of a records "stripped" + single column <file2>
# 20220531 ver 0.3.1 fix a bug where certain warnings are printed to stdout rather than stderr
# 20210429 ver 0.3 add '-x' to remove <file1> lines without a match in <file2>
# 20210124 ver 0.2 add '-N' to set the total column number and skip lines in <file1> with a different number of columns.
# 20210102 ver 0.1 bug fix and add '-k' option
# 20201224 ver 0.0 modified from "join_files_by_NthCol.py"
## 0.1 parsing arguments
parser = argparse.ArgumentParser(description = synopsis1, epilog = synopsis2, formatter_class = RawTextHelpFormatter)
# positional parameters
parser.add_argument('file1', type=argparse.FileType('r'))
parser.add_argument('file2', type=argparse.FileType('r'))
# options
parser.add_argument('-m', dest="mode", type=int, default= 1)
parser.add_argument('-n', dest="N", type=int, default= 1)
parser.add_argument('-H', '--header', action="store_true", default=False)
parser.add_argument('-k', '--keep_keys', action="store_true", default=False)
parser.add_argument('-x', dest="exclude_if_no_match", action="store_true", default=False)
parser.add_argument('-e', dest="filler_when_empty", type=str, default= "na")
parser.add_argument('-N', dest="totalCol", type=int, default= 0)
args = parser.parse_args()
# parse options
if args.N > 0:
N = args.N
else:
sys.stderr.write("Warning: '-n N' need a positive interger; set to default (n=1)")
N = 1
if args.mode in {1,2}:
mode = args.mode
else:
sys.stderr.write("Warning: '-m mode' parameter out of range; set to default (mode=1)")
mode = 1
exclude_if_no_match = args.exclude_if_no_match
filler = args.filler_when_empty
number_of_fields_f1 = args.totalCol
##########################
### 1. reading <file2> ###
##########################
sys.stderr.write(" reading %s ...\n" % args.file2.name)
lines_in_f2_dict = dict() # key = string in the 1st column; value = list of lines
header_f2 = ""
number_of_fields_f2 = 0
header = True
for line in args.file2:
line_to_add = ""
tok = line.strip(" \n").split('\t') # v.0.3.2
if args.keep_keys:
line_to_add = line.strip(" \n")
else:
line_to_add = '\t'.join( tok[1:] )
if header and args.header:
header_f2 = line_to_add
elif tok[0].strip() not in lines_in_f2_dict:
lines_in_f2_dict[ tok[0].strip() ] = [ line_to_add ]
else:
lines_in_f2_dict[ tok[0].strip() ].append( line_to_add )
if header: # counting fields in <file2> to "fill" with "filler" when there is no match,
number_of_fields_in_f2 = len(tok)
sys.stderr.write("Reading %d fields from %s \n" % (number_of_fields_in_f2, args.file2.name ) )
header = False
elif number_of_fields_in_f2 != len(tok):
sys.stderr.write("Warning: a line in %s has a different number of fields:\n" % args.file2.name )
sys.stderr.write(line)
args.file2.close()
if number_of_fields_in_f2 ==1:
if exclude_if_no_match:
sys.stderr.write("%s has only one column and '-x' is on == extracting lines rather than merging ;)\n" % args.file2.name )
else:
sys.stderr.write("%s has only one column and '-x' is off - exiting since there is nothing to do ;)\n" % args.file2.name )
sys.exit(0)
#################################################
### 2. merging to <file1> and write to STDOUT ###
#################################################
sys.stderr.write(" merging to %s using column #%d ...\n" % (args.file1.name, N) )
header = True
num_line = 0
if args.keep_keys:
number_of_fields_to_fill = number_of_fields_in_f2
else:
number_of_fields_to_fill = number_of_fields_in_f2 - 1
for line in args.file1:
line_merged = line.strip(" \n"); key_f1 = ""
num_line += 1
try:
if args.header and header :
if number_of_fields_in_f2 > 1:
print( line_merged + '\t' + header_f2 )
else:
print( line_merged )
header = False
elif number_of_fields_f1 == 0 or number_of_fields_f1 == len(line.split('\t')):
key_f1 = line.split('\t')[N-1].strip()
if key_f1 in lines_in_f2_dict:
if number_of_fields_in_f2 == 1: # v.0.3.2
print ( line.strip(" \n") )
else:
if mode == 1:
print ( line.strip(" \n") + '\t' + lines_in_f2_dict[key_f1][0] )
else:
for n in range(0, len( lines_in_f2_dict[key_f1] ) ):
print ( line.strip(" \n") + '\t' + lines_in_f2_dict[key_f1][n] )
elif not exclude_if_no_match: #v.0.3
for i in range(0, number_of_fields_to_fill):
line_merged = line_merged + '\t' + filler
print( line_merged )
elif number_of_fields_f1 != len(line.split('\t')):
print( line_merged ) # v0.2 skip <file1> lines not matching the set column number and print as they are,
except IndexError :
sys.stderr.write( "Warning: line %d has a non-valid column in %s\n" % (num_line, args.file1.name) )
args.file1.close()