-
Notifications
You must be signed in to change notification settings - Fork 16
/
tsv2my
executable file
·74 lines (63 loc) · 1.9 KB
/
tsv2my
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python2.7
"""
USAGES
tsv2my mydb.temptable < data.tsv
xlsx2tsv data.xlsx | python loadtsv.py mydb.temptable
Loads TSV-with-header data into a new mysql table.
All columns are varchar's.
Uses strict TSV - no quoting or comments, and no tabs or newlines in values.
"""
import sys,os,re
from collections import defaultdict
if len(sys.argv) == 1:
print "need DBNAME.TABLENAME argument"
sys.exit(1)
db_table_spec = sys.argv[1]
assert '.' in db_table_spec
db_name,table_name = db_table_spec.split('.')
if len(sys.argv)>2:
colspec = sys.argv[2]
else:
colspec = None
def uniq_c(seq):
ret = defaultdict(lambda:0)
for x in seq:
ret[x] += 1
return dict(ret)
input = sys.stdin
header = input.readline()
columns = [re.sub('[,:; -]','_',h.strip()) for h in header[:-1].split("\t")]
dups = set(col for col,count in uniq_c(columns).items() if count>1)
#print dups
if dups:
dup_counts = defaultdict(lambda:0)
for i,col in enumerate(columns):
if col in dups:
dup_counts[col] += 1
columns[i] = "%s%d" % (col, dup_counts[col])
#print columns
#print len(columns)
assert len(set(columns)) == len(columns)
import MySQLdb
conn = MySQLdb.connect(user='root', db=db_name)
curs = conn.cursor()
curs.execute("drop table if exists `%s`" % table_name)
if colspec:
col_types = colspec.split(",")
else:
col_types = ['text'] * len(columns)
max_size = 2000
sql = "create table `%s` (" % table_name
# sql += ",".join( "`%s` varchar(%d)" % (c,max_size) for c in columns )
sql += ",".join( "`%s` %s" % (columns[j], col_types[j]) for j in range(len(columns)))
sql += ")"
curs.execute(sql)
insert_sql = "insert into %s values (%s)" % (table_name, ",".join(["%s"] * len(columns)))
#print insert_sql
for line in input:
values = line[:-1].split("\t")
#print values
#print len(values)
# if any(len(v)>max_size for v in values):
# print "Warning, value truncated"
curs.execute(insert_sql, values)