-
Notifications
You must be signed in to change notification settings - Fork 8
/
csv2redis.awk
133 lines (124 loc) · 4.16 KB
/
csv2redis.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/gawk -f
#
# script to output the rows from a csv file to Redis format. the idea is
# that the CSV data will be translated to a Hash structure in Redis and
# inserted using the Redis HMSET command.
#
# the contained function formats the output according to the redis protocol.
#
# prerequisite is, that the CSV file has a header row. The header row names of
# the columns will be used as keys in redis. The header needs to use the same
# separator as is used in the data/rows. it is assumed that only one header row
# is present.
#
# you may pass a variable "separator" to the script to define the separator between
# the individual columns of the rows. the default separator is a comma.
#
# you may pass a variable "rediskey" to the script to define the base key to be
# used in redis. A devider (":") and a unique record/row id will be appended to
# this redis key. Default is "csvfile".
# you may pass a variable "uidcolumn" to the script to define the column (number)
# which is used as a unique id for the row. first column is number "1". the default
# is to use the record (row) number.
#
# you can pipe the resulting data directly to Redis, which will lightning fast.
#
# NOTE: If you have problem with piping a file through awk to redis which contains
# special characters such as é, à, ö, ä, etc. then the solution is to add a flag "-b"
# to the awk command.
#
#
# examples:
#
# awk -f csv2redis.awk csf_filename.csv
# or awk -v separator="\t" -f csv2redis.awk csf_filename.csv
# or awk -v separator="\t" -v uidcolumn=4 -f csv2redis.awk csv_filename.csv
# or awk -v uidcolumn=2 -v rediskey=myfile -f csv2redis.awk csv_filename.csv
# or awk -b -f csv2redis.awk csf_filename.csv | redis-cli --pipe
#
#
#
# last update: 2019-08-04
#
#
# function to construct the redis message according to the redis protocol
function toRedisProtocol(redisKey, recordUid, numberOfKeysAndValues, keys, values)
{
#linefeed
linefeed="\r\n"
# redis command
redisCommand="HMSET"
# devider between the redis key and the UID of the record/row
devider=":"
# number of parts that will be sent to redis:
# the HMSET command + the redis key + the number of header columns + the number of value columns
numberOfParts= 1 + 1 + numberOfKeysAndValues
# number of total parts of the command
part1="*" numberOfParts linefeed
# length of the redisCommand and the redisCommand itself
part2="$" length(redisCommand) linefeed redisCommand linefeed
# length of the redis key + length of the devider + length of the recordUid plus the constructed
# key of redisKey + devider + recordUid
part3="$" length(redisKey) + length(devider)+ length(recordUid) linefeed redisKey devider recordUid linefeed
# intermediate result
result = part1 part2 part3
# the loop creates the appropriate output for each key and value pair (column) of the CSV file
for(i=1;i<=numberOfValues;i++)
{
result= result "$" length(keys[i]) linefeed keys[i] linefeed
result= result "$" length(values[i]) linefeed values[i] linefeed
}
return result
}
# begin of processing
BEGIN {
# setting the field/column separator
if(separator)
{
FS=separator
}
else
{
FS=",";
}
# the key to use for redis. a devider (":") and the unique record key will be appended
# to this redis key, so that it forms a unique identifier for the row.
if(rediskey)
{
redisKey=rediskey
}
else
{
redisKey="csvfile"
}
# the value of which column (number) to use as the unique id for the row.
# if undefined then the record/row number will be used
if(uidcolumn)
{
recordUidColumnNumber=uidcolumn
}
}
# first row only
NR <= 1 {
# split the header row into its individual fields
numberOfKeys=split($0,keys,FS);
}
# all other rows
NR > 1 && $0 {
# split the record into its individual values
numberOfValues=split($0,values,FS)
# the unique id of the record
if(recordUidColumnNumber)
{
recordUid = values[recordUidColumnNumber]
}
else
{
recordUid = NR -1
}
# number of keys and values
numberOfKeysAndValues = numberOfKeys + numberOfValues
# create the redis command
printf "%s", toRedisProtocol(redisKey, recordUid, numberOfKeysAndValues, keys, values)
}