-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathflatten4mongo.py
80 lines (69 loc) · 3.52 KB
/
flatten4mongo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from datetime import datetime
from typing import Any, List, Dict
import yaml
import sys
import argparse
import json
import bson
class NoAliasDumper(yaml.SafeDumper):
def ignore_aliases(self, data):
return True
def flatten(data: Dict[str, Any], res: Dict[str, Any], parentname: List[str], parentid: List[str], all_ids: bool) -> Dict[str, Any]:
for coll in data:
print('collection', coll)
records = data[coll]
if not coll in res:
res[coll] = []
for record in records:
if len(parentname) > 0 and len(parentid) > 0 and len(parentname) == len(parentid):
for i in range(len(parentname)):
record[parentname[i] + '_id'] = parentid[i]
sub = {}
if '_subcollections' in record:
sub = record.pop('_subcollections')
for i in range(len(record)):
key: str = list(record.keys())[i]
if type(record[key]) == str and len(record[key]) == 20 and (key.endswith('_id') or key.endswith('_by')):
record[key] = { '$oid': bson.ObjectId(bytes(record[key][:12], 'utf-8')) }
if type(record[key]) == list and key.endswith('_ids'):
for j in range(len(record[key])):
if type(record[key][j]) == str and len(record[key][j]) == 20:
record[key][j] = { '$oid': bson.ObjectId(bytes(record[key][j][:12], 'utf-8')) }
if type(record[key]) == datetime:
if record[key].isoformat()[11:19] == '00:00:00':
record[key] = { '$date': record[key].isoformat()[:-6]+'+00:00' }
else:
record[key] = { '$date': record[key].isoformat()[:-6]+'-03:00' }
res[coll].append(record)
if len(sub) > 0:
if all_ids:
flatten(sub, res, [coll, *parentname], [record['_id'], *parentid], all_ids)
else:
flatten(sub, res, [coll], [record['_id']], all_ids)
def main():
cmd = argparse.ArgumentParser(description='Flatten Firestore database export data')
cmd.add_argument('input_file', help = 'yaml file to process')
# cmd.add_argument('output_file', help = 'Output json file')
cmd.add_argument('-a', '--allids', help='include full path references in subclooections docutemns', action='store_true')
cmd.add_argument('-m', '--mongo', help='dump collections to separate files with ready-to-import format', action='store_true')
args = vars(cmd.parse_args())
input_file = args['input_file']
in_path = input_file.split('.')
in_path[1] = 'json'
output_file = '.'.join(in_path)
with open(input_file, mode = 'r', encoding = 'utf8') as infile:
data: Dict[str, Any] = yaml.safe_load(infile)
res = {}
flatten(data, res, [], [], args['allids'])
if args['mongo']:
for k in res:
with open('{file}.json'.format(file = k), mode = 'w+', encoding = 'utf-8') as outputfile:
for o in res[k]:
json.dump(o, outputfile, indent = 2, default = str, ensure_ascii = False)
outputfile.write('\n')
else:
with open(output_file, mode = 'w+', encoding = 'utf8') as outputfile:
# yaml.dump(res, outputfile, allow_unicode = True, indent = 2, Dumper=NoAliasDumper)
json.dump(res, outputfile, indent = 2, default=str, ensure_ascii=False)
if __name__ == '__main__':
sys.exit(main())