-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinx2csv.py
125 lines (96 loc) · 3.73 KB
/
inx2csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# -*- coding: utf-8 -*-
'''
Created on 2013/02/27
@author: user
'''
import logging
logging.basicConfig(
level=logging.INFO,
format="%(levelname)-8s %(module)-16s %(funcName)-16s@%(lineno)d - %(message)s"
)
import unittest
import sys
import re
import argparse
from datetime import datetime
from inx import INXParser
from fdt import FDTParser, FDTParserVisitor
class FDTParserVisitorDetailFormatter(FDTParserVisitor):
def __init__(self, path):
import csv
self._writer = csv.writer(open(path, "wb"), quoting=csv.QUOTE_ALL)
self._total = 0
self._row = 0
self._last_time = datetime.now()
def print_detail(self):
now = datetime.now()
if 1 < (now - self._last_time).total_seconds():
print "%s / %s" % (self._total, self._total - self._row)
self._last_time = now
def _write(self, row):
self._row += 1
self.print_detail()
self._writer.writerow([text.encode("utf-8") for text in row])
def example(self, example):
self._total += 1
self._write([example.main_id, example.sub_id, example.heading, ""])
def document(self, document):
self._total += 1
self._write([document.main_id, document.sub_id, document.heading, document.body])
class FDTParserVisitorAnkiFormatter(FDTParserVisitorDetailFormatter):
def __init__(self, path):
super(FDTParserVisitorAnkiFormatter, self).__init__(path)
def example(self, example):
pass
def document(self, document):
self._total += 1
#エイリアス的な見出しはスキップする
body = re.sub(ur"▲▲(.+)?/(\d{11})△△", "", document.body)
if len(body) < 10:
logging.debug("skipped: %s", document)
return
#▲▲{text}/{main_id}△△ -> {text}
body = re.sub(ur"▲▲(.+)?/(\d{11})△△", ur"\1", document.body)
#<CR> -> <BR /><BR />
body = re.sub(ur"<CR>", "<BR /><BR />", body)
#"{text} | " -> {text}
heading = re.sub(ur" \| $", "", document.heading)
self._write([heading, body])
def get_arg():
parser = argparse.ArgumentParser(prog="inx2csv.exe",
description="inx to csv converter")
parser.add_argument("--mode", default="all", choices=["all", "anki"])
parser.add_argument("input")
parser.add_argument("output")
return parser.parse_args()
def main():
arg = get_arg()
print ""
print "input path: %s" % arg.input
print "output path: %s" % arg.output
print "mode: %s" % arg.mode
print ""
result = INXParser().parse(arg.input)
for entry in result.entries:
if entry.name.endswith(".fdt"):
print "found %s in %s" % (entry.name, arg.input)
print "converting fdt to csv ..."
print "total / skipped"
if arg.mode == "all":
visitor = FDTParserVisitorDetailFormatter(arg.output)
elif arg.mode == "anki":
visitor = FDTParserVisitorAnkiFormatter(arg.output)
parser = FDTParser()
parser.accept(entry.get_content(), visitor)
visitor.print_detail()
print "done."
return
print "ERROR: fdt file was not found"
class TestMain(unittest.TestCase):
def test_main(self):
sys.argv = ["", "--mode", "anki", "dicts/U_029/_ozh.inx", "test/_ozh.csv"]
main()
sys.argv = ["", "dicts/U_029/_ozh.inx", "test/_ozh.csv"]
main()
if __name__ == "__main__":
main()