-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathusfm2usx.py
461 lines (412 loc) · 16 KB
/
usfm2usx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
# -*- coding: utf-8 -*-
# This script produces a set of .usx files in tStudio-compatible resource container format from
# USFM source text.
# The resulting containers are importable to BTT-Writer or tStudio to use as source text.
# Chunk division and paragraph locations are based on \s5 markers in the usfm files.
# Uses parseUsfm module to parse the usfm files.
# This script was originally written for converting the Spanish Reina-Valera 1909 Bible
# so that Bible could be used as a source text in BTT-Writer.
# It has also been used for the Danish 'Hellig Bibel'.
# The input file(s) should be verified, correct USFM.
# Before running the script, set the global variables below.
# Global variables
# source_dir = r'C:\DCS\Persian\pes_opcb'
config = None
gui = None
state = None
nConverted = 0
# rc_dir = r'C:\Users\lvers\AppData\Local\BTT-Writer\library\resource_containers'
# Values to be written into each of the package.json files
# language_code = 'dan'
# language_name = 'Dansk'
# bible_id = 'det' # lowercase 'ulb' or other bible identifier
# bible_name = 'Hellig Bibel'
# direction = "ltr"
# pub_date = "2022-06-29"
# license = "Public Domain (NT)"
# version = "1"
import configmanager
from pathlib import Path
import sys
import os
import parseUsfm
import usfm_verses
import io
import codecs
import re
import json
import yaml
from shutil import copy
from datetime import date
lastToken = parseUsfm.UsfmToken(None)
vv_re = re.compile(r'([0-9]+)-([0-9]+)')
class State:
def __init__(self):
self.ID = ""
self.title = ""
self.chapter = 0
self.chapterPad = "00"
self.verse = 0
self.lastVerse = 0
self.versePad = "00"
self.needingVerseText = False
self.pendingVerse = 0
self.sectionPending = ""
self.reference = ""
self.lastRef = ""
self.en_content_dir = ""
self.en_chapter_dir = ""
self.target_content_dir = ""
self.target_chapter_dir = ""
self.usxOutput = None
def addID(self, id):
self.ID = id
self.title = ""
self.chapter = 0
self.lastVerse = 0
self.verse = 0
self.needingVerseText = False
self.lastRef = self.reference
self.reference = id
rc_dir = config['rc_dir']
self.en_content_dir = os.path.join( os.path.join(rc_dir, "en_" + id.lower() + "_ulb"), "content")
self.target_content_dir = os.path.join( os.path.join(rc_dir, config['language_code'] + "_" + id.lower() + "_" + config['bible_id']), "content")
def addTitle(self, bookTitle, mt):
if not self.title:
self.title = bookTitle
if self.title.isascii() and not bookTitle.isascii():
self.title = bookTitle
if mt and self.title.isascii() == bookTitle.isascii(): # \mt is highest priority title, everything else being equal
self.title = bookTitle
def addChapter(self, c):
self.lastChapter = self.chapter
self.chapter = int(c)
if len(c) == 1:
self.chapterPad = "0" + c
else:
self.chapterPad = c
self.lastVerse = 0
self.verse = 0
self.versePad = ""
self.needingVerseText = False
self.lastRef = self.reference
self.reference = self.ID + " " + c
self.en_chapter_dir = os.path.join(self.en_content_dir, self.chapterPad)
self.target_chapter_dir = os.path.join(self.target_content_dir, self.chapterPad)
def addText(self):
self.needingVerseText = False
# Reports if vv is a range of verses, e.g. 3-4. Passes the verse(s) on to addVerse()
def addVerses(self, vv):
if vv.find('-') > 0:
reportError("Range of verses encountered at " + self.reference)
vv_range = vv_re.search(vv)
self.addVerse(vv_range.group(1))
self.addVerse(vv_range.group(2))
else:
self.addVerse(vv)
# Sets self.versePad for file naming purposes.
# Updates self.reference
def addVerse(self, v):
self.lastVerse = self.verse
self.verse = int(v)
if len(v) == 1:
self.versePad = "0" + v
else:
self.versePad = v
self.needingVerseText = True
self.lastRef = self.reference
self.reference = self.ID + " " + str(self.chapter) + ":" + v
def setUsxOutput(self, file):
self.usxOutput = file
def needVerseText(self):
return self.needingVerseText
def saveSection(self, s):
self.sectionPending = s
# def printToken(token):
# if token.isV():
# print("Verse number " + token.value)
# elif token.isC():
# print("Chapter " + token.value)
# elif token.isS():
# sys.stdout.write("Section heading: " + token.value)
# elif token.isTEXT():
# print("Text: <" + token.value + ">")
# else:
# print(token)
# Removes UTF-8 Byte Order Marks (BOM) from specified file if it has one or more.
def removeBOM(path):
bytes_to_remove = 0
MAX = 60
with open(path, 'rb') as f:
raw = f.read(MAX + 3)
while raw[bytes_to_remove:bytes_to_remove+3] == codecs.BOM_UTF8 and bytes_to_remove < MAX:
bytes_to_remove += 3
if bytes_to_remove > 0:
f.seek(bytes_to_remove)
raw = f.read()
if bytes_to_remove > 0:
with open(path, 'wb') as f:
f.write(raw)
def takeID(id):
state.addID(id[0:3])
# When a chapter marker is encountered in the USFM file, we close the current .usx files
# and change the target chapter folder.
# We also write a default title.usx for the chapter.
def takeC(c):
closeUsx()
state.addChapter(c)
makeChapterDir(state.chapterPad)
createChapterTitleFile(str(state.chapter)) # default, in case \cl does not follow
path = os.path.join(state.target_chapter_dir, "01.usx")
state.setUsxOutput( io.open(path, "tw", encoding="utf-8", newline='\n') )
def takeCL(value):
createChapterTitleFile(value)
def takeF(value):
state.usxOutput.write('<note style="f" caller="+"> ')
def takeFTFQA(type, value):
state.usxOutput.write(f'<char style="{type}">\n{value} </char>\n')
def takeFE():
state.usxOutput.write('</note>\n')
# Currently this function does nothing, as paragraphs are not relevant to tStudio/BTTW (confirmed 3/29/22).
def takeP(type):
pass
# state.usxOutput.write('<para style="' + type + '">\n\n')
# Writes the section heading immediately if at the beginning of a chapter.
# Saves the section heading if it occurs after the first verse in a chapter.
def takeS(s):
if state.verse == 0: # section heading is at the start of the chapter
state.usxOutput.write(s)
else:
state.saveSection(s)
# When a verse marker is encountered in the USFM file, we open a new usx file if needed.
# We write the <verse> element into the usx file.
def takeV(v):
state.addVerses(v)
if not state.usxOutput:
path = os.path.join(state.target_chapter_dir, state.versePad + ".usx")
state.setUsxOutput( io.open(path, "tw", encoding="utf-8", newline='\n') )
state.usxOutput.write('<verse number="' + v + '" style="v" />')
# Writes the specified text to the current usx file.
def takeText(t):
if state.verse > 0:
if state.usxOutput:
state.usxOutput.write(t + "\n\n")
else:
reportError("Unhandled text before verse 1. See " + state.reference)
state.addText()
# Handles each usfm token as the usfm files is parsed.
def take(token):
if state.needVerseText() and not token.isTEXT():
reportError("Empty verse: " + state.reference)
if token.isID():
takeID(token.value)
elif token.isH() or token.isTOC1() or token.isTOC2() or token.isMT():
state.addTitle(token.value, token.isMT())
elif token.isC():
takeC(token.value)
elif token.isCL():
takeCL(token.value)
# elif token.isS(): # section headings are ignored currently
# printToken(token)
# takeS(token.value)
elif token.isS5():
closeUsx()
elif token.isV():
takeV(token.value)
elif token.isTEXT():
takeText(token.value)
elif token.isP() or token.isPI() or token.isPC() or token.isNB() or token.isQ() \
or token.isQ1() or token.isQA() or token.isSP() or token.isQR() or token.isQC():
takeP(token.type)
elif token.isF_S():
takeF(token.value)
elif token.isFT() or token.isFQA():
takeFTFQA(token.type, token.value)
elif token.isF_E():
takeFE()
else:
if not token.type in {'ide','toc3'}:
reportError("Unhandled token: " + token.type)
global lastToken
lastToken = token
# Called when a \s5 chunk marker occurs, and at the end of every chapter.
# Closes the current usx file.
def closeUsx():
if state.usxOutput:
state.usxOutput.close()
state.setUsxOutput(None)
def reportError(msg):
reportToGui(msg, '<<ScriptMessage>>')
sys.stderr.write(msg + '\n')
# sys.stderr.flush()
# Sends a progress report to the GUI.
# To be called only if the gui is set.
def reportStatus(msg):
reportToGui(msg, '<<ScriptMessage>>')
print(msg)
def reportProgress(msg):
reportToGui(msg, '<<ScriptProgress>>')
print(msg)
def reportToGui(msg, event):
if gui:
with gui.progress_lock:
gui.progress = msg if not gui.progress else f"{gui.progress}\n{msg}"
gui.event_generate(event, when="tail")
# Creates the specified folder and a "content" folder under it
def makeTargetDirs(target_book_dir):
if not os.path.isdir(target_book_dir):
os.mkdir(target_book_dir)
target_content_dir = os.path.join(target_book_dir, "content")
if not os.path.isdir(target_content_dir):
os.mkdir(target_content_dir)
# Creates a chapter folder under the target content directory.
def makeChapterDir(chap):
dir = os.path.join(state.target_content_dir, chap)
if not os.path.isdir(dir):
os.mkdir(dir)
idcode_re = re.compile(r'\\id +([\w][\w][\w])')
# Parses the book identifier from the \id tag, which should be on the first line of the usfm file
def getBookId(usfmpath):
input = io.open(usfmpath, "tr", encoding="utf-8-sig")
str = input.readline()
input.close()
if idcode := idcode_re.match(str):
bookId = idcode.group(1)
else:
reportError("USFM file does not start with standard \\id marker.")
bookId = ""
return bookId
# Makes a custom package.json file in the specified target folder.
# Modifies a copy of an English manifest.
def createManifest(en_book_dir, target_book_dir):
path = os.path.join(en_book_dir, "package.json")
jsonFile = io.open(path, "tr", encoding='utf-8-sig')
package = json.load(jsonFile)
today = date.today()
s = '%(year)d%(month)02d%(day)02d' % {'year':today.year, 'month':today.month, 'day':today.day}
package['modified_at'] = int(s)
package['language']['slug'] = config['language_code']
package['language']['name'] = config['language_name']
package['language']['direction'] = config['direction']
package['project']['slug'] = state.ID.lower()
package['project']['name'] = state.title
package['project']['sort'] = usfm_verses.verseCounts[state.ID.upper()]['sort']
package['project']['chunks_url'] = "https://api.unfoldingword.org/bible/txt/1/" + state.ID.lower() + "/chunks.json"
category = "bible-nt"
if usfm_verses.verseCounts[state.ID.upper()]['sort'] < 40:
category = "bible-ot"
package['project']['category_slug'] = category
package['project']['categories'] = [category]
package['resource']['slug'] = config['bible_id']
package['resource']['name'] = config['bible_name']
package['resource']['status']['pub_date'] = config['pub_date']
package['resource']['status']['license'] = config['license']
package['resource']['status']['version'] = config['version']
path = os.path.join(target_book_dir, "package.json")
jsonFile = io.open(path, "tw", encoding='utf-8', newline='\n')
json.dump(package, jsonFile, ensure_ascii=False, indent=2)
jsonFile.close()
removeBOM(path) # because tStudio/BTTW chokes on BOM
# Creates or overwrites chapter title file.
def createChapterTitleFile(title):
path = os.path.join(state.target_chapter_dir, "title.usx")
with io.open(path, "tw", encoding="utf-8", newline='\n') as usxOutput:
usxOutput.write( title )
# Adds front folder with title.usx, if the book title is known.
def createBookTitleFile():
frontFolder = os.path.join(state.target_content_dir, 'front')
if not os.path.isdir(frontFolder):
os.mkdir(frontFolder)
output = io.open(os.path.join(frontFolder, 'title.usx'), 'tw', encoding="utf-8")
output.write( state.title )
output.close()
# Writes the toc.yaml file into the specified folder
def createToc(en_content_dir, content_dir):
# Temporary implemention -- just copies the English toc.yaml
copy(os.path.join(en_content_dir, 'toc.yml'), content_dir) # copy() is from shutil
# TODO: implement a better solution
path = os.path.join(content_dir, "toc.yaml")
# Converts a single usfm file to a usx resource container.
def convertFile(usfmpath, bookId):
rc_dir = config['rc_dir']
en_book_dir = os.path.join(rc_dir, "en_" + bookId.lower() + "_ulb")
target_book_dir = os.path.join(rc_dir, config['language_code'] + "_" + bookId.lower() + "_" + config['bible_id'])
if not os.path.isdir(en_book_dir):
reportError("English book folder not found: " + en_book_dir)
else:
makeTargetDirs(target_book_dir)
en_content_dir = os.path.join(en_book_dir, "content")
reportProgress("CONVERTING " + usfmpath)
# sys.stdout.flush()
input = io.open(usfmpath, "tr", encoding="utf-8-sig")
str = input.read()
input.close()
for token in parseUsfm.parseString(str):
take(token)
closeUsx()
copy(os.path.join(en_book_dir, 'LICENSE.md'), target_book_dir)
createManifest(en_book_dir, target_book_dir)
copy(os.path.join(en_content_dir, 'config.yml'), state.target_content_dir) # copy() is from shutil
createToc(en_content_dir, state.target_content_dir)
createBookTitleFile()
global nConverted
nConverted += 1
# Parses entire usfm file and writes to .usx files by chunk.
def processFile(usfmpath):
bookId = getBookId(usfmpath)
if not bookId:
reportError("Invalid USFM file: " + usfmpath)
else:
convertFile(usfmpath, bookId)
# Processes a whole folder of usfm files, recursively.
def convertDir(dir):
for entry in os.listdir(dir):
path = os.path.join(dir, entry)
if entry[0] != '.' and os.path.isdir(path):
convertDir(path)
elif entry.endswith("sfm") and os.path.isfile(path):
processFile(path)
# Creates the specified folder if necessary.
# Fails if the parent folder does not exist.
# Returns False if not possible.
def make_dir(folder):
if not os.path.isdir(folder):
parent = os.path.dirname(folder)
if os.path.isdir(parent):
os.mkdir(folder)
return os.path.isdir(folder)
def main(app = None):
global nConverted
global gui
global config
global state
nConverted = 0
gui = app
state = State()
config = configmanager.ToolsConfigManager().get_section('Usfm2Usx') # configmanager version
if config:
source_dir = config['source_dir']
rc_dir = config['rc_dir']
if not make_dir(rc_dir):
reportError("Invalid resource_containers folder: " + rc_dir)
elif not os.path.isdir(source_dir):
reportError("Invalid source folder: " + source_dir)
else:
file = config['filename']
if file:
path = os.path.join(source_dir, file)
if os.path.isfile(path):
processFile(path)
else:
reportError(f"No such file: {path}")
else:
convertDir(source_dir)
if nConverted > 0:
reportStatus(f"\nDone. Converted {nConverted} book(s).")
else:
reportStatus("No books were successfully converted.")
sys.stdout.flush()
if gui:
gui.event_generate('<<ScriptEnd>>', when="tail")
if __name__ == "__main__":
main()