forked from UMass-Rescue/Speech-to-Text
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
47 lines (39 loc) · 1.54 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import sys, os, time, logging, json
from pydub import AudioSegment
def mp3_to_wav(source):
destination = "tmp.wav"
# convert mp3 to wav
sound = AudioSegment.from_mp3(source)
sound.export(destination, format="wav")
return destination
logging.basicConfig(filename='stt.log', level=logging.INFO)
program_name = sys.argv[0]
arguments = sys.argv[1:]
file_path = ""
file_name = ""
for i in range(len(arguments)):
start_time = time.time()
file_path = arguments[i]
file_name = os.path.basename(file_path)
name_ext = os.path.splitext(file_name)
transcript = "{}_transcript".format(name_ext[0])
if name_ext[1] == ".mp3":
wav_tmp = mp3_to_wav(file_path)
txt = "deepspeech --model deepspeech-0.6.1-models/output_graph.pbmm --lm deepspeech-0.6.1-models/lm.binary --trie deepspeech-0.6.1-models/trie --audio {} > {}.txt".format(wav_tmp, transcript)
js = "deepspeech --json --model deepspeech-0.6.1-models/output_graph.pbmm --lm deepspeech-0.6.1-models/lm.binary --trie deepspeech-0.6.1-models/trie --audio {} > {}.json".format(wav_tmp, transcript)
#print(os.system(json))
os.system(js)
with open('ninja_transcript.json', 'r') as f:
trans = json.load(f)
script = ""
for word in trans["words"]:
script = script + word["word"] + " "
with open('{}.txt'.format(transcript), 'w') as f:
f.write(script)
elapsed_time = time.time() - start_time
logging.info('Confidence: {}'.format(trans["confidence"]))
logging.info('Elapsed Time: {:0.2f} seconds'.format(elapsed_time))
if os.path.exists('tmp.wav'):
os.remove('tmp.wav')
else:
logging.info('tmp.wav did not exist')