Skip to content

Commit

Permalink
refactor: streamline JSON conversion functions and improve error hand…
Browse files Browse the repository at this point in the history
…ling for unsupported file types
  • Loading branch information
codeperfectplus committed Nov 26, 2024
1 parent 2369b93 commit 341a182
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 138 deletions.
13 changes: 1 addition & 12 deletions audiobook/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,9 @@

from audiobook.config import speed_dict
from audiobook.utils import (
docs_to_json,
epub_to_json,
html_to_json,
load_json,
mobi_to_json,
odt_to_json,
pdf_to_json,
speak_text,
txt_to_json,
write_json_file,
rtf_to_json
)
from audiobook.utils import get_json_metadata

Expand Down Expand Up @@ -73,10 +65,7 @@ def create_json_book(self, input_book_path, password=None, load_from_library=Fal
metadata["pages"] = len(json_book)
return json_book, metadata

if json_book:
json_book, metadata = get_json_metadata(input_book_path=input_book_path, password=password)
else:
raise NotImplementedError("Only PDF, TXT, EPUB, MOBI, ODT, HTTP, RTF, DOCX and DOC files are supported")
json_book, metadata = get_json_metadata(input_book_path=input_book_path, password=password)

write_json_file(json_book, output_file_path)

Expand Down
203 changes: 77 additions & 126 deletions audiobook/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
import os
import re

import docx2txt
import ebooklib
import html2text
Expand All @@ -11,235 +10,187 @@
from odf import text, teletype
from odf.opendocument import load
from striprtf.striprtf import rtf_to_text

from audiobook.doc_parser.web_parser import ArticleWebScraper
from audiobook.doc_parser.pdf_parser import PyPDF2DocParser
# from audiobook.doc_parser.pdf_parser import PdfMinerDocParser


# Helper function to load JSON data from a file
def load_json(filename):
with open(filename, "r") as fp:
return json.load(fp)


# Helper function to write JSON data to a file
def write_json_file(json_data, filename):
with open(filename, "w") as fp:
json.dump(json_data, fp)


# Text preprocessing: removes unwanted characters and extra spaces
def text_preprocessing(input_text):
"""function to preprocess text"""
regex = re.compile(r"[\n\r\t]")
preprocessed_text = regex.sub("", input_text)
preprocessed_text = re.sub(" +", " ", preprocessed_text)
preprocessed_text = preprocessed_text.strip()
preprocessed_text = re.sub(r"[\n\r\t]", "", input_text)
preprocessed_text = re.sub(" +", " ", preprocessed_text).strip()
return preprocessed_text


# Extract text content from HTML, preprocess it
def response_to_text(chapter):
"""fuction to convert response to text
required for epub files
maybe required for html files
"""
soup = BeautifulSoup(chapter, "html.parser")
extracted_text = [para.get_text() for para in soup.find_all("p")]
extracted_text = " ".join(extracted_text)
preprocessed_text = text_preprocessing(extracted_text)
return preprocessed_text

extracted_text = " ".join([para.get_text() for para in soup.find_all("p")])
return text_preprocessing(extracted_text)

# Speak the given text using the engine
def speak_text(engine, text, display=True):
"""function to speak text and display it"""
if display:
print(text)
engine.say(text)
engine.runAndWait()


# Helper function to convert mobi files to JSON format
def mobi_to_json(input_book_path):
"""sub method to create json book from mobi file"""
metadata = {}
json_book = {}
book_name = os.path.basename(input_book_path).split(".")[0]
tempdir, filepath = mobi.extract(input_book_path)

with open(filepath, "r", encoding="utf-8") as fp:
content = fp.read()

book_data = html2text.html2text(content)
book_data = text_preprocessing(book_data)

for i in range(0, len(book_data), 2000):
page_num = i // 2000
json_book[str(page_num)] = book_data[i: i + 2000]

# Split content into chunks of 2000 characters
json_book = {str(i // 2000): book_data[i:i + 2000] for i in range(0, len(book_data), 2000)}

metadata["pages"] = len(json_book)
metadata["book_name"] = book_name
return json_book, metadata


# Helper function to convert PDF to JSON format
def pdf_to_json(input_book_path, password=None):
"""sub method to create json book from pdf file"""
metadata = {}
json_book = {}
metadata = {}
basename = os.path.basename(input_book_path).split(".")[0]

# removed pdf parser selection(only pydf2 is supported now)
# if extraction_engine is None or extraction_engine == "pdfminer":
# print("Using pdfminer")
# pdf_parser = PdfMinerDocParser()
# elif extraction_engine == "pypdf2":
# print("Using pypdf2")
# pdf_parser = PyPDF2DocParser()
# else:
# raise NotImplementedError("Only pdfminer and pypdf2 are supported")

pdf_parser = PyPDF2DocParser()
text = pdf_parser.get_text(input_book_path, password=password)
text = text_preprocessing(text)

for i in range(0, len(text), 2000):
page_num = i // 2000
json_book[str(page_num)] = text[i: i + 2000]

metadata['book_name'] = basename
metadata['pages'] = len(json_book)
json_book = {str(i // 2000): text[i:i + 2000] for i in range(0, len(text), 2000)}

metadata["book_name"] = basename
metadata["pages"] = len(json_book)
return json_book, metadata


# Helper function to convert ODT files to JSON format
def odt_to_json(input_book_path):
"""sub method to create json book from odt file"""
metadata = {}
json_book = {}
metadata = {}
basename = os.path.basename(input_book_path).split(".")[0]

textdoc = load(input_book_path)
allparas = textdoc.getElementsByType(text.P)
output_text = ""
for i in range(len(allparas)):
output_text += " " + teletype.extractText(allparas[i])
output_text = " ".join([teletype.extractText(para) for para in textdoc.getElementsByType(text.P)])
output_text = text_preprocessing(output_text)

for i in range(0, len(output_text), 2000):
page_num = i // 2000
json_book[str(page_num)] = output_text[i: i + 2000]

metadata['book_name'] = basename
metadata['pages'] = len(json_book)

json_book = {str(i // 2000): output_text[i:i + 2000] for i in range(0, len(output_text), 2000)}

metadata["book_name"] = basename
metadata["pages"] = len(json_book)
return json_book, metadata


# Helper function to convert TXT files to JSON format
def txt_to_json(input_book_path):
"""sub method to create json book from txt file"""
json_book = {}
metadata = {}
book_name = os.path.basename(input_book_path).split(".")[0]

with open(input_book_path, "r") as fp:
file_txt_data = fp.read()

file_txt_data = text_preprocessing(file_txt_data)

for i in range(0, len(file_txt_data), 2000):
page_num = i // 2000
json_book[str(page_num)] = file_txt_data[i: i + 2000]

json_book = {str(i // 2000): file_txt_data[i:i + 2000] for i in range(0, len(file_txt_data), 2000)}

metadata["pages"] = len(json_book)
metadata["book_name"] = book_name
return json_book, metadata


# Helper function to convert RTF files to JSON format
def rtf_to_json(input_book_path):
"""sub method to create json book from rtf file"""
json_book = {}
metadata = {}
book_name = os.path.basename(input_book_path).split(".")[0]

with open(input_book_path, "r") as fp:
file_rtf_data = fp.read()
file_txt_data = rtf_to_text(file_rtf_data , errors="ignore")

file_txt_data = rtf_to_text(file_rtf_data, errors="ignore")
file_txt_data = text_preprocessing(file_txt_data)

for i in range(0, len(file_txt_data), 2000):
page_num = i // 2000
json_book[str(page_num)] = file_txt_data[i: i + 2000]

json_book = {str(i // 2000): file_txt_data[i:i + 2000] for i in range(0, len(file_txt_data), 2000)}

metadata["pages"] = len(json_book)
metadata["book_name"] = book_name
return json_book, metadata


# Helper function to convert DOCX files to JSON format
def docs_to_json(input_book_path):
"""sub method to create json book from docs file"""
metadata = {}
json_book = {}
metadata = {}
book_name = os.path.basename(input_book_path).split(".")[0]

book_data = docx2txt.process(input_book_path)
for i in range(0, len(book_data), 2000):
page_num = i // 2000
json_book[str(page_num)] = book_data[i: i + 2000]

json_book = {str(i // 2000): book_data[i:i + 2000] for i in range(0, len(book_data), 2000)}

metadata["pages"] = len(json_book)
metadata["book_name"] = book_name
return json_book, metadata


# Helper function to convert EPUB files to JSON format
def epub_to_json(input_book_path):
metadata = {}
json_book = {}
metadata = {}
book_name = os.path.basename(input_book_path).split(".")[0]

book = epub.read_epub(input_book_path)
text = " ".join(
[
response_to_text(chapter.get_body_content())
for chapter in book.get_items_of_type(ebooklib.ITEM_DOCUMENT)
]
)
for i in range(1, len(text) + 1, 2000):
page_num = i // 2000
json_book[str(page_num)] = text[i: i + 2000]

text = " ".join([response_to_text(chapter.get_body_content()) for chapter in book.get_items_of_type(ebooklib.ITEM_DOCUMENT)])

json_book = {str(i // 2000): text[i:i + 2000] for i in range(1, len(text) + 1, 2000)}

metadata["pages"] = len(json_book)
metadata["book_name"] = book_name
return json_book, metadata


# Helper function to convert HTML (web) content to JSON format
def html_to_json(url):
"""method to create json book from web article"""
metadata = {}
json_book = {}
book_name = os.path.basename(url).split(".")[0]

article_scraper = ArticleWebScraper(url)
page_data = article_scraper.get_page_data()
page_data = text_preprocessing(page_data)
for i in range(0, len(page_data), 2000):
page_num = i // 2000
json_book[str(page_num)] = page_data[i: i + 2000]


json_book = {str(i // 2000): page_data[i:i + 2000] for i in range(0, len(page_data), 2000)}

metadata["pages"] = len(json_book)
metadata["book_name"] = book_name
return json_book, metadata


def get_json_metadata(input_book_path, password):
""" helper function to call the function based on the file type """
# get the file extension
json_book = {}
metadata = {}
file_extension = input_book_path.split(".")[-1]

if file_extension == "odt":
json_book, metadata = odt_to_json(input_book_path)
elif file_extension == "pdf":
json_book, metadata = pdf_to_json(input_book_path, password)
elif file_extension == "txt":
json_book, metadata = txt_to_json(input_book_path)
elif file_extension == "epub":
json_book, metadata = epub_to_json(input_book_path)
elif file_extension == "mobi":
json_book, metadata = mobi_to_json(input_book_path)
elif input_book_path.startswith(("http", "https")):
json_book, metadata = html_to_json(input_book_path)
elif input_book_path.endswith((".docx", ".doc")):
json_book, metadata = docs_to_json(input_book_path)
elif file_extension == "rtf":
json_book, metadata = rtf_to_json(input_book_path)

# Main function to determine the file type and call respective methods
def get_json_metadata(input_book_path, password=None):
file_extension = input_book_path.split(".")[-1].lower()
json_book, metadata = {}, {}

file_to_json = {
"odt": odt_to_json,
"pdf": pdf_to_json,
"txt": txt_to_json,
"epub": epub_to_json,
"mobi": mobi_to_json,
"html": html_to_json,
"docx": docs_to_json,
"rtf": rtf_to_json
}

if file_extension in file_to_json:
json_book, metadata = file_to_json[file_extension](input_book_path, password)
else:
raise NotImplementedError(f"Unsupported file type: {file_extension}")

return json_book, metadata

0 comments on commit 341a182

Please sign in to comment.