Skip to content

Commit

Permalink
Merge pull request #39 from Cannon07/upload_book_script
Browse files Browse the repository at this point in the history
Upload book script
  • Loading branch information
rbharath authored May 20, 2024
2 parents d6d66bd + 86fdede commit b21c946
Show file tree
Hide file tree
Showing 18 changed files with 49,062 additions and 20 deletions.
3 changes: 2 additions & 1 deletion new-website/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ A detailed description of the working of the scripts is given below.
- ### `build_pdf_book.py`

- The script reads the list of notebooks from `utils/tutorials/website-render-order` and converts the HTML files (downloaded temporarily to `/utils/tutorials/html-notebooks`) to PDF files using `pdfkit` and stores them in `/utils/tutorials/storage/`.
- The script then merged these PDFs and creates the file `merged.pdf`.
- The script then merges these PDFs and creates the file `merged.pdf`.
- The `merged.pdf` file is then uploaded to the S3 bucket.
- Please note, pdfunite package is required to be installed for merging. `apt install poppler-utils`


Expand Down
Binary file modified new-website/utils/requirements.txt
Binary file not shown.
130 changes: 112 additions & 18 deletions new-website/utils/tutorials/build_pdf_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
- pdfunite
- pdfkit
- mdpdf
- boto3
Example Usage:
- Run the script "fetch_tutorials.py" // It will fetch all the tutorials.
Expand All @@ -23,42 +24,131 @@
import pdfkit
from utils import numeric_sorter
from typing import List
import signal
import logging
import boto3
from botocore.exceptions import ClientError

INFO_PATH = "website-render-order/"
DATA_PATH = "html-notebooks/"
PDF_PATH = "storage/"

INFO_PATH = "/workspaces/deepchem.github.io/new-website/utils/tutorials/website-render-order/"
DATA_PATH = "/workspaces/deepchem.github.io/new-website/utils/tutorials/html-notebooks/"
PDF_PATH = "/workspaces/deepchem.github.io/new-website/utils/tutorials/storage/"

files = os.listdir(INFO_PATH)
files = sorted(files)
def timeout_handler(signum, frame):
"""
For terminating a function call.
Raises
------
Exception
If the function is called.
files_list = numeric_sorter(files)
"""
raise Exception("Coversion Timed out")


def html_to_pdf():
def html_to_pdf(data_path=DATA_PATH, info_path=INFO_PATH, pdf_path=PDF_PATH):
"""
Converts HTML files to PDF files.
Parameters
----------
data_path: str
Path of the html files to be converted. Defaults to DATA_PATH.
info_path: str
Path for Tutorial Render Order. Defaults to INFO_PATH.
pdf_path: str
Path where the converted pdf files will be stored. Defaults to PDF_PATH.
Raises
------
ProtocolUnknownError
If it faces some unknown kind of graphic.
IOError
If the file specified in the website-render-order is not present in /html-notebooks.
Exception
If the Conversion takes longer than 60 seconds.
"""
files = os.listdir(info_path)
files = sorted(files)
files_list = numeric_sorter(files)

for i in files_list:
chapter = pd.read_csv(INFO_PATH + "-".join(i))
chapter = pd.read_csv(info_path + "-".join(i))
for j in chapter["File Name"]:
print(i, j)
pdfkit.from_file(DATA_PATH + j[:-5] + "html", PDF_PATH + j[:-5] + "pdf")
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(60)
try:
print(i, j)
pdfkit.from_file(data_path + j.strip()[:-5] + "html", pdf_path + j.strip()[:-5] + "pdf")
print("Conversion Successful")
except Exception as e:
print("Exception occured: {}".format(e))


def upload_file(file_name, bucket, object_name=None):
"""
Upload a file to an S3 bucket
Parameters
----------
file_name: str
Path of the File to be uploaded.
bucket: str
Name of the Bucket to upload the file to.
object_name: str
S3 object name. If not specified then file_name is used.
Returns
-------
boolean:
True if file was uploaded, else False
"""

# If S3 object_name was not specified, use file_name
if object_name is None:
object_name = os.path.basename(file_name)

# Upload the file
s3_client = boto3.client('s3')
try:
response = s3_client.upload_file(file_name, bucket, object_name)
except ClientError as e:
logging.error(e)
return False
return True


def merge_pdf(info_path=INFO_PATH, pdf_path=PDF_PATH):
"""
Merges the compiled PDFs.
Parameters
----------
info_path: str
Path for Tutorial Render Order. Defaults to INFO_PATH.
pdf_path: str
Path where the merged pdf file will be stored. Defaults to PDF_PATH.
"""
files = os.listdir(info_path)
files = sorted(files)

files_list = numeric_sorter(files)

def merge_pdf():
"""Merges the compiled PDFs."""
command = "pdfunite "
for i in files_list:
chapter = pd.read_csv(INFO_PATH + "-".join(i))
print(i)
chapter = pd.read_csv(info_path + "-".join(i))
for j in chapter["File Name"]:
print(i, j)
command = command + PDF_PATH + j[:-5] + "pdf "
os.system(command + "merged.pdf")
file_path = pdf_path + j[:-5] + "pdf"
if (os.path.exists(file_path)):
print(i, j)
command = command + pdf_path + j[:-5] + "pdf "
os.system(command + f"{pdf_path}merged.pdf")


def merge_pdf_pages(a: List[str]):
"""Merges the PDFs.
Expand All @@ -73,7 +163,8 @@ def merge_pdf_pages(a: List[str]):
command = "pdfunite "
for i in a:
command = command + i + ' '
os.system(command + "storage/merged.pdf")
os.system(command + "storage/full_pdf.pdf")


def compile_information_pages():
"""Converts the Acknowledgent page and content page from
Expand All @@ -85,9 +176,12 @@ def compile_information_pages():
pdfkit.from_file('contents.html', 'storage/contents.pdf')
pdfkit.from_file('acknowledgement.html', 'storage/acknowledgement.pdf')


if __name__ == "__main__":
os.system("mkdir " + PDF_PATH)
html_to_pdf()
merge_pdf()
compile_information_pages()
merge_pdf_pages(['storage/title.pdf', 'storage/acknowledgement.pdf', 'storage/contents.pdf', 'storage/full_pdf.pdf'])
merge_pdf_pages(['storage/title.pdf', 'storage/acknowledgement.pdf', 'storage/contents.pdf', 'storage/merged.pdf'])
upload_file('storage/full_pdf.pdf', 'deepchemtutorials', 'TutorialsBook.pdf')

Loading

0 comments on commit b21c946

Please sign in to comment.