Merge pull request #33 from GreatRSingh/main

rbharath · web-flow · commit ccd63e6ed3ac · 2023-12-12T09:32:31.000-08:00
Build PDF Book
diff --git a/new-website/.gitignore b/new-website/.gitignore
@@ -2,5 +2,6 @@
 /docs
 /utils/tutorials/html-notebooks
 /utils/tutorials/ipynb-notebooks
+/utils/tutorials/storage
 /utils/tutorials/website-render-order
 /utils/tutorials/notebooks.txt
diff --git a/new-website/README.md b/new-website/README.md
@@ -80,13 +80,18 @@ A detailed description of the working of the scripts is given below.
     - The CSV file itself contains the Titles and File names of the tutorials in the order in which they should be read.
 
 - ### `export_tutorials.py`
+
   - This script reads the list of notebooks from `/utils/tutorials/notebooks.txt` and parses the HTML files (downloaded temporarily to `/utils/tutorials/html-notebooks`) using `BeautifulSoup`.
   - The script then creates a react component for each tutorial and exports it to the `/deepchem/pages/tutorials` directory.
   - The script also creates a json data file for each tutorial and exports it to the `/deepchem/data/tutorials` directory.
   - The template for the react components is stored in `utils/tutorials/tutorial_component_template.py`.
     Please note, that any files required by scripts are generated by the scripts themselves and are not stored in the repository.
 
+- ### `build_pdf_book.py`
 
+  - The script reads the list of notebooks from `utils/tutorials/website-render-order` and converts the HTML files (downloaded temporarily to `/utils/tutorials/html-notebooks`) to PDF files using `pdfkit` and stores them in `/utils/tutorials/storage/`.
+  - The script then merged these PDFs and creates the file `merged.pdf`.
+    - Please note, pdfunite package is required to be installed for merging. `apt install poppler-utils`
 
 
 ## Deployment
diff --git a/new-website/utils/requirements.txt b/new-website/utils/requirements.txt
diff --git a/new-website/utils/tutorials/build_pdf_book.py b/new-website/utils/tutorials/build_pdf_book.py
@@ -0,0 +1,65 @@
+"""
+This script is used to build the pdf book from DeepChem Tutorials.
+
+Requirements:
+    - pdfunite
+    - pdfkit
+
+Example Usage:
+    - Run the script "fetch_tutorials.py" // It will fetch all the tutorials.
+    - Run the script "build_pdf_book.py"
+    - It may cause error, mostly due to the type of graphic used in some tutorials
+    which donot compile properly, remove them from the website-render-order or fix
+    them, and run this script again.
+    
+NOTE:
+    - NO FILES OR DIRECTORIES HAVE TO BE CREATED MANUALLY. The script will create the required directories and files.
+    - Run scripts in the Top-Level folder.
+
+"""
+import os
+import pandas as pd
+import pdfkit
+from utils import numeric_sorter
+
+
+INFO_PATH = "/workspaces/deepchem.github.io/new-website/utils/tutorials/website-render-order/"
+DATA_PATH = "/workspaces/deepchem.github.io/new-website/utils/tutorials/html-notebooks/"
+PDF_PATH = "/workspaces/deepchem.github.io/new-website/utils/tutorials/storage/"
+
+files = os.listdir(INFO_PATH)
+files = sorted(files)
+
+files_list = numeric_sorter(files)
+
+def html_to_pdf():
+    """
+    Converts HTML files to PDF files.
+
+    Raises
+    ------
+    ProtocolUnknownError
+        If it faces some unknown kind of graphic.
+
+    """
+    for i in files_list:
+        chapter = pd.read_csv(INFO_PATH + "-".join(i))
+        for j in chapter["File Name"]:
+            print(i, j)
+            pdfkit.from_file(DATA_PATH + j[:-5] + "html", PDF_PATH + j[:-5] + "pdf")
+
+def merge_pdf():
+    """Merges the compiled PDFs."""
+    command = "pdfunite "
+    for i in files_list:
+        chapter = pd.read_csv(INFO_PATH + "-".join(i))
+        for j in chapter["File Name"]:
+            print(i, j)
+            command = command + PDF_PATH + j[:-5] + "pdf "
+    os.system(command + "merged.pdf")
+
+
+if __name__ == "__main__":
+    os.system("mkdir " + PDF_PATH)
+    html_to_pdf()
+    merge_pdf()
diff --git a/new-website/utils/tutorials/utils.py b/new-website/utils/tutorials/utils.py
@@ -8,6 +8,32 @@
 import re
 
 
+def numeric_sorter(s):
+    """
+    Sorts the tutorials according to their serial number.
+
+    Parameters
+    ----------
+    s: List[str]
+        The List to be sorted.
+
+    Returns
+    -------
+    s_sorted: List[List[str]]
+        The sorted and Broken into parts list.
+
+    """
+    s_splitted_list = []
+    s_sorted = []
+    for i in s:
+        s_splitted_list.append(i.split("-"))
+    for i in range(len(s_splitted_list)+1):
+        for j in s_splitted_list:
+            if i == int(j[0]):
+                s_sorted.append(j)
+    return s_sorted
+
+
 def to_valid_identifier(s):
     """
     Converts a given string into a valid identifier.