scripts/gen_libdevice_json.py

# Takes the pdf file of the libcuda docs and generates a JSON file representing it.
# That json file is then used to generate internal intrinsics as well as intrinsics docs.
# libdevice is 300+ intrinsics, therefore making a script to do this is better for developer
# sanity as well as extensibility for any future versions of libdevice.

import pdfplumber
import os
import re
import json

dirname = os.path.dirname(__file__)
filename = os.path.join(dirname, 'data/libdevice.pdf')
out_filename = os.path.join(dirname, 'data/libdevice.json')

text = ""
with pdfplumber.open(filename) as pdf:
    for page in pdf.pages:
        text += page.extract_text()

open("scripts/data/libdevice.txt", "w", encoding="utf8").write(text)

# I know this is bad but trust me its much less work than writing a proper parser
regex = r"3\.\d+\.\s(\w+)(?!\.)\nPrototype:\n(.+)\nDescription:\n([\s\S]*?(?=Returns:))Returns:\n([\s\S]*?(?=Library Availability))Library Availability:\n([\s\S]*?(?=(3\.\d+\.)|\Z|www\.nvidia\.com))"

# The raw text includes the page footer which messes up the regex so clean that up before we go on
sanitize_regex = r"www.nvidia.com\nLibdevice User's Guide Part 000 _v8.0 \| \d+Function Reference\n"

text = re.sub(sanitize_regex, "", text)
# renders better in markdown
text = text.replace("\u2023", "-")
# replace more than one space in a row with a single space
text = re.sub(" +", " ", text)
# the text conversion has some issues with the math symbols in the pdf
# it seems to turn x and y into \nx and \ny
text = text.replace("\nx", "x")
text = text.replace("\ny", "y")
# i dont even know
text = text.replace(".x", "x.")

matches = re.finditer(regex, text)
intrinsics = []
type_map = {
    "float": "f32",
    "double": "f64",
    "i8": "i8",
    "i16": "i16",
    "i32": "i32",
    "i64": "i64",
    "void": "()",
    "i8*": "*mut i8",
    "i16*": "*mut i16",
    "i32*": "*mut i32",
    "i64*": "*mut i64",
    "float*": "*mut f32",
    "double*": "*mut f64",
}

for match in matches:
    sig_txt = match.group(2).strip()
    sig = {}
    return_ty = type_map[re.search(".*(?= @)", sig_txt).group()]
    params = []
    for param in re.finditer("(\w+\*?)(?= %) %(\w+)", sig_txt):
        params.append(
            {
                "name": param.group(2).strip(),
                "type": type_map[param.group(1).strip()]
            }
        )

    sig["params"] = params
    sig["returns"] = return_ty

    intrinsics.append(
        {
            "name": match.group(1).strip(),
            "sig": sig,
            "description": match.group(3).strip(),
            "returns": match.group(4).strip(),
            "availability": match.group(5).strip()
        }
    )

out = open(out_filename, "w", encoding="utf8")
out.write(json.dumps(intrinsics, indent=2))