summarize.py

import datetime
import os
import platform


class SFile:
    def __init__(
        self,
        path,
        filename,
        ctime=0,
        is_additional_file=False,
        header="",
        last_modified_time=0,
        url="",
    ):
        self.path = path
        self.filename = filename
        self.ctime = ctime
        self.last_modified_time = last_modified_time
        self.header = header
        self.is_additional_file = is_additional_file
        self.url = url

    def setHeader(self, header):
        self.header = header


def genAdditionalFiles(header, url, c_time, last_modified_time):
    return SFile(
        "",
        "",
        c_time,
        is_additional_file=True,
        header=header,
        last_modified_time=last_modified_time,
        url=url,
    )

import re
def process_filename(filename: str) -> str:
    return re.sub(r'^(\d+)-([^\d-].*)$', r'\2', filename)

def getFileMarkdownLink(file):
    if file.is_additional_file:
        return f"[{file.header}]({file.url})\n"
    # get docs/xxx/xxx/ form path
    path = file.path.split("docs/")[1]
    paths = path.split("/")
    paths[-1] = paths[-1].removesuffix(".md")
    # remove the beginning number until "-"
    paths[-1] = process_filename(paths[-1])
    path = "/".join(paths)
    return f"[{file.header}](https://doc.fenglyulin.com/docs/{path.removesuffix('.md')})\n"


def getFileCreationTime(path_to_file):
    """
    Try to get the date that a file was created, falling back to when it was
    last modified if that isn't possible.
    See http://stackoverflow.com/a/39501288/1709587 for explanation.
    """
    if platform.system() == "Windows":
        return int(os.path.getctime(path_to_file))
    else:
        stat = os.stat(path_to_file)
        try:
            return int(stat.st_birthtime)
        except AttributeError:
            # We're probably on Linux. No easy way to get creation dates here,
            # so we'll settle for when its content was last modified.
            return int(stat.st_mtime)


def getFileLastModifiedTime(path_to_file):
    return int(os.path.getmtime(path_to_file))


# 2024-07-24 -> 1720310400
def getTimestampFromYearMonthDay(year, month, day):
    return int(datetime.datetime(year, month, day).timestamp())


summaryPrePart = """---
sidebar_position: 1
id: practise-coding-record
title: Docs Summary
tags: [intro]
---

> Auto-generated by [summarize.py](https://github.com/ColaLinN/cs-notes/blob/main/summarize.py)

"""

# the following is the content of the file
# > 本文主要总结本人所输出的技术文章。
# ## 作品集 Portfolio

# - 我用一周时间开发了一个 🇸🇬 新加坡 pr 申请查询的应用 https://sgpass.info/
#   - 小红书宣传帖子： [link](http://xhslink.com/0p7H7Q)
#   - 自己一个人 solo 的项目
#     1. 基于 Nextjs 搭建前后端
#     2. 集成 Clerk 的用户鉴权
#     3. 使用 Supabase 的 Postgres 数据库，目前还是免费版本
#     4. 部署在 Vercel 上，自带简易的服务监控，开了会员 30 美刀一个月
#     5. 在 Domaincheap 买域名，3美刀一年
#     6. 使用 Chatgpt API 分析 500 条记录，6美刀
#   - 持续更新中`


afterPart = """


> 排版参考 [为什么这么设计系列文章](https://draveness.me/whys-the-design/)
"""

SharingDomain = {
    "sharing": [
        genAdditionalFiles(
            "我在组内做了一次技术分享：分布式系统，I did a Distributed Systems Sharing in our Team",
            "https://docs.google.com/presentation/d/1WhaGyZQZUH905QXI5QKHfKzojU1wFxcuWiNiF2pq994",
            getTimestampFromYearMonthDay(2024, 7, 7),
            getTimestampFromYearMonthDay(2024, 7, 7),
        ),
        genAdditionalFiles(
            "【视频】43秒带你速通分布式系统概念 [Video] A quick tour of Distributed System",
            "https://doc.fenglyulin.com/docs/distributed-systems/overview",
            getTimestampFromYearMonthDay(2024, 3, 24),
            getTimestampFromYearMonthDay(2024, 3, 24),
        ),
        genAdditionalFiles(
            "训练机器学习模型预测赢家，在 Kaggle 准确度最高 Game Winner Prediction, best 76.9% w EDA&Finetune",
            "https://www.kaggle.com/code/kirklin/game-winner-prediction-best-76-9-w-eda-finetune",
            getTimestampFromYearMonthDay(2024, 4, 15),
            getTimestampFromYearMonthDay(2024, 4, 15),
        ),
    ]
}


def gather_markdown_files(folder_path):
    idx = 0
    for root, dirs, files in os.walk(folder_path):
        print("idx:", idx)
        print("Root:", root)
        for dir in dirs:
            print("Directory:", dir)
        for file in files:
            print("File:", file)
        break
    return None


#testing
# def getDirs2(folder_path):
#     dirs = []
#     for root, dirs, files in os.walk(folder_path):
#         dirs.append(dirs)
#         for dir in dirs:
#             full_path = os.path.join(root, dir)
#             print("full_path:", full_path)
#             print("ctime", os.path.getctime(full_path))
#             print("ctime_int", int(os.path.getctime(full_path)))
#             print("size", os.path.getsize(full_path))
#             print("isfile", os.path.isfile(full_path))
#             print("isdir", os.path.isdir(full_path))
#             print("stat", os.stat(full_path))
#             stat = os.stat(full_path)
#             print("stat.st_ctime", stat.st_ctime)
#         break
#     return dirs


# ['artificial-intelligence', 'data-science'...
def getDirs(folder_path):
    for root, dirs, files in os.walk(folder_path):
        return sorted(dirs)


def getFiles(folder_path):
    for root, dirs, files in os.walk(folder_path):
        return sorted(files)


# ======================== Main ========================

folder_path = "docs"
files = getFiles(folder_path)

domains = getDirs(folder_path)

domainToMarkdownFiles = {}
domainToSFiles = {}  # SFile object
for domain in domains:
    if domain.endswith(".assets") or domain == "misc":
        continue

    domainPath = os.path.join(folder_path, domain)
    sFileList = []

    cFiles = getFiles(domainPath)
    for file in cFiles:
        if file.endswith(".md"):
            sFileList.append(
                SFile(
                    os.path.join(domainPath, file),
                    file,
                    getFileCreationTime(os.path.join(domainPath, file)),
                    last_modified_time=getFileLastModifiedTime(os.path.join(domainPath, file)),
                )
            )

    subdomains = getDirs(domainPath)
    for subdomain in subdomains:
        subdomainPath = os.path.join(domainPath, subdomain)
        subdomainFiles = getFiles(subdomainPath)
        for subdomainFileName in subdomainFiles:
            if subdomainFileName.endswith(".md"):
                sFileList.append(
                    SFile(
                        os.path.join(subdomainPath, subdomainFileName),
                        subdomainFileName,
                        getFileCreationTime(os.path.join(subdomainPath, subdomainFileName)),
                        last_modified_time=getFileLastModifiedTime(os.path.join(subdomainPath, subdomainFileName)),
                    )
                )

    domainToSFiles[domain] = sorted(sFileList, key=lambda x: x.filename)

# add additional files
domainToSFiles["sharing"] = SharingDomain["sharing"]
domainToSFiles = dict(sorted(domainToSFiles.items(), key=lambda x: x))

domainToc = ""
summary = []
noHeadingOneFiles = []
for domain in domainToSFiles:
    # domain = "artificial-intelligence" -> domaincapitalize = "Artificial Intelligence"
    domaincapitalize = " ".join([word.upper() for word in domain.split("-")])
    domainToc += f"## {domaincapitalize}\n"

    newDomainToSFiles = []
    for i in range(len(domainToSFiles[domain])):
        sfile = domainToSFiles[domain][i]
        fileName = sfile.filename
        filePath = sfile.path

        if "daily-challenge" in sfile.path:
            continue
        if "introduction" in sfile.path:
            continue

        if not sfile.is_additional_file:
            header = ""
            with open(sfile.path, "r") as f:
                for line in f:
                    if line.startswith("# "):
                        header = line[2:].removesuffix("\n")
                        break
            if header == "":
                noHeadingOneFiles.append(fileName)
                continue
            else:
                sfile.setHeader(header)
        newDomainToSFiles.append(sfile)
    newDomainToSFiles = sorted(newDomainToSFiles, key=lambda x: x.ctime, reverse=True)
    domainToSFiles[domain] = newDomainToSFiles

    idx = 1
    for sfile in domainToSFiles[domain]:
        cfile = getFileMarkdownLink(sfile)
        summary.append(sfile)
        if not sfile.is_additional_file:
            domainToc += f"{idx}. " + cfile
        else:
            domainToc += f"{idx}. " + f"[{sfile.header}]({sfile.url})\n"
        idx += 1
    domainToc += "\n"

# print(domainToc)

allToc = "## 全部文章 All Articles\n\n"
summary = sorted(summary, key=lambda x: x.ctime, reverse=False)
for i in range(len(summary) - 1, -1, -1):
    # idxNumStr = "%3d" % (i + 1)
    idxNumStr = str(i + 1).zfill(3)
    allToc += f"- {idxNumStr} {getFileMarkdownLink(summary[i])}"

# print(allToc)
final = summaryPrePart + allToc + domainToc + afterPart

with open(os.path.join(folder_path, "practise-coding-record.md"), "w") as f:
    f.write(final)

print(
    "Num of no heading Files",
    len(noHeadingOneFiles),
    "\nNo Heading One Files:",
    noHeadingOneFiles,
)