From 062c71ecfea11116263039490bc74c2c33e1baf8 Mon Sep 17 00:00:00 2001 From: David Wooten Date: Thu, 11 Jul 2024 21:22:48 -0500 Subject: [PATCH 1/9] setup test action --- .github/workflows/autoreplies/__init__.py | 0 .../autoreplies/account_recovery.yml | 29 ++++ .../autoreplies/check_account_recovery.py | 146 ++++++++++++++++ .github/workflows/autoreplies/gh_utils.py | 118 +++++++++++++ .github/workflows/autoreplies/pypi_utils.py | 93 ++++++++++ .../workflows/autoreplies/requirements.txt | 2 + .gitignore | 163 ++++++++++++++++++ 7 files changed, 551 insertions(+) create mode 100644 .github/workflows/autoreplies/__init__.py create mode 100644 .github/workflows/autoreplies/account_recovery.yml create mode 100644 .github/workflows/autoreplies/check_account_recovery.py create mode 100644 .github/workflows/autoreplies/gh_utils.py create mode 100644 .github/workflows/autoreplies/pypi_utils.py create mode 100644 .github/workflows/autoreplies/requirements.txt create mode 100644 .gitignore diff --git a/.github/workflows/autoreplies/__init__.py b/.github/workflows/autoreplies/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/.github/workflows/autoreplies/account_recovery.yml b/.github/workflows/autoreplies/account_recovery.yml new file mode 100644 index 0000000..ec08863 --- /dev/null +++ b/.github/workflows/autoreplies/account_recovery.yml @@ -0,0 +1,29 @@ +name: Issue Label Trigger + +on: + issues: + types: [labeled] + +jobs: + parse-issue: + runs-on: ubuntu-latest + if: contains(github.event.issue.labels.*.name, 'account-recovery') + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r .github/workflows/autoreplies/requirements.txt + + - name: Run Python script + run: python .github/workflows/autoreplies/check_account_recovery.py + env: + ISSUE_NUMBER: ${{ github.event.issue.number }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/autoreplies/check_account_recovery.py b/.github/workflows/autoreplies/check_account_recovery.py new file mode 100644 index 0000000..eeb4302 --- /dev/null +++ b/.github/workflows/autoreplies/check_account_recovery.py @@ -0,0 +1,146 @@ +"""Parse a GitHub issue to determine the best path forward for account recovery requests. + + +""" + +import os +import sys + +import pypi_utils +import gh_utils + + +PYPI = "pypi" +REPO = "support" +PYPI_USER_HEADER = "PyPI Username" + + +NO_REPO = "None listed" +UNKNOWN_OWERNSHIP = "May not belong to user" +BELONGS = "Belongs to user" + + +def sanitize_pypi_user(username: str) -> str: + """Remove any backticks from the username. + + Some users write their usernames like: + `username` + for pretty markdown purposes, but we don't want the backticks. + """ + return username.strip().replace("`", "") + + +def Xadd_issue_comment(comment: str, gh_user: str, repo_name: str, issue_number, github_token=None): + print() + print("Comment") + print() + print(comment) + print() + + +def Xadd_label_to_issue(label: str, gh_user: str, repo_name: str, issue_number, github_token=None): + print() + print("Label") + print() + print(label) + print() + + +def format_markdown_table(header: list, rows: list) -> str: + """Format a list of rows into a markdown table.""" + row_strings = [] + row_strings.append(" | ".join(header)) + row_strings.append(" | ".join(["---"] * len(header))) + for row in rows: + row_strings.append(" | ".join(row)) + return "\n".join(row_strings) + + +def format_markdown_package_link(package_name: str) -> str: + return f"[{package_name}](https://pypi.org/project/{package_name})" + + +def format_markdown_pypi_user_link(pypi_user: str) -> str: + return f"[{pypi_user}](https://pypi.org/user/{pypi_user}/)" + + +def format_markdown_gh_user_link(gh_user: str) -> str: + return f"[{gh_user}](https://github.com/{gh_user}/)" + + +if __name__ == "__main__": + issue_number = os.environ.get("ISSUE_NUMBER", "4343") + github_token = os.environ.get("GITHUB_TOKEN", None) + + issue_data = gh_utils.fetch_issue_details(PYPI, REPO, issue_number, github_token=github_token) + + gh_user = issue_data["user"] + gh_user_link = format_markdown_gh_user_link(gh_user) + + if PYPI_USER_HEADER not in issue_data["body"]: + raise ValueError(f"Issue body does not contain expected header: {PYPI_USER_HEADER}") + + pypi_user = sanitize_pypi_user(issue_data["body"]["PyPI Username"]) + pypi_user_link = format_markdown_pypi_user_link(pypi_user) + + try: + packages = pypi_utils.get_packages_by_user(pypi_user) + except ValueError as e: + raise e + + # If the pypi user is not a maintainer for any packages + if not packages: + Xadd_issue_comment( + f"User {pypi_user_link} has no packages", PYPI, REPO, issue_number, github_token=github_token + ) + sys.exit() + + # Loop over all packages to see if they belong to the user + package_ownership = [] # List of [package_name, repo_url, ownership_status] + for package_name in packages: + package_md_link = format_markdown_package_link(package_name) + package = pypi_utils.get_pypi_project_info(package_name) + + # Package has source code repo listed at PyPI + if "repository_url" not in package: + package_ownership.append([package_md_link, "", NO_REPO]) + continue + + package_repo = package["repository_url"] + + # Package source code may not belong to the user + if not ( + gh_utils.is_github_repo_belonging_to_owner(package_repo, gh_user) + or gh_utils.is_github_pages_belonging_to_owner(package_repo, gh_user) + ): + package_ownership.append([package_md_link, package_repo, UNKNOWN_OWERNSHIP]) + else: + package_ownership.append([package_md_link, package_repo, BELONGS]) + + # Add a comment to the issue with the package ownership information + header = ["Package", "Repository", "Ownership"] + table = format_markdown_table(header, package_ownership) + + unknown_ownership = [row[1] for row in package_ownership if row[-1] != BELONGS] + label = None + + if len(unknown_ownership) == 0: + approval_message = f"All projects maintained by {pypi_user_link} belong to the gh user {gh_user_link}" + label = "fasttrack" + else: + approval_message = f"{len(unknown_ownership)} projects may not belong to the gh user {gh_user_link}" + + comment = f"""\ +## Package Ownership + +{table} + +{approval_message} + +## NOTE + +This action was performed by a bot. Account recovery requires manual approval by processing by PyPI.""" + + Xadd_issue_comment(comment, PYPI, REPO, issue_number, github_token=github_token) + if label: + Xadd_label_to_issue(label, PYPI, REPO, issue_number, github_token=github_token) diff --git a/.github/workflows/autoreplies/gh_utils.py b/.github/workflows/autoreplies/gh_utils.py new file mode 100644 index 0000000..50bf9f4 --- /dev/null +++ b/.github/workflows/autoreplies/gh_utils.py @@ -0,0 +1,118 @@ +import re +from urllib.parse import urlparse + +import requests + + +def fetch_issue_details(gh_user: str, repo_name: str, issue_number, github_token=None) -> dict: + """Fetch issue details using the GitHub API.""" + headers = {"Authorization": f"token {github_token}"} if github_token else {} + + url = f"https://api.github.com/repos/{gh_user}/{repo_name}/issues/{issue_number}" + response = requests.get(url, headers=headers) + if response.status_code == 200: + return parse_issue_details(response.json()) + raise ValueError(f"Failed to fetch issue details: {response.status_code}") + + +def parse_issue_details(issue: dict) -> dict: + """Parse a GitHub issue metadata to retrieve relevant fields.""" + body = parse_issue_body(issue["body"]) + return { + "created_at": issue["created_at"], + "user": issue["user"]["login"], + "url": issue["html_url"], + "body": body, + } + + +def parse_issue_body(body: str) -> dict: + """Parse the body of a GitHub issue into a dictionary. + + This function works well with the issue templates, though may run into trouble if users include "### " in their own + body text. + + Parameters + ---------- + body: str + The body of the issue. + + Returns + ------- + dict + A dictionary with the issue text keyed by the markdown headers (h3) + """ + RE_GH_ISSUE_HEADER = re.compile(r"### (?P.+)") + body_dict = {} + cur_key = None + cur_lines = [] + for line in body.strip().split("\n"): + line = line.strip() + if not line: + continue + header_match = RE_GH_ISSUE_HEADER.match(line) + if header_match: + if cur_key: + body_dict[cur_key] = "\n".join(cur_lines) + cur_lines = [] + cur_key = header_match.group("key") + else: + cur_lines.append(line) + return body_dict + + +def _sanitize_url(url: str) -> str: + """Ensure the URL starts with "http://" or "https://", and lowercases the URL since GitHub is case-insensitive.""" + url = url.lower() + if not url.startswith("http"): + url = f"https://{url}" + return url + + +def is_github_pages_belonging_to_owner(code_repo_url: str, gh_user: str) -> bool: + """Return True if the URL is a GitHub Pages URL for the GitHub user's account.""" + parsed_url = urlparse(_sanitize_url(code_repo_url)) + + # Normalize domain + hostname = parsed_url.hostname or "" + hostname = hostname.replace("www.", "") + return hostname == f"{gh_user}.github.io".lower() + + +def is_github_repo_belonging_to_owner(code_repo_url: str, gh_user: str) -> bool: + """Return True if the URL is a GitHub repo associated to the GitHub user's account.""" + parsed_url = urlparse(_sanitize_url(code_repo_url)) + + # Normalize domain + hostname = parsed_url.hostname or "" + hostname = hostname.replace("www.", "") + + # Check if the domain is github.com + if hostname != "github.com": + return False + + # Split the path to analyze its parts + path_parts = parsed_url.path.strip("/").split("/") + + # Check if the first part of the path is 'gh_user' + return path_parts and path_parts[0] == gh_user.lower() + + +def add_issue_comment(comment: str, gh_user: str, repo_name: str, issue_number, github_token=None): + """Add a comment to a GitHub issue.""" + headers = {"Authorization": f"token {github_token}"} if github_token else {} + url = f"https://api.github.com/repos/{gh_user}/{repo_name}/issues/{issue_number}/comments" + response = requests.post(url, json={"body": comment}, headers=headers) + if response.status_code != 201: + raise ValueError(f"Failed to add comment: {response.status_code}") + return response.json() + + +def add_label_to_issue(label: str, gh_user: str, repo_name: str, issue_number, github_token=None): + """Add a label to a GitHub issue.""" + headers = {"Authorization": f"token {github_token}"} if github_token else {} + url = f"https://api.github.com/repos/{gh_user}/{repo_name}/issues/{issue_number}/labels" + response = requests.post(url, json=[label], headers=headers) + if response.status_code != 200: + raise ValueError(f"Failed to add label: {response.status_code}") + return response.json() diff --git a/.github/workflows/autoreplies/pypi_utils.py b/.github/workflows/autoreplies/pypi_utils.py new file mode 100644 index 0000000..d900227 --- /dev/null +++ b/.github/workflows/autoreplies/pypi_utils.py @@ -0,0 +1,93 @@ +import re +import time +from typing import Dict + +import requests +from bs4 import BeautifulSoup + + +RE_GH_ISSUE_HEADER = re.compile(r"###\s*(?P.+)") + + +def get_packages_by_user(username: str) -> list: + """Parse html to get a list of packages for a given PyPI user. + + The pypi api does not provide a way to get a list of packages for a user, hence crawling the html. + + Steps: + 1) Queries the PyPI user page for the given username. + 2) Parses the html to get the number of projects and the list of packages. This assumes that the number of projects + listed on the page is in the first

tag, in the form "X project" or "X projects". + 3) Loops over all elements of to get the package names. + 4) Ensure that the number of packages found is equal to the number of projects reported. If not, raise an error. + 5) Return the list of package names. + + Step 2 is to avoid having to handle pagination of projects. As of now the user with the most projects I have seen + has 43, and there was no pagination. If pagination is detected, this function will raise an error. + + Parameters + ---------- + username: str + The PyPI username to search for. + + Returns + ------- + list + A list of package names + """ + RE_PROJECT_COUNT = re.compile(r"\s*(?P\d+)\s*project(?:s)?") + time.sleep(1) + url = f"https://pypi.org/user/{username}/" + response = requests.get(url) + if response.status_code == 200: + soup = BeautifulSoup(response.content, "html.parser") + + # Get reported + num_projects_text = soup.find("h2").text.lower() + num_projects_text = num_projects_text.replace("No projects", "0 projects") + re_num_project_match = RE_PROJECT_COUNT.match(num_projects_text) + if not re_num_project_match: + raise ValueError(f"Could not determine the bumber of projects for user {username}") + + num_projects = int(re_num_project_match.group("num_projects")) + packages = [a.text.strip().split("\n")[0] for a in soup.find_all("a", class_="package-snippet")] + # Check for pagination: if num_projects > len(packages) then there are probably more pages + # which aren't handled here yet + if len(packages) != num_projects: + raise ValueError(f"num_projects {num_projects} != num_packages {len(packages)} for user {username}") + return packages + raise ValueError(f"Error retrieving project data for user {username}") + + +def get_pypi_project_info(package_name: str) -> Dict[str, str]: + """Retrieve relevant information about a PyPI project. + + Parameters + ---------- + package_name: str + The name of the package to query. + + Returns + ------- + Dict[str, str] + A dictionary containing the following keys: + - repository_url (may be "Not specified" if no repository or homepage is listed) + - author + - author_email + """ + time.sleep(1) + url = f"https://pypi.org/pypi/{package_name}/json" + response = requests.get(url) + if response.status_code != 200: + raise ValueError(f"Error retrieving project info for {package_name}") + + data = response.json() + info = data.get("info", {}) + project_urls = info.get("project_urls", {}) or {} + author = info.get("author") + author_email = info.get("author_email") + return { + "repository_url": project_urls.get("Source", project_urls.get("Homepage", "Not specified")), + "author": author, + "author_email": author_email, + } diff --git a/.github/workflows/autoreplies/requirements.txt b/.github/workflows/autoreplies/requirements.txt new file mode 100644 index 0000000..2c5956f --- /dev/null +++ b/.github/workflows/autoreplies/requirements.txt @@ -0,0 +1,2 @@ +beautifulsoup4>=4.9.1 +requests>=2.24.0 \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9ac2dd8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,163 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + From a5ec1c4f8c4fd8239660bfdeef1c1771adcd411a Mon Sep 17 00:00:00 2001 From: David Wooten Date: Thu, 11 Jul 2024 21:29:05 -0500 Subject: [PATCH 2/9] moved workflow --- .github/workflows/{autoreplies => }/account_recovery.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{autoreplies => }/account_recovery.yml (100%) diff --git a/.github/workflows/autoreplies/account_recovery.yml b/.github/workflows/account_recovery.yml similarity index 100% rename from .github/workflows/autoreplies/account_recovery.yml rename to .github/workflows/account_recovery.yml From 9cad05af32b6648d0ff7339679fe8339b4818ca1 Mon Sep 17 00:00:00 2001 From: David Wooten Date: Thu, 11 Jul 2024 21:34:39 -0500 Subject: [PATCH 3/9] get the correct issue tracking repo information through the action --- .github/workflows/account_recovery.yml | 4 +++- .../autoreplies/check_account_recovery.py | 18 ++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/.github/workflows/account_recovery.yml b/.github/workflows/account_recovery.yml index ec08863..9405852 100644 --- a/.github/workflows/account_recovery.yml +++ b/.github/workflows/account_recovery.yml @@ -26,4 +26,6 @@ jobs: run: python .github/workflows/autoreplies/check_account_recovery.py env: ISSUE_NUMBER: ${{ github.event.issue.number }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_ISSUE_OWNER: ${{ github.repository_owner }} + GITHUB_ISSUE_REPO: ${{ github.event.repository.name }} \ No newline at end of file diff --git a/.github/workflows/autoreplies/check_account_recovery.py b/.github/workflows/autoreplies/check_account_recovery.py index eeb4302..99f92a8 100644 --- a/.github/workflows/autoreplies/check_account_recovery.py +++ b/.github/workflows/autoreplies/check_account_recovery.py @@ -10,8 +10,6 @@ import gh_utils -PYPI = "pypi" -REPO = "support" PYPI_USER_HEADER = "PyPI Username" @@ -71,8 +69,12 @@ def format_markdown_gh_user_link(gh_user: str) -> str: if __name__ == "__main__": issue_number = os.environ.get("ISSUE_NUMBER", "4343") github_token = os.environ.get("GITHUB_TOKEN", None) + github_issue_user = os.environ.get("GITHUB_ISSUE_USER", "pypi") + github_issue_repo = os.environ.get("GITHUB_ISSUE_REPO", "support") - issue_data = gh_utils.fetch_issue_details(PYPI, REPO, issue_number, github_token=github_token) + issue_data = gh_utils.fetch_issue_details( + github_issue_user, github_issue_repo, issue_number, github_token=github_token + ) gh_user = issue_data["user"] gh_user_link = format_markdown_gh_user_link(gh_user) @@ -91,7 +93,11 @@ def format_markdown_gh_user_link(gh_user: str) -> str: # If the pypi user is not a maintainer for any packages if not packages: Xadd_issue_comment( - f"User {pypi_user_link} has no packages", PYPI, REPO, issue_number, github_token=github_token + f"User {pypi_user_link} has no packages", + github_issue_user, + github_issue_repo, + issue_number, + github_token=github_token, ) sys.exit() @@ -141,6 +147,6 @@ def format_markdown_gh_user_link(gh_user: str) -> str: This action was performed by a bot. Account recovery requires manual approval by processing by PyPI.""" - Xadd_issue_comment(comment, PYPI, REPO, issue_number, github_token=github_token) + Xadd_issue_comment(comment, github_issue_user, github_issue_repo, issue_number, github_token=github_token) if label: - Xadd_label_to_issue(label, PYPI, REPO, issue_number, github_token=github_token) + Xadd_label_to_issue(label, github_issue_user, github_issue_repo, issue_number, github_token=github_token) From c4008b9f173df647f94719ac9845bfb99e0f2bd1 Mon Sep 17 00:00:00 2001 From: David Wooten Date: Thu, 11 Jul 2024 21:36:25 -0500 Subject: [PATCH 4/9] typo --- .../workflows/autoreplies/check_account_recovery.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/autoreplies/check_account_recovery.py b/.github/workflows/autoreplies/check_account_recovery.py index 99f92a8..68fb674 100644 --- a/.github/workflows/autoreplies/check_account_recovery.py +++ b/.github/workflows/autoreplies/check_account_recovery.py @@ -69,11 +69,11 @@ def format_markdown_gh_user_link(gh_user: str) -> str: if __name__ == "__main__": issue_number = os.environ.get("ISSUE_NUMBER", "4343") github_token = os.environ.get("GITHUB_TOKEN", None) - github_issue_user = os.environ.get("GITHUB_ISSUE_USER", "pypi") + github_issue_owner = os.environ.get("GITHUB_ISSUE_OWNER", "pypi") github_issue_repo = os.environ.get("GITHUB_ISSUE_REPO", "support") issue_data = gh_utils.fetch_issue_details( - github_issue_user, github_issue_repo, issue_number, github_token=github_token + github_issue_owner, github_issue_repo, issue_number, github_token=github_token ) gh_user = issue_data["user"] @@ -94,7 +94,7 @@ def format_markdown_gh_user_link(gh_user: str) -> str: if not packages: Xadd_issue_comment( f"User {pypi_user_link} has no packages", - github_issue_user, + github_issue_owner, github_issue_repo, issue_number, github_token=github_token, @@ -147,6 +147,6 @@ def format_markdown_gh_user_link(gh_user: str) -> str: This action was performed by a bot. Account recovery requires manual approval by processing by PyPI.""" - Xadd_issue_comment(comment, github_issue_user, github_issue_repo, issue_number, github_token=github_token) + Xadd_issue_comment(comment, github_issue_owner, github_issue_repo, issue_number, github_token=github_token) if label: - Xadd_label_to_issue(label, github_issue_user, github_issue_repo, issue_number, github_token=github_token) + Xadd_label_to_issue(label, github_issue_owner, github_issue_repo, issue_number, github_token=github_token) From f3649b80b70356eee7d438af9ad49075acfa73ae Mon Sep 17 00:00:00 2001 From: David Wooten Date: Thu, 11 Jul 2024 21:38:20 -0500 Subject: [PATCH 5/9] actually post comment --- .github/workflows/autoreplies/check_account_recovery.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/autoreplies/check_account_recovery.py b/.github/workflows/autoreplies/check_account_recovery.py index 68fb674..0d5e89c 100644 --- a/.github/workflows/autoreplies/check_account_recovery.py +++ b/.github/workflows/autoreplies/check_account_recovery.py @@ -92,7 +92,7 @@ def format_markdown_gh_user_link(gh_user: str) -> str: # If the pypi user is not a maintainer for any packages if not packages: - Xadd_issue_comment( + gh_utils.add_issue_comment( f"User {pypi_user_link} has no packages", github_issue_owner, github_issue_repo, @@ -147,6 +147,8 @@ def format_markdown_gh_user_link(gh_user: str) -> str: This action was performed by a bot. Account recovery requires manual approval by processing by PyPI.""" - Xadd_issue_comment(comment, github_issue_owner, github_issue_repo, issue_number, github_token=github_token) + gh_utils.add_issue_comment(comment, github_issue_owner, github_issue_repo, issue_number, github_token=github_token) if label: - Xadd_label_to_issue(label, github_issue_owner, github_issue_repo, issue_number, github_token=github_token) + gh_utils.add_label_to_issue( + label, github_issue_owner, github_issue_repo, issue_number, github_token=github_token + ) From 06ad72b8e46b53df5177ac6d90e14b8f5e6afaf0 Mon Sep 17 00:00:00 2001 From: David Wooten Date: Sat, 13 Jul 2024 12:55:38 -0500 Subject: [PATCH 6/9] improve summary table and clean utils --- .../autoreplies/check_account_recovery.py | 157 +++++++++++------- .github/workflows/autoreplies/gh_utils.py | 90 +++++++++- .github/workflows/autoreplies/pypi_utils.py | 12 +- 3 files changed, 192 insertions(+), 67 deletions(-) diff --git a/.github/workflows/autoreplies/check_account_recovery.py b/.github/workflows/autoreplies/check_account_recovery.py index 0d5e89c..3e10590 100644 --- a/.github/workflows/autoreplies/check_account_recovery.py +++ b/.github/workflows/autoreplies/check_account_recovery.py @@ -1,6 +1,26 @@ -"""Parse a GitHub issue to determine the best path forward for account recovery requests. +"""Parse a GitHub issue to automatically aggregate package ownership information to facilitate account recovery. +Steps +1) finds all PyPI packages maintained by the user +2) checks each PyPI package to see if its source code repository listed at PyPI belongs to the github user +3) adds a comment to the issue summarizing the package ownership information +If the github user owns the source code repositories for all of the PyPI packages, or is an administrator for the github +organization that owns them, then the issue is automatically labeled with "fasttrack". + +Environment Variables +--------------------- +GITHUB_ISSUE_OWNER + The owner (e.g., "pypi") of the issue repository + +GITHUB_ISSUE_REPO + The repository (e.g., "support") where the issue is located + +ISSUE_NUMBER + The number of the issue to process + +GITHUB_TOKEN + (Optional) A GitHub token with permissions to comment on the issue and read the repository. """ import os @@ -10,12 +30,22 @@ import gh_utils +# Issue body headers PYPI_USER_HEADER = "PyPI Username" +# Ownership status levels +BELONGS = 0 +ORG_ADMIN = 1 +ORG_MEMBER = 2 +UNKNOWN_OWERNSHIP = 3 +NO_REPO = 4 -NO_REPO = "None listed" -UNKNOWN_OWERNSHIP = "May not belong to user" -BELONGS = "Belongs to user" +# This notice indicates that the final determination of account recovery rests with the PyPI team +BOT_NOTICE = ( + "### NOTE\n\n" + "_This action was performed automatically by a bot and **does not guarantee account recovery**. Account recovery" + " requires manual approval processing by the PyPI team._" +) def sanitize_pypi_user(username: str) -> str: @@ -28,29 +58,25 @@ def sanitize_pypi_user(username: str) -> str: return username.strip().replace("`", "") -def Xadd_issue_comment(comment: str, gh_user: str, repo_name: str, issue_number, github_token=None): - print() - print("Comment") - print() - print(comment) - print() +def format_markdown_table(rows: list) -> str: + """Format a list of rows into a markdown table. - -def Xadd_label_to_issue(label: str, gh_user: str, repo_name: str, issue_number, github_token=None): - print() - print("Label") - print() - print(label) - print() - - -def format_markdown_table(header: list, rows: list) -> str: - """Format a list of rows into a markdown table.""" + Parameters + ---------- + rows: list + A list of rows to format into a table. Each row should be [package_link, repo_url, ownership_level] where + ownership_level is an int indicating which column to mark with an "X". + """ + header = ["Package", "Repository", "Owner", "Admin", "Member", "Unknown", "No Repo"] row_strings = [] row_strings.append(" | ".join(header)) - row_strings.append(" | ".join(["---"] * len(header))) + row_strings.append(" | ".join(["---"] * 2 + [":-:"] * (len(header) - 2))) for row in rows: - row_strings.append(" | ".join(row)) + row_fields = [""] * len(header) + row_fields[0] = row[0] + row_fields[1] = row[1] + row_fields[2 + row[2]] = "X" + row_strings.append(" | ".join(row_fields)) return "\n".join(row_strings) @@ -66,8 +92,16 @@ def format_markdown_gh_user_link(gh_user: str) -> str: return f"[{gh_user}](https://github.com/{gh_user}/)" +def X_add_issue_comment( + comment: str, github_issue_owner: str, github_issue_repo: str, issue_number: str, github_token: str = None +): + print() + print(comment) + print() + + if __name__ == "__main__": - issue_number = os.environ.get("ISSUE_NUMBER", "4343") + issue_number = os.environ.get("ISSUE_NUMBER", "4386") github_token = os.environ.get("GITHUB_TOKEN", None) github_issue_owner = os.environ.get("GITHUB_ISSUE_OWNER", "pypi") github_issue_repo = os.environ.get("GITHUB_ISSUE_REPO", "support") @@ -92,7 +126,8 @@ def format_markdown_gh_user_link(gh_user: str) -> str: # If the pypi user is not a maintainer for any packages if not packages: - gh_utils.add_issue_comment( + # gh_utils.add_issue_comment( + X_add_issue_comment( f"User {pypi_user_link} has no packages", github_issue_owner, github_issue_repo, @@ -104,51 +139,59 @@ def format_markdown_gh_user_link(gh_user: str) -> str: # Loop over all packages to see if they belong to the user package_ownership = [] # List of [package_name, repo_url, ownership_status] for package_name in packages: - package_md_link = format_markdown_package_link(package_name) + pypi_package_link = format_markdown_package_link(package_name) package = pypi_utils.get_pypi_project_info(package_name) # Package has source code repo listed at PyPI if "repository_url" not in package: - package_ownership.append([package_md_link, "", NO_REPO]) + package_ownership.append([pypi_package_link, "", NO_REPO]) + continue + + package_repo_url = package["repository_url"] + + # Package source repo directly belongs to the gh_user + if gh_utils.does_user_own_repo(package_repo_url, gh_user): + package_ownership.append([pypi_package_link, package_repo_url, BELONGS]) continue - package_repo = package["repository_url"] + # If package source repo belongs to an organization - check if the gh_user is a member or admin + org_status = gh_utils.get_user_role_in_org(package_repo_url, gh_user) + if org_status == "admin": + package_ownership.append([pypi_package_link, package_repo_url, ORG_ADMIN]) + elif org_status == "member": + package_ownership.append([pypi_package_link, package_repo_url, ORG_MEMBER]) - # Package source code may not belong to the user - if not ( - gh_utils.is_github_repo_belonging_to_owner(package_repo, gh_user) - or gh_utils.is_github_pages_belonging_to_owner(package_repo, gh_user) - ): - package_ownership.append([package_md_link, package_repo, UNKNOWN_OWERNSHIP]) + # Otherwise the source repo may not belong to the gh_user else: - package_ownership.append([package_md_link, package_repo, BELONGS]) + package_ownership.append([pypi_package_link, package_repo_url, UNKNOWN_OWERNSHIP]) # Add a comment to the issue with the package ownership information - header = ["Package", "Repository", "Ownership"] - table = format_markdown_table(header, package_ownership) + table = format_markdown_table(package_ownership) - unknown_ownership = [row[1] for row in package_ownership if row[-1] != BELONGS] - label = None + # Count how many packages are not owned or administered by the user + num_unverified = len([row for row in package_ownership if row[2] > ORG_ADMIN]) - if len(unknown_ownership) == 0: - approval_message = f"All projects maintained by {pypi_user_link} belong to the gh user {gh_user_link}" + if num_unverified == 0: label = "fasttrack" else: - approval_message = f"{len(unknown_ownership)} projects may not belong to the gh user {gh_user_link}" - - comment = f"""\ -## Package Ownership - -{table} - -{approval_message} + label = "" -## NOTE + comment = "\n\n".join(["### Package Ownership", table, BOT_NOTICE]) -This action was performed by a bot. Account recovery requires manual approval by processing by PyPI.""" - - gh_utils.add_issue_comment(comment, github_issue_owner, github_issue_repo, issue_number, github_token=github_token) - if label: - gh_utils.add_label_to_issue( - label, github_issue_owner, github_issue_repo, issue_number, github_token=github_token - ) + try: + # gh_utils.add_issue_comment( + # comment, github_issue_owner, github_issue_repo, issue_number, github_token=github_token + # ) + X_add_issue_comment(comment, github_issue_owner, github_issue_repo, issue_number, github_token=github_token) + except Exception as e: + print(f"Failed to add comment to issue {issue_number}: {e}") + print("Comment:") + print(comment) + + if label and False: + try: + gh_utils.add_label_to_issue( + label, github_issue_owner, github_issue_repo, issue_number, github_token=github_token + ) + except Exception as e: + print(f"Failed to add label to issue {issue_number}: {e}") diff --git a/.github/workflows/autoreplies/gh_utils.py b/.github/workflows/autoreplies/gh_utils.py index 50bf9f4..06a901c 100644 --- a/.github/workflows/autoreplies/gh_utils.py +++ b/.github/workflows/autoreplies/gh_utils.py @@ -32,6 +32,23 @@ def parse_issue_body(body: str) -> dict: This function works well with the issue templates, though may run into trouble if users include "### " in their own body text. + For example: + + ### Some header + + abcd 123 ab + cdefg + + ### Another header + + Lorem ipsum dolor sit amet, consectetur adipiscing elit. + + will get processed to: + { + "Some header": "abcd 123 ab\ncdefg", + "Another header": "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + } + Parameters ---------- body: str @@ -42,7 +59,7 @@ def parse_issue_body(body: str) -> dict: dict A dictionary with the issue text keyed by the markdown headers (h3) """ - RE_GH_ISSUE_HEADER = re.compile(r"### (?P.+)") + RE_GH_ISSUE_HEADER = re.compile(r"### (?P.+)") # This finds lines beginning with "### " to use as keys body_dict = {} cur_key = None cur_lines = [] @@ -69,7 +86,26 @@ def _sanitize_url(url: str) -> str: return url -def is_github_pages_belonging_to_owner(code_repo_url: str, gh_user: str) -> bool: +def _is_user_in_org(org_name, username, github_token=None): + """Return True if the user is a publically listed member of the organization.""" + url = f"https://api.github.com/orgs/{org_name}/members/{username}" + headers = {"Authorization": f"token {github_token}"} if github_token else {} + response = requests.get(url, headers=headers) + return response.status_code == 204 + + +def _is_user_owner_of_org(org_name, username, github_token=None): + """Return True if the user is an owner of the organization.""" + url = f"https://api.github.com/orgs/{org_name}/memberships/{username}" + headers = {"Authorization": f"token {github_token}"} if github_token else {} + response = requests.get(url, headers=headers) + if response.status_code == 200: + membership_info = response.json() + return membership_info.get("role") == "admin" + return False + + +def _is_github_pages_belonging_to_owner(code_repo_url: str, gh_user: str) -> bool: """Return True if the URL is a GitHub Pages URL for the GitHub user's account.""" parsed_url = urlparse(_sanitize_url(code_repo_url)) @@ -79,7 +115,7 @@ def is_github_pages_belonging_to_owner(code_repo_url: str, gh_user: str) -> bool return hostname == f"{gh_user}.github.io".lower() -def is_github_repo_belonging_to_owner(code_repo_url: str, gh_user: str) -> bool: +def _is_github_repo_belonging_to_owner(code_repo_url: str, gh_user: str) -> bool: """Return True if the URL is a GitHub repo associated to the GitHub user's account.""" parsed_url = urlparse(_sanitize_url(code_repo_url)) @@ -98,6 +134,54 @@ def is_github_repo_belonging_to_owner(code_repo_url: str, gh_user: str) -> bool: return path_parts and path_parts[0] == gh_user.lower() +def get_user_role_in_org(code_repo_url: str, gh_user: str, github_token=None) -> str: + """Determines the role of the user in an organization. + + Parameters + ---------- + code_repo_url: str + The URL of the repository. This can be a GitHub Pages URL or a GitHub repository URL. + + gh_user: str + The GitHub username to check for. + + github_token: str + The GitHub token to use for API requests. + + Returns + ------- + str + "member" or "admin", or an empty string if the user is not in the organization. + """ + parsed_url = urlparse(_sanitize_url(code_repo_url)) + + # Normalize domain + hostname = parsed_url.hostname or "" + hostname = hostname.replace("www.", "").lower() + + RE_GH_PAGES = re.compile(r"^(?P.+)\.github\.io$") + pages_match = re.match(RE_GH_PAGES, hostname) + if pages_match: + org_name = pages_match.group("org_name") + elif hostname == "github.com": + org_name = parsed_url.path.strip("/").split("/")[0] + else: + return "" + + if _is_user_in_org(org_name, gh_user, github_token=github_token): + if _is_user_owner_of_org(org_name, gh_user, github_token=github_token): + return "admin" + return "member" + return "" + + +def does_user_own_repo(code_repo_url: str, gh_user: str) -> bool: + """Return True if the GitHub user owns the repository.""" + return _is_github_repo_belonging_to_owner(code_repo_url, gh_user) or _is_github_pages_belonging_to_owner( + code_repo_url, gh_user + ) + + def add_issue_comment(comment: str, gh_user: str, repo_name: str, issue_number, github_token=None): """Add a comment to a GitHub issue.""" headers = {"Authorization": f"token {github_token}"} if github_token else {} diff --git a/.github/workflows/autoreplies/pypi_utils.py b/.github/workflows/autoreplies/pypi_utils.py index d900227..b2dd17c 100644 --- a/.github/workflows/autoreplies/pypi_utils.py +++ b/.github/workflows/autoreplies/pypi_utils.py @@ -6,9 +6,6 @@ from bs4 import BeautifulSoup -RE_GH_ISSUE_HEADER = re.compile(r"###\s*(?P.+)") - - def get_packages_by_user(username: str) -> list: """Parse html to get a list of packages for a given PyPI user. @@ -35,19 +32,20 @@ def get_packages_by_user(username: str) -> list: list A list of package names """ - RE_PROJECT_COUNT = re.compile(r"\s*(?P\d+)\s*project(?:s)?") time.sleep(1) url = f"https://pypi.org/user/{username}/" response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.content, "html.parser") - # Get reported + # Get the reported number of projects maintained by this user, to ensure we later don't miss any num_projects_text = soup.find("h2").text.lower() - num_projects_text = num_projects_text.replace("No projects", "0 projects") + num_projects_text = num_projects_text.replace("no projects", "0 projects") + + RE_PROJECT_COUNT = re.compile(r"\s*(?P\d+)\s*project(?:s)?") re_num_project_match = RE_PROJECT_COUNT.match(num_projects_text) if not re_num_project_match: - raise ValueError(f"Could not determine the bumber of projects for user {username}") + raise ValueError(f"Could not determine the number of projects for user {username}") num_projects = int(re_num_project_match.group("num_projects")) packages = [a.text.strip().split("\n")[0] for a in soup.find_all("a", class_="package-snippet")] From 59c70d207259d3b03ec623a8a0f0a01f85bf4b33 Mon Sep 17 00:00:00 2001 From: David Wooten Date: Sat, 13 Jul 2024 12:57:50 -0500 Subject: [PATCH 7/9] remove local debug code --- .../autoreplies/check_account_recovery.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/.github/workflows/autoreplies/check_account_recovery.py b/.github/workflows/autoreplies/check_account_recovery.py index 3e10590..fe7730d 100644 --- a/.github/workflows/autoreplies/check_account_recovery.py +++ b/.github/workflows/autoreplies/check_account_recovery.py @@ -92,14 +92,6 @@ def format_markdown_gh_user_link(gh_user: str) -> str: return f"[{gh_user}](https://github.com/{gh_user}/)" -def X_add_issue_comment( - comment: str, github_issue_owner: str, github_issue_repo: str, issue_number: str, github_token: str = None -): - print() - print(comment) - print() - - if __name__ == "__main__": issue_number = os.environ.get("ISSUE_NUMBER", "4386") github_token = os.environ.get("GITHUB_TOKEN", None) @@ -126,8 +118,7 @@ def X_add_issue_comment( # If the pypi user is not a maintainer for any packages if not packages: - # gh_utils.add_issue_comment( - X_add_issue_comment( + gh_utils.add_issue_comment( f"User {pypi_user_link} has no packages", github_issue_owner, github_issue_repo, @@ -179,10 +170,9 @@ def X_add_issue_comment( comment = "\n\n".join(["### Package Ownership", table, BOT_NOTICE]) try: - # gh_utils.add_issue_comment( - # comment, github_issue_owner, github_issue_repo, issue_number, github_token=github_token - # ) - X_add_issue_comment(comment, github_issue_owner, github_issue_repo, issue_number, github_token=github_token) + gh_utils.add_issue_comment( + comment, github_issue_owner, github_issue_repo, issue_number, github_token=github_token + ) except Exception as e: print(f"Failed to add comment to issue {issue_number}: {e}") print("Comment:") From 93852da783f8d2c0035bd724c094e0ea95e86096 Mon Sep 17 00:00:00 2001 From: David Wooten Date: Sat, 13 Jul 2024 13:36:26 -0500 Subject: [PATCH 8/9] re-enable labeling --- .github/workflows/autoreplies/check_account_recovery.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/autoreplies/check_account_recovery.py b/.github/workflows/autoreplies/check_account_recovery.py index fe7730d..2cbbfb7 100644 --- a/.github/workflows/autoreplies/check_account_recovery.py +++ b/.github/workflows/autoreplies/check_account_recovery.py @@ -178,7 +178,7 @@ def format_markdown_gh_user_link(gh_user: str) -> str: print("Comment:") print(comment) - if label and False: + if label: try: gh_utils.add_label_to_issue( label, github_issue_owner, github_issue_repo, issue_number, github_token=github_token From aa93a63118ac042e34a5c73ab23989569d6efc55 Mon Sep 17 00:00:00 2001 From: David Wooten Date: Sat, 13 Jul 2024 15:04:33 -0500 Subject: [PATCH 9/9] cleanup --- .github/workflows/account_recovery.yml | 2 +- .github/workflows/autoreplies/check_account_recovery.py | 2 +- .github/workflows/autoreplies/pypi_utils.py | 4 ++-- .github/workflows/autoreplies/requirements.txt | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/account_recovery.yml b/.github/workflows/account_recovery.yml index 9405852..026557b 100644 --- a/.github/workflows/account_recovery.yml +++ b/.github/workflows/account_recovery.yml @@ -28,4 +28,4 @@ jobs: ISSUE_NUMBER: ${{ github.event.issue.number }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_ISSUE_OWNER: ${{ github.repository_owner }} - GITHUB_ISSUE_REPO: ${{ github.event.repository.name }} \ No newline at end of file + GITHUB_ISSUE_REPO: ${{ github.event.repository.name }} diff --git a/.github/workflows/autoreplies/check_account_recovery.py b/.github/workflows/autoreplies/check_account_recovery.py index 2cbbfb7..255835d 100644 --- a/.github/workflows/autoreplies/check_account_recovery.py +++ b/.github/workflows/autoreplies/check_account_recovery.py @@ -134,7 +134,7 @@ def format_markdown_gh_user_link(gh_user: str) -> str: package = pypi_utils.get_pypi_project_info(package_name) # Package has source code repo listed at PyPI - if "repository_url" not in package: + if "repository_url" not in package or not package["repository_url"]: package_ownership.append([pypi_package_link, "", NO_REPO]) continue diff --git a/.github/workflows/autoreplies/pypi_utils.py b/.github/workflows/autoreplies/pypi_utils.py index b2dd17c..7ccceac 100644 --- a/.github/workflows/autoreplies/pypi_utils.py +++ b/.github/workflows/autoreplies/pypi_utils.py @@ -69,7 +69,7 @@ def get_pypi_project_info(package_name: str) -> Dict[str, str]: ------- Dict[str, str] A dictionary containing the following keys: - - repository_url (may be "Not specified" if no repository or homepage is listed) + - repository_url ("" if no repository or homepage is listed) - author - author_email """ @@ -85,7 +85,7 @@ def get_pypi_project_info(package_name: str) -> Dict[str, str]: author = info.get("author") author_email = info.get("author_email") return { - "repository_url": project_urls.get("Source", project_urls.get("Homepage", "Not specified")), + "repository_url": project_urls.get("Source", project_urls.get("Homepage", "")), "author": author, "author_email": author_email, } diff --git a/.github/workflows/autoreplies/requirements.txt b/.github/workflows/autoreplies/requirements.txt index 2c5956f..a4d20e8 100644 --- a/.github/workflows/autoreplies/requirements.txt +++ b/.github/workflows/autoreplies/requirements.txt @@ -1,2 +1,2 @@ beautifulsoup4>=4.9.1 -requests>=2.24.0 \ No newline at end of file +requests>=2.24.0