Skip to content

Commit

Permalink
Optimize index.md processing
Browse files Browse the repository at this point in the history
  • Loading branch information
arkid15r committed Sep 19, 2024
1 parent 3b2c1fd commit d8dddc6
Show file tree
Hide file tree
Showing 7 changed files with 154 additions and 146 deletions.
12 changes: 7 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,18 @@ collect-static:
@CMD="poetry run python manage.py collectstatic --noinput" $(MAKE) exec-backend-command

django-shell:
@CMD="poetry run python manage.py shell" $(MAKE) exec-backend-command-it
@CMD="poetry run python manage.py shell" $(MAKE) exec-backend-command

dump-data:
@CMD="poetry run python manage.py dumpdata github owasp --indent=2" $(MAKE) exec-backend-command > data/nest.json

enrich-data: github-enrich-issues owasp-enrich-projects

exec-backend-command:
@docker exec -i nest-backend $(CMD) 2>/dev/null

exec-backend-command-it:
@docker exec -it nest-backend $(CMD) 2>/dev/null

github-enrich-issues:
@echo "Enriching GitHub issues"
@CMD="poetry run python manage.py github_enrich_issues" $(MAKE) exec-backend-command

github-update-owasp-organization:
Expand Down Expand Up @@ -50,12 +48,16 @@ owasp-aggregate-projects:
@CMD="poetry run python manage.py owasp_aggregate_projects" $(MAKE) exec-backend-command

owasp-enrich-projects:
@echo "Enriching OWASP projects"
@CMD="poetry run python manage.py owasp_enrich_projects" $(MAKE) exec-backend-command

owasp-scrape-owasp-org:
@echo "Scraping OWASP site projects data"
@CMD="poetry run python manage.py owasp_scrape_owasp_org" $(MAKE) exec-backend-command

poetry-update:
@CMD="poetry update" $(MAKE) exec-backend-command

pre-commit:
@pre-commit run -a

Expand All @@ -70,7 +72,7 @@ setup:
@CMD="poetry run python manage.py createsuperuser" $(MAKE) exec-backend-command

shell:
@CMD="/bin/bash" $(MAKE) exec-backend-command-it
@CMD="/bin/bash" $(MAKE) exec-backend-command

sync: update-data enrich-data index-data

Expand Down
7 changes: 7 additions & 0 deletions backend/apps/github/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from urllib.parse import urlparse

import requests

from apps.github.constants import GITHUB_REPOSITORY_RE


Expand Down Expand Up @@ -32,6 +34,11 @@ def check_funding_policy_compliance(platform, target):
return False


def get_repository_file_content(url, timeout=30):
"""Get repository file content."""
return requests.get(url, timeout=timeout).text


def get_repository_path(url):
"""Parse repository URL to owner and repository name."""
match = GITHUB_REPOSITORY_RE.search(url.split("#")[0])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

import logging

import requests
from django.core.management.base import BaseCommand

from apps.common.open_ai import OpenAi
from apps.github.utils import get_repository_file_content
from apps.owasp.models.project import Project

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -42,7 +42,7 @@ def handle(self, *args, **options):
prefix = f"{idx + offset + 1} of {active_projects_count - offset}"
print(f"{prefix:<10} {project.owasp_url}")

open_ai.set_input(requests.get(project.raw_index_md_url, timeout=30).text)
open_ai.set_input(get_repository_file_content(project.index_md_raw_url))

# Generate summary
if update_summary:
Expand Down
15 changes: 11 additions & 4 deletions backend/apps/owasp/models/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

import logging
import re
from base64 import b64decode

import yaml
from github.GithubException import GithubException, UnknownObjectException

from apps.github.constants import GITHUB_REPOSITORY_RE, GITHUB_USER_RE
from apps.github.utils import get_repository_file_content

logger = logging.getLogger(__name__)

Expand All @@ -28,14 +28,21 @@ def owasp_url(self):
"""Get OWASP URL."""
return f"https://owasp.org/{self.key}"

@property
def index_md_raw_url(self):
"""Return project's raw index.md GitHub URL."""
return (
"https://raw.githubusercontent.com/OWASP/"
f"{self.owasp_repository.key}/{self.owasp_repository.default_branch}/index.md"
)

def from_github(self, field_mapping, gh_repository, repository):
"""Update instance based on GitHub repository data."""
# Fetch project metadata from index.md file.
project_metadata = {}
try:
index_md = gh_repository.get_contents("index.md")
md_content = b64decode(index_md.content).decode()
yaml_content = re.search(r"^---\n(.*?)\n---", md_content, re.DOTALL)
index_md_content = get_repository_file_content(self.index_md_raw_url)
yaml_content = re.search(r"^---\n(.*?)\n---", index_md_content, re.DOTALL)
project_metadata = yaml.safe_load(yaml_content.group(1)) or {} if yaml_content else {}

# Direct fields.
Expand Down
8 changes: 0 additions & 8 deletions backend/apps/owasp/models/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,14 +154,6 @@ def is_indexable(self):
"""Projects to index."""
return self.is_active and self.has_active_repositories

@property
def raw_index_md_url(self):
"""Return project's raw index.md GitHub URL."""
return (
"https://raw.githubusercontent.com/OWASP/"
f"{self.owasp_repository.key}/{self.owasp_repository.default_branch}/index.md"
)

def deactivate(self):
"""Deactivate project."""
self.is_active = False
Expand Down
6 changes: 3 additions & 3 deletions backend/apps/slack/commands/contribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def handler(ack, say, command):
markdown(
(
f"\n*Here are top 10 most relevant issues "
f"that I found for*\n `{COMMAND} {search_query_escaped}`:\n"
f"that I found based on *\n `{COMMAND} {search_query_escaped}`:\n"
)
if search_query_escaped
else (
Expand All @@ -50,8 +50,8 @@ def handler(ack, say, command):
)
blocks.append(
markdown(
f"\n*{idx + 1}. {escape(issue['idx_project_name'])}*\n"
f"<{issue['idx_url']}|{escape(issue['idx_title'])}>\n"
f"\n*{idx + 1}.* <{issue['idx_url']}|*{escape(issue['idx_title'])}*>\n"
f"{escape(issue['idx_project_name'])}\n"
f"{escape(summary_truncated)}\n"
),
)
Expand Down
Loading

0 comments on commit d8dddc6

Please sign in to comment.