Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Making this work again #1

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 12 additions & 143 deletions GitHub-issues-to-pdf.py
Original file line number Diff line number Diff line change
@@ -1,134 +1,14 @@
### This script scrapes the issues for a github project, and saves each one as a PDF.

import pdfkit
import requests
import re
import os
from bs4 import BeautifulSoup

from datetime import datetime


# OPTIONS:

# Repository to fetch from (e.g. jackjamieson2/GitHub-issues-to-pdf)
repository = 'jackjamieson2/yarns-indie-reader'

# Output directory to save PDFs
output_dir = 'Exported PDFs/' + repository + "/"

# Generate automatic tags (True/False)
generate_auto_tags = True # If true will add automatically generated tags to
# bottom of the PDF in the form ##[tag]. See autotags() function for details


print("starting...")
# Autotags
def autotags(soup):
referenced = False
commit_found = False
tags = "<br><h1>Tags</h1>"
tags+="<br>###status: " + soup.select(".TableObject-item .State")[0].text

for item in soup.select('.discussion-item'):
if str(item).find('This was referenced')>=0:
referenced = True
if str(item).find('referenced this issue')>=0:
referenced = True
if str(item).find('id="ref-commit-')>=0:
commit_found = True
if referenced == True:
tags+="<br>###referenced"
if commit_found == True:
tags+="<br>###referenced_in_commit"

for item in soup.select('.labels a'):
tags+="<br>###current_label: " + item.text

for item in soup.select('.IssueLabel a'):
tags+="<br>###past_or_present_label: " + item.text

participants_N = len(soup.select('.participant-avatar'))
if participants_N ==1:
tags+="<br>###1_participant"
elif participants_N ==2:
tags+="<br>###2_participants"
elif participants_N >2:
tags+="<br>###>=3_participants" + "(" + str(participants_N) + ")"

for item in soup.select('.participant-avatar'):
participant_name = re.sub("/","",item.get('href'))
tags+="<br>###participant: " + participant_name

for item in soup.select('.assignee'):
tags+="<br>###assignee: " + item.text

return tags

def log_error(error):
if not os.path.isfile(output_dir + "error_log.txt"):
# Log file does not exist, so write explanatory header
with open(output_dir + "error_log.txt", "a") as myfile:
myfile.write("Errors reported for the following URLs, please check to ensure the generated PDFs are correct.")
with open(output_dir + "error_log.txt", "a") as myfile:
myfile.write("\n\n" + str(datetime.now()) + "\n" + error)
myfile.close()
return

#Options
options = {
'dpi':'300' # This zooms in to make the PDFs more readable (recommended)
}

# Look up how many issues the repository has
issue_count = 0
r = requests.get('https://github.com/' + repository + '/issues?q=is%3Aissue')
if r.status_code == 200:
soup = BeautifulSoup(r.content, "lxml")
issue = soup.find(class_="js-issue-row")
issue_count = int(re.sub('issue_',"",issue.get('id')))
print(str(issue_count) + " issues found")

# Create the output folder if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)

errors = []
# Iterate through each issue page
for i in range (1,issue_count +1):
url = 'https://github.com/' + repository + '/issues/' + str(i)
r = requests.get(url)
if r.status_code == 200:
print('\nConverting page to PDF: ' + url)
c = r.text
# Strip versioning number from <link> paths (e.g. example.css?1234 -> example.css)
# This is needed to avoid an error with wkpdftohtml
# see thread at https://github.com/wkhtmltopdf/wkhtmltopdf/issues/2051
html = re.sub('#(\.css|\.js)\?[^"]+#', '$1', c)
soup = BeautifulSoup(html, "lxml")
html_head = str(soup.head)
html_body = str(soup.find(class_='repohead'))
html_body = str(html_body) + str(soup.find(id='show_issue'))
if generate_auto_tags == True:
tags = autotags(soup)
else:
tags = ""

full_html = html_head + html_body + tags

try:
### This script scrapes the issues for a github project, and saves each one as a PDF.

import pdfkit
import requests
import re
import os
from bs4 import BeautifulSoup

from datetime import datetime


# OPTIONS:
instance = 'github.com'
headers = {'Cookie':''}

# Repository to fetch from (e.g. jackjamieson2/GitHub-issues-to-pdf)
repository = 'jackjamieson2/yarns-indie-reader'
Expand Down Expand Up @@ -202,7 +82,7 @@ def log_error(error):

# Look up how many issues the repository has
issue_count = 0
r = requests.get('https://github.com/' + repository + '/issues?q=is%3Aissue')
r = requests.get('https://' + instance + '/' + repository + '/issues?q=is%3Aissue', headers=headers)
if r.status_code == 200:
soup = BeautifulSoup(r.content, "lxml")
issue = soup.find(class_="js-issue-row")
Expand All @@ -216,8 +96,8 @@ def log_error(error):
errors = []
# Iterate through each issue page
for i in range (1,issue_count +1):
url = 'https://github.com/' + repository + '/issues/' + str(i)
r = requests.get(url)
url = 'https://' + instance + '/' + repository + '/issues/' + str(i)
r = requests.get(url, headers=headers)
if r.status_code == 200:
print('\nConverting page to PDF: ' + url)
c = r.text
Expand All @@ -229,36 +109,25 @@ def log_error(error):
html_head = str(soup.head)
html_body = str(soup.find(class_='repohead'))
html_body = str(html_body) + str(soup.find(id='show_issue'))
if generate_auto_tags == True:
tags = autotags(soup)
else:
tags = ""

full_html = html_head + html_body + tags

try:
if generate_auto_tags == True:
tags = autotags(soup)
else:
tags = ""

full_html = html_head + html_body + tags

if soup.find(id='show_issue'):
pdfkit.from_string(full_html, output_dir +str(i) +'.pdf', options=options)
else:
print('\nIssue does not exist:' + url)
except:
log_error(url)



elif r.status_code == 404:
print('\n404 not found: ' + url )

print('\n\nFinished!\nSaved PDFs for ' + str(i) + ' issues.' )
print('Find your exported PDFs in ' +output_dir )
else:
print("Repository not found: " + repository)

except:
log_error(url)



elif r.status_code == 404:
print('\n404 not found: ' + url )

Expand Down