Skip to content

Commit

Permalink
Organized indentation
Browse files Browse the repository at this point in the history
  • Loading branch information
wilsonfreitas committed Mar 30, 2022
1 parent 49441d3 commit 4b209b8
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 68 deletions.
14 changes: 8 additions & 6 deletions cranscrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,15 @@
'https://cran.r-project.org/web/packages/bizdays/index.html',
]


def get_data(url):
res = requests.get(url)
m = reu.search(res.text)
if m:
return dict(cran=url, github=m.group(0), repo=m.group(1))
else:
return dict(cran=url, github='', repo='')
res = requests.get(url)
m = reu.search(res.text)
if m:
return dict(cran=url, github=m.group(0), repo=m.group(1))
else:
return dict(cran=url, github='', repo='')


all_data = [get_data(url) for url in urls]
df = pd.DataFrame(all_data)
Expand Down
123 changes: 62 additions & 61 deletions parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,83 +10,84 @@
# using an access token
g = Github(os.environ['GITHUB_ACCESS_TOKEN'])


def extract_repo(url):
reu = re.compile('^https://github.com/([\w-]+/[-\w\.]+)$')
m = reu.match(url)
if m:
return m.group(1)
else:
return ''
reu = re.compile('^https://github.com/([\w-]+/[-\w\.]+)$')
m = reu.match(url)
if m:
return m.group(1)
else:
return ''


def get_last_commit(repo):
try:
if repo:
r = g.get_repo(repo)
cs = r.get_commits()
return cs[0].commit.author.date.strftime('%Y-%m-%d')
else:
return ''
except:
print('ERROR' + repo)
return 'error'
try:
if repo:
r = g.get_repo(repo)
cs = r.get_commits()
return cs[0].commit.author.date.strftime('%Y-%m-%d')
else:
return ''
except:
print('ERROR' + repo)
return 'error'


class Project(Thread):

def __init__(self, match, section):
super().__init__()
self._match = match
self.regs = None
self._section = section

def run(self):
m = self._match
is_github = 'github.com' in m.group(2)
is_cran = 'cran.r-project.org' in m.group(2)
repo = extract_repo(m.group(2))
last_commit = get_last_commit(repo)
self.regs = dict(
project=m.group(1),
section=self._section,
last_commit=last_commit,
url=m.group(2),
description=m.group(3),
github=is_github,
cran=is_cran,
repo=repo
)
def __init__(self, match, section):
super().__init__()
self._match = match
self.regs = None
self._section = section

def run(self):
m = self._match
is_github = 'github.com' in m.group(2)
is_cran = 'cran.r-project.org' in m.group(2)
repo = extract_repo(m.group(2))
last_commit = get_last_commit(repo)
self.regs = dict(
project=m.group(1),
section=self._section,
last_commit=last_commit,
url=m.group(2),
description=m.group(3),
github=is_github,
cran=is_cran,
repo=repo
)


projects = []

with open('README.md', 'r', encoding='utf8') as f:
ret = re.compile('^(#+) (.*)$')
rex = re.compile('^\s*- \[(.*)\]\((.*)\) - (.*)$')
m_titles = []
last_head_level = 0
for line in f:
m = rex.match(line)
if m:
p = Project(m, ' > '.join(m_titles[1:]))
p.start()
projects.append(p)
else:
m = ret.match(line)
if m:
hrs = m.group(1)
if len(hrs) > last_head_level:
m_titles.append(m.group(2))
ret = re.compile('^(#+) (.*)$')
rex = re.compile('^\s*- \[(.*)\]\((.*)\) - (.*)$')
m_titles = []
last_head_level = 0
for line in f:
m = rex.match(line)
if m:
p = Project(m, ' > '.join(m_titles[1:]))
p.start()
projects.append(p)
else:
for n in range(last_head_level - len(hrs) + 1):
m_titles.pop()
m_titles.append(m.group(2))
last_head_level = len(hrs)
m = ret.match(line)
if m:
hrs = m.group(1)
if len(hrs) > last_head_level:
m_titles.append(m.group(2))
else:
for n in range(last_head_level - len(hrs) + 1):
m_titles.pop()
m_titles.append(m.group(2))
last_head_level = len(hrs)

while True:
checks = [not p.is_alive() for p in projects]
if all(checks):
break
checks = [not p.is_alive() for p in projects]
if all(checks):
break

projects = [p.regs for p in projects]
df = pd.DataFrame(projects)
Expand Down
3 changes: 2 additions & 1 deletion topic.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@
for repo in repos:
if repo.stargazers_count < 1000:
break
print(repo.name, repo.stargazers_count, repo.language, repo.html_url, repo.description, repo.updated_at, repo.archived)
print(repo.name, repo.stargazers_count, repo.language, repo.html_url,
repo.description, repo.updated_at, repo.archived)

0 comments on commit 4b209b8

Please sign in to comment.