Skip to content

Commit

Permalink
Add 2024Q2; explore some
Browse files Browse the repository at this point in the history
  • Loading branch information
tjpalmer committed Jul 3, 2024
1 parent af8a513 commit ce65226
Show file tree
Hide file tree
Showing 7 changed files with 38,153 additions and 30,706 deletions.
18 changes: 12 additions & 6 deletions process/ghmerge.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class Row(typ.TypedDict):
repo: str


def extract_data(data: dict[str, Repo | None]) -> pd.DataFrame:
def extract_data(data: dict[str, Repo | None], path: pth.Path) -> pd.DataFrame:
rows = []
for obj in data.values():
if obj is not None:
Expand All @@ -42,6 +42,7 @@ def extract_data(data: dict[str, Repo | None]) -> pd.DataFrame:
"found": True,
"lang": lang["name"] if lang else "",
"repo": obj["nameWithOwner"],
"path": path,
}
rows.append(row)
return pd.DataFrame(rows)
Expand All @@ -60,20 +61,24 @@ def extract_errors(errors: list[Error]) -> pd.DataFrame:

def load_projects(dir: str) -> pd.DataFrame:
parts = []
cwd = pth.Path.cwd()
for path in pth.Path(dir).iterdir():
with open(path) as input:
try:
response = json.load(input)
except:
print(f"Error reading: {path}")
raise
data = extract_data(response.get("data") or {})
errors = extract_errors(response.get("errors", []))
parts += [data, errors]
return pd.concat(parts).drop_duplicates()
path = path.resolve().relative_to(cwd)
data = extract_data(response.get("data") or {}, path=path)
# errors = extract_errors(response.get("errors", []))
# parts += [data, errors]
parts.append(data)
return pd.concat(parts).drop_duplicates(subset=["found", "lang", "repo"])


def main():
pd.set_option("display.max_columns", None)
parser = argparse.ArgumentParser()
parser.add_argument("--jsondir", required=True)
parser.add_argument("--output", required=True)
Expand All @@ -91,7 +96,8 @@ def run(*, args: Args):
print("Contradictions")
print(contras)
# Then arbitrarily move on.
# I had 4 repos in this category at the moment.
# I had 4 repos in this category early on. Now many.
contras.to_csv(pth.Path(args["output"]).parent / "contras.csv", index=False)
langs = langs.drop_duplicates(subset="repo")
langs.to_csv(args["output"], index=False)

Expand Down
6 changes: 3 additions & 3 deletions process/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ class Args(typ.TypedDict):
''
) as repo
from (
select * from `githubarchive.month.202401` union all
select * from `githubarchive.month.202402` union all
select * from `githubarchive.month.202403`
select * from `githubarchive.month.202404` union all
select * from `githubarchive.month.202405` union all
select * from `githubarchive.month.202406`
) event
where event.type in (
'IssuesEvent', 'PullRequestEvent', 'WatchEvent'
Expand Down
Loading

0 comments on commit ce65226

Please sign in to comment.