Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sourcery Starbot ⭐ refactored thinhnggia/NLP-progress #1

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 27 additions & 39 deletions structured/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,7 @@ def sanitize_subdataset_name(name:str):
"""

name = name.replace("**", "")
if name.endswith(":"):
name = name[:-1]

name = name.removesuffix(":")
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function sanitize_subdataset_name refactored with the following changes:

  • Replace a conditional string slice with a call to str.removesuffix or str.removeprefix, where applicable (use-string-remove-affix)

return name.strip()


Expand All @@ -64,9 +62,7 @@ def extract_lines_before_tables(lines:List[str]):
in_table = True
elif in_table and not l.startswith("|"):
in_table = False
before = None
if l.strip() != "":
before = l.strip()
before = l.strip() if l.strip() != "" else None
Comment on lines -67 to +65
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function extract_lines_before_tables refactored with the following changes:

elif l.strip() != "":
before = l.strip()

Expand Down Expand Up @@ -99,12 +95,13 @@ def handle_multiple_sota_table_exceptions(section:List[str], sota_tables:List[Li
print("ERROR parsing the subdataset SOTA tables", file=sys.stderr)
print(sota_tables, file=sys.stderr)
else:
for i in range(len(subdatasets)):
out.append({
out.extend(
{
"subdataset": subdatasets[i],
"sota": extract_sota_table(sota_tables[i])
})

"sota": extract_sota_table(sota_tables[i]),
}
for i in range(len(subdatasets))
)
Comment on lines -102 to +104
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function handle_multiple_sota_table_exceptions refactored with the following changes:

return out


Expand Down Expand Up @@ -153,7 +150,9 @@ def extract_paper_title_and_link(paper_md:str) -> Tuple:
md_links = re.findall("\\[.*\\]\\(.*\\)", paper_md)

if len(md_links) > 1:
print("WARNING: Found multiple paper references: `%s`, using only the first..." % paper_md)
print(
f"WARNING: Found multiple paper references: `{paper_md}`, using only the first..."
)
Comment on lines -156 to +155
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function extract_paper_title_and_link refactored with the following changes:

if len(md_links) == 0:
return None, None

Expand Down Expand Up @@ -192,8 +191,6 @@ def extract_sota_table(table_lines:List[str]) -> Dict:
:return:
"""

sota = {}
Comment on lines 193 to -195
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function extract_sota_table refactored with the following changes:

This removes the following comments ( why? ):

# extract all the metrics


header = table_lines[0]
header_cols = [h.strip() for h in header.split("|") if h.strip()]
cols_sanitized = [h.lower() for h in header_cols]
Expand All @@ -216,19 +213,13 @@ def extract_sota_table(table_lines:List[str]) -> Dict:
print("".join(table_lines), file=sys.stderr)
return {}

if "code" in cols_sanitized:
code_inx = cols_sanitized.index("code")
else:
code_inx = None

metrics_inx = set(range(len(header_cols))) - set([model_inx, paper_inx, code_inx])
code_inx = cols_sanitized.index("code") if "code" in cols_sanitized else None
metrics_inx = set(range(len(header_cols))) - {model_inx, paper_inx, code_inx}
metrics_inx = sorted(list(metrics_inx))

metrics_names = [header_cols[i] for i in metrics_inx]

sota["metrics"] = metrics_names
sota["rows"] = []

sota = {"metrics": metrics_names, "rows": []}
min_cols = len(header_cols)

# now parse the table rows
Expand All @@ -237,14 +228,16 @@ def extract_sota_table(table_lines:List[str]) -> Dict:
row_cols = [h.strip() for h in row.split("|")][1:]

if len(row_cols) < min_cols:
print("This row doesn't have enough columns, skipping: %s" % row, file=sys.stderr)
print(
f"This row doesn't have enough columns, skipping: {row}",
file=sys.stderr,
)
continue

# extract all the metrics
metrics = {}
for i in range(len(metrics_inx)):
metrics[metrics_names[i]] = row_cols[metrics_inx[i]]

metrics = {
metrics_names[i]: row_cols[metrics_inx[i]]
for i in range(len(metrics_inx))
}
# extract paper references
paper_title, paper_link = extract_paper_title_and_link(row_cols[paper_inx])

Expand Down Expand Up @@ -343,9 +336,7 @@ def parse_markdown_file(md_file:str) -> List:
if line.startswith("#"):
if cur:
sections.append(cur)
cur = [line]
else:
cur = [line]
cur = [line]
Comment on lines -346 to +339
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function parse_markdown_file refactored with the following changes:

This removes the following comments ( why? ):

# see if there is an arxiv link in the first paragraph of the description

else:
cur.append(line)

Expand Down Expand Up @@ -402,30 +393,27 @@ def parse_markdown_file(md_file:str) -> List:
print("ERROR: Unexpected dataset without a parent task at %s:#%d" %
(md_file, get_line_no(sections, section_index)), file=sys.stderr)

# new dataset and add
ds = {}
if st is not None:
# we are in a subtask, add everything here
if "datasets" not in st:
st["datasets"] = []

# new dataset and add
ds = {}
st["datasets"].append(ds)
else:
# we are in a task, add here
if "datasets" not in t:
t["datasets"] = []

ds = {}
t["datasets"].append(ds)

ds["dataset"] = header[3:].strip()
# dataset description is everything that's not a table
desc, tables = extract_dataset_desc_and_sota_table(section[1:])
ds["description"] = "".join(desc).strip()

# see if there is an arxiv link in the first paragraph of the description
dataset_links = extract_dataset_desc_links(desc)
if dataset_links:
if dataset_links := extract_dataset_desc_links(desc):
ds["dataset_links"] = dataset_links

if tables:
Expand Down Expand Up @@ -456,7 +444,7 @@ def parse_markdown_directory(path:str):

out = []
for md_file in md_files:
print("Processing `%s`..." % md_file)
print(f"Processing `{md_file}`...")
Comment on lines -459 to +447
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function parse_markdown_directory refactored with the following changes:

out.extend(parse_markdown_file(os.path.join(path, md_file)))

return out
Expand Down