From 67753cb7c0923e44e6f4c0a66b09ba3bae1fd6f9 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Wed, 1 Jan 2025 22:39:53 +0100 Subject: [PATCH 01/16] Move paper+volume BibTeX creation into create_hugo_data.py --- bin/create_hugo_data.py | 75 +++++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 22 deletions(-) diff --git a/bin/create_hugo_data.py b/bin/create_hugo_data.py index d46bc7bab8..cd702b9f17 100755 --- a/bin/create_hugo_data.py +++ b/bin/create_hugo_data.py @@ -15,13 +15,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Usage: create_hugo_data.py [--importdir=DIR] [--exportdir=DIR] [-c] [--debug] [--dry-run] +"""Usage: create_hugo_data.py [--importdir=DIR] [--exportdir=DIR] [options] Creates Hugo data files containing all necessary Anthology data for the website generation. +This will write JSON data files to `{exportdir}/data/` as well as volume-level BibTeX files +to `{exportdir}/data-export/volumes/`. + Options: --importdir=DIR Directory to import XML files from. [default: {scriptdir}/../data/] - --exportdir=DIR Directory to write data files to. [default: {scriptdir}/../build/data/] + --exportdir=DIR Directory to write build files to. [default: {scriptdir}/../build/] + --bib-limit=N Only generate bibliographic information for the first N papers per volume. + Setting the environment variable NOBIB=true is equivalent to --bib-limit=3. --debug Output debug-level log messages. -c, --clean Delete existing files in target directory before generation. -n, --dry-run Do not write data files (useful for debugging). @@ -55,6 +60,7 @@ ) +BIBLIMIT = False ENCODER = msgspec.json.Encoder() SCRIPTDIR = os.path.dirname(os.path.realpath(__file__)) @@ -126,6 +132,8 @@ def paper_to_dict(paper): editors = [ person_to_dict(paper.root.resolve(ns).id, ns) for ns in paper.get_editors() ] + if not BIBLIMIT or int(paper.id) <= BIBLIMIT: + data["bibtex"] = paper.to_bibtex(with_abstract=True) if paper.is_frontmatter: # Editors are considered authors for the frontmatter if editors: @@ -273,13 +281,14 @@ def volume_to_dict(volume): return data -def export_papers_and_volumes(anthology, outdir, dryrun): +def export_papers_and_volumes(anthology, builddir, dryrun): all_volumes = {} with make_progress() as progress: paper_count = sum(1 for _ in anthology.papers()) task = progress.add_task("Exporting papers...", total=paper_count) for collection in anthology.collections.values(): collection_papers = {} + volume_bibtex = {} for volume in collection.volumes(): # Compute volume-level information that gets appended to every paper # TODO: Could this be changed in the Hugo templates to @@ -287,6 +296,7 @@ def export_papers_and_volumes(anthology, outdir, dryrun): # this information on every paper? # --- this also applies to some information from paper_to_dict() # which may be fetched from the volume if not set for the paper + volume_bibtex[volume.full_id] = [] volume_data = { "booktitle": volume.title.as_text(), "parent_volume_id": volume.full_id, @@ -307,24 +317,34 @@ def export_papers_and_volumes(anthology, outdir, dryrun): data = paper_to_dict(paper) data.update(volume_data) collection_papers[paper.full_id] = data + if "bibtex" in data: + volume_bibtex[volume.full_id].append( + paper.to_bibtex(with_abstract=False) + ) # We build the volume data separately since it uses slightly # different fields than what gets attached to papers all_volumes[volume.full_id] = volume_to_dict(volume) if not dryrun: - with open(f"{outdir}/papers/{collection.id}.json", "wb") as f: + with open(f"{builddir}/data/papers/{collection.id}.json", "wb") as f: f.write(ENCODER.encode(collection_papers)) + for volume_id, bibtex in volume_bibtex.items(): + with open( + f"{builddir}/data-export/volumes/{volume_id}.bib", "w" + ) as f: + print("\n".join(bibtex), file=f) + progress.update(task, advance=len(collection_papers)) # Export volumes if not dryrun: - with open(f"{outdir}/volumes.json", "wb") as f: + with open(f"{builddir}/data/volumes.json", "wb") as f: f.write(ENCODER.encode(all_volumes)) -def export_people(anthology, outdir, dryrun): +def export_people(anthology, builddir, dryrun): with make_progress() as progress: # Just to make progress bars nicer ppl_count = sum(1 for _ in anthology.people.items()) @@ -379,12 +399,12 @@ def export_people(anthology, outdir, dryrun): if not dryrun: for first_letter, people_list in people.items(): - with open(f"{outdir}/people/{first_letter}.json", "wb") as f: + with open(f"{builddir}/data/people/{first_letter}.json", "wb") as f: f.write(ENCODER.encode(people_list)) progress.update(task, advance=100) -def export_venues(anthology, outdir, dryrun): +def export_venues(anthology, builddir, dryrun): all_venues = {} print("Exporting venues...") for venue_id, venue in anthology.venues.items(): @@ -416,11 +436,11 @@ def export_venues(anthology, outdir, dryrun): all_venues[venue_id] = data if not dryrun: - with open("{}/venues.json".format(outdir), "wb") as f: + with open(f"{builddir}/data/venues.json", "wb") as f: f.write(ENCODER.encode(all_venues)) -def export_events(anthology, outdir, dryrun): +def export_events(anthology, builddir, dryrun): # Export events all_events = {} print("Exporting events...") @@ -473,11 +493,11 @@ def export_events(anthology, outdir, dryrun): all_events[event.id] = data if not dryrun: - with open(f"{outdir}/events.json", "wb") as f: + with open(f"{builddir}/data/events.json", "wb") as f: f.write(ENCODER.encode(all_events)) -def export_sigs(anthology, outdir, dryrun): +def export_sigs(anthology, builddir, dryrun): all_sigs = {} print("Exporting SIGs...") for sig in anthology.sigs.values(): @@ -502,27 +522,32 @@ def export_sigs(anthology, outdir, dryrun): all_sigs[sig.acronym] = data if not dryrun: - with open("{}/sigs.json".format(outdir), "wb") as f: + with open(f"{builddir}/data/sigs.json", "wb") as f: f.write(ENCODER.encode(all_sigs)) -def export_anthology(anthology, outdir, clean=False, dryrun=False): +def export_anthology(anthology, builddir, clean=False, dryrun=False): """ - Dumps files in build/data/*.json. These files are used in conjunction with the hugo - page stubs created by create_hugo_pages.py to instantiate Hugo templates. + Dumps files in build/data/*.json, which are used by Hugo templates + to generate the website, as well as build/data-export/volumes/*.bib, + which are used later as a basis to generate more bibliographic files. """ # Create directories if not dryrun: for subdir in ("", "papers", "people"): - target_dir = "{}/{}".format(outdir, subdir) + target_dir = "{}/data/{}".format(builddir, subdir) + if not check_directory(target_dir, clean=clean): + return + for subdir in ("", "volumes"): + target_dir = "{}/data-export/{}".format(builddir, subdir) if not check_directory(target_dir, clean=clean): return - export_papers_and_volumes(anthology, outdir, dryrun) - export_people(anthology, outdir, dryrun) - export_venues(anthology, outdir, dryrun) - export_events(anthology, outdir, dryrun) - export_sigs(anthology, outdir, dryrun) + export_papers_and_volumes(anthology, builddir, dryrun) + export_people(anthology, builddir, dryrun) + export_venues(anthology, builddir, dryrun) + export_events(anthology, builddir, dryrun) + export_sigs(anthology, builddir, dryrun) if __name__ == "__main__": @@ -540,6 +565,12 @@ def export_anthology(anthology, outdir, clean=False, dryrun=False): log_level = log.DEBUG if args["--debug"] else log.INFO tracker = setup_rich_logging(level=log_level) + if limit := args["--bib-limit"]: + BIBLIMIT = int(limit) + elif os.environ.get("NOBIB", "false") == "true": + BIBLIMIT = 3 + log.info("NOBIB=true, setting --bib-limit=3") + # This "freezes" the config, resulting in a massive speed-up OmegaConf.resolve(config) From be873538f041c2cb1fcc893eb6ce3ad7e85d49d8 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Wed, 1 Jan 2025 22:40:20 +0100 Subject: [PATCH 02/16] Move MODS+Endnote generation into create_bibtex.py, rename to create_bib.py --- Makefile | 58 ++-------- bin/bib2xml_wrapper | 20 ---- bin/create_bib.py | 264 +++++++++++++++++++++++++++++++++++++++++++ bin/create_bibtex.py | 194 ------------------------------- bin/xml2end_wrapper | 17 --- 5 files changed, 274 insertions(+), 279 deletions(-) delete mode 100755 bin/bib2xml_wrapper create mode 100755 bin/create_bib.py delete mode 100755 bin/create_bibtex.py delete mode 100755 bin/xml2end_wrapper diff --git a/Makefile b/Makefile index c1d616d49c..0d0a752ceb 100644 --- a/Makefile +++ b/Makefile @@ -97,10 +97,6 @@ ifeq ($(HUGO_VERSION_TOO_LOW),true) $(error "incorrect hugo version installed! Need hugo 0.$(HUGO_VERSION_MIN), but only found hugo 0.$(HUGO_VERSION)!") endif -# check whether bibtools are installed; used by the endnote and mods targets. -HAS_XML2END=$(shell which xml2end > /dev/null && echo true || echo false) -HAS_BIB2XML=$(shell which bib2xml > /dev/null && echo true || echo false) - VENV := "venv/bin/activate" @@ -167,63 +163,29 @@ build/.data: build/.basedirs $(sourcefiles) venv/bin/activate . $(VENV) && python3 bin/create_hugo_data.py --clean @touch build/.data -.PHONY: bibtex -bibtex: build/.bibtex - -.PHONY: mods -mods: build/.mods - -.PHONY: endnote -endnote: build/.endnote +.PHONY: bib +bib: build/.bib ####################################################### -build/.bibtex: build/.basedirs $(sourcefiles) venv/bin/activate - @echo "INFO Creating BibTeX files..." - . $(VENV) && python3 bin/create_bibtex.py --clean - @touch build/.bibtex - # Disable citation targets (except for 3 bibtex per volume) by setting NOBIB=true ifeq (true, $(NOBIB)) -$(info WARNING: not creating citation materials; this is not suitable for release!) -build/.mods: build/.bibtex - touch build/.mods -build/.endnote: build/.bibtex - touch build/.endnote +$(info WARNING: not creating full citation materials; this is not suitable for release!) +build/.bib: + @touch build/.bib else -build/.mods: build/.bibtex - @if [ $(HAS_BIB2XML) = false ]; then \ - echo "bib2xml not found, please install bibtools"; \ - echo "alternatively, build the site without endnote files by running make hugo"; \ - exit 1; \ - fi - @echo "INFO Converting BibTeX files to MODS XML..." - @find build/data-export -name '*.bib' -print0 | \ - xargs -0 -n 1 -P 8 bin/bib2xml_wrapper >/dev/null - @touch build/.mods - -build/.endnote: build/.mods - @if [ $(HAS_XML2END) = false ]; then \ - echo "xml2end not found, please install bibtools"; \ - echo "alternatively, build the site without endnote files by running make hugo"; \ - exit 1; \ - fi - @echo "INFO Converting MODS XML files to EndNote..." - @find build/data-export -name '*.xml' -print0 | \ - xargs -0 -n 1 -P 8 bin/xml2end_wrapper >/dev/null - @touch build/.endnote +build/.bib: build/.basedirs build/.data venv/bin/activate + @echo "INFO Creating extra bibliographic files..." + . $(VENV) && python3 bin/create_bib.py --clean + @touch build/.bib endif # end if block to conditionally disable bibtex generation ####################################################### - -%.endf: %.xml - xml2end $< 2>&1 > $@ - .PHONY: hugo hugo: build/.hugo -build/.hugo: build/.static build/.data build/.bibtex build/.mods build/.endnote +build/.hugo: build/.static build/.data build/.bib @echo "INFO Running Hugo... this may take a while." @cd build && \ hugo -b $(ANTHOLOGYHOST)/$(ANTHOLOGYDIR) \ diff --git a/bin/bib2xml_wrapper b/bin/bib2xml_wrapper deleted file mode 100755 index 1c39513805..0000000000 --- a/bin/bib2xml_wrapper +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/sh -# -# Copyright 2019 Marcel Bollmann -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e - -bib2xml -nt $1 2>/dev/null > ${1%.bib}.xml -echo 1 diff --git a/bin/create_bib.py b/bin/create_bib.py new file mode 100755 index 0000000000..a9bac0288b --- /dev/null +++ b/bin/create_bib.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Copyright 2019-2024 Marcel Bollmann +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Usage: create_bib.py [--builddir=DIR] [-c] [--max-workers=N] [--debug] + +Creates anthology.bib files and MODS/Endnote formats for all papers in the Hugo directory. + +Options: + --builddir=DIR Directory with build files; used both for reading and writing. [default: {scriptdir}/../build/] + --debug Output debug-level log messages. + -c, --clean Delete existing files in target directory before generation. + -n, --max-workers=N Maximum number of subprocesses that will be spawned. + -h, --help Display this helpful text. +""" + +import concurrent.futures +import datetime +from docopt import docopt +import gzip +import logging as log +import os +import msgspec +from pathlib import Path +import re +from rich.progress import track +import shutil +import subprocess + +from acl_anthology import config +from acl_anthology.utils.ids import infer_year +from acl_anthology.utils.logging import setup_rich_logging +from create_hugo_data import make_progress + + +BIB2XML = None +XML2END = None + + +def create_bibtex(builddir, clean=False) -> None: + """Create full Anthology BibTeX files. + + Requires volume bib files from create_hugo_data.py (for file without abstracts). + Requires data files from create_hugo_data.py (for file with abstracts). + """ + with ( + open( + f"{builddir}/data-export/anthology.bib", "wt", encoding="utf-8" + ) as file_anthology_raw, + gzip.open( + f"{builddir}/data-export/anthology.bib.gz", "wt", encoding="utf-8" + ) as file_anthology, + ): + # Add a header to each consolidated bibfile + for outfh in file_anthology_raw, file_anthology: + print( + f"% {config.url_prefix}/{Path(outfh.name).name} generated on {datetime.date.today().isoformat()}\n", + file=outfh, + ) + + # Add some shortcuts to the uncompressed consolidated bib file + print( + "@string{acl = {Association for Computational Linguistics}}", + file=file_anthology_raw, + ) + print(f"@string{{anth = {{{config.url_prefix}/}}}}", file=file_anthology_raw) + print(file=file_anthology_raw) + + for volume_file in track( + sorted( + Path(f"{builddir}/data-export/volumes").glob("*.bib"), + key=lambda p: (infer_year(p.stem), p.stem), + reverse=True, + ), + description="Create anthology.bib.gz... ", + ): + # reset this each time + abbrev = None + volume_id = volume_file.stem + + with open(volume_file, "r") as f: + bibtex = f.read() + print(bibtex, file=file_anthology) + + # Space saver (https://github.com/acl-org/acl-anthology/issues/3016) for the + # uncompressed consolidated bibfile. + # Replace verbose text with abbreviations to get the file under 50 MB for Overleaf + concise_contents = bibtex.replace( + 'publisher = "Association for Computational Linguistics",', + "publisher = acl,", + ) + concise_contents = re.sub( + rf'url = "{config.url_prefix}/(.*)"', + r"url = anth # {\1}", + concise_contents, + ) + + # Abbreviate the booktitle by extracting it and printing it before + # the first entry in each volume + if concise_contents.startswith("@proceedings"): + # Grab the title string and create the alias + first_bibkey_comp = re.match( + r'@proceedings{([a-z0-9]*)-', concise_contents + ).group(1) + abbrev = f"{first_bibkey_comp.upper()}:{infer_year(volume_id)}:{volume_id.split('-')[-1]}" + try: + booktitle = re.match( + r"@proceedings{[a-z0-9-]*,\n title = \"(.*)\",", + concise_contents, + ).group(1) + print( + f"@string{{{abbrev} = {{{booktitle}}}}}", + file=file_anthology_raw, + ) + except AttributeError: + log.warning(f"Could not find title for {volume_id}") + abbrev = None + + if abbrev is not None and "booktitle" in concise_contents: + # substitute the alias for the booktitle + concise_contents = re.sub( + r" booktitle = (\".*\"),", + f" booktitle = {abbrev},", + concise_contents, + ) + + # Remove whitespace to save space and keep things under 50 MB + concise_contents = re.sub(r",\n +", ",", concise_contents) + concise_contents = re.sub(r" and\n +", " and ", concise_contents) + concise_contents = re.sub(r",\n}", "}", concise_contents) + + print(concise_contents, file=file_anthology_raw) + + with gzip.open( + f"{builddir}/data-export/anthology+abstracts.bib.gz", "wt", encoding="utf-8" + ) as file_anthology_with_abstracts: + for collection_file in track( + sorted( + Path(f"{builddir}/data/papers").glob("*.json"), + key=lambda p: (infer_year(p.stem), p.stem), + reverse=True, + ), + description=" +abstracts.bib.gz... ", + ): + with open(collection_file, "rb") as f: + data = msgspec.json.decode(f.read()) + + # bibtex = "\n".join(entry["bibtex"] for entry in data.values() if "bibtex" in entry) + # print(bibtex, file=file_anthology_with_abstracts) + + for entry in data.values(): + if bibtex := entry.get("bibtex"): + print(bibtex, file=file_anthology_with_abstracts) + + +def convert_bibtex(builddir, max_workers=None): + """Convert BibTeX into other bibliographic formats, and add them to the data files. + + Requires data files from create_hugo_data.py. + """ + files = list(Path(f"{builddir}/data/papers").glob("*.json")) + + with make_progress() as progress: + task = progress.add_task("Convert to MODS & Endnote...", total=len(files)) + + with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(convert_collection_file, file) for file in files] + for _ in concurrent.futures.as_completed(futures): + progress.update(task, advance=1) + + +def convert_collection_file(collection_file): + """Read a single collection data file, convert its BibTeX entries to MODS and Endnote formats, and save those back into the file. + + Important: + This function should not rely on global objects, as it will be executed concurrently for different files with multiprocessing. + """ + + with open(collection_file, "rb") as f: + data = msgspec.json.decode(f.read()) + + entries = [entry for entry in data.values() if entry.get("bibtex")] + if not entries: + return + + bibtex = "\n".join(entry["bibtex"] for entry in entries) + mods_batch, endf_batch = batch_convert_to_mods_and_endf(bibtex, collection_file.name) + assert len(entries) == len(mods_batch) == len(endf_batch) + for entry, mods, endf in zip(entries, mods_batch, endf_batch): + entry["mods"] = mods + entry["endf"] = endf + + with open(collection_file, "wb") as f: + f.write(msgspec.json.encode(data)) + + +def batch_convert_to_mods_and_endf(bibtex, context): + """Convert a BibTeX string with multiple entries to MODS and Endnote. + + Relies on bibutils to perform the conversion, then returns a list with the individual converted entries. + """ + mods = subprocess.run( + [BIB2XML, "-nt"], + input=bibtex, + capture_output=True, + text=True, + ) + log.debug(f"{context}: {mods.stderr.strip()}") + endf = subprocess.run( + [XML2END], + input=mods.stdout, + capture_output=True, + text=True, + ) + log.debug(f"{context}: {endf.stderr.strip()}") + + mods_header, *mods_entries = re.split(r"= log.ERROR: + exit(1) diff --git a/bin/create_bibtex.py b/bin/create_bibtex.py deleted file mode 100755 index 05bec2b952..0000000000 --- a/bin/create_bibtex.py +++ /dev/null @@ -1,194 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# -# Copyright 2019-2024 Marcel Bollmann -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Usage: create_bibtex.py [--importdir=DIR] [--exportdir=DIR] [-c] [--debug] - -Creates .bib files for all papers in the Hugo directory. - -Options: - --importdir=DIR Directory to import XML files from. [default: {scriptdir}/../data/] - --exportdir=DIR Directory to write exported files to. [default: {scriptdir}/../build/data-export/] - --debug Output debug-level log messages. - -c, --clean Delete existing files in target directory before generation. - -h, --help Display this helpful text. -""" - -import re -import gzip -import logging as log -import os -import datetime - -from docopt import docopt -from omegaconf import OmegaConf -from pathlib import Path -from rich.progress import track - -from acl_anthology import Anthology, config -from acl_anthology.utils.logging import setup_rich_logging -from create_hugo_data import check_directory - - -def create_bibtex(anthology, trgdir, limit=0, clean=False) -> None: - """Creates .bib files for all papers. - - :param anthology: The Anthology object. - :param trgdir: The target directory to write to - :param limit: If nonzero, only generate {limit} entries per volume - :param clean: Clean the directory first - """ - if not check_directory("{}/papers".format(trgdir), clean=clean): - return - if not check_directory("{}/volumes".format(trgdir), clean=clean): - return - - log.debug("Creating BibTeX files for all papers...") - with ( - open( - "{}/anthology.bib".format(trgdir), "wt", encoding="utf-8" - ) as file_anthology_raw, - gzip.open( - "{}/anthology.bib.gz".format(trgdir), "wt", encoding="utf-8" - ) as file_anthology, - gzip.open( - "{}/anthology+abstracts.bib.gz".format(trgdir), "wt", encoding="utf-8" - ) as file_anthology_with_abstracts, - ): - # Add a header to each consolidated bibfile - for outfh in file_anthology_raw, file_anthology, file_anthology_with_abstracts: - print( - f"% https://aclanthology.org/{Path(outfh.name).name} generated on {datetime.date.today().isoformat()}\n", - file=outfh, - ) - - # Add some shortcuts to the uncompressed consolidated bib file - print( - "@string{acl = {Association for Computational Linguistics}}", - file=file_anthology_raw, - ) - print("@string{anth = {https://aclanthology.org/}}", file=file_anthology_raw) - print(file=file_anthology_raw) - - for volume in track( - sorted( - anthology.volumes(), key=lambda vol: (vol.year, vol.full_id), reverse=True - ), - description="Creating BibTeX files...", - ): - # reset this each time - abbrev = None - - volume_dir = trgdir - if not os.path.exists(volume_dir): - os.makedirs(volume_dir) - with open( - "{}/volumes/{}.bib".format(trgdir, volume.full_id), "w" - ) as file_volume: - for i, paper in enumerate(volume.values(), 1): - if limit and i > limit: - break - - with open( - "{}/{}.bib".format(volume_dir, paper.full_id), "w" - ) as file_paper: - contents = paper.to_bibtex(with_abstract=True) - print(contents, file=file_paper) - print(contents, file=file_anthology_with_abstracts) - - concise_contents = paper.to_bibtex() - print(concise_contents, file=file_volume) - print(concise_contents, file=file_anthology) - - # Space saver (https://github.com/acl-org/acl-anthology/issues/3016) for the - # uncompressed consolidated bibfile. - # Replace verbose text with abbreviations to get the file under 50 MB for Overleaf - concise_contents = concise_contents.replace( - 'publisher = "Association for Computational Linguistics",', - "publisher = acl,", - ) - concise_contents = re.sub( - r'url = "https://aclanthology.org/(.*)"', - r"url = anth # {\1}", - concise_contents, - ) - - # Abbreviate the booktitle by extracting it and printing it before - # the first entry in each volume - if concise_contents.startswith("@proceedings"): - # Grab the title string and create the alias - abbrev = ( - f"{volume.venue_ids[0].upper()}:{volume.year}:{volume.id}" - ) - try: - booktitle = re.search( - r" title = \"(.*)\",", concise_contents - ).group(1) - print( - f"@string{{{abbrev} = {{{booktitle}}}}}", - file=file_anthology_raw, - ) - except AttributeError: - - log.warning(f"Could not find title for {volume.full_id}") - abbrev = None - - if abbrev is not None and "booktitle" in concise_contents: - # substitute the alias for the booktitle - concise_contents = re.sub( - r" booktitle = (\".*\"),", - f" booktitle = {abbrev},", - concise_contents, - ) - - # Remove whitespace to save space and keep things under 50 MB - concise_contents = re.sub(r",\n +", ",", concise_contents) - concise_contents = re.sub(r" and\n +", " and ", concise_contents) - concise_contents = re.sub(r",\n}", "}", concise_contents) - - print(concise_contents, file=file_anthology_raw) - - -if __name__ == "__main__": - args = docopt(__doc__) - scriptdir = os.path.dirname(os.path.abspath(__file__)) - if "{scriptdir}" in args["--importdir"]: - args["--importdir"] = os.path.abspath( - args["--importdir"].format(scriptdir=scriptdir) - ) - if "{scriptdir}" in args["--exportdir"]: - args["--exportdir"] = os.path.abspath( - args["--exportdir"].format(scriptdir=scriptdir) - ) - - log_level = log.DEBUG if args["--debug"] else log.INFO - tracker = setup_rich_logging(level=log_level) - - # This "freezes" the config, resulting in a massive speed-up - OmegaConf.resolve(config) - - # If NOBIB is set, generate only three bibs per volume - limit = 0 if os.environ.get("NOBIB", "false") == "false" else 3 - if limit != 0: - log.info(f"NOBIB=true, generating only {limit} BibTEX files per volume") - - anthology = Anthology(datadir=args["--importdir"]).load_all() - if tracker.highest >= log.ERROR: - exit(1) - - create_bibtex(anthology, args["--exportdir"], limit=limit, clean=args["--clean"]) - if tracker.highest >= log.ERROR: - exit(1) diff --git a/bin/xml2end_wrapper b/bin/xml2end_wrapper deleted file mode 100755 index 63f8877768..0000000000 --- a/bin/xml2end_wrapper +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/sh -# -# Copyright 2019 Martin Villalba -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -xml2end $1 2>&1 > ${1%.xml}.endf From 1d74ed44c696fac634b491f466e78bc40f2ef84f Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Wed, 1 Jan 2025 23:22:14 +0100 Subject: [PATCH 03/16] Change Hugo templates to read bib from data, add FileSaver.js --- hugo/layouts/_default/baseof.html | 2 +- hugo/layouts/papers/list-entry-author.html | 2 + hugo/layouts/papers/list-entry.html | 2 + hugo/layouts/papers/single.html | 58 ++++--- hugo/static/js/FileSaver.js | 188 +++++++++++++++++++++ 5 files changed, 225 insertions(+), 27 deletions(-) create mode 100644 hugo/static/js/FileSaver.js diff --git a/hugo/layouts/_default/baseof.html b/hugo/layouts/_default/baseof.html index 125c02053e..c2f3c118de 100644 --- a/hugo/layouts/_default/baseof.html +++ b/hugo/layouts/_default/baseof.html @@ -48,7 +48,7 @@ // abstract toggling -- used on multiple pages, so defined here if ($("#toggle-all-abstracts")) { - $("#toggle-all-abstracts").click( + $("#toggle-all-abstracts").on("click", function() { var target = $("#toggle-all-abstracts"); target.attr("disabled", true); diff --git a/hugo/layouts/papers/list-entry-author.html b/hugo/layouts/papers/list-entry-author.html index d32424cc27..06a72fc8b1 100644 --- a/hugo/layouts/papers/list-entry-author.html +++ b/hugo/layouts/papers/list-entry-author.html @@ -7,11 +7,13 @@ pdf {{- end -}} + {{- with $paper.abstract_html -}} {{- end -}} diff --git a/hugo/layouts/papers/list-entry.html b/hugo/layouts/papers/list-entry.html index bfe19aa249..98abfab33f 100644 --- a/hugo/layouts/papers/list-entry.html +++ b/hugo/layouts/papers/list-entry.html @@ -8,11 +8,13 @@ pdf {{- end -}} + {{- with $paper.abstract_html -}} {{- end -}} diff --git a/hugo/layouts/papers/single.html b/hugo/layouts/papers/single.html index c52134c560..90dc3fb228 100644 --- a/hugo/layouts/papers/single.html +++ b/hugo/layouts/papers/single.html @@ -40,6 +40,7 @@ {{ $volume_id := index (split .Params.anthology_id "-") 0 }} {{ $paper := index (index .Site.Data.papers $volume_id) .Params.anthology_id }} +