From 1142c8ce9cbe803b8d97a34ca43d528ef2672d72 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 11:31:49 +0100 Subject: [PATCH 1/3] Lighten dependencies and add changelog --- .github/CONTRIBUTING.md | 3 +++ .github/changelog_template.md | 8 ++++++ .github/fetch_version.py | 13 ++++++++++ .github/get-changelog-diff.sh | 2 ++ .github/has-functional-changes.sh | 12 +++++++++ .github/is-version-number-acceptable.sh | 33 +++++++++++++++++++++++++ .github/publish-git-tag.sh | 4 +++ .github/workflows/ci_cd.yaml | 29 ++++++++++++++++++++++ changelog.yaml | 5 ++++ changelog_entry.yaml | 4 +++ pyproject.toml | 6 ++--- 11 files changed, 116 insertions(+), 3 deletions(-) create mode 100644 .github/CONTRIBUTING.md create mode 100644 .github/changelog_template.md create mode 100644 .github/fetch_version.py create mode 100755 .github/get-changelog-diff.sh create mode 100755 .github/has-functional-changes.sh create mode 100755 .github/is-version-number-acceptable.sh create mode 100755 .github/publish-git-tag.sh create mode 100644 changelog.yaml create mode 100644 changelog_entry.yaml diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 0000000..5b5d24e --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,3 @@ +## Updating data + +If your changes present a non-bugfix change to one or more datasets which are cloud-hosted (FRS and EFRS), then please change both the filename and URL (in both the class definition file and in `storage/upload_completed_datasets.py`). This enables us to store historical versions of datasets separately and reproducibly. diff --git a/.github/changelog_template.md b/.github/changelog_template.md new file mode 100644 index 0000000..8a1e679 --- /dev/null +++ b/.github/changelog_template.md @@ -0,0 +1,8 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +{{changelog}} \ No newline at end of file diff --git a/.github/fetch_version.py b/.github/fetch_version.py new file mode 100644 index 0000000..f130f40 --- /dev/null +++ b/.github/fetch_version.py @@ -0,0 +1,13 @@ +from policyengine_us_data.__version__ import __version__ + + +def fetch_version(): + try: + return __version__ + except Exception as e: + print(f"Error fetching version: {e}") + return None + + +if __name__ == "__main__": + print(fetch_version()) diff --git a/.github/get-changelog-diff.sh b/.github/get-changelog-diff.sh new file mode 100755 index 0000000..66c2bfd --- /dev/null +++ b/.github/get-changelog-diff.sh @@ -0,0 +1,2 @@ +last_tagged_commit=`git describe --tags --abbrev=0 --first-parent` +git --no-pager diff $last_tagged_commit -- CHANGELOG.md \ No newline at end of file diff --git a/.github/has-functional-changes.sh b/.github/has-functional-changes.sh new file mode 100755 index 0000000..169689a --- /dev/null +++ b/.github/has-functional-changes.sh @@ -0,0 +1,12 @@ +#! /usr/bin/env bash + +IGNORE_DIFF_ON="README.md CONTRIBUTING.md Makefile docs/* .gitignore LICENSE* .github/* data/*" + +last_tagged_commit=`git describe --tags --abbrev=0 --first-parent` # --first-parent ensures we don't follow tags not published in master through an unlikely intermediary merge commit + +if git diff-index --name-only --exit-code $last_tagged_commit -- . `echo " $IGNORE_DIFF_ON" | sed 's/ / :(exclude)/g'` # Check if any file that has not be listed in IGNORE_DIFF_ON has changed since the last tag was published. +then + echo "No functional changes detected." + exit 1 +else echo "The functional files above were changed." +fi diff --git a/.github/is-version-number-acceptable.sh b/.github/is-version-number-acceptable.sh new file mode 100755 index 0000000..a9067e6 --- /dev/null +++ b/.github/is-version-number-acceptable.sh @@ -0,0 +1,33 @@ +#! /usr/bin/env bash + +if [[ ${GITHUB_REF#refs/heads/} == master ]] +then + echo "No need for a version check on master." + exit 0 +fi + +if ! $(dirname "$BASH_SOURCE")/has-functional-changes.sh +then + echo "No need for a version update." + exit 0 +fi + +current_version=`python .github/fetch_version.py` + +if git rev-parse --verify --quiet $current_version +then + echo "Version $current_version already exists in commit:" + git --no-pager log -1 $current_version + echo + echo "Update the version number in setup.py before merging this branch into master." + echo "Look at the CONTRIBUTING.md file to learn how the version number should be updated." + exit 1 +fi + +if ! $(dirname "$BASH_SOURCE")/has-functional-changes.sh | grep --quiet CHANGELOG.md +then + echo "CHANGELOG.md has not been modified, while functional changes were made." + echo "Explain what you changed before merging this branch into master." + echo "Look at the CONTRIBUTING.md file to learn how to write the changelog." + exit 2 +fi diff --git a/.github/publish-git-tag.sh b/.github/publish-git-tag.sh new file mode 100755 index 0000000..9437a66 --- /dev/null +++ b/.github/publish-git-tag.sh @@ -0,0 +1,4 @@ +#! /usr/bin/env bash + +git tag `python .github/fetch_version.py` # create a new tag +git push --tags || true # update the repository version diff --git a/.github/workflows/ci_cd.yaml b/.github/workflows/ci_cd.yaml index 3db31f2..4df3e8c 100644 --- a/.github/workflows/ci_cd.yaml +++ b/.github/workflows/ci_cd.yaml @@ -68,6 +68,35 @@ jobs: run: make data - name: Run tests run: pytest + check-version: + name: Check version + if: github.event_name == 'pull_request' + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch all history for all tags and branches + repository: ${{ github.event.pull_request.head.repo.full_name }} + ref: ${{ github.event.pull_request.head.ref }} + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + - name: Build changelog + run: pip install "yaml-changelog>=0.1.7" && make changelog + - name: Preview changelog update + run: ".github/get-changelog-diff.sh" + - name: Check version number has been properly updated + run: ".github/is-version-number-acceptable.sh" + - name: Update changelog + uses: EndBug/add-and-commit@v9 + with: + add: "." + committer_name: Github Actions[bot] + author_name: Github Actions[bot] + message: Update PolicyEngine US data + github_token: ${{ secrets.POLICYENGINE_GITHUB }} docker: name: Docker diff --git a/changelog.yaml b/changelog.yaml new file mode 100644 index 0000000..8944a4b --- /dev/null +++ b/changelog.yaml @@ -0,0 +1,5 @@ +- changes: + added: + - Initialized changelogging + date: 2024-09-09 17:29:10 + version: 1.0.0 diff --git a/changelog_entry.yaml b/changelog_entry.yaml new file mode 100644 index 0000000..f3b708c --- /dev/null +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + changed: + - Lightened dependency list. diff --git a/pyproject.toml b/pyproject.toml index 640810d..2ecfff7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,9 +14,6 @@ license = {file = "LICENSE"} requires-python = ">=3.6" dependencies = [ "policyengine_core", - "tables", - "survey_enhance", - "torch", "requests", "tqdm", "tabulate", @@ -29,6 +26,9 @@ dev = [ "pytest", "policyengine_uk>=1.8.0", "streamlit", + "survey_enhance", + "torch", + "tables", ] [tool.setuptools] From 84ab3ff508ead2588174a203f86b9e17ffd064cc Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 11:59:00 +0100 Subject: [PATCH 2/3] Add missing uploads --- Makefile | 1 + .../datasets/frs/enhanced_frs.py | 2 ++ .../datasets/frs/extended_frs.py | 1 + .../storage/upload_completed_datasets.py | 17 +++++++++++++++++ 4 files changed, 21 insertions(+) diff --git a/Makefile b/Makefile index b254604..ed38cdd 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,7 @@ documentation: data: python policyengine_uk_data/datasets/frs/dwp_frs.py python policyengine_uk_data/datasets/frs/frs.py + python policyengine_uk_data/datasets/frs/enhanced_frs.py build: python -m build diff --git a/policyengine_uk_data/datasets/frs/enhanced_frs.py b/policyengine_uk_data/datasets/frs/enhanced_frs.py index 28b873b..6a773e9 100644 --- a/policyengine_uk_data/datasets/frs/enhanced_frs.py +++ b/policyengine_uk_data/datasets/frs/enhanced_frs.py @@ -38,6 +38,7 @@ class ReweightedFRS_2022_23(EnhancedFRS): input_frs = FRS_2022_23 time_period = 2022 end_year = 2022 + url = "release://PolicyEngine/ukda/reweighted_frs_2022_23.h5" class EnhancedFRS_2022_23(EnhancedFRS): @@ -100,4 +101,5 @@ def loss(weights): if __name__ == "__main__": + ReweightedFRS_2022_23().generate() EnhancedFRS_2022_23().generate() diff --git a/policyengine_uk_data/datasets/frs/extended_frs.py b/policyengine_uk_data/datasets/frs/extended_frs.py index a421d0d..085ef38 100644 --- a/policyengine_uk_data/datasets/frs/extended_frs.py +++ b/policyengine_uk_data/datasets/frs/extended_frs.py @@ -94,6 +94,7 @@ class ExtendedFRS_2022_23(ExtendedFRS): data_format = Dataset.TIME_PERIOD_ARRAYS input_frs = FRS_2022_23 time_period = 2022 + url = "release://PolicyEngine/ukda/extended_frs_2022_23.h5" if __name__ == "__main__": diff --git a/policyengine_uk_data/storage/upload_completed_datasets.py b/policyengine_uk_data/storage/upload_completed_datasets.py index 8d97ba1..6d6490b 100644 --- a/policyengine_uk_data/storage/upload_completed_datasets.py +++ b/policyengine_uk_data/storage/upload_completed_datasets.py @@ -1,4 +1,21 @@ from policyengine_uk_data.utils.github import upload from pathlib import Path +from tqdm import tqdm FOLDER = Path(__file__).parent + +FILES = [ + "cps_2022_23.h5", + "enhanced_frs_2022_23.h5", + "extended_frs_2022_23.h5", + "reweighted_frs_2022_23.h5", +] + +for file in tqdm(FILES): + upload( + "PolicyEngine", + "ukda", + "release", + file, + FOLDER / file, + ) From 153339bc0d05f46eec90d5955eef9a330d6000cf Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 12:17:18 +0100 Subject: [PATCH 3/3] Fix bug in uploads --- .../storage/upload_completed_datasets.py | 2 +- policyengine_uk_data/utils/github.py | 77 +++++++------------ 2 files changed, 28 insertions(+), 51 deletions(-) diff --git a/policyengine_uk_data/storage/upload_completed_datasets.py b/policyengine_uk_data/storage/upload_completed_datasets.py index 6d6490b..513bbbb 100644 --- a/policyengine_uk_data/storage/upload_completed_datasets.py +++ b/policyengine_uk_data/storage/upload_completed_datasets.py @@ -5,7 +5,7 @@ FOLDER = Path(__file__).parent FILES = [ - "cps_2022_23.h5", + "frs_2022_23.h5", "enhanced_frs_2022_23.h5", "extended_frs_2022_23.h5", "reweighted_frs_2022_23.h5", diff --git a/policyengine_uk_data/utils/github.py b/policyengine_uk_data/utils/github.py index 43a05e2..27c88e1 100644 --- a/policyengine_uk_data/utils/github.py +++ b/policyengine_uk_data/utils/github.py @@ -1,8 +1,6 @@ import os import requests from tqdm import tqdm -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry import time auth_headers = { @@ -62,66 +60,45 @@ def download( f.write(response.content) -def create_session_with_retries(): - session = requests.Session() - retries = Retry( - total=5, backoff_factor=1, status_forcelist=[502, 503, 504] - ) - session.mount("https://", HTTPAdapter(max_retries=retries)) - return session - - def upload( org: str, repo: str, release_tag: str, file_name: str, file_path: str ) -> bytes: release_id = get_release_id(org, repo, release_tag) + + # First, list release assets + url = f"https://api.github.com/repos/{org}/{repo}/releases/{release_id}/assets" + response = requests.get(url, headers=auth_headers).json() + names = [asset["name"] for asset in response] + if file_name in names: + print( + f"Asset {file_name} already exists in release {release_tag} of {org}/{repo}, skipping." + ) + return + url = f"https://uploads.github.com/repos/{org}/{repo}/releases/{release_id}/assets?name={file_name}" - file_size = os.path.getsize(file_path) headers = { "Accept": "application/vnd.github.v3+json", "Content-Type": "application/octet-stream", **auth_headers, } - session = create_session_with_retries() - - max_retries = 3 - for attempt in range(max_retries): - try: - with open(file_path, "rb") as f: - with tqdm(total=file_size, unit="B", unit_scale=True) as pbar: - response = session.post( - url, - headers=headers, - data=f, - stream=True, - hooks=dict( - response=lambda r, *args, **kwargs: pbar.update( - len(r.content) - ) - ), - timeout=300, # 5 minutes timeout - ) - - if response.status_code == 201: - return response.json() - else: - print( - f"Attempt {attempt + 1} failed with status code {response.status_code}. Response: {response.text}" - ) - - except requests.exceptions.RequestException as e: - print(f"Attempt {attempt + 1} failed with error: {str(e)}") - - if attempt < max_retries - 1: - wait_time = ( - attempt + 1 - ) * 60 # Wait 1 minute, then 2 minutes, then 3 minutes - print(f"Waiting {wait_time} seconds before retrying...") - time.sleep(wait_time) - - raise ValueError(f"Failed to upload file after {max_retries} attempts.") + with open(file_path, "rb") as f: + data = f.read() + + response = requests.post( + url, + headers=headers, + data=data, + ) + + if response.status_code != 201: + raise ValueError( + f"Invalid response code {response.status_code} for url {url}. Received: {response.text}" + ) + + return response.json() + def set_pr_auto_review_comment(text: str):