From 0735d3728dc36321f54917da81ca933c50fa5434 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Thu, 13 Feb 2025 19:14:57 -0800 Subject: [PATCH 1/3] Add GH workflow for running analysis This is just a basic proof-of-concept for now. Is an actions runner beefy enough for this? What about costs with all the S3 requests from outside AWS? --- .editorconfig | 3 ++ .github/workflows/analyze.yml | 64 +++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 .github/workflows/analyze.yml diff --git a/.editorconfig b/.editorconfig index 6389927..f5ef29b 100644 --- a/.editorconfig +++ b/.editorconfig @@ -6,3 +6,6 @@ trim_trailing_whitespace = true [*.js] indent_size = 2 + +[*.yml] +indent_size = 2 diff --git a/.github/workflows/analyze.yml b/.github/workflows/analyze.yml new file mode 100644 index 0000000..49ffcfe --- /dev/null +++ b/.github/workflows/analyze.yml @@ -0,0 +1,64 @@ +name: Analyze + +env: + TEST_PATTERN: '*justice.gov/*' + +on: + pull_request: {} + workflow_dispatch: + inputs: + threshold: + description: 'Threshold' + required: false + type: string + pattern: + description: 'Pattern' + required: false + type: string + default: '' + from: + description: 'From Time' + required: false + type: string + default: '' + to: + description: 'To Time' + required: false + type: string + default: '' + +jobs: + analyze: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: pip + + - name: Install Dependencies + run: pip install -r requirements.txt + + - name: Analyze! + run: | + # FIXME: set up readability running in a background process. + # We probably need to bring the code over from + # web-monitoring-changed-terms-analysis + python generate_task_sheets.py \ + --output out \ + --after '${{ inputs.from || '240' }}' \ + --before '${{ inputs.to || '0' }}' \ + --threshold '${{ inputs.threshold || '0.25' }}' \ + --pattern '${{ inputs.pattern || env.TEST_PATTERN }}' \ + --skip-readability + + - name: Upload Results + uses: actions/upload-artifact@v4 + with: + name: output + path: out + if-no-files-found: error + # TODO: what's appropriate retention here? + # retention-days: 1 From a66a9d0b753a99711bdfcf674e74f0aceb2be11d Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Thu, 13 Feb 2025 19:24:13 -0800 Subject: [PATCH 2/3] Install system deps --- .github/workflows/analyze.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/analyze.yml b/.github/workflows/analyze.yml index 49ffcfe..6d1eed3 100644 --- a/.github/workflows/analyze.yml +++ b/.github/workflows/analyze.yml @@ -38,7 +38,13 @@ jobs: python-version: '3.10' cache: pip - - name: Install Dependencies + - name: Install System Dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + gcc g++ pkg-config libxml2-dev libxslt-dev libz-dev + + - name: Install Python Dependencies run: pip install -r requirements.txt - name: Analyze! From f8cc8fe5e8a031ed5467a6a0860988ba34710561 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Thu, 13 Feb 2025 19:32:01 -0800 Subject: [PATCH 3/3] Don't forget NTLK --- .github/workflows/analyze.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/analyze.yml b/.github/workflows/analyze.yml index 6d1eed3..023ca32 100644 --- a/.github/workflows/analyze.yml +++ b/.github/workflows/analyze.yml @@ -47,6 +47,10 @@ jobs: - name: Install Python Dependencies run: pip install -r requirements.txt + - name: Download NLTK Corpora + run: | + python -m nltk.downloader stopwords + - name: Analyze! run: | # FIXME: set up readability running in a background process.