From 6a4cec4f789bcfd3611dabd29acaa9d1a2c30236 Mon Sep 17 00:00:00 2001 From: Jelle Teijema Date: Tue, 7 Jan 2025 14:17:40 +0100 Subject: [PATCH] Introduce project folder argument (#68) Key Updates Project Folder and Paths Introduced a -p flag for specifying the project folder, and -d for data. Updated templates to use relative paths for outputs and scripts. Refactored path handling using a new ProjectPaths class for better consistency. Template and Workflow Improvements Removed the output folder flag for templates. Improved prior dataset handling, including renaming and pathing fixes. Documentation and Examples Added detailed examples and extra info to the README. Updated examples to reflect the new -p and -d flag and path handling. Workflow Enhancements Added steps to run scitree before and after jobs execution. Fixed issues with running jobs in workflows. General Refactoring Cleaned up file handlers and folder names for clarity. Standardized platform checks and snake_case naming conventions. Miscellaneous Added .venv to .gitignore. Improved code quality with ruff. --- .github/workflows/ci-workflow.yml | 51 ++-- .github/workflows/test_data/labels.csv | 4 + .github/workflows/test_data/priors_labels.csv | 16 ++ .gitignore | 12 +- README.md | 264 ++++++++++++------ asreviewcontrib/makita/entrypoint.py | 244 ++++++++++------ asreviewcontrib/makita/template_arfi.py | 6 +- asreviewcontrib/makita/template_base.py | 40 +-- asreviewcontrib/makita/template_basic.py | 30 +- asreviewcontrib/makita/template_multimodel.py | 6 +- asreviewcontrib/makita/template_prior.py | 12 +- .../makita/templates/doc_README.md.template | 16 +- examples/README.md | 28 +- 13 files changed, 468 insertions(+), 261 deletions(-) create mode 100644 .github/workflows/test_data/priors_labels.csv diff --git a/.github/workflows/ci-workflow.yml b/.github/workflows/ci-workflow.yml index 26ae2293..4fe141f1 100644 --- a/.github/workflows/ci-workflow.yml +++ b/.github/workflows/ci-workflow.yml @@ -27,33 +27,20 @@ jobs: - name: Lint python with ruff run: | ruff check . - - name: Create directories using Python - run: python -c "import os; [os.makedirs(path, exist_ok=True) for path in ['./tmp/basic/data-test', './tmp/arfi/data', './tmp/prior/data', './tmp/multimodel/data', './tmp/scripts', './tmp/synergy/data']]" - - name: set up environment - run: | - cp .github/workflows/test_data/labels.csv ./tmp/basic/data-test/labels.csv - cp .github/workflows/test_data/labels.csv ./tmp/arfi/data/labels.csv - cp .github/workflows/test_data/labels.csv ./tmp/prior/data/labels.csv - cp .github/workflows/test_data/labels.csv ./tmp/prior/data/prior_labels.csv - cp .github/workflows/test_data/labels.csv ./tmp/multimodel/data/labels.csv - name: Render makita templates run: | - cd tmp/basic - asreview makita template basic --classifier nb --feature_extractor tfidf --query_strategy max --n_runs 1 -s data-test -o output-test --init_seed 1 --model_seed 2 --skip_wordclouds --overwrite --instances_per_query 2 --stop_if min --balance_strategy double | tee output.txt - grep -q "ERROR" output.txt && exit 1 || true - cd ../arfi - asreview makita template arfi | tee output.txt - grep -q "ERROR" output.txt && exit 1 || true - cd ../prior - asreview makita template prior | tee output.txt - grep -q "ERROR" output.txt && exit 1 || true - cd ../multimodel - asreview makita template multimodel | tee output.txt - grep -q "ERROR" output.txt && exit 1 || true + asreview makita template basic -p basic -d .github/workflows/test_data/ --classifier nb --feature_extractor tfidf --query_strategy max --n_runs 1 --init_seed 1 --model_seed 2 --skip_wordclouds --overwrite --instances_per_query 2 --stop_if min --balance_strategy double | tee output_basic.txt + grep -q "ERROR" output_basic.txt && exit 1 || true + asreview makita template arfi -p arfi -d .github/workflows/test_data/ | tee output_arfi.txt + grep -q "ERROR" output_arfi.txt && exit 1 || true + asreview makita template prior -p prior -d .github/workflows/test_data/ | tee output_prior.txt + grep -q "ERROR" output_prior.txt && exit 1 || true + asreview makita template multimodel -p multimodel -d .github/workflows/test_data/ | tee output_multimodel.txt + grep -q "ERROR" output_multimodel.txt && exit 1 || true - name: Render makita scripts run: | - asreview makita add-script --all -o ./tmp/scripts | tee output.txt - grep -q "ERROR" output.txt && exit 1 || true + asreview makita add-script --all -o ./tmp/scripts | tee output_scripts.txt + grep -q "ERROR" output_scripts.txt && exit 1 || true - name: Run SciTree if: ${{ matrix.os != 'windows-latest' }} run: | @@ -62,8 +49,20 @@ jobs: - name: Execute basic template jobs file if: ${{ matrix.os != 'windows-latest' }} run: | - cd tmp/synergy - synergy_dataset get -d van_de_Schoot_2018 -o ./data -l - asreview makita template basic --instances_per_query 100 --skip_wordclouds --overwrite --n_runs 2 + mkdir basic_run + cd basic_run + asreview makita template basic -d ../.github/workflows/test_data/ --instances_per_query 100 --skip_wordclouds --overwrite --n_runs 1 + scitree sh jobs.sh scitree + - name: Upload output files + if: always() + uses: actions/upload-artifact@v3 + with: + name: output-files + path: | + output_basic.txt + output_arfi.txt + output_prior.txt + output_multimodel.txt + output_scripts.txt \ No newline at end of file diff --git a/.github/workflows/test_data/labels.csv b/.github/workflows/test_data/labels.csv index 6b9a991d..c8a42193 100644 --- a/.github/workflows/test_data/labels.csv +++ b/.github/workflows/test_data/labels.csv @@ -1,5 +1,9 @@ Unnamed: 0,abstract,alternate_title2,alternate_title3,file_attachments2,authors,id,issn,keywords,note,number,place_published,title,publication_year,publisher,secondary_authors,start_page,type_of_reference,label_included,url,volume 0,"BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example.com,"['Marx, Karl', 'Lindgren, Astrid']",12345,1932-6208,"['Pippi', 'Nordwind', 'Piraten']",1008150341,3,United States,Title of reference,2014//,Fun Factory,"['Glattauer, Daniel']",e0815,JOUR,1,http://example_url.com,9 +0,"BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example.com,"['Marx, Karl', 'Lindgren, Astrid']",12345,1932-6208,"['Pippi', 'Nordwind', 'Piraten']",1008150341,3,United States,Title of reference,2014//,Fun Factory,"['Glattauer, Daniel']",e0815,JOUR,1,http://example_url.com,9 +0,"BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example.com,"['Marx, Karl', 'Lindgren, Astrid']",12345,1932-6208,"['Pippi', 'Nordwind', 'Piraten']",1008150341,3,United States,Title of reference,2014//,Fun Factory,"['Glattauer, Daniel']",e0815,JOUR,1,http://example_url.com,9 +0,"BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example.com,"['Marx, Karl', 'Lindgren, Astrid']",12345,1932-6208,"['Pippi', 'Nordwind', 'Piraten']",1008150341,3,United States,Title of reference,2014//,Fun Factory,"['Glattauer, Daniel']",e0815,JOUR,1,http://example_url.com,9 +0,"BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example.com,"['Marx, Karl', 'Lindgren, Astrid']",12345,1932-6208,"['Pippi', 'Nordwind', 'Piraten']",1008150341,3,United States,Title of reference,2014//,Fun Factory,"['Glattauer, Daniel']",e0815,JOUR,1,http://example_url.com,9 1,"BACKGROUND: Lorem dammed ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example2.com,"['Marxus, Karlus', 'Lindgren, Astrid']",12345,1732-4208,"['Pippi Langstrumpf', 'Nordwind', 'Piraten']",1228150341,3,Germany,The title of the reference,2006//,Dark Factory,"['Glattauer, Daniel']",e0815341,JOUR,0,http://example_url.com,6 1,"BACKGROUND: Lorem dammed ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example2.com,"['Marxus, Karlus', 'Lindgren, Astrid']",12345,1732-4208,"['Pippi Langstrumpf', 'Nordwind', 'Piraten']",1228150341,3,Germany,The title of the reference,2006//,Dark Factory,"['Glattauer, Daniel']",e0815341,JOUR,0,http://example_url.com,6 1,"BACKGROUND: Lorem dammed ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example2.com,"['Marxus, Karlus', 'Lindgren, Astrid']",12345,1732-4208,"['Pippi Langstrumpf', 'Nordwind', 'Piraten']",1228150341,3,Germany,The title of the reference,2006//,Dark Factory,"['Glattauer, Daniel']",e0815341,JOUR,0,http://example_url.com,6 diff --git a/.github/workflows/test_data/priors_labels.csv b/.github/workflows/test_data/priors_labels.csv new file mode 100644 index 00000000..c8a42193 --- /dev/null +++ b/.github/workflows/test_data/priors_labels.csv @@ -0,0 +1,16 @@ +Unnamed: 0,abstract,alternate_title2,alternate_title3,file_attachments2,authors,id,issn,keywords,note,number,place_published,title,publication_year,publisher,secondary_authors,start_page,type_of_reference,label_included,url,volume +0,"BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example.com,"['Marx, Karl', 'Lindgren, Astrid']",12345,1932-6208,"['Pippi', 'Nordwind', 'Piraten']",1008150341,3,United States,Title of reference,2014//,Fun Factory,"['Glattauer, Daniel']",e0815,JOUR,1,http://example_url.com,9 +0,"BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example.com,"['Marx, Karl', 'Lindgren, Astrid']",12345,1932-6208,"['Pippi', 'Nordwind', 'Piraten']",1008150341,3,United States,Title of reference,2014//,Fun Factory,"['Glattauer, Daniel']",e0815,JOUR,1,http://example_url.com,9 +0,"BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example.com,"['Marx, Karl', 'Lindgren, Astrid']",12345,1932-6208,"['Pippi', 'Nordwind', 'Piraten']",1008150341,3,United States,Title of reference,2014//,Fun Factory,"['Glattauer, Daniel']",e0815,JOUR,1,http://example_url.com,9 +0,"BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example.com,"['Marx, Karl', 'Lindgren, Astrid']",12345,1932-6208,"['Pippi', 'Nordwind', 'Piraten']",1008150341,3,United States,Title of reference,2014//,Fun Factory,"['Glattauer, Daniel']",e0815,JOUR,1,http://example_url.com,9 +0,"BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example.com,"['Marx, Karl', 'Lindgren, Astrid']",12345,1932-6208,"['Pippi', 'Nordwind', 'Piraten']",1008150341,3,United States,Title of reference,2014//,Fun Factory,"['Glattauer, Daniel']",e0815,JOUR,1,http://example_url.com,9 +1,"BACKGROUND: Lorem dammed ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example2.com,"['Marxus, Karlus', 'Lindgren, Astrid']",12345,1732-4208,"['Pippi Langstrumpf', 'Nordwind', 'Piraten']",1228150341,3,Germany,The title of the reference,2006//,Dark Factory,"['Glattauer, Daniel']",e0815341,JOUR,0,http://example_url.com,6 +1,"BACKGROUND: Lorem dammed ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example2.com,"['Marxus, Karlus', 'Lindgren, Astrid']",12345,1732-4208,"['Pippi Langstrumpf', 'Nordwind', 'Piraten']",1228150341,3,Germany,The title of the reference,2006//,Dark Factory,"['Glattauer, Daniel']",e0815341,JOUR,0,http://example_url.com,6 +1,"BACKGROUND: Lorem dammed ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example2.com,"['Marxus, Karlus', 'Lindgren, Astrid']",12345,1732-4208,"['Pippi Langstrumpf', 'Nordwind', 'Piraten']",1228150341,3,Germany,The title of the reference,2006//,Dark Factory,"['Glattauer, Daniel']",e0815341,JOUR,0,http://example_url.com,6 +1,"BACKGROUND: Lorem dammed ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example2.com,"['Marxus, Karlus', 'Lindgren, Astrid']",12345,1732-4208,"['Pippi Langstrumpf', 'Nordwind', 'Piraten']",1228150341,3,Germany,The title of the reference,2006//,Dark Factory,"['Glattauer, Daniel']",e0815341,JOUR,0,http://example_url.com,6 +1,"BACKGROUND: Lorem dammed ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example2.com,"['Marxus, Karlus', 'Lindgren, Astrid']",12345,1732-4208,"['Pippi Langstrumpf', 'Nordwind', 'Piraten']",1228150341,3,Germany,The title of the reference,2006//,Dark Factory,"['Glattauer, Daniel']",e0815341,JOUR,0,http://example_url.com,6 +1,"BACKGROUND: Lorem dammed ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example2.com,"['Marxus, Karlus', 'Lindgren, Astrid']",12345,1732-4208,"['Pippi Langstrumpf', 'Nordwind', 'Piraten']",1228150341,3,Germany,The title of the reference,2006//,Dark Factory,"['Glattauer, Daniel']",e0815341,JOUR,0,http://example_url.com,6 +1,"BACKGROUND: Lorem dammed ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example2.com,"['Marxus, Karlus', 'Lindgren, Astrid']",12345,1732-4208,"['Pippi Langstrumpf', 'Nordwind', 'Piraten']",1228150341,3,Germany,The title of the reference,2006//,Dark Factory,"['Glattauer, Daniel']",e0815341,JOUR,0,http://example_url.com,6 +1,"BACKGROUND: Lorem dammed ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example2.com,"['Marxus, Karlus', 'Lindgren, Astrid']",12345,1732-4208,"['Pippi Langstrumpf', 'Nordwind', 'Piraten']",1228150341,3,Germany,The title of the reference,2006//,Dark Factory,"['Glattauer, Daniel']",e0815341,JOUR,0,http://example_url.com,6 +1,"BACKGROUND: Lorem dammed ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example2.com,"['Marxus, Karlus', 'Lindgren, Astrid']",12345,1732-4208,"['Pippi Langstrumpf', 'Nordwind', 'Piraten']",1228150341,3,Germany,The title of the reference,2006//,Dark Factory,"['Glattauer, Daniel']",e0815341,JOUR,0,http://example_url.com,6 +1,"BACKGROUND: Lorem dammed ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.",lorem,Lorem,http://example2.com,"['Marxus, Karlus', 'Lindgren, Astrid']",12345,1732-4208,"['Pippi Langstrumpf', 'Nordwind', 'Piraten']",1228150341,3,Germany,The title of the reference,2006//,Dark Factory,"['Glattauer, Daniel']",e0815341,JOUR,0,http://example_url.com,6 diff --git a/.gitignore b/.gitignore index 79f3b131..601298a1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,3 @@ - -tmp/ -examples/*/output/ - # Created by https://www.gitignore.io/api/python # Edit at https://www.gitignore.io/?templates=python @@ -112,4 +108,12 @@ dmypy.json # End of https://www.gitignore.io/api/python +# Makita versioning file asreviewcontrib/makita/_version.py + +# Example data for examples +examples/data/ +examples/prior_example/generated_data/* + +# Python venv +venv/ \ No newline at end of file diff --git a/README.md b/README.md index ad9c1473..6cdfc274 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,53 @@ # ASReview Makita -[![PyPI version](https://badge.fury.io/py/asreview-makita.svg)](https://badge.fury.io/py/asreview-makita) [![Downloads](https://pepy.tech/badge/asreview-makita)](https://pepy.tech/project/asreview-makita) ![PyPI - License](https://img.shields.io/pypi/l/asreview-makita) [![DOI](https://zenodo.org/badge/530642619.svg)](https://zenodo.org/badge/latestdoi/530642619) [![DOI](https://img.shields.io/badge/DOI-10.1016%2Fj.simpa.2024.100663-blue)](https://doi.org/10.1016/j.simpa.2024.100663) - -[ASReviews](https://github.com/asreview/asreview)' Makita (**MAK**e **IT** **A**utomatic) is a workflow generator for simulation studies using the command line interface of [ASReview -LAB](https://asreview.readthedocs.io/en/latest/simulation_cli.html). Makita can be used to effortlessly generate the framework and code for your simulation study. - -A simulation involves mimicking the screening process for a systematic review of a human in interaction with an Active learning model (i.e., a combination of a feature extractor, classifier, -balancing method, and a query strategy). The simulation reenacts the screening process as if a researcher were using active learning. The performance of one or multiple model(s) can then be -measured by performance metrics, such as the Work Saved over Sampling, recall at a given point in the screening process, or the average time to discover a relevant record. - -Using Makita templates, different study structures can be generated to fit the needs of your very own study. If your study requires a unique template, you can create a new one and use it -instead. - -With [ASReview LAB](https://github.com/asreview/asreview), you can [simulate](https://asreview.readthedocs.io/en/latest/simulation_overview.html#overview) with the [web -interface](https://asreview.readthedocs.io/en/latest/simulation_overview.html#simulating-with-asreview-lab), the [Python API](https://asreview.readthedocs.io/en/latest/simulation_api_example.html), or -the [Command Line Interface (CLI)](https://asreview.readthedocs.io/en/latest/simulation_cli.html). Makita makes use of the CLI. +[![PyPI +version](https://badge.fury.io/py/asreview-makita.svg)](https://badge.fury.io/py/asreview-makita) +[![Downloads](https://static.pepy.tech/badge/asreview-makita)](https://pepy.tech/project/asreview-makita) +![PyPI - License](https://img.shields.io/pypi/l/asreview-makita) +[![DOI](https://zenodo.org/badge/530642619.svg)](https://zenodo.org/badge/latestdoi/530642619) +[![DOI](https://img.shields.io/badge/DOI-10.1016%2Fj.simpa.2024.100663-blue)](https://doi.org/10.1016/j.simpa.2024.100663) + + +[ASReviews](https://github.com/asreview/asreview)' Makita (**MAK**e **IT** +**A**utomatic) is a workflow generator for simulation studies using the command +line interface of [ASReview +LAB](https://asreview.readthedocs.io/en/latest/simulation_cli.html). Makita can +be used to effortlessly generate the framework and code for your simulation +study. + +A simulation involves mimicking the screening process for a systematic review of +a human in interaction with an Active learning model (i.e., a combination of a +feature extractor, classifier, balancing method, and a query strategy). The +simulation reenacts the screening process as if a researcher were using active +learning. The performance of one or multiple model(s) can then be measured by +performance metrics, such as the Work Saved over Sampling, recall at a given +point in the screening process, or the average time to discover a relevant +record. + +Using Makita templates, different study structures can be generated to fit the +needs of your very own study. If your study requires a unique template, you can +create a new one and use it instead. + +With [ASReview LAB](https://github.com/asreview/asreview), you can +[simulate](https://asreview.readthedocs.io/en/latest/simulation_overview.html#overview) +with the [web +interface](https://asreview.readthedocs.io/en/latest/simulation_overview.html#simulating-with-asreview-lab), +the [Python +API](https://asreview.readthedocs.io/en/latest/simulation_api_example.html), or +the [Command Line Interface +(CLI)](https://asreview.readthedocs.io/en/latest/simulation_cli.html). Makita +makes use of the CLI. What Makita does: - Setting up a workflow for running a large-scale simulation study - Preparing a Github repository - Automating the many lines of code needed -- Creating a batch script for running the simulation study with just one line of code +- Creating a batch script for running the simulation study with just one line of + code - Making your research fully reproducible -- Allowing you to add templates to accommodate your own specific research question +- Allowing you to add templates to accommodate your own specific research + question What Makita does not do: @@ -31,7 +55,8 @@ What Makita does not do: - Being a black-box - Writing your paper -For a tutorial on using Makita we refer to the [Exercise on Using the ASReview Simulation Mode](https://asreview.github.io/asreview-academy/simulation.html). +For a tutorial on using Makita we refer to the [Exercise on Using the ASReview +Simulation Mode](https://asreview.github.io/asreview-academy/simulation.html). ## Installation @@ -53,46 +78,79 @@ pip install asreview-makita pip install --upgrade asreview-makita ``` -After installing the extension, ASReview should automatically detect Makita. If installed correctly, `asreview --help` should list Makita as an option. +After installing the extension, ASReview should automatically detect Makita. If +installed correctly, `asreview --help` should list Makita as an option. ## Getting started -You can create the framework and code for your own simulation study in 4 steps. +Follow these steps to set up the framework and code for your simulation study: + +1. Prepare Your Datasets + Place all your datasets in a single folder. + +2. Choose a Project Folder + Decide on a path for your project folder. You don't need to create the folder manually; `makita` will handle it for you. -1. Create an project folder on your computer. -2. Create a subfolder named `data` and fill it using one or more datasets. -3. Using your preferred command line tool, `cd` into the project folder. -4. Create a simulation study from a template found in the [list of templates](#templates) via +3. Install ASReview Makita + Use your preferred command-line tool to install Makita: +```console +pip install asreview-makita +``` +4. Generate a Simulation Study + Select a template from the [list of templates](#templates) and create your study using: ```console -asreview makita template NAME_OF_TEMPLATE +asreview makita template NAME_OF_TEMPLATE -d 'path/to/your/datasets' -p 'path/to/your/project-folder' ``` -where `NAME_OF_TEMPLATE` is one of the templates (e.g. `asreview makita template arfi`). +where `NAME_OF_TEMPLATE` is one of the templates (e.g. `asreview makita template +arfi`). -Your simulation study is now properly set up and ready for use. To start the simulations, execute the following shell script in the project folder: +To start the simulations, run the appropriate script in your project folder: ```console +# On Unix-based systems (e.g., macOS, Linux) sh jobs.sh + +# On Windows +jobs.bat ``` -Or on Windows: +The `jobs` script runs all jobs in the project folder. + +--- + +If you run the makita command in the directory where you want to generate your +project, you don't need to specify the project folder using the -p flag. +Similarly, if your datasets are stored in a folder named data within the same +location, you can omit the -d flag. + +For example: ```console -jobs.bat +asreview makita template basic ``` -The `jobs` script runs all jobs in the project folder. +is equivalent to: + +```console +asreview makita template basic -p 'current/working/directory' -d 'current/working/directory/data' +``` ### Platform support -By default, ASReview Makita renders job files for the platform of rendering. It is possible to render jobs for other platforms as well. Use the argument `--platform` with values "Windows", "Linux", or "Darwin" (MacOS) to change the output. +By default, ASReview Makita renders job files for the platform of rendering. It +is possible to render jobs for other platforms as well. Use the argument +`--platform` with values "Windows", "Linux", or "Darwin" (MacOS) to change the +output. ```console asreview makita template basic --platform Windows ``` -By default, the job file depends on the platform. Windows users will see a `jobs.bat` file, while other users will see `jobs.sh`. You can overwrite this with +By default, the job file depends on the platform. Windows users will see a +`jobs.bat` file, while other users will see `jobs.sh`. You can overwrite this +with ```console asreview makita template basic --job_file my_jobs_file.my_ext @@ -100,8 +158,8 @@ asreview makita template basic --job_file my_jobs_file.my_ext ## Templates -The following table gives an overview of the available templates. -See [Getting started](#getting-started) for instructions on usage. +The following table gives an overview of the available templates. See [Getting +started](#getting-started) for instructions on usage. > Note: If no seed is set with the template command, the default seed is used. > While this is important for the reproducibility of the results, it may lead to @@ -112,18 +170,20 @@ See [Getting started](#getting-started) for instructions on usage. command: `basic` -The basic template prepares a script for conducting a simulation study with one run using the default model settings, and two randomly chosen priors (one relevant and one irrelevant record). +The basic template prepares a script for conducting a simulation study with one +run using the default model settings, and two randomly chosen priors (one +relevant and one irrelevant record). optional arguments: ```console - -h, --help show this help message and exit - --job_file JOB_FILE, -f JOB_FILE The name of the file with jobs. Default jobs.bat for Windows, otherwise jobs.sh. - -s DATA_FOLDER Dataset folder - -o OUTPUT_FOLDER Output folder - --init_seed INIT_SEED Seed of the priors. Seed is set to 535 by default. - --model_seed MODEL_SEED Seed of the models. Seed is set to 165 by default. - --template TEMPLATE Overwrite template with template file path. + -h, --help Show this help message and exit + -p, --project_folder PROJECT_FOLDER The folder the project will be rendered too Default: The current working directory + -d, --data_folder DATA_FOLDER The dataset source folder Default: `Data` folder in working directory + -j, --job_file JOB_FILE The name of the file with jobs Default: jobs.bat for Windows, otherwise jobs.sh. + --init_seed INIT_SEED Seed of the priors Seed is set to 535 by default. + --model_seed MODEL_SEED Seed of the models Seed is set to 165 by default. + --template TEMPLATE Overwrite template with template file path --platform PLATFORM Platform to run jobs: Windows, Darwin, Linux. Default: the system of rendering templates. --n_runs N_RUNS Number of runs. Default: 1. --skip_wordclouds Disables the generation of wordclouds. @@ -140,15 +200,19 @@ optional arguments: command: `arfi` -The ARFI template (All relevant, fixed irrelevant) prepares a script for running a simulation study in such a way that for every relevant record 1 run will be executed with 10 randomly chosen irrelevant records which are kept constant over runs. When multiple datasets are available the template orders the tasks in the job file per dataset. +The ARFI template (All relevant, fixed irrelevant) prepares a script for running +a simulation study in such a way that for every relevant record 1 run will be +executed with 10 randomly chosen irrelevant records which are kept constant over +runs. When multiple datasets are available the template orders the tasks in the +job file per dataset. optional arguments: ```console - -h, --help show this help message and exit - --job_file JOB_FILE, -f JOB_FILE The name of the file with jobs. Default jobs.bat for Windows, otherwise jobs.sh. - -s DATA_FOLDER Dataset folder - -o OUTPUT_FOLDER Output folder + -h, --help Show this help message and exit + -p, --project_folder PROJECT_FOLDER The folder the project will be rendered too Default: The current working directory + -d, --data_folder DATA_FOLDER The dataset source folder Default: `Data` folder in working directory + -j, --job_file JOB_FILE The name of the file with jobs Default: jobs.bat for Windows, otherwise jobs.sh. --init_seed INIT_SEED Seed of the priors. Seed is set to 535 by default. --model_seed MODEL_SEED Seed of the models. Seed is set to 165 by default. --template TEMPLATE Overwrite template with template file path. @@ -168,15 +232,17 @@ optional arguments: command: `multimodel` -The multiple model template prepares a script for running a simulation study comparing multiple models for one dataset and a fixed set of priors (one relevant and one irrelevant record; identical across models). +The multiple model template prepares a script for running a simulation study +comparing multiple models for one dataset and a fixed set of priors (one +relevant and one irrelevant record; identical across models). optional arguments: ```console - -h, --help show this help message and exit - --job_file JOB_FILE, -f JOB_FILE The name of the file with jobs. Default jobs.bat for Windows, otherwise jobs.sh. - -s DATA_FOLDER Dataset folder - -o OUTPUT_FOLDER Output folder + -h, --help Show this help message and exit + -p, --project_folder PROJECT_FOLDER The folder the project will be rendered too Default: The current working directory + -d, --data_folder DATA_FOLDER The dataset source folder Default: `Data` folder in working directory + -j, --job_file JOB_FILE The name of the file with jobs Default: jobs.bat for Windows, otherwise jobs.sh. --init_seed INIT_SEED Seed of the priors. Seed is set to 535 by default. --model_seed MODEL_SEED Seed of the models. Seed is set to 165 by default. --template TEMPLATE Overwrite template with template file path. @@ -197,9 +263,10 @@ optional arguments: If you want to specify certain combinations of classifiers and feature extractors that should and should not be used, you can use the `--classifiers`, -`--feature_extractors`, `--query_strategies`, `--balance_strategies` and `--impossible_models` option. For instance, if you -want to exclude the combinations of `nb` with `doc2vec` and `logistic` with -`tfidf`, use the following command: +`--feature_extractors`, `--query_strategies`, `--balance_strategies` and +`--impossible_models` option. For instance, if you want to exclude the +combinations of `nb` with `doc2vec` and `logistic` with `tfidf`, use the +following command: ```console asreview makita template multimodel --classifiers logistic nb --feature_extractors tfidf doc2vec --query_strategies max max_random max_uncertainty cluster --impossible_models nb,doc2vec logistic,tfidf @@ -209,19 +276,30 @@ asreview makita template multimodel --classifiers logistic nb --feature_extracto command: `prior` -The prior template evaluates how a set of custom prior knowledge might affect simulation performance. It processes two types of data in the data folder: labeled dataset(s) to be simulated and labeled dataset(s) to be used as prior knowledge. The filename(s) of the dataset(s) containing the custom prior knowledge should use the naming prefix `prior_[dataset_name]`. +The prior template evaluates how a set of custom prior knowledge might affect +simulation performance. It processes two types of data in the data folder: +labeled dataset(s) to be simulated and labeled dataset(s) to be used as prior +knowledge. The filename(s) of the dataset(s) containing the custom prior +knowledge should use the naming prefix `prior_[dataset_name]`. -The template runs two simulations: the first simulation uses all records from the `prior_` dataset(s) as prior knowledge, and the second uses a 1+1 randomly chosen set of prior knowledge from the non-prior knowledge dataset as a minimal training set. Both runs simulate performance on the combined non-prior dataset(s). +The template runs two simulations: the first simulation uses all records from +the `prior_` dataset(s) as prior knowledge, and the second uses a 1+1 randomly +chosen set of prior knowledge from the non-prior knowledge dataset as a minimal +training set. Both runs simulate performance on the combined non-prior +dataset(s). -Running this template creates a `generated_data` folder. This folder contains two datasets; `dataset_with_priors.csv` and `dataset_without_priors.csv`. The simulations specified in the generated jobs file will use these datasets for their simulations. +Running this template creates a `generated_data` folder. This folder contains +two datasets; `dataset_with_priors.csv` and `dataset_without_priors.csv`. The +simulations specified in the generated jobs file will use these datasets for +their simulations. optional arguments: ```console - -h, --help show this help message and exit - --job_file JOB_FILE, -f JOB_FILE The name of the file with jobs. Default jobs.bat for Windows, otherwise jobs.sh. - -s DATA_FOLDER Dataset folder - -o OUTPUT_FOLDER Output folder + -h, --help Show this help message and exit + -p, --project_folder PROJECT_FOLDER The folder the project will be rendered too Default: The current working directory + -d, --data_folder DATA_FOLDER The dataset source folder Default: `Data` folder in working directory + -j, --job_file JOB_FILE The name of the file with jobs Default: jobs.bat for Windows, otherwise jobs.sh. --init_seed INIT_SEED Seed of the priors. Seed is set to 535 by default. --model_seed MODEL_SEED Seed of the models. Seed is set to 165 by default. --template TEMPLATE Overwrite template with template file path. @@ -239,7 +317,8 @@ optional arguments: #### Example usage -Put at least 2 datasets in the data folder. One starting with the `prior_` prefix, and one without this prefix. +Put at least 2 datasets in the data folder. One starting with the `prior_` +prefix, and one without this prefix. > note: `priors_` will also work. @@ -251,28 +330,41 @@ asreview makita template prior --classifier logistic --feature_extractor tfidf ### Create and use custom templates -It is possible to overwrite the internal templates. This can be useful for simulation studies with different needs. +It is possible to overwrite the internal templates. This can be useful for +simulation studies with different needs. -1. Select an existing template that looks similar to your needs. For example, you want to run ARFI with a different model, then you pick the [ARFI template](#arfi-template). -2. Download the template you selected in step 1 from the [Github repository](https://github.com/asreview/asreview-makita/tree/main/asreviewcontrib/makita/templates). Template files have the following - structure `template_*.txt.template`. For the ARFI example, the template is +1. Select an existing template that looks similar to your needs. For example, + you want to run ARFI with a different model, then you pick the [ARFI + template](#arfi-template). +2. Download the template you selected in step 1 from the [Github + repository](https://github.com/asreview/asreview-makita/tree/main/asreviewcontrib/makita/templates). + Template files have the following structure `template_*.txt.template`. For + the ARFI example, the template is [template_arfi.txt.template](https://github.com/asreview/asreview-makita/blob/main/asreviewcontrib/makita/templates/template_arfi.txt.template). -3. Save the downloaded template somewhere on your computer. The template is a so-called "Jinja" template. The template consists of [ASReview command line - commands](https://asreview.readthedocs.io/en/latest/API/cli.html) combined with jinja syntax. The Jinja syntax is very intuitive. See this +3. Save the downloaded template somewhere on your computer. The template is a + so-called "Jinja" template. The template consists of [ASReview command line + commands](https://asreview.readthedocs.io/en/latest/API/cli.html) combined + with jinja syntax. The Jinja syntax is very intuitive. See this [Cheatsheet](https://cheatography.com/skalavala/cheat-sheets/jinja/). 4. Edit the Jinja template to your needs. -5. Run the custom template with the command line option `--template PATH_TO_MY_TEMPLATE.txt.template`. For the ARFI example, this would be `asreview makita template arfi --template -PATH_TO_MY_TEMPLATE.txt.template`. Please keep in mind that you follow the usual steps for running a template. +5. Run the custom template with the command line option `--template +PATH_TO_MY_TEMPLATE.txt.template`. For the ARFI example, this would be `asreview +makita template arfi --template PATH_TO_MY_TEMPLATE.txt.template`. Please keep +in mind that you follow the usual steps for running a template. 6. A `jobs.sh` file should be in the your folder. -Please contribute your templates back to the project by making a Pull Request. Then, we can integrate it in the core of the makita package. +Please contribute your templates back to the project by making a Pull Request. +Then, we can integrate it in the core of the makita package. ### Add and use scripts -Makita can add scripts to your repository. The scripts are mainly pre- and postprocessing scripts. These scripts are not (yet) available in any existing ASReview software. Therefore, they can be added manually -with `asreview makita add-script NAME_OF_SCRIPT`. +Makita can add scripts to your repository. The scripts are mainly pre- and +postprocessing scripts. These scripts are not (yet) available in any existing +ASReview software. Therefore, they can be added manually with `asreview makita +add-script NAME_OF_SCRIPT`. -For example, the results from _ASReview datatools_ are merged via the script `merge_descriptives.py` (or `merge_metrics.py` for _ASReview insights_), using: +For example, the results from _ASReview datatools_ are merged via the script +`merge_descriptives.py` (or `merge_metrics.py` for _ASReview insights_), using: 1. Collect statistics (with template) 2. Run `asreview makita add-script merge_descriptives.py` @@ -284,11 +376,14 @@ Some scripts are added automatically to the folder, as they are part of the template. For example, the `get_plot.py` script is added to the generated folder when using any template, as it is used to generate the plots. -Still, `get_plot.py` can be used on its own, as it is a standalone script. To use it, -use `-s` (source) and `-o` (output) to tweak paths. +Still, `get_plot.py` can be used on its own, as it is a standalone script. To +use it, use `-s` (source) and `-o` (output) to tweak paths. Adding a legend to the plot can be done with the `-l` or `--show_legend` flag, -with the labels clustered on any of the following: `'filename', 'model', 'query_strategy', 'balance_strategy', 'feature_extraction', 'n_instances', 'stop_if', 'n_prior_included', 'n_prior_excluded', 'model_param', 'query_param', 'feature_param', 'balance_param'` +with the labels clustered on any of the following: `'filename', 'model', +'query_strategy', 'balance_strategy', 'feature_extraction', 'n_instances', +'stop_if', 'n_prior_included', 'n_prior_excluded', 'model_param', 'query_param', +'feature_param', 'balance_param'` #### Available scripts @@ -303,7 +398,14 @@ The following scripts are available: #### Time to Discovery Tables -The 'merge_tds.py' script creates a table of the time to discovery (TD) values for each dataset, with each row corresponding to each record ID of the relevant records in a dataset, and the columns correspond to each simulation run (e.g, for the Multimodel template each column corresponds to a simualtion run with each active learning model). Additionally, the tables includes the average-record-TD values (the average of the TD values for a record across multiple simulation runs), and the average-simulation-TD values (the average of the TD values across all records for a single simulation run). +The 'merge_tds.py' script creates a table of the time to discovery (TD) values +for each dataset, with each row corresponding to each record ID of the relevant +records in a dataset, and the columns correspond to each simulation run (e.g, +for the Multimodel template each column corresponds to a simualtion run with +each active learning model). Additionally, the tables includes the +average-record-TD values (the average of the TD values for a record across +multiple simulation runs), and the average-simulation-TD values (the average of +the TD values across all records for a single simulation run). #### Run Makita via Docker @@ -319,5 +421,7 @@ This extension is published under the [MIT license](/LICENSE). ## Contact -This extension is part of the ASReview project ([asreview.ai](https://asreview.ai)). It is maintained by the maintainers of ASReview LAB. See [ASReview LAB](https://github.com/asreview/asreview) for +This extension is part of the ASReview project +([asreview.ai](https://asreview.ai)). It is maintained by the maintainers of +ASReview LAB. See [ASReview LAB](https://github.com/asreview/asreview) for contact information and more resources. diff --git a/asreviewcontrib/makita/entrypoint.py b/asreviewcontrib/makita/entrypoint.py index 86880e3f..8acdb93c 100644 --- a/asreviewcontrib/makita/entrypoint.py +++ b/asreviewcontrib/makita/entrypoint.py @@ -1,6 +1,9 @@ import argparse import os +import shutil +from dataclasses import dataclass from pathlib import Path +from typing import Optional from asreview import config as ASREVIEW_CONFIG from asreview.entry_points import BaseEntryPoint @@ -12,7 +15,6 @@ class MakitaEntryPoint(BaseEntryPoint): - # backward compat? description = "Makita functionality for ASReview datasets." extension_name = "asreview-makita" @@ -33,16 +35,24 @@ def execute(self, argv): # noqa: C901 parser_template.add_argument("name", type=str, help="The name of the template.") parser_template.add_argument( "--job_file", - "-f", + "-j", type=str, help="The name of the file with jobs. Default " "jobs.bat for Windows, otherwise jobs.sh.", ) parser_template.add_argument( - "-s", type=str, default="data", help="Dataset folder" + "--data_folder", + "-d", + type=str, + default="data", + help="Dataset source folder. " + "Default will use the `data` folder in the current directory as source.", ) parser_template.add_argument( - "-o", type=str, default="output", help="Output folder" + "--project_folder", + "-p", + type=str, + help="Set project folder path." "Default will use current directory.", ) parser_template.add_argument( "--init_seed", @@ -153,6 +163,7 @@ def execute(self, argv): # noqa: C901 "--all", "-a", action="store_true", help="Add all scripts." ) parser_script.add_argument( + "--output", "-o", type=str, default="scripts", @@ -166,61 +177,133 @@ def execute(self, argv): # noqa: C901 def _template_cli(self, args): try: - self._template(args) + template_renderer = TemplateRenderer(args) + template_renderer.render_template() except Exception as err: print(f"\u001b[31mERROR: {err}\u001b[0m") - def _template(self, args): - """Generate a template.""" + def _add_script_cli(self, args): + try: + self._add_script(args) + except Exception as err: + print(f"\u001b[31mERROR: {err}\u001b[0m") - # lowercase name - args.name = args.name.lower() + def _add_script(self, args): + # initialize file handler + self.file_handler = FileHandler() - # check if args.name is in _entry_points - if args.name not in _entry_points(group="asreview.makita.templates").names: - raise ValueError(f"Template {args.name} not found.") + tmp_scripts = [] + if args.all: + tmp_scripts = [ + p.name[7:-9] for p in Path(TEMPLATES_FP).glob("script_*.template") + ] + else: + tmp_scripts = [args.name] - # if a custom template is provided, check if it exists - if args.template: - fp_template = Path(args.template) - if not fp_template.is_file(): - raise ValueError(f"Custom template {args.template} not found") - print( - f"\033[33mRendering custom template {args.template} using {args.name}.\u001b[0m\n" # noqa: E501 + for script in tmp_scripts: + params = {} + new_script = self.file_handler.render_file_from_template( + script, "script", **params ) - else: - fp_template = None - print(f"\033[33mRendering template {args.name}.\u001b[0m\n") - # load datasets + # export script + export_fp = Path(args.output, script) + self.file_handler.add_file(new_script, export_fp) + self.file_handler.print_summary() + + +class TemplateRenderer: + def __init__(self, args): + self.args = args + self.paths = self._setup_project_folder() + self.datasets = self._load_datasets() + + def render_template(self): + """Main function to render the template.""" + template_class = self._get_template_class(self.args.name.lower()) + fp_custom_template = self._get_custom_template(self.args.template) + file_handler = FileHandler(self.args.overwrite) + + job = template_class( + datasets=self.datasets, + fp_template=fp_custom_template, + file_handler=file_handler, + paths=self.paths, + **self._get_template_args(), + ).render() + + job = self._convert_job_for_platform(job) + + file_handler.add_file(content=job, export_fp=self.paths.job_file_path) + file_handler.print_summary() + + def _convert_job_for_platform(self, job): + if self.paths.job_file_path.suffix == ".bat": + job = f"@ echo off\nCOLOR E0{job}".replace("#", "::").replace("/", "\\") + return job + + def _get_template_class(self, template_name): + """Validate and load the template.""" + entry_points = _entry_points(group="asreview.makita.templates") + if template_name not in entry_points.names: + raise ValueError(f"Template {template_name} not found.") + return entry_points[template_name].load() + + def _get_custom_template(self, template_path): + """Check for a custom template file.""" + if template_path: + fp_template = Path(template_path) + if not fp_template.is_file(): + raise ValueError(f"Custom template {template_path} not found.") + print(f"Using custom template: {fp_template}") + return fp_template + print("Using default template.") + return None + + def _setup_project_folder(self): + """Set up project folder paths.""" + + paths = ProjectPaths( + project_folder=Path(self.args.project_folder or Path.cwd()), + output_folder="output", + data_folder="data", + scripts_folder="scripts", + job_file=self.args.job_file, + platform=self.args.platform, + ) + + paths.output_folder_path.mkdir(parents=True, exist_ok=True) + paths.data_folder_path.mkdir(parents=True, exist_ok=True) + paths.scripts_folder_path.mkdir(parents=True, exist_ok=True) + + return paths + + def _load_datasets(self): + """Load and validate datasets, returning files from the new location.""" + source_path = Path(self.args.data_folder) + datasets = ( - list(Path(args.s).glob("*.csv")) - + list(Path(args.s).glob("*.ris")) - + list(Path(args.s).glob("*.xlsx")) + list(source_path.glob("*.csv")) + + list(source_path.glob("*.ris")) + + list(source_path.glob("*.xlsx")) ) - # throw exception if no datasets are found - if len(datasets) == 0: + if not datasets: raise ValueError("No datasets found in the selected data folder.") - # create output folder - Path(args.o).parent.mkdir(parents=True, exist_ok=True) - - # get job file - if args.job_file is None: - if args.platform == "Windows" or ( - args.platform is None and os.name == "nt" - ): # noqa: E501 - args.job_file = "jobs.bat" - else: - args.job_file = "jobs.sh" + copied_files = [] + for dataset in datasets: + target_path = self.paths.data_folder_path / dataset.name + if source_path != self.paths.data_folder_path: + shutil.copyfile(dataset, target_path) + copied_files.append(target_path) - # load template - template = _entry_points(group="asreview.makita.templates")[args.name].load() + return copied_files - keys_of_interest = [ + def _get_template_args(self): + """Extract relevant arguments for the template.""" + args_to_pass = [ "skip_wordclouds", - "overwrite", "n_runs", "n_priors", "init_seed", @@ -236,53 +319,44 @@ def _template(self, args): "impossible_models", "instances_per_query", "stop_if", - "job_file", ] + return { + key: vars(self.args).get(key) + for key in args_to_pass + if key in vars(self.args) + } - job = template( - datasets=datasets, - fp_template=fp_template, - output_folder=Path(args.o), - scripts_folder=Path("scripts"), - **{key: vars(args)[key] for key in keys_of_interest if key in vars(args)}, - ).render() - - # convert shell to batch if needed - if args.job_file.endswith(".bat"): - job = f"@ echo off\nCOLOR E0{job}" - job = job.replace("#", "::") - job = job.replace("/", "\\") - # store result in output folder - with open(args.job_file, "w") as f: - f.write(job) - print(f"Rendered template {args.name} and saved to {args.job_file}") +@dataclass +class ProjectPaths: + project_folder: Path = Path.cwd() + output_folder: str = "output" + data_folder: str = "data" + scripts_folder: str = "scripts" + job_file: Optional[str] = None + platform: Optional[str] = None - def _add_script_cli(self, args): - try: - self._add_script(args) - except Exception as err: - print(f"\u001b[31mERROR: {err}\u001b[0m") + def __post_init__(self): + if self.job_file is None: + if (self.platform and self.platform.lower() == "windows") or ( + self.platform is None and os.name == "nt" + ): + self.job_file = "jobs.bat" + else: + self.job_file = "jobs.sh" - def _add_script(self, args): - # initialize file handler - self.file_handler = FileHandler() + @property + def output_folder_path(self): + return self.project_folder / self.output_folder - tmp_scripts = [] - if args.all: - tmp_scripts = [ - p.name[7:-9] for p in Path(TEMPLATES_FP).glob("script_*.template") - ] - else: - tmp_scripts = [args.name] + @property + def data_folder_path(self): + return self.project_folder / self.data_folder - for script in tmp_scripts: - params = {} - new_script = self.file_handler.render_file_from_template( - script, "script", **params - ) + @property + def scripts_folder_path(self): + return self.project_folder / self.scripts_folder - # export script - export_fp = Path(args.o, script) - self.file_handler.add_file(new_script, export_fp) - self.file_handler.print_summary() + @property + def job_file_path(self): + return self.project_folder / self.job_file diff --git a/asreviewcontrib/makita/template_arfi.py b/asreviewcontrib/makita/template_arfi.py index 800a453d..eb9a0546 100644 --- a/asreviewcontrib/makita/template_arfi.py +++ b/asreviewcontrib/makita/template_arfi.py @@ -34,7 +34,7 @@ def get_dataset_specific_params(self, index, fp_dataset): fp_dataset, init_seed=self.init_seed + index, n_priors=n_priors ) return { - "input_file": fp_dataset.as_posix(), + "input_file": f"{fp_dataset.parent.name}/{fp_dataset.name}", "input_file_stem": fp_dataset.stem, "priors": priors, "model_seed": self.model_seed + index, @@ -60,8 +60,8 @@ def get_template_specific_params(self, params): "instances_per_query": self.instances_per_query, "stop_if": self.stop_if, "init_seed": self.init_seed, - "output_folder": self.output_folder, - "scripts_folder": self.scripts_folder, + "output_folder": self.paths.output_folder, + "scripts_folder": self.paths.scripts_folder, "version": self.__version__, } diff --git a/asreviewcontrib/makita/template_base.py b/asreviewcontrib/makita/template_base.py index c0fa83d9..8014d9db 100644 --- a/asreviewcontrib/makita/template_base.py +++ b/asreviewcontrib/makita/template_base.py @@ -6,6 +6,7 @@ from asreviewcontrib.makita import __version__ from asreviewcontrib.makita.config import TEMPLATES_FP +from asreviewcontrib.makita.entrypoint import ProjectPaths from asreviewcontrib.makita.utils import FileHandler @@ -16,29 +17,25 @@ def __init__( self, datasets, fp_template, - output_folder, - scripts_folder, + file_handler: FileHandler, + paths: ProjectPaths, skip_wordclouds, - overwrite, init_seed, model_seed, balance_strategy, instances_per_query, stop_if, - job_file, **kwargs, ): self.datasets = datasets - self.output_folder = output_folder - self.scripts_folder = scripts_folder + self.paths = paths self.skip_wordclouds = skip_wordclouds self.init_seed = init_seed self.model_seed = model_seed self.balance_strategy = balance_strategy self.instances_per_query = instances_per_query self.stop_if = stop_if - self.job_file = job_file - self.file_handler = FileHandler(overwrite) + self.file_handler = file_handler self.__version__ = __version__ self.template = ConfigTemplate( @@ -75,27 +72,31 @@ def render_scripts(self, scripts: list): for s in scripts: t_script = self.file_handler.render_file_from_template( - s, "script", output_folder=self.output_folder + s, "script", output_folder=self.paths.output_folder + ) + self.file_handler.add_file( + t_script, Path(self.paths.scripts_folder_path, s) ) - export_fp = Path(self.scripts_folder, s) - self.file_handler.add_file(t_script, export_fp) - def render_docs(self, docs: list): + def render_docs(self, documents: list): """Render docs.""" - for s in docs: + for document in documents: t_docs = self.file_handler.render_file_from_template( - s, + document, "doc", - datasets=self.datasets, + datasets=[ + Path(dataset.parent.name, dataset.name) for dataset in self.datasets + ], template_name=self.template.name, template_name_long=self.template.name_long, template_scripts=self.template.scripts, skip_wordclouds=self.skip_wordclouds, - output_folder=self.output_folder, - job_file=self.job_file, + paths=self.paths, + ) + self.file_handler.add_file( + t_docs, Path(self.paths.project_folder, document) ) - self.file_handler.add_file(t_docs, s) def render(self): """Render template.""" @@ -114,7 +115,7 @@ def render(self): if " " in Path(fp_dataset).stem: raise ValueError( f"Dataset filename '{fp_dataset}' cannot contain whitespace." - ) # noqa + ) fp_dataset = Path(fp_dataset) params.append(self.get_dataset_specific_params(i, fp_dataset)) @@ -135,5 +136,4 @@ def render(self): else: raise e - self.file_handler.print_summary() return rendered_output diff --git a/asreviewcontrib/makita/template_basic.py b/asreviewcontrib/makita/template_basic.py index e1edb76d..9029ab84 100644 --- a/asreviewcontrib/makita/template_basic.py +++ b/asreviewcontrib/makita/template_basic.py @@ -27,7 +27,7 @@ def get_dataset_specific_params(self, index, fp_dataset): template once for each dataset.""" return { - "input_file": fp_dataset.as_posix(), + "input_file": f"{fp_dataset.parent.name}/{fp_dataset.name}", "input_file_stem": fp_dataset.stem, "model_seed": self.model_seed + index, "init_seed": self.init_seed, @@ -38,10 +38,26 @@ def get_template_specific_params(self, params): template only once.""" # set default values if not provided - classifier = self.classifier if self.classifier is not None else ASREVIEW_CONFIG.DEFAULT_MODEL # noqa: E501 - feature_extractor = self.feature_extractor if self.feature_extractor is not None else ASREVIEW_CONFIG.DEFAULT_FEATURE_EXTRACTION # noqa: E501 - query_strategy = self.query_strategy if self.query_strategy is not None else ASREVIEW_CONFIG.DEFAULT_QUERY_STRATEGY # noqa: E501 - balance_strategy = self.balance_strategy if self.balance_strategy is not None else ASREVIEW_CONFIG.DEFAULT_BALANCE_STRATEGY # noqa: E501 + classifier = ( + self.classifier + if self.classifier is not None + else ASREVIEW_CONFIG.DEFAULT_MODEL + ) + feature_extractor = ( + self.feature_extractor + if self.feature_extractor is not None + else ASREVIEW_CONFIG.DEFAULT_FEATURE_EXTRACTION + ) + query_strategy = ( + self.query_strategy + if self.query_strategy is not None + else ASREVIEW_CONFIG.DEFAULT_QUERY_STRATEGY + ) + balance_strategy = ( + self.balance_strategy + if self.balance_strategy is not None + else ASREVIEW_CONFIG.DEFAULT_BALANCE_STRATEGY + ) n_runs = self.n_runs if self.n_runs is not None else 1 return { @@ -54,7 +70,7 @@ def get_template_specific_params(self, params): "skip_wordclouds": self.skip_wordclouds, "instances_per_query": self.instances_per_query, "stop_if": self.stop_if, - "output_folder": self.output_folder, - "scripts_folder": self.scripts_folder, + "output_folder": self.paths.output_folder, + "scripts_folder": self.paths.scripts_folder, "version": self.__version__, } diff --git a/asreviewcontrib/makita/template_multimodel.py b/asreviewcontrib/makita/template_multimodel.py index 60ce53a4..070b6e97 100644 --- a/asreviewcontrib/makita/template_multimodel.py +++ b/asreviewcontrib/makita/template_multimodel.py @@ -32,7 +32,7 @@ def get_dataset_specific_params(self, index, fp_dataset): template once for each dataset.""" return { - "input_file": fp_dataset.as_posix(), + "input_file": f"{fp_dataset.parent.name}/{fp_dataset.name}", "input_file_stem": fp_dataset.stem, "model_seed": self.model_seed + index, "init_seed": self.init_seed, @@ -54,9 +54,9 @@ def get_template_specific_params(self, params): "skip_wordclouds": self.skip_wordclouds, "instances_per_query": self.instances_per_query, "stop_if": self.stop_if, - "output_folder": self.output_folder, + "output_folder": self.paths.output_folder, + "scripts_folder": self.paths.scripts_folder, "n_runs": n_runs, - "scripts_folder": self.scripts_folder, "version": self.__version__, "all_classifiers": all_classifiers, "all_feature_extractors": all_feature_extractors, diff --git a/asreviewcontrib/makita/template_prior.py b/asreviewcontrib/makita/template_prior.py index 783e348b..79a282fa 100644 --- a/asreviewcontrib/makita/template_prior.py +++ b/asreviewcontrib/makita/template_prior.py @@ -137,7 +137,7 @@ def get_template_specific_params(self, params): ) # Create a directory for generated data if it doesn't already exist - generated_folder = Path("generated_data") + generated_folder = Path(self.paths.project_folder, "generated_data") generated_folder.mkdir(parents=True, exist_ok=True) # Set file paths for datasets with custom records for prior knowledge @@ -195,14 +195,16 @@ def get_template_specific_params(self, params): "skip_wordclouds": self.skip_wordclouds, "instances_per_query": self.instances_per_query, "stop_if": self.stop_if, - "output_folder": self.output_folder, - "scripts_folder": self.scripts_folder, + "output_folder": self.paths.output_folder, + "scripts_folder": self.paths.scripts_folder, "version": self.__version__, "model_seed": self.model_seed, "init_seed": self.init_seed, - "filepath_with_priors": filepath_with_priors, + "filepath_with_priors": + f"{filepath_with_priors.parent.name}/{filepath_with_priors.name}", "filepath_with_priors_stem": filepath_with_priors.stem, - "filepath_without_priors": filepath_without_priors, + "filepath_without_priors": + f"{filepath_without_priors.parent.name}/{filepath_without_priors.name}", "filepath_without_priors_stem": filepath_without_priors.stem, "prior_idx": prior_idx, } diff --git a/asreviewcontrib/makita/templates/doc_README.md.template b/asreviewcontrib/makita/templates/doc_README.md.template index cd747243..643c5b1b 100644 --- a/asreviewcontrib/makita/templates/doc_README.md.template +++ b/asreviewcontrib/makita/templates/doc_README.md.template @@ -28,11 +28,11 @@ The performance on the following datasets is evaluated: {% endfor %} ## Run simulation -{% if job_file == 'jobs.sh' %}To start the simulation, run the following command in the project directory. +To start the simulation, run the following command in the project directory. -```sh -sh jobs.sh -```{% else %}To start the simulation, run the `{{ job_file }}` file.{% endif %} +```{% if paths.job_file.endswith('.sh') %}sh {% endif %} +{% if paths.job_file.endswith('.sh') %}sh {% endif %}{{ paths.job_file }} +``` {% if template_name != 'custom' %}## Structure @@ -40,16 +40,16 @@ The following files are found in this project: πŸ“¦Makita β”œβ”€β”€ πŸ“œREADME.md - β”œβ”€β”€ πŸ“œ{{ job_file }} - β”œβ”€β”€ πŸ“‚data{% for dataset in datasets %} + β”œβ”€β”€ πŸ“œ{{ paths.job_file }} + β”œβ”€β”€ πŸ“‚{{ paths.data_folder }}{% for dataset in datasets %} β”‚ β”œβ”€β”€ πŸ“œ{{ dataset.name }}{% endfor %}{% if template_name == 'prior' %} β”œβ”€β”€ πŸ“‚generated_data β”‚ β”œβ”€β”€ πŸ“œdataset_with_priors.csv β”‚ β”œβ”€β”€ πŸ“œdataset_without_priors.csv{%endif %} - β”œβ”€β”€ πŸ“‚scripts{% for script in template_scripts %} + β”œβ”€β”€ πŸ“‚{{ paths.scripts_folder }}{% for script in template_scripts %} β”‚ β”œβ”€β”€ πŸ“œ{{ script }}{% endfor %} β”‚ └── πŸ“œ... - └── πŸ“‚{{ output_folder }} + └── πŸ“‚{{ paths.output_folder }} β”œβ”€β”€ πŸ“‚simulation{% if template_name == 'prior' %} | β”œβ”€β”€ πŸ“‚descriptives | | β”œβ”€β”€ πŸ“œdata_stats_dataset_with_priors.json diff --git a/examples/README.md b/examples/README.md index f7d36b27..d5082db6 100644 --- a/examples/README.md +++ b/examples/README.md @@ -4,24 +4,12 @@ Examples are generated with the following code (from the root of the repo): ``` mkdir examples -mkdir examples/arfi_example -synergy_dataset get -d van_de_Schoot_2018 Smid_2020 -o examples/arfi_example/data -l -cd examples/arfi_example -asreview makita template arfi --overwrite --platform linux -cd ../.. -mkdir examples/basic_example -synergy_dataset get -d van_de_Schoot_2018 Smid_2020 -o examples/basic_example/data -l -cd examples/basic_example -asreview makita template basic --overwrite --platform linux -cd ../.. -mkdir examples/multimodel_example -synergy_dataset get -d van_de_Schoot_2018 Smid_2020 -o examples/multimodel_example/data -l -cd examples/multimodel_example -asreview makita template multimodel --overwrite --platform linux -cd ../.. -synergy_dataset get -d van_de_Schoot_2018 Smid_2020 -o examples/prior_example/data -l -ren ./examples/prior_example/data/Smid_2020.csv prior_Smid_2020.csv -cd examples/prior_example -asreview makita template prior --overwrite --platform linux -cd ../.. + +synergy_dataset get -d van_de_Schoot_2018 Smid_2020 -o examples/data -l + +asreview makita template arfi -d ./examples/data -p ./examples/arfi_example --overwrite --platform linux +asreview makita template basic -d ./examples/data -p ./examples/basic_example --overwrite --platform linux +asreview makita template multimodel -d ./examples/data -p ./examples/multimodel_example --overwrite --platform linux +ren ./examples/data/Smid_2020.csv prior_Smid_2020.csv +asreview makita template prior -d ./examples/data -p ./examples/prior_example --overwrite --platform linux ```