From 522b5198da5abe4dc542f3f436e90a70b0948688 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Tue, 28 May 2024 19:32:47 +0000 Subject: [PATCH] :art: speed up tutorial execution in CI, add some hyperlinks --- .github/workflows/test_tutorial.yaml | 2 +- scripts/0_Tutorial.ipynb | 39 ++++++++++++++++++---------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/.github/workflows/test_tutorial.yaml b/.github/workflows/test_tutorial.yaml index dd3091b..12e1d12 100644 --- a/.github/workflows/test_tutorial.yaml +++ b/.github/workflows/test_tutorial.yaml @@ -23,5 +23,5 @@ jobs: - name: Test Tutorial run: | cd scripts - papermill 0_Tutorial.ipynb 0_Tutorial_out.ipynb + papermill 0_Tutorial.ipynb 0_Tutorial_out.ipynb -p epochs 4 diff --git a/scripts/0_Tutorial.ipynb b/scripts/0_Tutorial.ipynb index b0cb135..c3a52f9 100644 --- a/scripts/0_Tutorial.ipynb +++ b/scripts/0_Tutorial.ipynb @@ -9,13 +9,13 @@ "\n", "*Authors:* \n", "\n", - "Marc Pielies Avelli (marc.pielies-avelli@cpr.ku.dk, mpielies@broadinstitute.org )\n", + "Marc Pielies Avelli ([marc.pielies-avelli@cpr.ku.dk](mailto:marc.pielies-avelli@cpr.ku.dk), [mpielies@broadinstitute.org](mailto:mpielies@broadinstitute.org) )\n", "\n", - "Arnor Sigurdsson (arnor.sigurdsson@sund.ku.dk, asigurds@broadinstitute.org)\n", + "Arnor Sigurdsson ([arnor.sigurdsson@sund.ku.dk](mailto:arnor.sigurdsson@sund.ku.dk), [asigurds@broadinstitute.org](mailto:asigurds@broadinstitute.org))\n", "\n", - "Henry Webel (henry.webel@sund.ku.dk)\n", + "Henry Webel ([henry.webel@sund.ku.dk](mailto:henry.webel@sund.ku.dk))\n", " \n", - "Simon Rasmussen (srasmuss@sund.ku.dk, srasmuss@broadinstitute.org)\n", + "Simon Rasmussen ([srasmuss@sund.ku.dk](mailto:srasmuss@sund.ku.dk), [srasmuss@broadinstitute.org](mailto:srasmuss@broadinstitute.org))\n", "\n", "## Introduction ##\n", "\n", @@ -47,9 +47,7 @@ "- **Fusion module**: The extracted high-level, abstract features are then combined in a number of dense layers.\n", "- **Output module**: A final set of dense layers maps the feature vectors to the targets in a regression task per node, which will represent the EU-seq signal at a distance from the TSS of the gene defining the sample (located in the middle).\n", "\n", - "CLASTER was built using the EIR framework, a python package developed by Arnor Sigurdsson (arnor.sigurdsson@sund.ku.dk) which makes it easy to replicate and adapt it to new tasks. Documentation on EIR can be found at https://eir.readthedocs.io/en/latest/. Have a look at the tutorials to get a feel for the config files required and all possibilities that EIR offers. The framework uses hydra (https://hydra.cc/docs/intro/) to manage a set of configuration files, which allow you to streamline the process.\n", - "\n", - "\n" + "CLASTER was built using the EIR framework, a python package developed by Arnor Sigurdsson ([arnor.sigurdsson@sund.ku.dk](mailto:arnor.sigurdsson@sund.ku.dk)) which makes it easy to replicate and adapt it to new tasks. Documentation on EIR can be found at [eir.readthedocs.io](https://eir.readthedocs.io/en/latest/). Have a look at the tutorials to get a feel for the config files required and all possibilities that EIR offers. The framework uses hydra (https://hydra.cc/docs/intro/) to manage a set of configuration files, which allow you to streamline the process." ] }, { @@ -812,6 +810,20 @@ "## 2. Training and validating CLASTER" ] }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "tags": [ + "Parameters", + "parameters" + ] + }, + "outputs": [], + "source": [ + "epochs: int = 120" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -845,11 +857,11 @@ "output_path.mkdir(parents=True, exist_ok=True)\n", "\n", "# Write config files explicitly:\n", - "training_microc_rotated_yaml_contents = {\"globals.yaml\":\"\"\"\n", + "training_microc_rotated_yaml_contents = {\"globals.yaml\": f\"\"\"\n", "output_folder: ../runs_tutorial/gene_expression_microc_rotated_pure_conv_tutorial/ \n", - "checkpoint_interval: 860\n", - "sample_interval: 860 \n", - "n_epochs: 120\n", + "checkpoint_interval: {int(860 /120 * epochs)}\n", + "sample_interval: {int(860 /120 * epochs)} \n", + "n_epochs: {epochs}\n", "batch_size: 64\n", "optimizer: \"adamw\"\n", "lr: 0.0001 \n", @@ -1570,7 +1582,8 @@ "results_path = Path(\"../runs_tutorial/gene_expression_microc_rotated_pure_conv_tutorial/results/\")\n", "N_BINS= 200\n", "condition_list = [\"_ctrl\"]\n", - "SPLIT = 4300 #860\n", + "SPLIT = int(860 /120 * epochs)\n", + "# SPLIT = 4300 # for 860 samples, 120 epochs\n", "\n", "ids, predicted, actual = _get_predictions(results_path, N_BINS, condition_list, SPLIT)\n", "\n", @@ -1597,7 +1610,7 @@ "\n", "CLASTER was designed to predict nascent transcription, but the EIR framework can be reused and expanded for any other purpose. It can handle plenty of other data modalities as well! You just need a well defined input containing some information or signal that can be used to infer the values of another, as an output. You can then customize your network using EIR, provide the inputs and targets in one of the supported data modalities, and name samples with the same ID for input and output.\n", "\n", - "If in doubt, feel free to reach us!\n" + "If in doubt, feel free to reach out to us!\n" ] } ],