forked from rishabgit/genomic-info-from-papers
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcodon_setup.sh
executable file
·33 lines (26 loc) · 1.27 KB
/
codon_setup.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/usr/bin/env bash
#BSUB -J train_var_ext
#BSUB -q short
#BSUB -n 1
#BSUB -M 16000
#BSUB -R "rusage[mem=16GB]"
#BSUB -o /hps/scratch/flicek/wormbase/var_extraction/Output_%J.out
#BSUB -e /hps/scratch/flicek/wormbase/var_extraction/Error_%J.err
VAREXTPATH=/hps/software/users/wormbase/variant-extraction/genomic-info-from-papers
VAREXTLOGS=/hps/scratch/flicek/wormbase/var_extraction
# Install dependencies and download nltk data
# -------------------------------------------------------------
module purge
module load python-3.9.10-gcc-9.3.0-i56je3q
cd "${VAREXTPATH}"
wget -N https://s3.amazonaws.com/net.tagtog.public/resources/corpora/tagtog_IDP4%2B_anndoc.zip
unzip -qq tagtog_IDP4+_anndoc.zip -d data/nala
python -m venv venv
source venv/bin/activate
pip install -r requirements.txt
mkdir data/nltk
export NLTK_DATA="${VAREXTPATH}/data/nltk"
python nltk-download.py
# Training:
# -------------------------------------------------------------
python train_ner.py --model_name_or_path dmis-lab/biobert-base-cased-v1.1 --train_file data/nala/train_dev.json --validation_file data/nala/devel.json --text_column_name tokens --label_column_name tags --pad_to_max_length --max_length 192 --per_device_train_batch_size 8 --learning_rate 2e-5 --num_train_epochs 10 --output_dir models/nala --seed 1