Update pretrain.py #78
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Python package | |
on: [push] | |
jobs: | |
build: | |
runs-on: ubuntu-latest | |
strategy: | |
matrix: | |
python-version: [3.7.13] | |
steps: | |
- uses: actions/checkout@v2 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v2 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Install dependencies | |
run: | | |
pip install -r requirements.txt | |
- name: Test with pytest | |
run: | | |
python preprocess.py --corpus_path corpora/CLUECorpusSmall_bert_sampled.txt --vocab_path models/google_zh_vocab.txt --dataset_path dataset.pt --processes_num 8 --seq_length 64 --data_processor bert | |
python pretrain.py --dataset_path dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/bert_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2 | |
mv models/bert_model.bin-10 models/bert_model.bin | |
python preprocess.py --corpus_path corpora/CLUECorpusSmall_sampled.txt --vocab_path models/google_zh_vocab.txt --dataset_path dataset.pt --processes_num 8 --dynamic_masking --seq_length 64 --data_processor mlm | |
python pretrain.py --dataset_path dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/roberta_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2 --data_processor mlm --target mlm | |
mv models/roberta_model.bin-10 models/roberta_model.bin | |
python preprocess.py --corpus_path corpora/CLUECorpusSmall_bert_sampled.txt --vocab_path models/google_zh_vocab.txt --dataset_path dataset.pt --processes_num 8 --seq_length 64 --data_processor albert | |
python pretrain.py --dataset_path dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/albert/base_config.json --output_model_path models/albert_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2 | |
mv models/albert_model.bin-10 models/albert_model.bin | |
python preprocess.py --corpus_path corpora/CLUECorpusSmall_sampled.txt --vocab_path models/google_zh_vocab.txt --dataset_path dataset.pt --processes_num 8 --seq_length 64 --data_processor lm | |
python pretrain.py --dataset_path dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/gpt2/config.json --output_model_path models/gpt2_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2 | |
mv models/gpt2_model.bin-10 models/gpt2_model.bin | |
python preprocess.py --corpus_path corpora/CLUECorpusSmall_sampled.txt --vocab_path models/google_zh_vocab.txt --dataset_path dataset.pt --processes_num 8 --dynamic_masking --span_masking --seq_length 64 --data_processor mlm | |
python pretrain.py --dataset_path dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/spanbert_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2 --data_processor mlm --target mlm | |
mv models/spanbert_model.bin-10 models/spanbert_model.bin | |
python preprocess.py --corpus_path corpora/book_review_cls.txt --vocab_path models/google_zh_vocab.txt --dataset_path dataset.pt --processes_num 8 --seq_length 64 --data_processor cls | |
python pretrain.py --dataset_path dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/cls_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2 --labels_num 2 --data_processor cls --target cls | |
mv models/cls_model.bin-10 models/cls_model.bin | |
python preprocess.py --corpus_path corpora/news-commentary-v13-en-zh_sampled.txt --vocab_path models/google_uncased_en_vocab.txt --tgt_vocab_path models/google_zh_vocab.txt --dataset_path mt_dataset.pt --processes_num 8 --seq_length 64 --tgt_seq_length 64 --data_processor mt | |
python pretrain.py --dataset_path mt_dataset.pt --vocab_path models/google_uncased_en_vocab.txt --tgt_vocab_path models/google_zh_vocab.txt --config_path models/transformer/base_config.json --output_model_path models/mt_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2 | |
mv models/mt_model.bin-10 models/mt_model.bin | |
python preprocess.py --corpus_path corpora/CLUECorpusSmall_bert_sampled.txt --vocab_path models/google_zh_vocab.txt --dataset_path dataset.pt --processes_num 8 --seq_length 128 --tgt_seq_length 128 --dup_factor 1 --sentence_selection_strategy random --data_processor gsg | |
python pretrain.py --dataset_path dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/pegasus/base_config.json --output_model_path models/pegasus_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2 | |
mv models/pegasus_model.bin-10 models/pegasus_model.bin | |
python finetune/run_classifier.py --pretrained_model_path models/bert_model.bin --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/classifier_model.bin --train_path datasets/test_data/book_review/train.tsv --dev_path datasets/test_data/book_review/dev.tsv --epochs_num 3 --batch_size 2 | |
python inference/run_classifier_infer.py --load_model_path models/classifier_model.bin --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --test_path datasets/test_data/book_review/test_nolabel.tsv --prediction_path datasets/test_data/book_review/prediction.tsv --labels_num 2 | |
python finetune/run_classifier.py --pretrained_model_path models/albert_model.bin --vocab_path models/google_zh_vocab.txt --config_path models/albert/base_config.json --output_model_path models/classifier_model.bin --train_path datasets/test_data/chnsenticorp/train.tsv --dev_path datasets/test_data/chnsenticorp/dev.tsv --learning_rate 4e-5 --epochs_num 3 --batch_size 2 | |
python finetune/run_classifier_mt.py --pretrained_model_path models/bert_model.bin --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --dataset_path_list datasets/test_data/book_review/ datasets/test_data/chnsenticorp/ --epochs_num 1 --batch_size 2 | |
python finetune/run_ner.py --pretrained_model_path models/bert_model.bin --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/ner_model.bin --train_path datasets/test_data/msra_ner/train.tsv --dev_path datasets/test_data/msra_ner/dev.tsv --label2id_path datasets/msra_ner/label2id.json --epochs_num 2 --batch_size 2 | |
python inference/run_ner_infer.py --load_model_path models/ner_model.bin --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --test_path datasets/test_data/msra_ner/test_nolabel.tsv --prediction_path datasets/test_data/msra_ner/prediction.tsv --label2id_path datasets/msra_ner/label2id.json | |
python finetune/run_cmrc.py --pretrained_model_path models/bert_model.bin --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/cmrc_model.bin --train_path datasets/test_data/cmrc/train.json --dev_path datasets/test_data/cmrc/dev.json --epochs_num 2 --batch_size 2 --seq_length 128 | |
python inference/run_cmrc_infer.py --load_model_path models/cmrc_model.bin --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --test_path datasets/test_data/cmrc/test.json --prediction_path datasets/test_data/cmrc/prediction.json --seq_length 128 |