diff --git a/tutorials/nel_emerson/configs/nel.cfg b/tutorials/nel_emerson/configs/nel_entityruler.cfg similarity index 93% rename from tutorials/nel_emerson/configs/nel.cfg rename to tutorials/nel_emerson/configs/nel_entityruler.cfg index 908f4f0f9..eaa24d7d0 100644 --- a/tutorials/nel_emerson/configs/nel.cfg +++ b/tutorials/nel_emerson/configs/nel_entityruler.cfg @@ -13,7 +13,7 @@ gpu_allocator = null [nlp] lang = "en" -pipeline = ["sentencizer","entity_ruler","ner","entity_linker"] +pipeline = ["sentencizer","entity_ruler","entity_linker"] disabled = [] before_creation = null after_creation = null @@ -30,10 +30,6 @@ punct_chars = null source = "${paths.base_nlp}" component = "entity_ruler" -[components.ner] -source = "${paths.base_nlp}" -component = "ner" - [components.entity_linker] factory = "entity_linker" entity_vector_length = 64 @@ -94,7 +90,7 @@ eval_frequency = 200 accumulate_gradient = 2 max_epochs = 0 max_steps = 600 -frozen_components = ["sentencizer","ner"] +frozen_components = [] before_to_disk = null [training.logger] @@ -130,6 +126,12 @@ learn_rate = 0.001 nel_micro_p = 0.0 nel_micro_r = 0.0 nel_micro_f = 1.0 +ents_f = 0.0 +ents_p = 0.0 +ents_r = 0.0 +sents_f = null +sents_p = null +sents_r = null [pretraining] diff --git a/tutorials/nel_emerson/configs/nel_ner.cfg b/tutorials/nel_emerson/configs/nel_ner.cfg new file mode 100644 index 000000000..507a66f8b --- /dev/null +++ b/tutorials/nel_emerson/configs/nel_ner.cfg @@ -0,0 +1,148 @@ +[paths] +train = "" +dev = "" +raw = null +init_tok2vec = null +kb = "" +base_nlp = "" +vectors = "${paths.base_nlp}" + +[system] +seed = 342 +gpu_allocator = null + +[nlp] +lang = "en" +pipeline = ["sentencizer","ner","entity_linker"] +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.sentencizer] +factory = "sentencizer" +punct_chars = null + +[components.ner] +source = "${paths.base_nlp}" +component = "ner" + +[components.entity_linker] +factory = "entity_linker" +entity_vector_length = 64 +get_candidates = {"@misc":"spacy.CandidateGenerator.v1"} +incl_context = true +incl_prior = true +labels_discard = [] +use_gold_ents = true + +[components.entity_linker.model] +@architectures = "spacy.EntityLinker.v2" +nO = null + +[components.entity_linker.model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 2 +embed_size = 2000 +window_size = 1 +maxout_pieces = 3 +subword_features = true + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null + +[initialize.components] + +[initialize.components.entity_linker] + +[initialize.components.entity_linker.kb_loader] +@misc = "spacy.KBFromFile.v1" +kb_path = ${paths.kb} + +[initialize.tokenizer] + + +[corpora] + +[corpora.train] +@readers = "MyCorpus.v1" +file = ${paths.train} + +[corpora.dev] +@readers = "MyCorpus.v1" +file = ${paths.dev} + +[training] +train_corpus = "corpora.train" +dev_corpus = "corpora.dev" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.2 +patience = 10000 +eval_frequency = 200 +accumulate_gradient = 2 +max_epochs = 0 +max_steps = 600 +frozen_components = ["ner"] +before_to_disk = null + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + + +[training.batcher] +@batchers = "spacy.batch_by_words.v1" +discard_oversize = false +tolerance = 0.2 +get_length = null + +[training.batcher.size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 +t = 0.0 + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 +learn_rate = 0.001 + +[training.score_weights] +nel_micro_p = 0.0 +nel_micro_r = 0.0 +nel_micro_f = 1.0 +ents_f = 0.0 +ents_p = 0.0 +ents_r = 0.0 +sents_f = null +sents_p = null +sents_r = null + +[pretraining] + +[optimizer] +@optimizers = "Adam.v1" +learn_rate = 0.001 +beta1 = 0.9 +beta2 = 0.999 +L2 = 0.0 +eps = 0.00000001 +grad_clip = 1.0 +L2_is_weight_decay = true +use_averages = true \ No newline at end of file diff --git a/tutorials/nel_emerson/configs/nel_only.cfg b/tutorials/nel_emerson/configs/nel_only.cfg new file mode 100644 index 000000000..90d49b204 --- /dev/null +++ b/tutorials/nel_emerson/configs/nel_only.cfg @@ -0,0 +1,141 @@ +[paths] +train = "" +dev = "" +raw = null +init_tok2vec = null +kb = "" +base_nlp = "" +vectors = "${paths.base_nlp}" + +[system] +seed = 342 +gpu_allocator = null + +[nlp] +lang = "en" +pipeline = ["sentencizer","entity_linker"] +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.sentencizer] +factory = "sentencizer" +punct_chars = null + +[components.entity_linker] +factory = "entity_linker" +entity_vector_length = 64 +get_candidates = {"@misc":"spacy.CandidateGenerator.v1"} +incl_context = true +incl_prior = true +labels_discard = [] +use_gold_ents = true + +[components.entity_linker.model] +@architectures = "spacy.EntityLinker.v2" +nO = null + +[components.entity_linker.model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 2 +embed_size = 2000 +window_size = 1 +maxout_pieces = 3 +subword_features = true + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null + +[initialize.components] + +[initialize.components.entity_linker] + +[initialize.components.entity_linker.kb_loader] +@misc = "spacy.KBFromFile.v1" +kb_path = ${paths.kb} + +[initialize.tokenizer] + + +[corpora] + +[corpora.train] +@readers = "MyCorpus.v1" +file = ${paths.train} + +[corpora.dev] +@readers = "MyCorpus.v1" +file = ${paths.dev} + +[training] +train_corpus = "corpora.train" +dev_corpus = "corpora.dev" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.2 +patience = 10000 +eval_frequency = 200 +accumulate_gradient = 2 +max_epochs = 0 +max_steps = 600 +frozen_components = [] +before_to_disk = null + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + + +[training.batcher] +@batchers = "spacy.batch_by_words.v1" +discard_oversize = false +tolerance = 0.2 +get_length = null + +[training.batcher.size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 +t = 0.0 + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 +learn_rate = 0.001 + +[training.score_weights] +nel_micro_p = 0.0 +nel_micro_r = 0.0 +nel_micro_f = 1.0 +sents_f = null +sents_p = null +sents_r = null + +[pretraining] + +[optimizer] +@optimizers = "Adam.v1" +learn_rate = 0.001 +beta1 = 0.9 +beta2 = 0.999 +L2 = 0.0 +eps = 0.00000001 +grad_clip = 1.0 +L2_is_weight_decay = true +use_averages = true \ No newline at end of file diff --git a/tutorials/nel_emerson/project.yml b/tutorials/nel_emerson/project.yml index ff097c5e5..d68161e13 100644 --- a/tutorials/nel_emerson/project.yml +++ b/tutorials/nel_emerson/project.yml @@ -3,7 +3,7 @@ description: "**This project was created as part of a [step-by-step video tutori # Variables can be referenced across the project.yml using ${vars.var_name} vars: name: "nel_emerson" - config: "nel.cfg" + config: "nel_entityruler.cfg" vectors_model: "en_core_web_md" annotations: "emerson_annotated_text.jsonl" entities: "entities.csv" @@ -11,7 +11,7 @@ vars: nlp: "my_nlp" train: "train" dev: "dev" - version: "0.0.3" + version: "0.0.4" # These are the directories that the project needs. The project CLI will make # sure that they always exist.