Skip to content

Commit

Permalink
Prepared a package for PyPI + fixed evaluate script to work with a ne…
Browse files Browse the repository at this point in the history
…w package + removed junk
  • Loading branch information
dkalpakchi committed Oct 25, 2021
1 parent e13b473 commit 8114f23
Show file tree
Hide file tree
Showing 13 changed files with 177 additions and 127 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,5 @@ problem_squad.txt
random_indexing.py
run_bpe.sh
run_exp.sh
templates.log
templates.log
src/quinductor.egg-info
42 changes: 30 additions & 12 deletions evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from operator import itemgetter
from collections import defaultdict
from itertools import product
from pathlib import Path

import seaborn as sns
import matplotlib.pyplot as plt
Expand All @@ -19,23 +20,24 @@
import numpy as np
import stanza
from stanza.utils.conll import CoNLL
from spacy.lang.sv import Swedish

from tqdm import tqdm

from rules import *
from core import *
from common import *
from guards import load_guards
import loaders
import repro
from quinductor.rules import *
from quinductor.core import *
from quinductor.common import *
from quinductor.guards import load_guards
from quinductor.loaders import *
from quinductor.repro import *

import udon2
from stanza_ext import TokenizeWithPunctProcessor

np.seterr('raise')


logger = get_logger()


SURVEY_TEMPLATES = {
'sv': "Meningen: {0}<br>Frågan: {1}<br>Det föreslagna svaret: {2}",
'en': "Sentence: {0}<br>Question: {1}<br>Suggested answer: {2}",
Expand Down Expand Up @@ -145,7 +147,20 @@ def translate(qw):
'eval_remove_diacritics': args.remove_diacritics
}, overwrite=True)

eval_folder = os.path.join(args.templates_folder, 'eval', '')
if args.templates_folder:
eval_folder = os.path.join(args.templates_folder, 'eval')
else:
args.templates_folder = get_default_model_path(args.lang)
if not os.path.exists(args.templates_folder):
logger.error(
"""No valid model found. Try downloading by running `quinductor.download({})`
or providing your own by using script arguments""".format(args.lang)
)
eval_folder = 'evaluation'

if not args.ranking_folder:
args.ranking_folder = Path(args.templates_folder).parent

if not os.path.exists(eval_folder):
os.makedirs(eval_folder)

Expand All @@ -156,17 +171,20 @@ def translate(qw):
proc = 'tokenize,mwt,pos' if args.lang in ['fi', 'ar'] else 'tokenize,pos'
stanza_dep_pipe = stanza.Pipeline(lang=args.lang, processors=dep_proc)
stanza_pipe = stanza.Pipeline(lang=args.lang, processors=proc)

if not args.pos_ngrams:
args.pos_ngrams = os.path.join(args.ranking_folder, 'pos_ngrams')
log_prob = load_pos_ngrams(args.pos_ngrams)

qw_stat = dill.load(open(os.path.join(args.ranking_folder, 'qwstats.dill'), 'rb'))
a_tmpl = dill.load(open(os.path.join(args.ranking_folder, 'atmpl.dill'), 'rb'))

if args.format == 'tt':
data_loader = loaders.TextinatorLoader
data_loader = TextinatorLoader
elif args.format == 'squad':
data_loader = loaders.SquadLoader
data_loader = SquadLoader
elif args.format == 'tydiqa':
data_loader = loaders.TyDiQaLoader
data_loader = TyDiQaLoader

data_file = os.path.join(eval_folder, 'data.dill')
if os.path.exists(data_file):
Expand Down
16 changes: 8 additions & 8 deletions evaluate.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
python3 evaluate.py -l en -f data/tydiqa-v1.0-dev.jsonl -t templates/en/16132054753040054/ -r templates/en/ -pg pos_ngrams/en/ -ft tydiqa -cf
python3 evaluate.py -l ar -f data/tydiqa-v1.0-dev.jsonl -t templates/ar/1614104416496133/ -r templates/ar/ -pg pos_ngrams/ar/ -ft tydiqa -cf
python3 evaluate.py -l id -f data/tydiqa-v1.0-dev.jsonl -t templates/id/16140609246000547/ -r templates/id/ -pg pos_ngrams/id/ -ft tydiqa -cf
python3 evaluate.py -l ja -f data/tydiqa-v1.0-dev.jsonl -t templates/ja/16140572221308537/ -r templates/ja/ -pg pos_ngrams/ja/ -ft tydiqa -rp -rtl
python3 evaluate.py -l ko -f data/tydiqa-v1.0-dev.jsonl -t templates/ko/16140582210609627/ -r templates/ko/ -pg pos_ngrams/ko/ -ft tydiqa -rtl
python3 evaluate.py -l te -f data/tydiqa-v1.0-dev.jsonl -t templates/te/16140691545631247/ -r templates/te/ -pg pos_ngrams/te/ -ft tydiqa -rtl
python3 evaluate.py -l fi -f data/tydiqa-v1.0-dev.jsonl -t templates/fi/16132078825085254/ -r templates/fi/ -pg pos_ngrams/fi/ -ft tydiqa -cf
python3 evaluate.py -l ru -f data/tydiqa-v1.0-dev.jsonl -t templates/ru/1613204358381249/ -r templates/ru/ -pg pos_ngrams/ru/ -ft tydiqa -cf -rd
python3 evaluate.py -l en -f data/tydiqa-v1.0-dev.jsonl -ft tydiqa -cf
python3 evaluate.py -l ar -f data/tydiqa-v1.0-dev.jsonl -ft tydiqa -cf
python3 evaluate.py -l id -f data/tydiqa-v1.0-dev.jsonl -ft tydiqa -cf
python3 evaluate.py -l ja -f data/tydiqa-v1.0-dev.jsonl -ft tydiqa -rp -rtl
python3 evaluate.py -l ko -f data/tydiqa-v1.0-dev.jsonl -ft tydiqa -rtl
python3 evaluate.py -l te -f data/tydiqa-v1.0-dev.jsonl -ft tydiqa -rtl
python3 evaluate.py -l fi -f data/tydiqa-v1.0-dev.jsonl -ft tydiqa -cf
python3 evaluate.py -l ru -f data/tydiqa-v1.0-dev.jsonl -ft tydiqa -cf -rd
16 changes: 8 additions & 8 deletions induce_templates.sh
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
# English
python3 -m quinductor.make_templates -l en -d data/tydiqa-v1.0-train.jsonl -ft tydiqa -idf templates/en/idf_en.csv -cf
python3 -m quinductor.make_templates -l en -d data/tydiqa-v1.0-train.jsonl -ft tydiqa -cf

# Finnish
python3 -m quinductor.make_templates -l fi -d data/tydiqa-v1.0-train.jsonl -ft tydiqa -idf templates/fi/idf_fi.csv -cf
python3 -m quinductor.make_templates -l fi -d data/tydiqa-v1.0-train.jsonl -ft tydiqa -cf

# Russian
python3 -m quinductor.make_templates -l ru -d data/tydiqa-v1.0-train.jsonl -ft tydiqa -idf templates/ru/idf_ru.csv -cf -rd
python3 -m quinductor.make_templates -l ru -d data/tydiqa-v1.0-train.jsonl -ft tydiqa -cf -rd

# Indonesian
python3 -m quinductor.make_templates -l id -d data/tydiqa-v1.0-train.jsonl -ft tydiqa -idf templates/id/idf_id.csv -cf
python3 -m quinductor.make_templates -l id -d data/tydiqa-v1.0-train.jsonl -ft tydiqa -cf

# Japanese
python3 -m quinductor.make_templates -l ja -d data/tydiqa-v1.0-train.jsonl -ft tydiqa -idf templates/ja/idf_ja.csv -rp -rtl
python3 -m quinductor.make_templates -l ja -d data/tydiqa-v1.0-train.jsonl -ft tydiqa -rp -rtl

# Telugu
python3 -m quinductor.make_templates -l te -d data/tydiqa-v1.0-train.jsonl -ft tydiqa -idf templates/te/idf_te.csv -rp -rtl
python3 -m quinductor.make_templates -l te -d data/tydiqa-v1.0-train.jsonl -ft tydiqa -rp -rtl

# Arabic
python3 -m quinductor.make_templates -l ar -d data/tydiqa-v1.0-train.jsonl -ft tydiqa -idf templates/ar/idf_ar.csv -rp
python3 -m quinductor.make_templates -l ar -d data/tydiqa-v1.0-train.jsonl -ft tydiqa -rp

# Korean
python3 -m quinductor.make_templates -l ko -d data/tydiqa-v1.0-train.jsonl -ft tydiqa -idf templates/ko/idf_ko.csv -rtl
python3 -m quinductor.make_templates -l ko -d data/tydiqa-v1.0-train.jsonl -ft tydiqa -rtl
8 changes: 4 additions & 4 deletions qword_stat.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import stanza
from tqdm import tqdm

import loaders
from quinductor.loaders import *


if __name__ == '__main__':
Expand All @@ -25,11 +25,11 @@
stanza_lang = stanza.Pipeline(lang=args.lang, processors=stanza_processors)

if args.format == 'tt':
data_loader = loaders.TextinatorLoader
data_loader = TextinatorLoader
elif args.format == 'squad':
data_loader = loaders.SquadLoader
data_loader = SquadLoader
elif args.format == 'tydiqa':
data_loader = loaders.TyDiQaLoader
data_loader = TyDiQaLoader

stats = defaultdict(lambda: defaultdict(int))
answer_tmpl = defaultdict(int)
Expand Down
20 changes: 0 additions & 20 deletions src/quinductor.egg-info/PKG-INFO

This file was deleted.

16 changes: 0 additions & 16 deletions src/quinductor.egg-info/SOURCES.txt

This file was deleted.

1 change: 0 additions & 1 deletion src/quinductor.egg-info/dependency_links.txt

This file was deleted.

7 changes: 0 additions & 7 deletions src/quinductor.egg-info/requires.txt

This file was deleted.

1 change: 0 additions & 1 deletion src/quinductor.egg-info/top_level.txt

This file was deleted.

70 changes: 70 additions & 0 deletions src/quinductor/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
import unicodedata
import re
from collections import defaultdict
from pathlib import Path
from pprint import pprint
import logging

import numpy as np

Expand All @@ -17,6 +19,74 @@

PUNCT_TABLE = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))

HOME_DIR = str(Path.home())
DEFAULT_TEMPLATES_DIR = os.getenv(
'QUINDUCTOR_RESOURCES_DIR',
os.path.join(HOME_DIR, 'quinductor_resources')
)
QUINDUCTOR_RESOURCES_GITHUB = 'https://raw.githubusercontent.com/dkalpakchi/quinductor/master/templates'

MODELS = {
'ar': {
'templates': 1614104416496133,
'pos_ngrams': ['ar_padt_train.txt']
},
'en': {
'templates': 16132054753040054,
'pos_ngrams': ['ewt_train_freq', 'ewt_dev_freq']
},
'fi': {
'templates': 16132078825085254,
'pos_ngrams': ['fi_tdt_train.txt']
},
'id': {
'templates': 16140609246000547,
'pos_ngrams': ['id_gsd_train.txt']
},
'ja': {
'templates': 16140572221308537,
'pos_ngrams': ['ja_gsd_train.txt']
},
'ko': {
'templates': 16140582210609627,
'pos_ngrams': ['ko_gsd_train.txt']
},
'ru': {
'templates': 1613204358381249,
'pos_ngrams': ['ru_syntagrus_train.txt']
},
'te': {
'templates': 16140691545631247,
'pos_ngrams': ['te_mtg_train.txt']
}
}


def get_logger():
logger = logging.getLogger('quinductor')
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
console.setFormatter(formatter)
logger.addHandler(console)
return logger


def get_default_model_path(lang):
if lang in MODELS:
return os.path.join(DEFAULT_TEMPLATES_DIR, lang, str(MODELS[lang]['templates']))
else:
logger = logging.getLogger('quinductor')
logger.error(
"""The language {} currently has no available models.
Please create your own model and provide it using script arguments""".format(args.lang)
)
sys.exit(1)


class TemplateElement:
def __init__(self, root_chain, node, last_span_id=None, is_subtree=False, is_lemma=False):
self.__chain = root_chain
Expand Down
2 changes: 2 additions & 0 deletions src/quinductor/guards.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import re
from collections import defaultdict

import numpy as np

from .rules import *


Expand Down
Loading

0 comments on commit 8114f23

Please sign in to comment.