From 4308ecc768a295bf6adceb73b1b28375509888cd Mon Sep 17 00:00:00 2001 From: Juliettejns Date: Thu, 13 Jun 2024 16:28:42 +0200 Subject: [PATCH] =?UTF-8?q?ajout=20fonction=20filtres=20tokens=20=C3=A0=20?= =?UTF-8?q?corriger?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/main/views/tokens.py | 33 +++++++++++++++ app/models/corpus.py | 62 +++++++++++++++++++++------- app/templates/macros/nav_macros.html | 19 +++++++++ 3 files changed, 98 insertions(+), 16 deletions(-) diff --git a/app/main/views/tokens.py b/app/main/views/tokens.py index ff4d448d..d9be6f5f 100644 --- a/app/main/views/tokens.py +++ b/app/main/views/tokens.py @@ -67,6 +67,39 @@ def tokens_correct_unallowed(corpus_id, allowed_type): ) +@main.route('/corpus//tokens/unallowed//correct/filter', methods=['POST']) +@login_required +@requires_corpus_access("corpus_id") +def tokens_correct_unallowed_filter(corpus_id, allowed_type): + corpus = Corpus.query.filter_by(**{"id": corpus_id}).first() + list_filter=[] + if request.method=="POST": + list_filter.append(request.form.get("punct")) + list_filter.append(request.form.get("numeral")) + list_filter.append(request.form.get("ignore")) + list_filter.append(request.form.get("metadata")) + filtered_filter=[] + for el in list_filter: + if el != None: + filtered_filter.append(el) + filter = " ".join(filtered_filter) + print(filter) + + + tokens = corpus \ + .get_unallowed(allowed_type, ignore=filter) \ + .paginate( + page=int_or(request.args.get("page"), 1), + per_page=int_or(request.args.get("limit"), current_app.config["PAGINATION_DEFAULT_TOKENS"]) + ) + return render_template_with_nav_info( + 'main/tokens_correct_unallowed.html', + corpus=corpus, + tokens=tokens, + allowed_type=allowed_type, + changed=corpus.changed(tokens.items)) + + @main.route('/corpus//tokens/changes/similar/') @login_required @requires_corpus_access("corpus_id") diff --git a/app/models/corpus.py b/app/models/corpus.py index 57fd2bdb..1c9a5443 100644 --- a/app/models/corpus.py +++ b/app/models/corpus.py @@ -297,7 +297,7 @@ def get_allowed_values(self, allowed_type="lemma", label=None, order_by="label", ).order_by(order_by) return db.session.query(cls).filter(cls.control_list == self.control_lists_id).order_by(order_by) - def get_unallowed(self, allowed_type="lemma"): + def get_unallowed(self, allowed_type="lemma", ignore=False): """ Search for WordToken that would not comply with Allowed Values (in AllowedLemma, AllowedPOS, AllowedMorph) nor with a corpus custom dictionary @@ -317,22 +317,52 @@ def get_unallowed(self, allowed_type="lemma"): else: raise ValueError("Get Allowed value had %s and it's not from the lemma, POS, morph set" % allowed_type) - allowed = db.session.query(cls).filter( - cls.control_list == self.control_lists_id, - cls.label == prop - ) - custom_dict = db.session.query(CorpusCustomDictionary).filter( - CorpusCustomDictionary.corpus == self.id, - CorpusCustomDictionary.category == allowed_type, - CorpusCustomDictionary.label == prop - ) - return db.session.query(WordToken).filter( - db.and_( - WordToken.corpus == self.id, - not_(allowed.exists()), - not_(custom_dict.exists()) + if ignore: + regex_liste = [] + if "metadata" in ignore: + regex_liste.append(r'^(?!\[[^\]]+:[^\]]*\]$).*') + if "ignore" in ignore: + regex_liste.append(r'^(?!^\[IGNORE\]$)') + if "punct" in ignore: + regex_liste.append(r"(?!^[^\w\s]$).") + if "numeral" in ignore: + regex_liste.append(r'(?!^\d+$).+') + regex = "".join(regex_liste) + allowed = db.session.query(cls).filter( + cls.control_list == self.control_lists_id, + cls.label == prop + ) + custom_dict = db.session.query(CorpusCustomDictionary).filter( + CorpusCustomDictionary.corpus == self.id, + CorpusCustomDictionary.category == allowed_type, + CorpusCustomDictionary.label == prop + ) + return db.session.query(WordToken).filter( + db.and_( + WordToken.corpus == self.id, + not_(allowed.exists()), + not_(custom_dict.exists()), + WordToken.form.op('~')(regex) + ) + ).order_by(WordToken.order_id) + + else: + allowed = db.session.query(cls).filter( + cls.control_list == self.control_lists_id, + cls.label == prop ) - ).order_by(WordToken.order_id) + custom_dict = db.session.query(CorpusCustomDictionary).filter( + CorpusCustomDictionary.corpus == self.id, + CorpusCustomDictionary.category == allowed_type, + CorpusCustomDictionary.label == prop + ) + return db.session.query(WordToken).filter( + db.and_( + WordToken.corpus == self.id, + not_(allowed.exists()), + not_(custom_dict.exists()) + ) + ).order_by(WordToken.order_id) @property def tokens_count(self): diff --git a/app/templates/macros/nav_macros.html b/app/templates/macros/nav_macros.html index 83e2bcbe..ebf4c3ed 100644 --- a/app/templates/macros/nav_macros.html +++ b/app/templates/macros/nav_macros.html @@ -261,6 +261,25 @@ {% endif %} +
+
+ {{ _('Correction: Ignore Invalide Lemmas: ') }} + {% set url = request.path %} + {% if 'lemma' in request.path %} +
+ {% endif %} + {% if 'morph' in request.path %} + + {% endif %} + {% if 'POS' in request.path %} + + {% endif %} + Punctuation + Numeral + Ignore + Metadata + +