diff --git a/classifiers/__init__.py b/classifiers/__init__.py index bae5e6a6..3aca4e59 100644 --- a/classifiers/__init__.py +++ b/classifiers/__init__.py @@ -22,6 +22,7 @@ from .reference_quality import ( word_count_classifier, special_character_classifier, + sentence_complete_classifier, ) from .dates_and_times import ( @@ -74,6 +75,7 @@ maximum_sentence_complexity, question_type_classifier, communication_style_classifier, + sentence_complete_classifier, ]: module_name = module.__name__.split(".")[-1] model_name = ( diff --git a/classifiers/reference_quality/sentence_complete_classifier/README.md b/classifiers/reference_quality/sentence_complete_classifier/README.md new file mode 100644 index 00000000..d527e078 --- /dev/null +++ b/classifiers/reference_quality/sentence_complete_classifier/README.md @@ -0,0 +1 @@ +Languages can be very dynamic and complicated. This brick does not actually try to be able to accurately classify all sentences, which would be quite complex. Instead, this brick is meant to check if some characteristics apply that a lot of complete sentences have. These characteristics being: does the sentence starts with an uppercase character, if it ends on a punctuation and if it contains at least two nouns and a verb. The name `starts_with_uppercase_ends_with_punctuation_and_contains_two_nouns_and_a_verb` would be a bit long for a brick, though. \ No newline at end of file diff --git a/classifiers/reference_quality/sentence_complete_classifier/__init__.py b/classifiers/reference_quality/sentence_complete_classifier/__init__.py new file mode 100644 index 00000000..81ed677c --- /dev/null +++ b/classifiers/reference_quality/sentence_complete_classifier/__init__.py @@ -0,0 +1,44 @@ +from pydantic import BaseModel +from extractors.util.spacy import SpacySingleton + +INPUT_EXAMPLE = { + "text": "it would be sad if", + "spacy_model": "en_core_web_sm" +} + +class SentenceCompleteClassifierModel(BaseModel): + text: str + spacy_model: str + + class Config: + schema_extra = {"example": INPUT_EXAMPLE} + +def sentence_complete_classifier(req: SentenceCompleteClassifierModel): + """Classify whether or not a text is complete""" + nlp = SpacySingleton.get_nlp(req.spacy_model) + doc = nlp(req.text) + + classifications = [] + for sent in doc.sents: + if sent[0].is_title and sent[-1].is_punct: + has_noun = 2 + has_verb = 1 + for token in sent: + if token.pos_ in ["NOUN", "PROPN", "PRON"]: + has_noun -= 1 + elif token.pos_ == "VERB": + has_verb -= 1 + if has_noun < 1 and has_verb < 1: + classifications.append("complete") + else: + classifications.append("incomplete") + else: + classifications.append("incomplete") + + # Aggregation logic + if all(classification == "complete" for classification in classifications): + return {"text_completeness": "complete"} + elif all(classification == "incomplete" for classification in classifications): + return {"text_completeness": "incomplete"} + elif any(classification == "incomplete" for classification in classifications): + return {"text_completeness": "partly complete"} \ No newline at end of file diff --git a/classifiers/reference_quality/sentence_complete_classifier/code_snippet_common.md b/classifiers/reference_quality/sentence_complete_classifier/code_snippet_common.md new file mode 100644 index 00000000..acaebfeb --- /dev/null +++ b/classifiers/reference_quality/sentence_complete_classifier/code_snippet_common.md @@ -0,0 +1,59 @@ +```python +import spacy + +loaded_models = {} +def load_spacy(spacy_model): + if spacy_model not in loaded_models: + loaded_models[spacy_model] = spacy.load(spacy_model) + return loaded_models[spacy_model] + +def sentence_complete_classifier(text: str, spacy_model: str = "en_core_web_sm") -> str: + """ + @param text: The text to classify + @param spacy_model: A spaCy language model + @returns: Classification for the text based on all sentences + """ + nlp = load_spacy(spacy_model) + doc = nlp(text) + + classifications = [] + for sent in doc.sents: + if sent[0].is_title and sent[-1].is_punct: + has_noun = 2 + has_verb = 1 + for token in sent: + if token.pos_ in ["NOUN", "PROPN", "PRON"]: + has_noun -= 1 + elif token.pos_ == "VERB": + has_verb -= 1 + if has_noun < 1 and has_verb < 1: + classifications.append("complete") + else: + classifications.append("incomplete") + else: + classifications.append("incomplete") + + # Aggregation logic + if all(classification == "complete" for classification in classifications): + return "complete" + elif all(classification == "incomplete" for classification in classifications): + return "incomplete" + elif any(classification == "incomplete" for classification in classifications): + return "partly complete" + + +# ↑ necessary bricks function +# ----------------------------------------------------------------------------------------- +# ↓ example implementation + +def example_integration(): + texts = [ + "This is a complete sentence written by me!", + "The first sentence I have written is complete! However, the second one...", + "and they rand over here and then" + ] + for text in texts: + print(f"The text '{text}' is -> {sentence_complete_classifier(text)}") + +example_integration() +``` \ No newline at end of file diff --git a/classifiers/reference_quality/sentence_complete_classifier/code_snippet_refinery.md b/classifiers/reference_quality/sentence_complete_classifier/code_snippet_refinery.md new file mode 100644 index 00000000..92017a0c --- /dev/null +++ b/classifiers/reference_quality/sentence_complete_classifier/code_snippet_refinery.md @@ -0,0 +1,29 @@ +```python +ATTRIBUTE: str = "text" + +def sentence_complete_classifier(record): + classifications = [] + for sent in record[ATTRIBUTE].sents: + if sent[0].is_title and sent[-1].is_punct: + has_noun = 2 + has_verb = 1 + for token in sent: + if token.pos_ in ["NOUN", "PROPN", "PRON"]: + has_noun -= 1 + elif token.pos_ == "VERB": + has_verb -= 1 + if has_noun < 1 and has_verb < 1: + classifications.append("complete") + else: + classifications.append("incomplete") + else: + classifications.append("incomplete") + + # Aggregation logic + if all(classification == "complete" for classification in classifications): + return "complete" + elif all(classification == "incomplete" for classification in classifications): + return "incomplete" + elif any(classification == "incomplete" for classification in classifications): + return "partly complete" +``` \ No newline at end of file diff --git a/classifiers/reference_quality/sentence_complete_classifier/config.py b/classifiers/reference_quality/sentence_complete_classifier/config.py new file mode 100644 index 00000000..7b8b78c7 --- /dev/null +++ b/classifiers/reference_quality/sentence_complete_classifier/config.py @@ -0,0 +1,38 @@ +from util.configs import build_classifier_function_config +from util.enums import State, BricksVariableType, RefineryDataType, SelectionType +from . import sentence_complete_classifier, INPUT_EXAMPLE + + +def get_config(): + return build_classifier_function_config( + # strapi information + function=sentence_complete_classifier, + input_example=INPUT_EXAMPLE, + issue_id=349, + tabler_icon="LanguageKatakana", + min_refinery_version="1.7.0", + state=State.PUBLIC.value, + type="python_function", + available_for=["refinery", "common"], + part_of_group=[ + "reference_quality", + ], # first entry should be parent directory + # bricks integrator information + cognition_init_mapping = { + "incomplete": "Needs fix", + "complete": "null" + }, + integrator_inputs={ + "name": "sentence_complete_classifier", + "refineryDataType": RefineryDataType.TEXT.value, + "variables": { + "ATTRIBUTE": { + "selectionType": SelectionType.CHOICE.value, + "addInfo": [ + BricksVariableType.ATTRIBUTE.value, + BricksVariableType.GENERIC_STRING.value + ] + }, + } + } + )