-
Notifications
You must be signed in to change notification settings - Fork 24
Sentence complete classifier #389
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Languages can be very dynamic and complicated. This brick does not actually try to be able to accurately classify all sentences, which would be quite complex. Instead, this brick is meant to check if some characteristics apply that a lot of complete sentences have. These characteristics being: does the sentence starts with an uppercase character, if it ends on a punctuation and if it contains at least two nouns and a verb. The name `starts_with_uppercase_ends_with_punctuation_and_contains_two_nouns_and_a_verb` would be a bit long for a brick, though. | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from pydantic import BaseModel | ||
from extractors.util.spacy import SpacySingleton | ||
|
||
INPUT_EXAMPLE = { | ||
"text": "it would be sad if", | ||
"spacy_model": "en_core_web_sm" | ||
} | ||
|
||
class SentenceCompleteClassifierModel(BaseModel): | ||
text: str | ||
spacy_model: str | ||
|
||
class Config: | ||
schema_extra = {"example": INPUT_EXAMPLE} | ||
|
||
def sentence_complete_classifier(req: SentenceCompleteClassifierModel): | ||
"""Classify whether or not a text is complete""" | ||
nlp = SpacySingleton.get_nlp(req.spacy_model) | ||
doc = nlp(req.text) | ||
|
||
classifications = [] | ||
for sent in doc.sents: | ||
if sent[0].is_title and sent[-1].is_punct: | ||
has_noun = 2 | ||
has_verb = 1 | ||
for token in sent: | ||
if token.pos_ in ["NOUN", "PROPN", "PRON"]: | ||
has_noun -= 1 | ||
elif token.pos_ == "VERB": | ||
has_verb -= 1 | ||
if has_noun < 1 and has_verb < 1: | ||
classifications.append("complete") | ||
else: | ||
classifications.append("incomplete") | ||
else: | ||
classifications.append("incomplete") | ||
Comment on lines
+21
to
+36
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you think of the following restructuring of the code?
another option could also be to encapsulate the per sentence classification in a function, and call this via a list comprehension |
||
|
||
# Aggregation logic | ||
if all(classification == "complete" for classification in classifications): | ||
return {"text_completeness": "complete"} | ||
elif all(classification == "incomplete" for classification in classifications): | ||
return {"text_completeness": "incomplete"} | ||
elif any(classification == "incomplete" for classification in classifications): | ||
return {"text_completeness": "partly complete"} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
```python | ||
import spacy | ||
|
||
loaded_models = {} | ||
def load_spacy(spacy_model): | ||
if spacy_model not in loaded_models: | ||
loaded_models[spacy_model] = spacy.load(spacy_model) | ||
return loaded_models[spacy_model] | ||
|
||
def sentence_complete_classifier(text: str, spacy_model: str = "en_core_web_sm") -> str: | ||
""" | ||
@param text: The text to classify | ||
@param spacy_model: A spaCy language model | ||
@returns: Classification for the text based on all sentences | ||
""" | ||
nlp = load_spacy(spacy_model) | ||
doc = nlp(text) | ||
|
||
classifications = [] | ||
for sent in doc.sents: | ||
if sent[0].is_title and sent[-1].is_punct: | ||
has_noun = 2 | ||
has_verb = 1 | ||
for token in sent: | ||
if token.pos_ in ["NOUN", "PROPN", "PRON"]: | ||
has_noun -= 1 | ||
elif token.pos_ == "VERB": | ||
has_verb -= 1 | ||
if has_noun < 1 and has_verb < 1: | ||
classifications.append("complete") | ||
else: | ||
classifications.append("incomplete") | ||
else: | ||
classifications.append("incomplete") | ||
|
||
# Aggregation logic | ||
if all(classification == "complete" for classification in classifications): | ||
return "complete" | ||
elif all(classification == "incomplete" for classification in classifications): | ||
return "incomplete" | ||
elif any(classification == "incomplete" for classification in classifications): | ||
return "partly complete" | ||
|
||
|
||
# ↑ necessary bricks function | ||
# ----------------------------------------------------------------------------------------- | ||
# ↓ example implementation | ||
|
||
def example_integration(): | ||
texts = [ | ||
"This is a complete sentence written by me!", | ||
"The first sentence I have written is complete! However, the second one...", | ||
"and they rand over here and then" | ||
] | ||
for text in texts: | ||
print(f"The text '{text}' is -> {sentence_complete_classifier(text)}") | ||
|
||
example_integration() | ||
``` |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
```python | ||
ATTRIBUTE: str = "text" | ||
|
||
def sentence_complete_classifier(record): | ||
classifications = [] | ||
for sent in record[ATTRIBUTE].sents: | ||
if sent[0].is_title and sent[-1].is_punct: | ||
has_noun = 2 | ||
has_verb = 1 | ||
for token in sent: | ||
if token.pos_ in ["NOUN", "PROPN", "PRON"]: | ||
has_noun -= 1 | ||
elif token.pos_ == "VERB": | ||
has_verb -= 1 | ||
if has_noun < 1 and has_verb < 1: | ||
classifications.append("complete") | ||
else: | ||
classifications.append("incomplete") | ||
else: | ||
classifications.append("incomplete") | ||
|
||
# Aggregation logic | ||
if all(classification == "complete" for classification in classifications): | ||
return "complete" | ||
elif all(classification == "incomplete" for classification in classifications): | ||
return "incomplete" | ||
elif any(classification == "incomplete" for classification in classifications): | ||
return "partly complete" | ||
``` |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
from util.configs import build_classifier_function_config | ||
from util.enums import State, BricksVariableType, RefineryDataType, SelectionType | ||
from . import sentence_complete_classifier, INPUT_EXAMPLE | ||
|
||
|
||
def get_config(): | ||
return build_classifier_function_config( | ||
# strapi information | ||
function=sentence_complete_classifier, | ||
input_example=INPUT_EXAMPLE, | ||
issue_id=349, | ||
tabler_icon="LanguageKatakana", | ||
min_refinery_version="1.7.0", | ||
state=State.PUBLIC.value, | ||
type="python_function", | ||
available_for=["refinery", "common"], | ||
part_of_group=[ | ||
"reference_quality", | ||
], # first entry should be parent directory | ||
# bricks integrator information | ||
cognition_init_mapping = { | ||
"incomplete": "Needs fix", | ||
"complete": "null" | ||
}, | ||
Comment on lines
+21
to
+24
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the mapping for |
||
integrator_inputs={ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
"name": "sentence_complete_classifier", | ||
"refineryDataType": RefineryDataType.TEXT.value, | ||
"variables": { | ||
"ATTRIBUTE": { | ||
"selectionType": SelectionType.CHOICE.value, | ||
"addInfo": [ | ||
BricksVariableType.ATTRIBUTE.value, | ||
BricksVariableType.GENERIC_STRING.value | ||
] | ||
}, | ||
} | ||
} | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From the name and description of the brick, I would not expect the aggregation logic.
I suggest mentioning it in the README