From e094831eb6025376bca639d3f43d04639c9872f7 Mon Sep 17 00:00:00 2001 From: Lena Date: Tue, 10 Sep 2019 16:33:34 +0200 Subject: [PATCH 1/4] Named Entity count transformer based on Spacy. TODO works only with pre-installed Spacy and language model. --- .../nlp/text_named_entities_transformer.py | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 transformers/nlp/text_named_entities_transformer.py diff --git a/transformers/nlp/text_named_entities_transformer.py b/transformers/nlp/text_named_entities_transformer.py new file mode 100644 index 00000000..8ceac324 --- /dev/null +++ b/transformers/nlp/text_named_entities_transformer.py @@ -0,0 +1,71 @@ +"""Extract the counts of different named entities in the text (e.g. Person, Organization, Location)""" + +import datatable as dt +import numpy as np + +from h2oaicore.transformer_utils import CustomTransformer + + +class NamedEntityTransformer: + """Transformer to extract the count of Named Entities""" + _method = NotImplemented + _modules_needed_by_name = ["spacy==2.1.8"] + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + import spacy + self.nlp = spacy.load('en_core_web_sm') + + @staticmethod + def get_default_properties(): + return dict(col_type="text", min_cols=1, max_cols=1, relative_importance=1) + + def get_ne_count(self, text): + ne_type = self.__class__._method + entities = self.nlp(text).ents + if entities: + return len([entity for entity in entities if entity.label_ == ne_type]) + else: + return 0 + + def fit_transform(self, X: dt.Frame, y: np.array = None): + return self.transform(X) + + def transform(self, X: dt.Frame): + return X.to_pandas().astype(str).fillna("NA").iloc[:, 0].apply(lambda x: self.get_ne_count(x)) + + +class PersonCountTransformer(NamedEntityTransformer, CustomTransformer): + """Get the count of Persons in the text column""" + _method = "PERSON" + + +class OrgCountTransformer(NamedEntityTransformer, CustomTransformer): + """Get the count of organizations in the text column""" + _method = "ORG" + + +class GeoCountTransformer(NamedEntityTransformer, CustomTransformer): + """Get the count of countries, cities, states in the text column""" + _method = "GPE" + + +class LocCountTransformer(NamedEntityTransformer, CustomTransformer): + """Get the count of non-GPE locations in the text column""" + _method = "LOC" + + +class ProductCountTransformer(NamedEntityTransformer, CustomTransformer): + """Get the count of products (objects, vehicles, foods, etc.) in the text column""" + _method = "PRODUCT" + + +class EventCountTransformer(NamedEntityTransformer, CustomTransformer): + """Get the count of events (named hurricanes, battles, wars, sports events, etc.) in the text column""" + _method = "EVENT" + + +class DateCountTransformer(NamedEntityTransformer, CustomTransformer): + """Get the count of dates and periods in the text column""" + _method = "DATE" From 0ef542f31eee3e28ff2a96f5225ea5bb34f90017 Mon Sep 17 00:00:00 2001 From: Lena Date: Wed, 11 Sep 2019 10:17:11 +0200 Subject: [PATCH 2/4] Module version specified with '>' gives an error on upload --- transformers/nlp/text_named_entities_transformer.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/transformers/nlp/text_named_entities_transformer.py b/transformers/nlp/text_named_entities_transformer.py index 8ceac324..bc5c505c 100644 --- a/transformers/nlp/text_named_entities_transformer.py +++ b/transformers/nlp/text_named_entities_transformer.py @@ -9,13 +9,18 @@ class NamedEntityTransformer: """Transformer to extract the count of Named Entities""" _method = NotImplemented - _modules_needed_by_name = ["spacy==2.1.8"] + _modules_needed_by_name = ["spacy>2.1.0"] def __init__(self, **kwargs): super().__init__(**kwargs) import spacy - self.nlp = spacy.load('en_core_web_sm') + try: + self.nlp = spacy.load('en_core_web_sm') + except IOError: + from spacy.cli import download + download('en_core_web_sm') + self.nlp = spacy.load('en_core_web_sm') @staticmethod def get_default_properties(): From 7df156eb675ccf86a63ef74d30021f864d7bfcc0 Mon Sep 17 00:00:00 2001 From: Lena Date: Thu, 12 Sep 2019 16:35:54 +0200 Subject: [PATCH 3/4] A transformer for replacing named entities with their tag. --- ...ext_named_entity_conversion_transformer.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 transformers/nlp/text_named_entity_conversion_transformer.py diff --git a/transformers/nlp/text_named_entity_conversion_transformer.py b/transformers/nlp/text_named_entity_conversion_transformer.py new file mode 100644 index 00000000..3112bc27 --- /dev/null +++ b/transformers/nlp/text_named_entity_conversion_transformer.py @@ -0,0 +1,61 @@ +"""Preprocess the text column by replacing named entities with a standard tag +For example: 'Mary lives in London from 2018' -> '[PERSON] lives in [GPE] from [DATE]' """ +import datatable as dt +import numpy as np +from h2oaicore.transformer_utils import CustomTransformer + + +class NamedEntityConverterTransformer(CustomTransformer): + """Transformer to replace mentions of named entities with standard tags the text""" + _numeric_output = False + _modules_needed_by_name = ["spacy==2.1.8"] + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.replace_person = True # turn off as needed + self.replace_location = True # turn off as needed + self.replace_date = True # turn off as needed + + import spacy + try: + self.nlp = spacy.load('en_core_web_sm') + except IOError: + from spacy.cli import download + download('en_core_web_sm') + self.nlp = spacy.load('en_core_web_sm') + + @staticmethod + def get_default_properties(): + return dict(col_type="text", min_cols=1, max_cols=1, relative_importance=1) + + @property + def display_name(self): + return "NamedEntityConvertedText" + + def convert_named_entities(self, text, entity_type): + tokens = self.nlp(text) + new_text = [] + for token in tokens: + if token.ent_type_ == entity_type: + word = "[{0}]".format(entity_type) + else: + word = token.text + new_text.append(word) + return " ".join(new_text) + + def convert_text(self, text): + if self.replace_person: + text = self.convert_named_entities(text, "PERSON") + if self.replace_date: + text = self.convert_named_entities(text, "DATE") + if self.replace_location: + text = self.convert_named_entities(text, "LOC") + text = self.convert_named_entities(text, "GPE") + + return text + + def fit_transform(self, X: dt.Frame, y: np.array = None): + return self.transform(X) + + def transform(self, X: dt.Frame): + return X.to_pandas().astype(str).fillna("NA").iloc[:, 0].apply(lambda x: self.convert_text(x)) From 09afd71a5ea62d5288bcb657947a1ddd089ba135 Mon Sep 17 00:00:00 2001 From: Lena Date: Thu, 12 Sep 2019 16:37:51 +0200 Subject: [PATCH 4/4] A transformer for replacing named entities with their tag. --- .../nlp/text_named_entities_transformer.py | 76 ------------------- 1 file changed, 76 deletions(-) delete mode 100644 transformers/nlp/text_named_entities_transformer.py diff --git a/transformers/nlp/text_named_entities_transformer.py b/transformers/nlp/text_named_entities_transformer.py deleted file mode 100644 index bc5c505c..00000000 --- a/transformers/nlp/text_named_entities_transformer.py +++ /dev/null @@ -1,76 +0,0 @@ -"""Extract the counts of different named entities in the text (e.g. Person, Organization, Location)""" - -import datatable as dt -import numpy as np - -from h2oaicore.transformer_utils import CustomTransformer - - -class NamedEntityTransformer: - """Transformer to extract the count of Named Entities""" - _method = NotImplemented - _modules_needed_by_name = ["spacy>2.1.0"] - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - import spacy - try: - self.nlp = spacy.load('en_core_web_sm') - except IOError: - from spacy.cli import download - download('en_core_web_sm') - self.nlp = spacy.load('en_core_web_sm') - - @staticmethod - def get_default_properties(): - return dict(col_type="text", min_cols=1, max_cols=1, relative_importance=1) - - def get_ne_count(self, text): - ne_type = self.__class__._method - entities = self.nlp(text).ents - if entities: - return len([entity for entity in entities if entity.label_ == ne_type]) - else: - return 0 - - def fit_transform(self, X: dt.Frame, y: np.array = None): - return self.transform(X) - - def transform(self, X: dt.Frame): - return X.to_pandas().astype(str).fillna("NA").iloc[:, 0].apply(lambda x: self.get_ne_count(x)) - - -class PersonCountTransformer(NamedEntityTransformer, CustomTransformer): - """Get the count of Persons in the text column""" - _method = "PERSON" - - -class OrgCountTransformer(NamedEntityTransformer, CustomTransformer): - """Get the count of organizations in the text column""" - _method = "ORG" - - -class GeoCountTransformer(NamedEntityTransformer, CustomTransformer): - """Get the count of countries, cities, states in the text column""" - _method = "GPE" - - -class LocCountTransformer(NamedEntityTransformer, CustomTransformer): - """Get the count of non-GPE locations in the text column""" - _method = "LOC" - - -class ProductCountTransformer(NamedEntityTransformer, CustomTransformer): - """Get the count of products (objects, vehicles, foods, etc.) in the text column""" - _method = "PRODUCT" - - -class EventCountTransformer(NamedEntityTransformer, CustomTransformer): - """Get the count of events (named hurricanes, battles, wars, sports events, etc.) in the text column""" - _method = "EVENT" - - -class DateCountTransformer(NamedEntityTransformer, CustomTransformer): - """Get the count of dates and periods in the text column""" - _method = "DATE"