From e094831eb6025376bca639d3f43d04639c9872f7 Mon Sep 17 00:00:00 2001
From: Lena <lena.jer@gmail.com>
Date: Tue, 10 Sep 2019 16:33:34 +0200
Subject: [PATCH 1/4] Named Entity count transformer based on Spacy. TODO works
 only with pre-installed Spacy and language model.

---
 .../nlp/text_named_entities_transformer.py    | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 transformers/nlp/text_named_entities_transformer.py

diff --git a/transformers/nlp/text_named_entities_transformer.py b/transformers/nlp/text_named_entities_transformer.py
new file mode 100644
index 00000000..8ceac324
--- /dev/null
+++ b/transformers/nlp/text_named_entities_transformer.py
@@ -0,0 +1,71 @@
+"""Extract the counts of different named entities in the text (e.g. Person, Organization, Location)"""
+
+import datatable as dt
+import numpy as np
+
+from h2oaicore.transformer_utils import CustomTransformer
+
+
+class NamedEntityTransformer:
+    """Transformer to extract the count of Named Entities"""
+    _method = NotImplemented
+    _modules_needed_by_name = ["spacy==2.1.8"]
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        import spacy
+        self.nlp = spacy.load('en_core_web_sm')
+
+    @staticmethod
+    def get_default_properties():
+        return dict(col_type="text", min_cols=1, max_cols=1, relative_importance=1)
+
+    def get_ne_count(self, text):
+        ne_type = self.__class__._method
+        entities = self.nlp(text).ents
+        if entities:
+            return len([entity for entity in entities if entity.label_ == ne_type])
+        else:
+            return 0
+
+    def fit_transform(self, X: dt.Frame, y: np.array = None):
+        return self.transform(X)
+
+    def transform(self, X: dt.Frame):
+        return X.to_pandas().astype(str).fillna("NA").iloc[:, 0].apply(lambda x: self.get_ne_count(x))
+
+
+class PersonCountTransformer(NamedEntityTransformer, CustomTransformer):
+    """Get the count of Persons in the text column"""
+    _method = "PERSON"
+
+
+class OrgCountTransformer(NamedEntityTransformer, CustomTransformer):
+    """Get the count of organizations in the text column"""
+    _method = "ORG"
+
+
+class GeoCountTransformer(NamedEntityTransformer, CustomTransformer):
+    """Get the count of countries, cities, states in the text column"""
+    _method = "GPE"
+
+
+class LocCountTransformer(NamedEntityTransformer, CustomTransformer):
+    """Get the count of non-GPE locations in the text column"""
+    _method = "LOC"
+
+
+class ProductCountTransformer(NamedEntityTransformer, CustomTransformer):
+    """Get the count of products (objects, vehicles, foods, etc.) in the text column"""
+    _method = "PRODUCT"
+
+
+class EventCountTransformer(NamedEntityTransformer, CustomTransformer):
+    """Get the count of events (named hurricanes, battles, wars, sports events, etc.) in the text column"""
+    _method = "EVENT"
+
+
+class DateCountTransformer(NamedEntityTransformer, CustomTransformer):
+    """Get the count of dates and periods in the text column"""
+    _method = "DATE"

From 0ef542f31eee3e28ff2a96f5225ea5bb34f90017 Mon Sep 17 00:00:00 2001
From: Lena <lena.jer@gmail.com>
Date: Wed, 11 Sep 2019 10:17:11 +0200
Subject: [PATCH 2/4] Module version specified with '>' gives an error on
 upload

---
 transformers/nlp/text_named_entities_transformer.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/transformers/nlp/text_named_entities_transformer.py b/transformers/nlp/text_named_entities_transformer.py
index 8ceac324..bc5c505c 100644
--- a/transformers/nlp/text_named_entities_transformer.py
+++ b/transformers/nlp/text_named_entities_transformer.py
@@ -9,13 +9,18 @@
 class NamedEntityTransformer:
     """Transformer to extract the count of Named Entities"""
     _method = NotImplemented
-    _modules_needed_by_name = ["spacy==2.1.8"]
+    _modules_needed_by_name = ["spacy>2.1.0"]
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
         import spacy
-        self.nlp = spacy.load('en_core_web_sm')
+        try:
+            self.nlp = spacy.load('en_core_web_sm')
+        except IOError:
+            from spacy.cli import download
+            download('en_core_web_sm')
+            self.nlp = spacy.load('en_core_web_sm')
 
     @staticmethod
     def get_default_properties():

From 7df156eb675ccf86a63ef74d30021f864d7bfcc0 Mon Sep 17 00:00:00 2001
From: Lena <lena.jer@gmail.com>
Date: Thu, 12 Sep 2019 16:35:54 +0200
Subject: [PATCH 3/4] A transformer for replacing named entities with their
 tag.

---
 ...ext_named_entity_conversion_transformer.py | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 transformers/nlp/text_named_entity_conversion_transformer.py

diff --git a/transformers/nlp/text_named_entity_conversion_transformer.py b/transformers/nlp/text_named_entity_conversion_transformer.py
new file mode 100644
index 00000000..3112bc27
--- /dev/null
+++ b/transformers/nlp/text_named_entity_conversion_transformer.py
@@ -0,0 +1,61 @@
+"""Preprocess the text column by replacing named entities with a standard tag
+For example: 'Mary lives in London from 2018' -> '[PERSON] lives in [GPE] from [DATE]' """
+import datatable as dt
+import numpy as np
+from h2oaicore.transformer_utils import CustomTransformer
+
+
+class NamedEntityConverterTransformer(CustomTransformer):
+    """Transformer to replace mentions of named entities with standard tags the text"""
+    _numeric_output = False
+    _modules_needed_by_name = ["spacy==2.1.8"]
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.replace_person = True  # turn off as needed
+        self.replace_location = True  # turn off as needed
+        self.replace_date = True  # turn off as needed
+
+        import spacy
+        try:
+            self.nlp = spacy.load('en_core_web_sm')
+        except IOError:
+            from spacy.cli import download
+            download('en_core_web_sm')
+            self.nlp = spacy.load('en_core_web_sm')
+
+    @staticmethod
+    def get_default_properties():
+        return dict(col_type="text", min_cols=1, max_cols=1, relative_importance=1)
+
+    @property
+    def display_name(self):
+        return "NamedEntityConvertedText"
+
+    def convert_named_entities(self, text, entity_type):
+        tokens = self.nlp(text)
+        new_text = []
+        for token in tokens:
+            if token.ent_type_ == entity_type:
+                word = "[{0}]".format(entity_type)
+            else:
+                word = token.text
+            new_text.append(word)
+        return " ".join(new_text)
+
+    def convert_text(self, text):
+        if self.replace_person:
+            text = self.convert_named_entities(text, "PERSON")
+        if self.replace_date:
+            text = self.convert_named_entities(text, "DATE")
+        if self.replace_location:
+            text = self.convert_named_entities(text, "LOC")
+            text = self.convert_named_entities(text, "GPE")
+
+        return text
+
+    def fit_transform(self, X: dt.Frame, y: np.array = None):
+        return self.transform(X)
+
+    def transform(self, X: dt.Frame):
+        return X.to_pandas().astype(str).fillna("NA").iloc[:, 0].apply(lambda x: self.convert_text(x))

From 09afd71a5ea62d5288bcb657947a1ddd089ba135 Mon Sep 17 00:00:00 2001
From: Lena <lena.jer@gmail.com>
Date: Thu, 12 Sep 2019 16:37:51 +0200
Subject: [PATCH 4/4] A transformer for replacing named entities with their
 tag.

---
 .../nlp/text_named_entities_transformer.py    | 76 -------------------
 1 file changed, 76 deletions(-)
 delete mode 100644 transformers/nlp/text_named_entities_transformer.py

diff --git a/transformers/nlp/text_named_entities_transformer.py b/transformers/nlp/text_named_entities_transformer.py
deleted file mode 100644
index bc5c505c..00000000
--- a/transformers/nlp/text_named_entities_transformer.py
+++ /dev/null
@@ -1,76 +0,0 @@
-"""Extract the counts of different named entities in the text (e.g. Person, Organization, Location)"""
-
-import datatable as dt
-import numpy as np
-
-from h2oaicore.transformer_utils import CustomTransformer
-
-
-class NamedEntityTransformer:
-    """Transformer to extract the count of Named Entities"""
-    _method = NotImplemented
-    _modules_needed_by_name = ["spacy>2.1.0"]
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-        import spacy
-        try:
-            self.nlp = spacy.load('en_core_web_sm')
-        except IOError:
-            from spacy.cli import download
-            download('en_core_web_sm')
-            self.nlp = spacy.load('en_core_web_sm')
-
-    @staticmethod
-    def get_default_properties():
-        return dict(col_type="text", min_cols=1, max_cols=1, relative_importance=1)
-
-    def get_ne_count(self, text):
-        ne_type = self.__class__._method
-        entities = self.nlp(text).ents
-        if entities:
-            return len([entity for entity in entities if entity.label_ == ne_type])
-        else:
-            return 0
-
-    def fit_transform(self, X: dt.Frame, y: np.array = None):
-        return self.transform(X)
-
-    def transform(self, X: dt.Frame):
-        return X.to_pandas().astype(str).fillna("NA").iloc[:, 0].apply(lambda x: self.get_ne_count(x))
-
-
-class PersonCountTransformer(NamedEntityTransformer, CustomTransformer):
-    """Get the count of Persons in the text column"""
-    _method = "PERSON"
-
-
-class OrgCountTransformer(NamedEntityTransformer, CustomTransformer):
-    """Get the count of organizations in the text column"""
-    _method = "ORG"
-
-
-class GeoCountTransformer(NamedEntityTransformer, CustomTransformer):
-    """Get the count of countries, cities, states in the text column"""
-    _method = "GPE"
-
-
-class LocCountTransformer(NamedEntityTransformer, CustomTransformer):
-    """Get the count of non-GPE locations in the text column"""
-    _method = "LOC"
-
-
-class ProductCountTransformer(NamedEntityTransformer, CustomTransformer):
-    """Get the count of products (objects, vehicles, foods, etc.) in the text column"""
-    _method = "PRODUCT"
-
-
-class EventCountTransformer(NamedEntityTransformer, CustomTransformer):
-    """Get the count of events (named hurricanes, battles, wars, sports events, etc.) in the text column"""
-    _method = "EVENT"
-
-
-class DateCountTransformer(NamedEntityTransformer, CustomTransformer):
-    """Get the count of dates and periods in the text column"""
-    _method = "DATE"