navigating-stories · eriktks · Jan 21, 2025 · Nov 25, 2024 · Nov 26, 2024 · Nov 26, 2024
diff --git a/doc/widgets/icons/purpose_analysis_adverbs.png b/doc/widgets/icons/purpose_analysis_adverbs.png
diff --git a/doc/widgets/icons/purpose_analysis_output.png b/doc/widgets/icons/purpose_analysis_output.png
diff --git a/doc/widgets/icons/purpose_analysis_pipeline.png b/doc/widgets/icons/purpose_analysis_pipeline.png
diff --git a/doc/widgets/icons/purpose_analysis_widget.png b/doc/widgets/icons/purpose_analysis_widget.png
diff --git a/doc/widgets/purpose.md b/doc/widgets/purpose.md
@@ -0,0 +1,64 @@
+![](../../orangecontrib/storynavigation/widgets/icons/purpose_analysis_icon.png)
+
+Purpose Analysis widget
+=======================
+
+**Introduction**
+
+The Purpose Analysis widget can be used for finding the *purpose* in sentences in text. This corresponds with the 
+concept with the same name in [Burke's dramatistic pentad](https://en.wikipedia.org/wiki/Dramatistic_pentad#Agency) and 
+with *[final clause](https://en.wikipedia.org/wiki/Final_clause)* in linguistics. Here are two example sentences:
+
+English: <span style="color:pink">John walks every day.</span> <span style="color:lightblue">Thus</span> 
+<span style="color:lightgreen">he improves his health</span>
+<br>Dutch: <span style="color:pink">Jan wandelt elke dag.</span> <span style="color:lightblue">Daardoor</span> 
+<span style="color:lightgreen">verbetert hij zijn gezondheid</span>
+
+In the English sentence, the purpose phrase is *it is good for his health*. The sentence also contains an intentionally 
+performed action as context: *John walks 30 minutes everyday*. The action is linked to the purpose phrase by
+an adverb (*because*). The purpose phrase is called PURPOSE in the widget, the action phrase 
+CONTEXT while the conjunction is named ADVERB.
+
+**Methods**
+
+The identification of purpose phrases in text requires a tool which can recognize intention. As far as we know, no
+such tool exists. Instead, we use an approximation based on the syntactic analysis tool [Spacy](https://spacy.io/). 
+The Purpose Analysis widget offers two strategies for finding purpose phrases:
+
+1. verbs: search for verbs like *want* to identify the purpose of the author of the text
+2. adverbs: search for adverbs like *because* to find cause-effect relations in the text which could be context-purpose relations
+
+None of these strategies work perfectly. The Purpose Analysis widget offers the users to provide lists of verbs and 
+adverbs, and then select the strategy which corresponds best with their application.
+
+**Languages**
+
+The Purpose Analysis widget has been tested for Dutch. The verb file and adverb file need to be chosen separately from 
+the language selection. Languages other than Dutch and English are not supported.
+
+**Relations with other widgets**
+
+The Purpose Analysis widget expects input from the *Elements* widget. The output table can be inspected in a *Data Table* 
+widget.
+
+**Screenshots**
+
+![](icons/purpose_analysis_pipeline.png)
+
+Figure 1: The Purpose Analysis widget expects input from the Elements widget. The output table can be inspected in a Data 
+Table widget
+
+![](icons/purpose_analysis_widget.png)
+
+Figure 2: Purpose Analysis widget analysis of the sentence: "Jan wandelt elke dag. Daardoor verbetert hij zijn 
+gezondheid". The *Browse* button can be used to select a different abverbs file
+
+![](icons/purpose_analysis_adverbs.png)
+
+Figure 3: Contents of the file dutch_purpose_adverbs.csv
+
+![](icons/purpose_analysis_output.png)
+
+Figure 4: Output table of the Means Analysis widget for the sentence "Jan wandelt elke dag. Daardoor verbetert hij 
+zijn gezondheid"
+
diff --git a/orangecontrib/storynavigation/modules/constants.py b/orangecontrib/storynavigation/modules/constants.py
@@ -204,3 +204,11 @@
 MEANS_STRATEGY_VERB_FRAMES = "use verb frames"
 MEANS_STRATEGY_VERB_FRAME_PREPS = "use prepositions from verb frames   "
 MEANS_STRATEGY_SPACY_PREPS = "use prepositions from Spacy"
+
+# purpose widget strategies
+PURPOSE_STRATEGY_ADVERBS = "use adverbs"
+PURPOSE_STRATEGY_VERBS = "use verbs"
+
+# first person words for purpose widget
+NL_FIRST_PERSON_WORDS = ['ik', 'we', 'wij']
+EN_FIRST_PERSON_WORDS = ['i', 'we']
diff --git a/orangecontrib/storynavigation/modules/purposeanalysis.py b/orangecontrib/storynavigation/modules/purposeanalysis.py
@@ -0,0 +1,276 @@
+import pandas as pd
+import storynavigation.modules.constants as constants
+import storynavigation.modules.util as util
+
+
+class PurposeAnalyzer:
+    """Class for extracting purpose from texts
+
+    Args:
+        language (str): ISO string of the language of the input text
+        story_elements (list of lists): tokens with their Spacy analysis
+        verb_frames: verb frames indicating purpose
+        purpose_strategy: strategy to identify purpose
+        callback: function in widget to show the progress of this process
+    """
+
+
+    PURPOSE_LABELS = ['PURPOSE', 'ADVERB', 'CONTEXT']
+
+
+    def __init__(self, language, story_elements, verb_frames, purpose_strategy, callback=None) -> None:
+        self.language = language
+        self.verb_frames = verb_frames
+        self.purpose_strategy = purpose_strategy
+        if self.language == constants.NL:
+            self.first_person_words = constants.NL_FIRST_PERSON_WORDS
+        else:
+            self.first_person_words = constants.EN_FIRST_PERSON_WORDS
+        story_elements_df = util.convert_orangetable_to_dataframe(story_elements)
+        self.__convert_str_columns_to_ints(story_elements_df)
+        entities = self.__process_texts(story_elements_df, callback=callback)
+        sentence_offsets = self.__compute_sentence_offsets(story_elements_df)
+        entities_from_onsets = self.__convert_entities(entities, sentence_offsets)
+        entities_from_onsets = self.__add_missing_relation_parts(story_elements_df, entities_from_onsets, sentence_offsets)
+        self.purpose_analysis = self.__sort_and_filter_results(entities_from_onsets)
+
+
+    def __convert_str_columns_to_ints(self, story_elements_df) -> None:
+        columns_to_convert = ["storyid", "sentence_id", "token_start_idx", "spacy_head_idx"]
+        story_elements_df[columns_to_convert] = story_elements_df[columns_to_convert].astype(int)
+
+
+    def __compute_sentence_offsets(self, story_elements_df) -> pd.DataFrame:
+        sentences_df = story_elements_df.groupby(["storyid", "sentence_id"]).first().reset_index()[["storyid", "sentence_id", "sentence"]]
+        char_offsets = []
+        last_sentence = ""
+        for sentence_id, sentence in zip(sentences_df["sentence_id"],
+                                         sentences_df["sentence"]):
+            if sentence_id == sentences_df.iloc[0]["sentence_id"]:
+                char_offset = 0
+            else:
+                char_offset += len(last_sentence) + 1
+            char_offsets.append(char_offset)
+            last_sentence = sentence
+        sentences_df["char_offset"] = char_offsets
+        return sentences_df[["storyid", "sentence_id", "char_offset"]].set_index(["storyid", "sentence_id"])
+
+
+    def __get_missing_label(self, entities_from_onsets, storyid, sentence_id) -> list:
+        labels_found = [entity['label_'] for entity in entities_from_onsets[storyid].values() if entity['sentence_id'] == sentence_id]
+        return [x for x in self.PURPOSE_LABELS if x not in labels_found]
+
+
+    def __add_missing_relation_part(self, entities_from_onsets, sentence_offsets, storyid, sentence_id, previous_sentence) -> None:
+        missing_labels = self.__get_missing_label(entities_from_onsets, storyid, sentence_id)
+        if len(missing_labels) == 1:
+            char_id_start = sentence_offsets.loc[(storyid, sentence_id - 1)]["char_offset"]
+            char_id_end = sentence_offsets.loc[(storyid, sentence_id)]["char_offset"] - 1
+            entities_from_onsets[storyid][char_id_start] = {
+                'label_': missing_labels[0],
+                'sentence_id': sentence_id,
+                'text': previous_sentence
+            }
+
+
+    def __add_missing_relation_parts(self, story_elements_df, entities_from_onsets, sentence_offsets) -> dict:
+        sentences_df = story_elements_df.groupby(['storyid', 'sentence_id'])['sentence'].first()
+        for storyid in entities_from_onsets:
+            sentence_ids = {}
+            for char_id in entities_from_onsets[storyid]:
+                sentence_id = entities_from_onsets[storyid][char_id]['sentence_id']
+                label = entities_from_onsets[storyid][char_id]['label_']
+                if sentence_id in sentence_ids:
+                    sentence_ids[sentence_id].append(label)
+                else:
+                    sentence_ids[sentence_id] = [label]
+            for sentence_id in sentence_ids:
+                if len(sentence_ids[sentence_id]) == 2 and sentence_id > 0:
+                    self.__add_missing_relation_part(entities_from_onsets, 
+                                                     sentence_offsets, 
+                                                     storyid, 
+                                                     sentence_id,
+                                                     sentences_df.loc[storyid, sentence_id - 1])
+        return entities_from_onsets
+
+
+    def __convert_entities(self, entities, sentence_offsets) -> dict:
+        entities_from_onsets = {}
+        for storyid, sentence_id, sentence_data in entities:
+            story_entities = entities_from_onsets.setdefault(storyid, {})
+            char_offset_sentence = sentence_offsets.loc[(storyid, sentence_id)]["char_offset"]
+            for token_start_id, token_data in sentence_data.items():
+                story_entities[token_start_id + char_offset_sentence] = token_data
+        return entities_from_onsets
+
+
+    def __convert_stories_to_sentences(self, story_elements_df) -> pd.DataFrame:
+        return { index: group.to_dict(orient="index") for index, group in story_elements_df.groupby(["storyid", "sentence_id"])}
+
+
+    def __process_texts(self, story_elements_df, callback=None) -> list:
+        sentence_dict = self.__convert_stories_to_sentences(story_elements_df)
+        entities = []
+        for index, (sentence_dict_index, row_sentence_dict) in enumerate(sentence_dict.items()):
+            row_sentence_dict = { token["token_start_idx"]: token
+                                 for token_idx, token in row_sentence_dict.items() }
+            sentence_entities = self.__process_sentence(row_sentence_dict)
+            if sentence_entities:
+                entities.append([
+                    sentence_dict_index[0],
+                    sentence_dict_index[1],
+                    sentence_entities])
+            if callback:
+                callback((100*(index + 1))/len(sentence_dict))
+        return entities
+
+
+    def __find_matching_dependencies(self, sentence_dict, entity_start_id, head_start_id, head_of_head_start_id) -> bool:
+        try:
+            if sentence_dict[head_of_head_start_id]["spacy_tag"] not in {"VERB", "AUX"}:
+                return False
+        except:
+            return False
+        verb_frame_adverbs = [x[1] for x in self.verb_frames]
+        verb_frame_verbs = [x[0] for x in self.verb_frames]
+        entity = sentence_dict[entity_start_id]
+        head = sentence_dict[head_start_id]
+        return ((self.purpose_strategy == constants.PURPOSE_STRATEGY_ADVERBS and
+                 entity['spacy_lemma'].lower() in verb_frame_adverbs) or
+                (self.purpose_strategy == constants.PURPOSE_STRATEGY_VERBS and
+                 entity['spacy_lemma'].lower() in self.first_person_words and
+                 head['spacy_lemma'].lower() in verb_frame_verbs))
+
+
+    def __expand_phrase(self, sentence_dict, sentence_entities, entity_start_id, head_start_id, processed_ids) -> None:
+        child_entity_ids = self.__get_head_dependencies(sentence_dict, entity_start_id, head_start_id)
+        head_start_id = self.__prepend_tokens_to_purpose_phrase(sentence_dict, sentence_entities, head_start_id, child_entity_ids, processed_ids)
+        self.__append_tokens_to_purpose_phrase(sentence_dict, sentence_entities, head_start_id, child_entity_ids, processed_ids)
+        for child_entity_id in sorted(set(child_entity_ids).difference(processed_ids)):
+            print(sentence_dict[entity_start_id]["token_text"], sentence_dict[head_start_id]["token_text"],
+                  "skipping purpose word", sentence_dict[child_entity_id]["spacy_lemma"], sentence_dict[child_entity_id]["sentence"])
+
+
+    def __prepend_tokens_to_purpose_phrase(self, sentence_dict, sentence_entities, head_start_id, child_entity_ids, processed_ids) -> None:
+        for child_entity_id in sorted(child_entity_ids, reverse=True):
+            if child_entity_id in processed_ids:
+                continue
+            child_entity_text = sentence_dict[child_entity_id]["token_text"]
+            entity_gap_size = head_start_id - len(child_entity_text) - child_entity_id
+            if entity_gap_size in [1, 2, 3]:
+                if entity_gap_size == 1:
+                   in_between_text = " "
+                elif entity_gap_size == 2:
+                   in_between_text = ", "
+                elif entity_gap_size == 3:
+                   in_between_text = " , "
+                sentence_entities[child_entity_id] = {
+                    "text": child_entity_text + in_between_text + sentence_entities[head_start_id]["text"],
+                    "sentence_id": sentence_dict[child_entity_id]["sentence_id"],
+                    "label_": sentence_entities[head_start_id]['label_'] }
+                del sentence_entities[head_start_id]
+                head_start_id = child_entity_id
+                processed_ids.add(child_entity_id)
+        return head_start_id
+
+
+    def __append_tokens_to_purpose_phrase(self, sentence_dict, sentence_entities, head_start_id, child_entity_ids, processed_ids) -> None:
+        for child_entity_id in sorted(child_entity_ids):
+            if child_entity_id in processed_ids:
+                continue
+            entity_gap_size = child_entity_id - head_start_id - len(sentence_entities[head_start_id]["text"])
+            if entity_gap_size in [1, 2, 3]:
+                if entity_gap_size == 1:
+                   in_between_text = " "
+                elif entity_gap_size == 2:
+                   in_between_text = ", "
+                elif entity_gap_size == 3:
+                   in_between_text = " , "
+                sentence_entities[head_start_id]["text"] += in_between_text + sentence_dict[child_entity_id]["token_text"]
+                processed_ids.add(child_entity_id)
+
+
+    def __process_sentence(self, sentence_dict) -> dict:
+        sentence_entities = {}
+        for entity_start_id, token_data in sorted(sentence_dict.items()):
+            try:
+                head_start_id = token_data.get("spacy_head_idx")
+                head_of_head_start_id = sentence_dict.get(head_start_id, {}).get("spacy_head_idx")
+                if self.language == constants.EN:
+                    entity_start_id, head_start_id = head_start_id, entity_start_id
+                if self.__find_matching_dependencies(sentence_dict, entity_start_id, head_start_id, head_of_head_start_id):
+                    if self.purpose_strategy == constants.PURPOSE_STRATEGY_VERBS:
+                        self.__add_sentence_entity_verb(sentence_dict, sentence_entities, head_start_id)
+                    elif self.purpose_strategy == constants.PURPOSE_STRATEGY_ADVERBS:
+                        if head_start_id == head_of_head_start_id:
+                            print("overlapping relation parts!", sentence_dict[head_start_id]["token_text"])
+                        self.__add_sentence_entity_adverb(sentence_dict, sentence_entities, entity_start_id, head_start_id, head_of_head_start_id)
+            except AttributeError as e:
+                self.__log_error("attribute error", e, token_data)
+            except KeyError as e:
+                self.__log_error("key error", e, token_data)
+        return sentence_entities
+
+
+    def __log_error(self, error_phrase, e, token_data) -> None:
+        print(f"{error_phrase}: missing {e} in {token_data['storyid']} {token_data['token_text']} {token_data['sentence']}")
+
+
+    def __add_sentence_entity_adverb(self, sentence_dict, sentence_entities, entity_start_id, head_start_id, head_of_head_start_id) -> None:
+        entity = sentence_dict[entity_start_id]
+        sentence_id = entity["sentence_id"]
+        reversed_order = ([x[2] for x in self.verb_frames if x[1] == entity["token_text"].lower()] == ['yes'])
+        head_label, head_of_head_label = ['PURPOSE', 'CONTEXT'] if reversed_order else ['CONTEXT', 'PURPOSE']
+        sentence_entities[entity_start_id] = {
+            "label_": "ADVERB", 
+            "sentence_id": sentence_id,
+            "text": entity["token_text"]}
+        sentence_entities[head_start_id] = {
+            "label_": head_label,
+            "sentence_id": sentence_id,
+            "text": sentence_dict[head_start_id]["token_text"]}
+        processed_ids = {entity_start_id, head_start_id, head_of_head_start_id}
+        self.__expand_phrase(sentence_dict, 
+                                     sentence_entities, 
+                                     entity_start_id, 
+                                     head_start_id, 
+                                     processed_ids=processed_ids)
+        if head_of_head_start_id != head_start_id:
+            sentence_entities[head_of_head_start_id] = {
+                "label_": head_of_head_label,
+                "sentence_id": sentence_id,
+                "text": sentence_dict[head_of_head_start_id]["token_text"]}
+            self.__expand_phrase(sentence_dict, 
+                                         sentence_entities, 
+                                         entity_start_id, 
+                                         head_of_head_start_id, 
+                                         processed_ids=processed_ids)
+
+
+    def __add_sentence_entity_verb(self, sentence_dict, sentence_entities, entity_start_id) -> None:
+        entity = sentence_dict[entity_start_id]
+        sentence_id = entity["sentence_id"]
+        sentence_entities[entity_start_id] = {
+            "label_": "PURPOSE", 
+            "sentence_id": sentence_id,
+            "text": entity["token_text"]}
+        self.__expand_phrase(sentence_dict, sentence_entities, entity_start_id, entity_start_id, processed_ids=set())
+
+
+    def __get_head_dependencies(self, sentence_dict, entity_start_id, head_start_id) -> list:
+        entity_ids = []
+        for start_id, token in sorted(sentence_dict.items()):
+            if token["spacy_head_idx"] == head_start_id and start_id not in {entity_start_id, head_start_id}:
+                entity_ids.append(start_id)
+                entity_ids.extend(self.__get_head_dependencies(sentence_dict, entity_start_id, start_id))
+        return entity_ids
+
+
+    def __sort_and_filter_results(self, entities) -> pd.DataFrame:
+        results = [(entity["text"], entity["label_"], storyid, entity["sentence_id"], char_id)
+                   for storyid, story_entities in entities.items()
+                   for char_id, entity in story_entities.items()]
+        results_df = pd.DataFrame(results, columns=["text", "label", "storyid", "sentence_id", "character_id"])
+        results_df.sort_values(by=["storyid", "character_id"], inplace=True)
+        results_df["text_id"] = "ST" + results_df["storyid"].astype(str)
+        return results_df[["text", "label", "text_id", "sentence_id", "character_id"]].reset_index(drop=True)
diff --git a/orangecontrib/storynavigation/modules/tagging.py b/orangecontrib/storynavigation/modules/tagging.py
@@ -499,7 +499,7 @@ def __is_valid_token(self, token):
         """
         word = util.get_normalized_token(token)
 
-        return (word not in self.stopwords) and len(word) > 1 and util.is_only_punctuation(word) != '-'
+        return (word not in self.stopwords) and len(word) > 0 and util.is_only_punctuation(word) != '-'
 
     def __is_subject(self, tag):
         """Checks whether a given pos-tagged token is a subject of its sentence or not

diff --git a/orangecontrib/storynavigation/resources/dutch_purpose_adverbs.csv b/orangecontrib/storynavigation/resources/dutch_purpose_adverbs.csv
@@ -0,0 +1,11 @@
+verb,adverb,reversed
+ignore,omdat,no
+ignore,doordat,no
+ignore,zodat,yes
+ignore,opdat,no
+ignore,daardoor,yes
+ignore,want,no
+ignore,waardoor,yes
+ignore,dus,yes
+ignore,daarom,yes
+ignore,hierdoor,yes
diff --git a/orangecontrib/storynavigation/resources/dutch_purpose_verbs.csv b/orangecontrib/storynavigation/resources/dutch_purpose_verbs.csv
@@ -0,0 +1,5 @@
+verb,adverb
+willen,ignore
+hopen,ignore
+verlangen,ignore
+wensen,ignore