Add bullet points to markdown output (long description)

visdesignlab · Dec 2, 2024 · 2a1af1f · 2a1af1f
1 parent 060d043
commit 2a1af1f
Show file tree

Hide file tree

Showing 2 changed files with 96 additions and 88 deletions.
diff --git a/src/alttxt/generator.py b/src/alttxt/generator.py
@@ -92,14 +92,25 @@ def text(self) -> str:
 
 
             if self.structured:
+                # Helper function to replace periods with newlines and bullet points
+                def add_bullet_points(text: str) -> str:
+                    # Add a bullet point to the first sentence
+                    text = text.strip()
+                    text = "* " + text
+
+                    parts = text.split('.')
+                    if len(parts) > 1:
+                        return '.\n*'.join(parts[:-1]) + '.' + parts[-1]
+                    return text
+
                 # Construct the dictionary for markdown content
                 data_to_write_as_md = {
                     "UpSet Introduction": self.replaceTokens(introduction),
                     "Dataset Properties": self.replaceTokens(dataset_properties),
-                    "Set Properties": self.replaceTokens(set_description),
-                    "Intersection Properties": self.replaceTokens(intersection_description),
-                    "Statistical Information": self.replaceTokens(statistical_information),
-                    "Trend Analysis": self.replaceTokens(trend_analysis)
+                    "Set Properties": add_bullet_points(self.replaceTokens(set_description)),
+                    "Intersection Properties": add_bullet_points(self.replaceTokens(intersection_description)),
+                    "Statistical Information": add_bullet_points(self.replaceTokens(statistical_information)),
+                    "Trend Analysis": add_bullet_points(self.replaceTokens(trend_analysis)),
                 }
 
                 markdown_content = ""

diff --git a/src/alttxt/tokenmap.py b/src/alttxt/tokenmap.py
@@ -1,6 +1,7 @@
 from typing import Any, Callable, Tuple, Union, Optional
 from alttxt.models import DataModel, GrammarModel, Subset
 from alttxt.enums import SubsetField, IndividualSetSize, IntersectionTrend
+import statistics
 from alttxt.regionclass import *
 import math
 from collections import Counter
@@ -672,7 +673,7 @@ def calculate_max_min_set_presence(self, maxmin_sized_set_name) -> str:
             (maxmin_set_count / total_non_empty) * 100 if total_non_empty else 0
         )
 
-        return f"{maxmin_set_percentage:.1f}%"
+        return f"{round(maxmin_set_percentage)}%"
 
     def calculate_max_intersection(self) -> dict[str, int]:
         """
@@ -737,16 +738,16 @@ def calculate_set_divergence(self):
         min_set_size = self.sort_visible_sets()[-1][1]
 
         divergence_percentage = (min_set_size / max_set_size) * 100
-        divergence_percentage = math.ceil(divergence_percentage)
 
         # Determine the divergence category
-        if divergence_percentage < 30:
+        if divergence_percentage < 26.67:
             return IndividualSetSize.DIVERGINGALOT.value
-        elif 30 <= divergence_percentage <= 90:
+        elif 26.68 <= divergence_percentage <= 53.34:
             return IndividualSetSize.DIVERGING.value
-        elif divergence_percentage > 90:
+        elif 53.35 <= divergence_percentage <= 79.99:
             return IndividualSetSize.DIVERGINGABIT.value
-
+        else:
+            return IndividualSetSize.IDENTICAL.value
 
     def calculate_change_trend(self):
         """
@@ -822,120 +823,108 @@ def calculate_largest_factor(self):
 
     def categorize_subsets(self):
         """
-        Categorize the subsets into small, medium, large and largest regions based on their coeficent of variance.
-        Based on the varainces, we do set two thresholds for large region of intersections and small region intersections
-        Then we assign the intersection types (individual, low-degree, medium-degree, high-order sets)  based on the percentage presence (>=50%)
-        Returns a dictionary of intersection types being on the region(s)
-
+        Categorize the subsets into small, medium, large and largest regions based on their size.
         """
         results = {}
         sorted_subsets = sorted(self.data.subsets, key=lambda subset: subset.size, reverse=True)
 
-        largest_subset = sorted_subsets.pop(0)  
+        largest_subset = sorted_subsets.pop(0)  # Remove the largest
+
+        median_size = statistics.median([subset.size for subset in sorted_subsets])
 
         region_classification = RegionClassification()
+
         region_classification.set_largest(largest_subset)
-
-        if not sorted_subsets:
-            return {'largest_data_region': [largest_subset]}
-
-        sizes = [subset.size for subset in sorted_subsets]
-        unique_sizes = sorted(set(sizes), reverse=True)  
-
-        q1, median, q3 = np.percentile(sizes, [25, 50, 75])
-        iqr = q3 - q1
-        multiplier = 1.5
-
-        large_threshold = median + multiplier * iqr
-
-        size_categories = {}
-
-        for size in unique_sizes:
-            if size > large_threshold:
-                size_categories[size] = 'large'
-            elif size < median:
-                size_categories[size] = 'small'
-            else:
-                size_categories[size] = 'medium'
-
-
+
+        close_to_zero_threshold = median_size * 1.2  # Define what 'close to zero' means
+
         for subset in sorted_subsets:
-            category = size_categories[subset.size]
-            if category == 'large':
-                region_classification.add_to_large_region(subset)
-            elif category == 'medium':
-                region_classification.add_to_medium_region(subset)
-            else: 
+            deviation = subset.size - median_size
+
+            if deviation < 0 and abs(deviation) > abs(median_size-close_to_zero_threshold):
                 region_classification.add_to_small_region(subset)
+            elif deviation == 0 or abs(deviation) <= abs(median_size-close_to_zero_threshold):
+                region_classification.add_to_medium_region(subset)
+            else:  # deviation > 0 and not close to zero
+                region_classification.add_to_large_region(subset)
 
 
-        # the small treshold may not always perform well, specially in the case of being the smallest intersection really pretty smaller than the largest
-        if not region_classification.large_data_region and region_classification.medium_data_region:
-            largest_medium = max(region_classification.medium_data_region, key=lambda x: x.size)
-            region_classification.medium_data_region = [x for x in region_classification.medium_data_region if x.size != largest_medium.size]
-            region_classification.large_data_region = [x for x in sorted_subsets if x.size == largest_medium.size]
-
-        if not region_classification.small_data_region and region_classification.medium_data_region:
-            smallest_medium = min(region_classification.medium_data_region, key=lambda x: x.size)
-            region_classification.medium_data_region = [x for x in region_classification.medium_data_region if x.size != smallest_medium.size]
-            region_classification.small_data_region = [x for x in sorted_subsets if x.size == smallest_medium.size]
-
-        regions = {
-            'largest_data_region': [region_classification.largest_data_region],
-            'large_data_region': region_classification.large_data_region,
-            'medium_data_region': region_classification.medium_data_region,
-            'small_data_region': region_classification.small_data_region, 
-        }
-
         regions = {
         'largest_data_region': [region_classification.largest_data_region],
         'large_data_region': region_classification.large_data_region,
         'medium_data_region': region_classification.medium_data_region,
-        'small_data_region': region_classification.small_data_region, 
+        'small_data_region': region_classification.small_data_region,
+        # Assuming largest_data_region is a single subset, not a list
+
         }
 
+
         total_sizes = {region: sum(subset.size for subset in subsets) for region, subsets in regions.items()}
 
         for region_name, subsets in regions.items():
+            # Initialize dictionary to store sizes for special classifications
             special_sizes = {}
+            # Initialize Counter for other classifications
             classification_sizes = Counter()
 
             for subset in subsets:
+                # Handle 'the empty set' and 'all set' by directly logging their sizes
                 if subset.classification in ['the empty set']:
                     special_sizes[subset.classification] = subset.size
                 else:
                     classification_sizes[subset.classification] += subset.size
-
-            percentages = {cls: (size / total_sizes[region_name] * 100) for cls, size in classification_sizes.items()}
 
+
+
+            # Calculate percentages based on sizes for other classifications
+            percentages = {cls: (size / total_sizes[region_name] * 100) for cls, size in classification_sizes.items()}
+
+            # Update results with percentages and direct sizes for special cases
             results[region_name] = {**percentages, **special_sizes}
 
         classification_to_regions = {}
 
-        threshold_percentage = 50.0  # the percentage threshold for inclusion
+        threshold_percentage = 35.0  # Define the threshold for inclusion
 
         for region, classifications in results.items():
-            above_threshold = {classification: value for classification, value in classifications.items() if value >= threshold_percentage}
-
-            if above_threshold:
-                for classification in above_threshold.keys():
-                    classification_to_regions.setdefault(classification, set()).add(region)
+                for classification, value in classifications.items():
+                    # Direct inclusion for single-region classifications or exceeding threshold
+                    if classification.value not in classification_to_regions or value >= threshold_percentage:
+                        classification_to_regions.setdefault(classification.value, {}).update({region: value})
+                    else:
+                        # Include only if the percentage exceeds the threshold
+                        existing_region, existing_value = next(iter(classification_to_regions[classification.value].items()))
+                        if value > existing_value:
+                            classification_to_regions[classification.value].update({region: value})
+                        # classification_to_regions[classification.value].update({region: value})
+
+
+            # Refine mapping based on the new rules
+        for classification, regions_percentages in classification_to_regions.items():
+            if len(regions_percentages) > 1:
+                    # Filter regions by threshold and select the highest if none exceed the threshold
+                above_threshold_regions = {region: pct for region, pct in regions_percentages.items() if pct >= threshold_percentage}
+
+            # If no regions meet the threshold, choose the one with the highest percentage
+                if not above_threshold_regions:
+                    highest_region = max(regions_percentages, key=regions_percentages.get)
+                    classification_to_regions[classification] = {highest_region}
+                else:
+                    # Include all regions that meet the threshold
+                    classification_to_regions[classification] = set(above_threshold_regions.keys())
             else:
-                if classifications:
-                    max_value = max(classifications.values())
-                    max_classifications = [classification for classification, value in classifications.items() if value == max_value]
-                    for classification in max_classifications:
-                        classification_to_regions.setdefault(classification, set()).add(region)
-
-
+                # If the classification is present in only one region, include it regardless of the percentage
+                classification_to_regions[classification] = set(regions_percentages.keys())
 
+        # Handle special cases such as 'the empty set' and 'all set'
+        # They should be directly mapped without considering the threshold
         for special_case in ['the empty set', 'all set']:
             if special_case in results:
                 for region in results[special_case]:
                     classification_to_regions[special_case] = {region}
 
 
-        final_output = {cls.value: {regions} if isinstance(regions, str) else set(regions) for cls, regions in classification_to_regions.items()}
+        final_output = {cls: {regions} if isinstance(regions, str) else set(regions) for cls, regions in classification_to_regions.items()}
 
         return final_output
 
@@ -959,20 +948,22 @@ def individual_set_presence(self) -> str:
         individual_set_regions = categorization.get('individual set')
 
         if individual_set_regions:
+            # Convert set to list to index
             regions_list = list(individual_set_regions)
             if len(regions_list) == 1:
-                return f" The individual set intersections are seen in {regions_list[0].replace('_data_region', '')} sized intersections."
+                return f" The individual set intersections are {regions_list[0].replace('_data_region', '')} in size."
             else:
                 regions_formatted = [region.replace('_data_region', '') for region in regions_list]
-                return f" The individual set intersections are significantly present in {' and '.join(regions_formatted)} intersections."
+                return f" The individual set intersections are in {' and '.join(regions_formatted)} intersections."
         else:
-            return ""
+            return " No individual set intersections are present."
 
     def medium_set_presence(self) -> str:
         categorization = self.categorize_subsets()
         medium_set_regions = categorization.get('medium set')
 
         if medium_set_regions:
+            # Convert set to list to index
             regions_list = list(medium_set_regions)
             if len(regions_list) == 1:
                 return f" The medium degree set intersections can be seen among {regions_list[0].replace('_data_region', '')} sized intersections."
@@ -990,10 +981,16 @@ def low_set_presence(self) -> str:
             # Convert set to list to index
             regions_list = list(low_set_regions)
             if len(regions_list) == 1:
-                return f" The low degree set intersections lie in {regions_list[0].replace('_data_region', '')} sized intersections."
+                ret_str = f" The low degree set intersections are in {regions_list[0].replace('_data_region', '')} sized intersections."
+
+                # Replace 'largest' and 'smallest' with 'the largest' and 'the smallest'
+                ret_str = ret_str.replace('largest', 'the largest')
+                ret_str = ret_str.replace('smallest', 'the smallest')
+
+                return ret_str
             else:
                 regions_formatted = [region.replace('_data_region', '') for region in regions_list]
-                return f" The low degree set intersections lie in {' and '.join(regions_formatted)} sized intersections."
+                return f" The low degree set intersections are in {' and '.join(regions_formatted)} sized intersections."
         else:
             return ""
 
@@ -1003,6 +1000,7 @@ def high_set_presence(self) -> str:
         high_set_regions = categorization.get('high order set')
 
         if high_set_regions:
+            # Convert set to list to index
             regions_list = list(high_set_regions)
             if len(regions_list) == 1:
                 if regions_list[0] == 'largest_data_region':
@@ -1012,8 +1010,7 @@ def high_set_presence(self) -> str:
                 regions_formatted = [region.replace('_data_region', '') for region in regions_list]
                 return f" In {' and '.join(regions_formatted)} sized intersections, the high order set intersections are significantly present."
         else:
-            return ""
-
+            return " No high order intersections are present."
 
     def calculate_intersection_trend(self) -> str:
         intersection_trend = self.calculate_change_trend()