Skip to content

Commit

Permalink
Add bullet points to markdown output (long description)
Browse files Browse the repository at this point in the history
  • Loading branch information
JakeWags committed Dec 2, 2024
1 parent 060d043 commit 2a1af1f
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 88 deletions.
19 changes: 15 additions & 4 deletions src/alttxt/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,25 @@ def text(self) -> str:


if self.structured:
# Helper function to replace periods with newlines and bullet points
def add_bullet_points(text: str) -> str:
# Add a bullet point to the first sentence
text = text.strip()
text = "* " + text

parts = text.split('.')
if len(parts) > 1:
return '.\n*'.join(parts[:-1]) + '.' + parts[-1]
return text

# Construct the dictionary for markdown content
data_to_write_as_md = {
"UpSet Introduction": self.replaceTokens(introduction),
"Dataset Properties": self.replaceTokens(dataset_properties),
"Set Properties": self.replaceTokens(set_description),
"Intersection Properties": self.replaceTokens(intersection_description),
"Statistical Information": self.replaceTokens(statistical_information),
"Trend Analysis": self.replaceTokens(trend_analysis)
"Set Properties": add_bullet_points(self.replaceTokens(set_description)),
"Intersection Properties": add_bullet_points(self.replaceTokens(intersection_description)),
"Statistical Information": add_bullet_points(self.replaceTokens(statistical_information)),
"Trend Analysis": add_bullet_points(self.replaceTokens(trend_analysis)),
}

markdown_content = ""
Expand Down
165 changes: 81 additions & 84 deletions src/alttxt/tokenmap.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Any, Callable, Tuple, Union, Optional
from alttxt.models import DataModel, GrammarModel, Subset
from alttxt.enums import SubsetField, IndividualSetSize, IntersectionTrend
import statistics
from alttxt.regionclass import *
import math
from collections import Counter
Expand Down Expand Up @@ -672,7 +673,7 @@ def calculate_max_min_set_presence(self, maxmin_sized_set_name) -> str:
(maxmin_set_count / total_non_empty) * 100 if total_non_empty else 0
)

return f"{maxmin_set_percentage:.1f}%"
return f"{round(maxmin_set_percentage)}%"

def calculate_max_intersection(self) -> dict[str, int]:
"""
Expand Down Expand Up @@ -737,16 +738,16 @@ def calculate_set_divergence(self):
min_set_size = self.sort_visible_sets()[-1][1]

divergence_percentage = (min_set_size / max_set_size) * 100
divergence_percentage = math.ceil(divergence_percentage)

# Determine the divergence category
if divergence_percentage < 30:
if divergence_percentage < 26.67:
return IndividualSetSize.DIVERGINGALOT.value
elif 30 <= divergence_percentage <= 90:
elif 26.68 <= divergence_percentage <= 53.34:
return IndividualSetSize.DIVERGING.value
elif divergence_percentage > 90:
elif 53.35 <= divergence_percentage <= 79.99:
return IndividualSetSize.DIVERGINGABIT.value

else:
return IndividualSetSize.IDENTICAL.value

def calculate_change_trend(self):
"""
Expand Down Expand Up @@ -822,120 +823,108 @@ def calculate_largest_factor(self):

def categorize_subsets(self):
"""
Categorize the subsets into small, medium, large and largest regions based on their coeficent of variance.
Based on the varainces, we do set two thresholds for large region of intersections and small region intersections
Then we assign the intersection types (individual, low-degree, medium-degree, high-order sets) based on the percentage presence (>=50%)
Returns a dictionary of intersection types being on the region(s)
Categorize the subsets into small, medium, large and largest regions based on their size.
"""
results = {}
sorted_subsets = sorted(self.data.subsets, key=lambda subset: subset.size, reverse=True)

largest_subset = sorted_subsets.pop(0)
largest_subset = sorted_subsets.pop(0) # Remove the largest

median_size = statistics.median([subset.size for subset in sorted_subsets])

region_classification = RegionClassification()

region_classification.set_largest(largest_subset)

if not sorted_subsets:
return {'largest_data_region': [largest_subset]}

sizes = [subset.size for subset in sorted_subsets]
unique_sizes = sorted(set(sizes), reverse=True)

q1, median, q3 = np.percentile(sizes, [25, 50, 75])
iqr = q3 - q1
multiplier = 1.5

large_threshold = median + multiplier * iqr

size_categories = {}

for size in unique_sizes:
if size > large_threshold:
size_categories[size] = 'large'
elif size < median:
size_categories[size] = 'small'
else:
size_categories[size] = 'medium'



close_to_zero_threshold = median_size * 1.2 # Define what 'close to zero' means

for subset in sorted_subsets:
category = size_categories[subset.size]
if category == 'large':
region_classification.add_to_large_region(subset)
elif category == 'medium':
region_classification.add_to_medium_region(subset)
else:
deviation = subset.size - median_size

if deviation < 0 and abs(deviation) > abs(median_size-close_to_zero_threshold):
region_classification.add_to_small_region(subset)
elif deviation == 0 or abs(deviation) <= abs(median_size-close_to_zero_threshold):
region_classification.add_to_medium_region(subset)
else: # deviation > 0 and not close to zero
region_classification.add_to_large_region(subset)


# the small treshold may not always perform well, specially in the case of being the smallest intersection really pretty smaller than the largest
if not region_classification.large_data_region and region_classification.medium_data_region:
largest_medium = max(region_classification.medium_data_region, key=lambda x: x.size)
region_classification.medium_data_region = [x for x in region_classification.medium_data_region if x.size != largest_medium.size]
region_classification.large_data_region = [x for x in sorted_subsets if x.size == largest_medium.size]

if not region_classification.small_data_region and region_classification.medium_data_region:
smallest_medium = min(region_classification.medium_data_region, key=lambda x: x.size)
region_classification.medium_data_region = [x for x in region_classification.medium_data_region if x.size != smallest_medium.size]
region_classification.small_data_region = [x for x in sorted_subsets if x.size == smallest_medium.size]

regions = {
'largest_data_region': [region_classification.largest_data_region],
'large_data_region': region_classification.large_data_region,
'medium_data_region': region_classification.medium_data_region,
'small_data_region': region_classification.small_data_region,
}

regions = {
'largest_data_region': [region_classification.largest_data_region],
'large_data_region': region_classification.large_data_region,
'medium_data_region': region_classification.medium_data_region,
'small_data_region': region_classification.small_data_region,
'small_data_region': region_classification.small_data_region,
# Assuming largest_data_region is a single subset, not a list

}


total_sizes = {region: sum(subset.size for subset in subsets) for region, subsets in regions.items()}

for region_name, subsets in regions.items():
# Initialize dictionary to store sizes for special classifications
special_sizes = {}
# Initialize Counter for other classifications
classification_sizes = Counter()

for subset in subsets:
# Handle 'the empty set' and 'all set' by directly logging their sizes
if subset.classification in ['the empty set']:
special_sizes[subset.classification] = subset.size
else:
classification_sizes[subset.classification] += subset.size

percentages = {cls: (size / total_sizes[region_name] * 100) for cls, size in classification_sizes.items()}



# Calculate percentages based on sizes for other classifications
percentages = {cls: (size / total_sizes[region_name] * 100) for cls, size in classification_sizes.items()}

# Update results with percentages and direct sizes for special cases
results[region_name] = {**percentages, **special_sizes}

classification_to_regions = {}

threshold_percentage = 50.0 # the percentage threshold for inclusion
threshold_percentage = 35.0 # Define the threshold for inclusion

for region, classifications in results.items():
above_threshold = {classification: value for classification, value in classifications.items() if value >= threshold_percentage}

if above_threshold:
for classification in above_threshold.keys():
classification_to_regions.setdefault(classification, set()).add(region)
for classification, value in classifications.items():
# Direct inclusion for single-region classifications or exceeding threshold
if classification.value not in classification_to_regions or value >= threshold_percentage:
classification_to_regions.setdefault(classification.value, {}).update({region: value})
else:
# Include only if the percentage exceeds the threshold
existing_region, existing_value = next(iter(classification_to_regions[classification.value].items()))
if value > existing_value:
classification_to_regions[classification.value].update({region: value})
# classification_to_regions[classification.value].update({region: value})


# Refine mapping based on the new rules
for classification, regions_percentages in classification_to_regions.items():
if len(regions_percentages) > 1:
# Filter regions by threshold and select the highest if none exceed the threshold
above_threshold_regions = {region: pct for region, pct in regions_percentages.items() if pct >= threshold_percentage}

# If no regions meet the threshold, choose the one with the highest percentage
if not above_threshold_regions:
highest_region = max(regions_percentages, key=regions_percentages.get)
classification_to_regions[classification] = {highest_region}
else:
# Include all regions that meet the threshold
classification_to_regions[classification] = set(above_threshold_regions.keys())
else:
if classifications:
max_value = max(classifications.values())
max_classifications = [classification for classification, value in classifications.items() if value == max_value]
for classification in max_classifications:
classification_to_regions.setdefault(classification, set()).add(region)


# If the classification is present in only one region, include it regardless of the percentage
classification_to_regions[classification] = set(regions_percentages.keys())

# Handle special cases such as 'the empty set' and 'all set'
# They should be directly mapped without considering the threshold
for special_case in ['the empty set', 'all set']:
if special_case in results:
for region in results[special_case]:
classification_to_regions[special_case] = {region}


final_output = {cls.value: {regions} if isinstance(regions, str) else set(regions) for cls, regions in classification_to_regions.items()}
final_output = {cls: {regions} if isinstance(regions, str) else set(regions) for cls, regions in classification_to_regions.items()}

return final_output

Expand All @@ -959,20 +948,22 @@ def individual_set_presence(self) -> str:
individual_set_regions = categorization.get('individual set')

if individual_set_regions:
# Convert set to list to index
regions_list = list(individual_set_regions)
if len(regions_list) == 1:
return f" The individual set intersections are seen in {regions_list[0].replace('_data_region', '')} sized intersections."
return f" The individual set intersections are {regions_list[0].replace('_data_region', '')} in size."
else:
regions_formatted = [region.replace('_data_region', '') for region in regions_list]
return f" The individual set intersections are significantly present in {' and '.join(regions_formatted)} intersections."
return f" The individual set intersections are in {' and '.join(regions_formatted)} intersections."
else:
return ""
return " No individual set intersections are present."

def medium_set_presence(self) -> str:
categorization = self.categorize_subsets()
medium_set_regions = categorization.get('medium set')

if medium_set_regions:
# Convert set to list to index
regions_list = list(medium_set_regions)
if len(regions_list) == 1:
return f" The medium degree set intersections can be seen among {regions_list[0].replace('_data_region', '')} sized intersections."
Expand All @@ -990,10 +981,16 @@ def low_set_presence(self) -> str:
# Convert set to list to index
regions_list = list(low_set_regions)
if len(regions_list) == 1:
return f" The low degree set intersections lie in {regions_list[0].replace('_data_region', '')} sized intersections."
ret_str = f" The low degree set intersections are in {regions_list[0].replace('_data_region', '')} sized intersections."

# Replace 'largest' and 'smallest' with 'the largest' and 'the smallest'
ret_str = ret_str.replace('largest', 'the largest')
ret_str = ret_str.replace('smallest', 'the smallest')

return ret_str
else:
regions_formatted = [region.replace('_data_region', '') for region in regions_list]
return f" The low degree set intersections lie in {' and '.join(regions_formatted)} sized intersections."
return f" The low degree set intersections are in {' and '.join(regions_formatted)} sized intersections."
else:
return ""

Expand All @@ -1003,6 +1000,7 @@ def high_set_presence(self) -> str:
high_set_regions = categorization.get('high order set')

if high_set_regions:
# Convert set to list to index
regions_list = list(high_set_regions)
if len(regions_list) == 1:
if regions_list[0] == 'largest_data_region':
Expand All @@ -1012,8 +1010,7 @@ def high_set_presence(self) -> str:
regions_formatted = [region.replace('_data_region', '') for region in regions_list]
return f" In {' and '.join(regions_formatted)} sized intersections, the high order set intersections are significantly present."
else:
return ""

return " No high order intersections are present."

def calculate_intersection_trend(self) -> str:
intersection_trend = self.calculate_change_trend()
Expand Down

0 comments on commit 2a1af1f

Please sign in to comment.