Skip to content

Commit

Permalink
Refactor code for removing formating content under about page
Browse files Browse the repository at this point in the history
  • Loading branch information
Nahomtes committed Apr 24, 2024
1 parent b287407 commit ccc9aeb
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 16 deletions.
62 changes: 62 additions & 0 deletions about_page_content_cleaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import re
from bento.common.utils import get_logger

logger = get_logger('AboutPageContentCleaner')


class AboutPageContentCleaner:
@staticmethod
def clean_text(text):
# Remove inline links
cleaned_text = re.sub(r'\$\$\[(.*?)\]\(.*?\)\$\$', r'\1', text)
# Remove hash tags
cleaned_text = re.sub(r'\$\$#(.*?)#\$\$', r'\1', cleaned_text)
# Remove asterisk symbol
cleaned_text = re.sub(r'\$\$\*(.*?)\*\$\$', r'\1', cleaned_text)
# Remove extra spaces
cleaned_text = ' '.join(cleaned_text.split())
return cleaned_text

@staticmethod
def remove_formatting_content(page_name, content):
cleaned_content = []
for item in content:
if isinstance(item, dict) and 'paragraph' in item:
item['paragraph'] = AboutPageContentCleaner.clean_text(item['paragraph'])
cleaned_content.append(item)
# Handling unOrdered List
elif isinstance(item, dict) and 'listWithDots' in item:
cleaned_list = []
for list_item in item['listWithDots']:
# handling Alphabets sub orders list
if 'listWithAlphabets' in list_item:
cleaned_inner_list = [AboutPageContentCleaner.clean_text(inner_list_item) for inner_list_item in list_item['listWithAlphabets']]
cleaned_list.append({'listWithAlphabets': cleaned_inner_list})
else:
cleaned_list.append(AboutPageContentCleaner.clean_text(list_item))
cleaned_content.append({'listWithDots': cleaned_list})
elif isinstance(item, dict) and 'listWithAlphabets' in item:
cleaned_list = [AboutPageContentCleaner.clean_text(list_item) for list_item in item['listWithAlphabets']]
cleaned_content.append({'listWithAlphabets': cleaned_list})
# Handling Ordered List with Numbers
elif isinstance(item, dict) and 'listWithNumbers' in item:
cleaned_list = []
for list_item in item['listWithNumbers']:
# Handling Alphabets sub orders list
if 'listWithAlphabets' in list_item:
cleaned_inner_list = [AboutPageContentCleaner.clean_text(inner_list_item) for inner_list_item in list_item['listWithAlphabets']]
cleaned_list.append({'listWithAlphabets': cleaned_inner_list})
else:
cleaned_list.append(AboutPageContentCleaner.clean_text(list_item))
cleaned_content.append({'listWithNumbers': cleaned_list})
# Handle table cleaning logic
elif isinstance(item, dict) and 'table' in item:
cleaned_table = {
'head': [AboutPageContentCleaner.clean_text(cell) for cell in item['table']['head']],
'body': [[AboutPageContentCleaner.clean_text(cell) for cell in row] for row in item['table']['body']]
}
cleaned_content.append({'table': cleaned_table})
else:
cleaned_content.append(item)
logger.info(f'Cleaned content for "{page_name}"')
return cleaned_content
20 changes: 4 additions & 16 deletions es_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from bento.common.utils import get_logger, print_config
from icdc_schema import ICDC_Schema, PROPERTIES, ENUM, PROP_ENUM, PROP_TYPE, REQUIRED, DESCRIPTION
from props import Props
from about_page_content_cleaner import AboutPageContentCleaner


logger = get_logger('ESLoader')

Expand Down Expand Up @@ -111,22 +113,8 @@ def load_about_page(self, index_name, mapping, file_name, clean_about_page_forma
self.index_data(index_name, page, f'page{page["page"]}')

def remove_formatting_content(self, page_name, content):
cleaned_content = []
for item in content:
if isinstance(item, dict) and 'paragraph' in item:
# Remove inline links
cleaned_text = re.sub(r'\$\$\[(.*?)\]\(.*?\)\$\$', r'\1', item['paragraph'])
# Remove hash tags
cleaned_text = re.sub(r'\$\$#(.*?)#\$\$', r'\1', cleaned_text)
# Remove asterisk symbol
cleaned_text = re.sub(r'\$\$\*(.*?)\*\$\$', r'\1', cleaned_text)
# Remove extra spaces
cleaned_text = ' '.join(cleaned_text.split())
cleaned_content.append({'paragraph': cleaned_text})
else:
cleaned_content.append(item)
logger.info(f'Cleaned content for "{page_name}"')
return cleaned_content
# Call remove_formatting_content from AboutPageContentCleaner
return AboutPageContentCleaner.remove_formatting_content(page_name, content)

def read_model(self, model_files, prop_file):
for file_name in model_files:
Expand Down

0 comments on commit ccc9aeb

Please sign in to comment.