diff --git a/about_page_content_cleaner.py b/about_page_content_cleaner.py new file mode 100644 index 00000000..c72fc760 --- /dev/null +++ b/about_page_content_cleaner.py @@ -0,0 +1,62 @@ +import re +from bento.common.utils import get_logger + +logger = get_logger('AboutPageContentCleaner') + + +class AboutPageContentCleaner: + @staticmethod + def clean_text(text): + # Remove inline links + cleaned_text = re.sub(r'\$\$\[(.*?)\]\(.*?\)\$\$', r'\1', text) + # Remove hash tags + cleaned_text = re.sub(r'\$\$#(.*?)#\$\$', r'\1', cleaned_text) + # Remove asterisk symbol + cleaned_text = re.sub(r'\$\$\*(.*?)\*\$\$', r'\1', cleaned_text) + # Remove extra spaces + cleaned_text = ' '.join(cleaned_text.split()) + return cleaned_text + + @staticmethod + def remove_formatting_content(page_name, content): + cleaned_content = [] + for item in content: + if isinstance(item, dict) and 'paragraph' in item: + item['paragraph'] = AboutPageContentCleaner.clean_text(item['paragraph']) + cleaned_content.append(item) + # Handling unOrdered List + elif isinstance(item, dict) and 'listWithDots' in item: + cleaned_list = [] + for list_item in item['listWithDots']: + # handling Alphabets sub orders list + if 'listWithAlphabets' in list_item: + cleaned_inner_list = [AboutPageContentCleaner.clean_text(inner_list_item) for inner_list_item in list_item['listWithAlphabets']] + cleaned_list.append({'listWithAlphabets': cleaned_inner_list}) + else: + cleaned_list.append(AboutPageContentCleaner.clean_text(list_item)) + cleaned_content.append({'listWithDots': cleaned_list}) + elif isinstance(item, dict) and 'listWithAlphabets' in item: + cleaned_list = [AboutPageContentCleaner.clean_text(list_item) for list_item in item['listWithAlphabets']] + cleaned_content.append({'listWithAlphabets': cleaned_list}) + # Handling Ordered List with Numbers + elif isinstance(item, dict) and 'listWithNumbers' in item: + cleaned_list = [] + for list_item in item['listWithNumbers']: + # Handling Alphabets sub orders list + if 'listWithAlphabets' in list_item: + cleaned_inner_list = [AboutPageContentCleaner.clean_text(inner_list_item) for inner_list_item in list_item['listWithAlphabets']] + cleaned_list.append({'listWithAlphabets': cleaned_inner_list}) + else: + cleaned_list.append(AboutPageContentCleaner.clean_text(list_item)) + cleaned_content.append({'listWithNumbers': cleaned_list}) + # Handle table cleaning logic + elif isinstance(item, dict) and 'table' in item: + cleaned_table = { + 'head': [AboutPageContentCleaner.clean_text(cell) for cell in item['table']['head']], + 'body': [[AboutPageContentCleaner.clean_text(cell) for cell in row] for row in item['table']['body']] + } + cleaned_content.append({'table': cleaned_table}) + else: + cleaned_content.append(item) + logger.info(f'Cleaned content for "{page_name}"') + return cleaned_content diff --git a/es_loader.py b/es_loader.py index 5ad83f8f..08e3532c 100755 --- a/es_loader.py +++ b/es_loader.py @@ -13,6 +13,8 @@ from bento.common.utils import get_logger, print_config from icdc_schema import ICDC_Schema, PROPERTIES, ENUM, PROP_ENUM, PROP_TYPE, REQUIRED, DESCRIPTION from props import Props +from about_page_content_cleaner import AboutPageContentCleaner + logger = get_logger('ESLoader') @@ -111,22 +113,8 @@ def load_about_page(self, index_name, mapping, file_name, clean_about_page_forma self.index_data(index_name, page, f'page{page["page"]}') def remove_formatting_content(self, page_name, content): - cleaned_content = [] - for item in content: - if isinstance(item, dict) and 'paragraph' in item: - # Remove inline links - cleaned_text = re.sub(r'\$\$\[(.*?)\]\(.*?\)\$\$', r'\1', item['paragraph']) - # Remove hash tags - cleaned_text = re.sub(r'\$\$#(.*?)#\$\$', r'\1', cleaned_text) - # Remove asterisk symbol - cleaned_text = re.sub(r'\$\$\*(.*?)\*\$\$', r'\1', cleaned_text) - # Remove extra spaces - cleaned_text = ' '.join(cleaned_text.split()) - cleaned_content.append({'paragraph': cleaned_text}) - else: - cleaned_content.append(item) - logger.info(f'Cleaned content for "{page_name}"') - return cleaned_content + # Call remove_formatting_content from AboutPageContentCleaner + return AboutPageContentCleaner.remove_formatting_content(page_name, content) def read_model(self, model_files, prop_file): for file_name in model_files: