forked from CBIIT/icdc-dataloader
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathabout_page_content_cleaner.py
62 lines (58 loc) · 3.21 KB
/
about_page_content_cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import re
from bento.common.utils import get_logger
logger = get_logger('AboutPageContentCleaner')
class AboutPageContentCleaner:
@staticmethod
def clean_text(text):
# Remove inline links
cleaned_text = re.sub(r'\$\$\[(.*?)\]\(.*?\)\$\$', r'\1', text)
# Remove hash tags
cleaned_text = re.sub(r'\$\$#(.*?)#\$\$', r'\1', cleaned_text)
# Remove asterisk symbol
cleaned_text = re.sub(r'\$\$\*(.*?)\*\$\$', r'\1', cleaned_text)
# Remove extra spaces
cleaned_text = ' '.join(cleaned_text.split())
return cleaned_text
@staticmethod
def remove_formatting_content(page_name, content):
cleaned_content = []
for item in content:
if isinstance(item, dict) and 'paragraph' in item:
item['paragraph'] = AboutPageContentCleaner.clean_text(item['paragraph'])
cleaned_content.append(item)
# Handling unOrdered List
elif isinstance(item, dict) and 'listWithDots' in item:
cleaned_list = []
for list_item in item['listWithDots']:
# handling Alphabets sub orders list
if 'listWithAlphabets' in list_item:
cleaned_inner_list = [AboutPageContentCleaner.clean_text(inner_list_item) for inner_list_item in list_item['listWithAlphabets']]
cleaned_list.append({'listWithAlphabets': cleaned_inner_list})
else:
cleaned_list.append(AboutPageContentCleaner.clean_text(list_item))
cleaned_content.append({'listWithDots': cleaned_list})
elif isinstance(item, dict) and 'listWithAlphabets' in item:
cleaned_list = [AboutPageContentCleaner.clean_text(list_item) for list_item in item['listWithAlphabets']]
cleaned_content.append({'listWithAlphabets': cleaned_list})
# Handling Ordered List with Numbers
elif isinstance(item, dict) and 'listWithNumbers' in item:
cleaned_list = []
for list_item in item['listWithNumbers']:
# Handling Alphabets sub orders list
if 'listWithAlphabets' in list_item:
cleaned_inner_list = [AboutPageContentCleaner.clean_text(inner_list_item) for inner_list_item in list_item['listWithAlphabets']]
cleaned_list.append({'listWithAlphabets': cleaned_inner_list})
else:
cleaned_list.append(AboutPageContentCleaner.clean_text(list_item))
cleaned_content.append({'listWithNumbers': cleaned_list})
# Handle table cleaning logic
elif isinstance(item, dict) and 'table' in item:
cleaned_table = {
'head': [AboutPageContentCleaner.clean_text(cell) for cell in item['table']['head']],
'body': [[AboutPageContentCleaner.clean_text(cell) for cell in row] for row in item['table']['body']]
}
cleaned_content.append({'table': cleaned_table})
else:
cleaned_content.append(item)
logger.info(f'Cleaned content for "{page_name}"')
return cleaned_content