Skip to content

Commit

Permalink
feature: Remove About page formatting prior to loading into ES
Browse files Browse the repository at this point in the history
  • Loading branch information
Nahomtes committed Apr 22, 2024
1 parent d38c5ce commit b287407
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 5 deletions.
6 changes: 4 additions & 2 deletions config/es_loader.example.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Config:
# Neo4j URL with port number
neo4j_uri: "bolt://127.0.0.1:7687"
neo4j_uri: 'bolt://127.0.0.1:7687'
# Neo4j user name, default is neo4j
neo4j_user: neo4j
# Password for Neo4j user
Expand All @@ -9,9 +9,11 @@ Config:
es_host: localhost
# Path to about file
about_file: path_to_about_yaml_file
# Boolean flag indicating whether to apply formatting cleaning to the content from the about/static page.
clean_about_page_format: True

model_files:
- bento-model/model-desc/bento_tailorx_model_file.yaml
- bento-model/model-desc/bento_tailorx_model_properties.yaml

prop_file: config/props-bento-ext.yml
prop_file: config/props-bento-ext.yml
30 changes: 27 additions & 3 deletions es_loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python3
import argparse

import re
import os
import yaml
from elasticsearch import Elasticsearch, RequestsHttpConnection
Expand Down Expand Up @@ -93,7 +94,7 @@ def bulk_load(self, index_name, data):
successes += 1 if ok else 0
logger.info(f"Indexed {successes}/{total} documents")

def load_about_page(self, index_name, mapping, file_name):
def load_about_page(self, index_name, mapping, file_name, clean_about_page_format):
logger.info('Indexing content from about page')
if not os.path.isfile(file_name):
raise Exception(f'"{file_name} is not a file!')
Expand All @@ -103,8 +104,30 @@ def load_about_page(self, index_name, mapping, file_name):
about_file = yaml.safe_load(file_obj)
for page in about_file:
logger.info(f'Indexing about page "{page["page"]}"')
cleaned_content = page['content']
if clean_about_page_format:
cleaned_content = self.remove_formatting_content(page["page"], cleaned_content)
page['content'] = cleaned_content
self.index_data(index_name, page, f'page{page["page"]}')


def remove_formatting_content(self, page_name, content):
cleaned_content = []
for item in content:
if isinstance(item, dict) and 'paragraph' in item:
# Remove inline links
cleaned_text = re.sub(r'\$\$\[(.*?)\]\(.*?\)\$\$', r'\1', item['paragraph'])
# Remove hash tags
cleaned_text = re.sub(r'\$\$#(.*?)#\$\$', r'\1', cleaned_text)
# Remove asterisk symbol
cleaned_text = re.sub(r'\$\$\*(.*?)\*\$\$', r'\1', cleaned_text)
# Remove extra spaces
cleaned_text = ' '.join(cleaned_text.split())
cleaned_content.append({'paragraph': cleaned_text})
else:
cleaned_content.append(item)
logger.info(f'Cleaned content for "{page_name}"')
return cleaned_content

def read_model(self, model_files, prop_file):
for file_name in model_files:
if not os.path.isfile(file_name):
Expand Down Expand Up @@ -207,7 +230,8 @@ def main():
loader.load(index['index_name'], index['mapping'], index['cypher_query'])
elif index['type'] == 'about_file':
if 'about_file' in config:
loader.load_about_page(index['index_name'], index['mapping'], config['about_file'])
clean_about_page_format = config.get('clean_about_page_format', False)
loader.load_about_page(index['index_name'], index['mapping'], config['about_file'], clean_about_page_format)
else:
logger.warning(f'"about_file" not set in configuration file, {index["index_name"]} will not be loaded!')
elif index['type'] == 'model':
Expand Down

0 comments on commit b287407

Please sign in to comment.