Skip to content

Commit

Permalink
feature: add titles to Document nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
ClemDoum committed Jan 3, 2024
1 parent 6030312 commit 2d8a2cc
Show file tree
Hide file tree
Showing 7 changed files with 375 additions and 22 deletions.
5 changes: 4 additions & 1 deletion neo4j-app/neo4j_app/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,24 @@
DOC_ID = "id"
DOC_ID_CSV = f"ID({DOC_NODE})"
DOC_EXTRACTION_DATE = "extractionDate"
DOC_EXTRACTION_LEVEL = "extractionLevel"
DOC_METADATA = "metadata"
DOC_MODIFIED_AT = "modifiedAt"
DOC_PATH = "path"
DOC_URL_SUFFIX = "urlSuffix"
DOC_ROOT_ID = "rootDocument"
DOC_ROOT_TYPE = "HAS_PARENT"
DOC_TITLE = "title"
DOC_COLUMNS = {
DOC_ID: {NEO4J_CSV_COL: DOC_ID_CSV},
DOC_DIRNAME: {},
DOC_CONTENT_TYPE: {},
DOC_CONTENT_LENGTH: {NEO4J_CSV_COL: "LONG"},
DOC_EXTRACTION_DATE: {NEO4J_CSV_COL: "DATETIME"},
DOC_EXTRACTION_LEVEL: {NEO4J_CSV_COL: "LONG"},
DOC_METADATA: {},
DOC_PATH: {},
DOC_TITLE: {},
DOC_URL_SUFFIX: {},
}

Expand Down Expand Up @@ -68,7 +72,6 @@
EMAIL_REL_HEADER_FIELDS: {NEO4J_CSV_COL: "STRING[]"},
}


# TODO: check that this list is exhaustive, we know it isn't !!!
SENT_EMAIL_HEADERS = {"tika_metadata_message_from", "tika_metadata_dc_creator"}
# TODO: check that this list is exhaustive, we know it isn't !!!
Expand Down
89 changes: 88 additions & 1 deletion neo4j-app/neo4j_app/core/elasticsearch/to_neo4j.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
import hashlib
from typing import Any, Dict, List, Optional, TextIO
from urllib.parse import unquote_plus

from neo4j_app.constants import (
DOC_COLUMNS,
DOC_CONTENT_TYPE,
DOC_CREATED_AT,
DOC_CREATED_AT_META,
DOC_EXTRACTION_LEVEL,
DOC_ID,
DOC_METADATA,
DOC_MODIFIED_AT,
DOC_MODIFIED_AT_META,
DOC_NODE,
DOC_PATH,
DOC_ROOT_ID,
DOC_TITLE,
DOC_URL_SUFFIX,
EMAIL_HEADER,
EMAIL_RECEIVED_TYPE,
Expand Down Expand Up @@ -39,7 +44,7 @@
RECEIVED_EMAIL_HEADERS,
SENT_EMAIL_HEADERS,
)
from neo4j_app.core.elasticsearch.utils import INDEX_, JOIN, PARENT, SOURCE
from neo4j_app.core.elasticsearch.utils import ID_, INDEX_, JOIN, PARENT, SOURCE
from neo4j_app.core.neo4j import write_neo4j_csv

_DS_DOC_URL = "ds/"
Expand All @@ -57,9 +62,91 @@ def es_to_neo4j_doc_row(document_hit: Dict) -> List[Dict[str, Any]]:
f"{_DS_DOC_URL}{document_hit[INDEX_]}/{doc_id}/{doc.get(DOC_ROOT_ID, doc_id)}"
)
doc[DOC_URL_SUFFIX] = doc_url
doc_title = _parse_doc_title(document_hit)
doc[DOC_TITLE] = doc_title
return [doc]


def _is_email(doc_hit: Dict) -> bool:
content_type = doc_hit[SOURCE].get(DOC_CONTENT_TYPE, "")
return (
content_type.startswith("message/")
or content_type == "application/vnd.ms-outlook"
)


def _email_titles(doc_hit: Dict) -> List[str]:
metadata = doc_hit[SOURCE].get(DOC_METADATA, {})
titles = [metadata.get("tika_metadata_dc_title", "").strip()]
subject = metadata.get(
"tika_metadata_subject", metadata.get("tika_metadata_dc_subject", "")
)
titles.append(subject.strip())
return titles


def _is_tweet(doc_hit: Dict) -> bool:
content_type = doc_hit[SOURCE].get(DOC_CONTENT_TYPE, "")
return content_type == "application/json; twint"


def _tweet_title(doc_hit: Dict) -> str:
metadata = doc_hit[SOURCE].get(DOC_METADATA, dict())
return metadata.get("tika_metadata_dc_title", "").strip()


def _short_doc_id(doc_hit: Dict) -> str:
return doc_hit[ID_][:10]


def _doc_base_name(doc_hit: Dict) -> str:
path = doc_hit[SOURCE].get(DOC_PATH, "")
return path.split("/")[-1]


def _doc_resource_name(doc_hit: Dict) -> str:
source = doc_hit[SOURCE]
extraction_level = source.get(DOC_EXTRACTION_LEVEL, 0)
if not extraction_level:
return ""
resource_name = (
source.get(DOC_METADATA, dict()).get("tika_metadata_resourcename", "").strip()
)
if resource_name.startswith("=?") and resource_name.endswith("?="):
resource_name = resource_name.split("?")[-2]
resource_name = unquote_plus(resource_name.replace("=", "%"))
return resource_name


def _doc_title(doc_hit: Dict) -> str:
return doc_hit[SOURCE].get(DOC_TITLE, "").strip()


def _default_title(doc_hit: Dict) -> str:
titles = [
_short_doc_id(doc_hit),
_doc_base_name(doc_hit),
_doc_resource_name(doc_hit),
_doc_title(doc_hit),
]
for t in titles[::-1]:
if t:
return t
raise ValueError("couldn't find any valid default title")


def _parse_doc_title(doc_hit: Dict) -> str:
titles = [_default_title(doc_hit)]
if _is_email(doc_hit):
titles.extend(_email_titles(doc_hit))
elif _is_tweet(doc_hit):
titles.append(_tweet_title(doc_hit))
for t in titles[::-1]:
if t:
return t
raise ValueError("couldn't find any valid title")


def _coalesce(item: Dict[str, Any], columns: List[str]) -> Optional[Any]:
for c in columns:
value = item.get(c)
Expand Down
22 changes: 15 additions & 7 deletions neo4j-app/neo4j_app/core/neo4j/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,33 @@
DOC_CREATED_AT_META,
DOC_DIRNAME,
DOC_EXTRACTION_DATE,
DOC_EXTRACTION_LEVEL,
DOC_ID,
DOC_METADATA,
DOC_MODIFIED_AT,
DOC_MODIFIED_AT_META,
DOC_NODE,
DOC_PATH,
DOC_ROOT_ID,
DOC_ROOT_TYPE,
DOC_TITLE,
DOC_URL_SUFFIX,
)
from neo4j_app.typing_ import LightCounters

logger = logging.getLogger(__name__)


_DOC_CREATED_AT_META = ["metadata." + c for c in DOC_CREATED_AT_META]
_DOC_MODIFIED_AT_META = ["metadata." + c for c in DOC_MODIFIED_AT_META]
def _access_attributes(*, variable: str, attributes: List[str]) -> List[str]:
return [f"{variable}.{a}" for a in attributes]


def _coalesce(*, variable: str, attributes: List[str]) -> str:
values = ", ".join(f"{variable}.{a}" for a in attributes)
return f"coalesce({values})"
def _coalesce(values: List[str]) -> str:
return f"coalesce({', '.join(values)})"


_DOC_CREATED_AT_META = [f"{DOC_METADATA}." + c for c in DOC_CREATED_AT_META]
_DOC_MODIFIED_AT_META = [f"{DOC_METADATA}." + c for c in DOC_MODIFIED_AT_META]


async def import_document_rows(
Expand All @@ -48,13 +54,15 @@ async def import_document_rows(
doc.{DOC_CONTENT_TYPE} = row.{DOC_CONTENT_TYPE},
doc.{DOC_CONTENT_LENGTH} = toInteger(row.{DOC_CONTENT_LENGTH}),
doc.{DOC_EXTRACTION_DATE} = datetime(row.{DOC_EXTRACTION_DATE}),
doc.{DOC_EXTRACTION_LEVEL} = toInteger(row.{DOC_EXTRACTION_LEVEL}),
doc.{DOC_DIRNAME} = row.{DOC_DIRNAME},
doc.{DOC_PATH} = row.{DOC_PATH},
doc.{DOC_URL_SUFFIX} = row.{DOC_URL_SUFFIX},
doc.{DOC_CREATED_AT} = datetime({
_coalesce(variable="row", attributes=_DOC_CREATED_AT_META)}),
_coalesce(_access_attributes(variable="row", attributes=_DOC_CREATED_AT_META))}),
doc.{DOC_MODIFIED_AT} = datetime({
_coalesce(variable="row", attributes=_DOC_MODIFIED_AT_META)})
_coalesce(_access_attributes(variable="row", attributes=_DOC_MODIFIED_AT_META))}),
doc.{DOC_TITLE} = row.{DOC_TITLE}
WITH doc, row
WHERE doc.{DOC_ID} = row.{DOC_ID} and row.{DOC_ROOT_ID} IS NOT NULL
MERGE (root:{DOC_NODE} {{{DOC_ID}: row.{DOC_ROOT_ID}}})
Expand Down
4 changes: 3 additions & 1 deletion neo4j-app/neo4j_app/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,15 +354,17 @@ async def neo4j_test_session(
def make_docs(n: int, add_dates: bool = False) -> Generator[Dict, None, None]:
random.seed(a=777)
for i in random.sample(list(range(n)), k=n):
root = f"doc-{i - 1}" if i else None
doc = {
"_index": TEST_PROJECT,
"_id": f"doc-{i}",
"_source": {
"rootDocument": f"doc-{i - 1}" if i else None,
"rootDocument": root,
"dirname": f"dirname-{i}",
"contentType": f"content-type-{i}",
"contentLength": i**2,
"extractionDate": "2023-02-06T13:48:22.3866",
"extractionLevel": int(bool(root)),
"path": f"dirname-{i}",
"type": "Document",
"join": {"name": "Document"},
Expand Down
Loading

0 comments on commit 2d8a2cc

Please sign in to comment.