Skip to content

Commit

Permalink
⚗️(backend) function to extract text from base64 yjs document
Browse files Browse the repository at this point in the history
Function to extract text from base64 yjs document.
Can be usefull if we need to index the content
of the documents.
  • Loading branch information
AntoLC committed Sep 20, 2024
1 parent ac86a4e commit 1ee8e5f
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ and this project adheres to

## [Unreleased]

## Added

- ⚗️(backend) Extract text from base64 yjs document #270


## [1.4.0] - 2024-09-17

Expand Down
28 changes: 27 additions & 1 deletion src/backend/core/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import pytest

from core.utils import email_invitation
from core.utils import email_invitation, yjs_base64_to_text

pytestmark = pytest.mark.django_db

Expand Down Expand Up @@ -85,3 +85,29 @@ def test_utils__email_invitation_failed(mock_logger, _mock_send_mail):

assert email == "[email protected]"
assert isinstance(exception, smtplib.SMTPException)


def test_yjs_base64_to_text():
"""
Test extract_text_from_saved_yjs_document
This base64 string is an example of what is saved in the database.
This base64 is generated from the blocknote editor, it contains
the text \n# *Hello* \n- w**or**ld
"""
base64_string = (
"AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
"aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
"ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
"1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
"YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
"AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
"BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
"bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
"ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
"ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
"bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
"9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
"dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
)

assert yjs_base64_to_text(base64_string) == "Hello world"
18 changes: 18 additions & 0 deletions src/backend/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Utilities for the core app.
"""

import base64
import smtplib
from logging import getLogger

Expand All @@ -12,6 +13,9 @@
from django.utils.translation import gettext_lazy as _
from django.utils.translation import override

import y_py as Y
from bs4 import BeautifulSoup

logger = getLogger(__name__)


Expand All @@ -38,3 +42,17 @@ def email_invitation(language, email, document_id):

except smtplib.SMTPException as exception:
logger.error("invitation to %s was not sent: %s", email, exception)


def yjs_base64_to_text(base64_string):
"""Extract text from base64 yjs document"""

decoded_bytes = base64.b64decode(base64_string)
uint8_array = bytearray(decoded_bytes)

doc = Y.YDoc() # pylint: disable=E1101
Y.apply_update(doc, uint8_array) # pylint: disable=E1101
blocknote_structure = str(doc.get_xml_element("document-store"))

soup = BeautifulSoup(blocknote_structure, "html.parser")
return soup.get_text(separator=" ").strip()
2 changes: 2 additions & 0 deletions src/backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ license = { file = "LICENSE" }
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"beautifulsoup4==4.12.3",
"boto3==1.35.10",
"Brotli==1.1.0",
"celery[redis]==5.4.0",
Expand Down Expand Up @@ -57,6 +58,7 @@ dependencies = [
"WeasyPrint>=60.2",
"whitenoise==6.7.0",
"mozilla-django-oidc==4.0.1",
"y-py==0.5.5",
]

[project.urls]
Expand Down

0 comments on commit 1ee8e5f

Please sign in to comment.