Skip to content

WhatsApp Markdown Sanity Check #592

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 86 additions & 0 deletions daras_ai/mdit_wa_plugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from mdformat.renderer._context import (
RenderContext,
make_render_children,
_render_inline_as_text,
WRAP_POINT,
)
from mdformat.renderer._tree import RenderTreeNode
from mdformat.plugins import ParserExtensionInterface
from mdformat.renderer._util import maybe_add_link_brackets


def wa_heading_renderer(node: RenderTreeNode, context: RenderContext) -> str:
text = make_render_children(separator="")(node, context)
text = text.lstrip("*")
text = text.rstrip("*")

return "*" + text + "*"


def wa_em_renderer(node: RenderTreeNode, context: RenderContext) -> str:
text = make_render_children(separator="")(node, context)
return "_" + text + "_"


def wa_strong_renderer(node: RenderTreeNode, context: RenderContext) -> str:
text = make_render_children(separator="")(node, context)
return "*" + text + "*"


def wa_link_renderer(node: RenderTreeNode, context: RenderContext) -> str:
if node.info == "auto":
autolink_url = node.attrs["href"]
# Remove 'mailto:' if the URL is a mailto link and the content doesn't start with 'mailto:'
if autolink_url.startswith("mailto:") and not node.children[
0
].content.startswith("mailto:"):
autolink_url = autolink_url[7:]
return f"{autolink_url}"

# Get the display text for the link
text = "".join(child.render(context) for child in node.children)

uri = node.attrs["href"]
return f"{text} ({uri})"


def wa_image_renderer(node: RenderTreeNode, context: RenderContext) -> str:
description = _render_inline_as_text(node, context)
ref_label = node.meta.get("label")
if ref_label:
context.env["used_refs"].add(ref_label)
ref_label_repr = ref_label.lower()
if description.lower() == ref_label_repr:
return f"[{description}]"
return f" {description} [{ref_label_repr}]"

uri = node.attrs["src"]
assert isinstance(uri, str)
uri = maybe_add_link_brackets(uri)
title = node.attrs.get("title")
if title is not None:
return f'{description} ({uri} "{title}")'
return f"{description} ({uri})"


def wa_hr_renderer(node: RenderTreeNode, context: RenderContext) -> str:
return ""


def wa_strikethrough_renderer(node: RenderTreeNode, context: RenderContext) -> str:
# Render the content inside the strikethrough element
text = make_render_children(separator="")(node, context)
return f"~{text}~"


class WhatsappParser(ParserExtensionInterface):

RENDERERS = {
"heading": wa_heading_renderer,
"em": wa_em_renderer,
"strong": wa_strong_renderer,
"link": wa_link_renderer,
"hr": wa_hr_renderer,
"image": wa_image_renderer,
"s": wa_strikethrough_renderer,
}
117 changes: 116 additions & 1 deletion daras_ai/text_format.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,37 @@
import ast

import re
import parse
import requests
from typing import Mapping, Any

from furl import furl
from markdown_it import MarkdownIt
from mdformat.renderer import MDRenderer

from daras_ai.image_input import upload_file_from_bytes
from daras_ai.mdit_wa_plugin import WhatsappParser
from daras_ai_v2.exceptions import raise_for_status
from daras_ai_v2.tts_markdown_renderer import RendererPlain
from daras_ai_v2.text_splitter import new_para
from loguru import logger


input_spec_parse_pattern = "{" * 5 + "}" * 5

WA_FORMATTING_OPTIONS: Mapping[str, Any] = {
"mdformat": {"number": True},
"parser_extension": [WhatsappParser],
}

WHATSAPP_VALID_IMAGE_FORMATS = [
"image/jpeg",
"image/png",
"image/gif",
"image/tiff",
"image/webp",
"image/bmp",
]


def daras_ai_format_str(format_str, variables):
from glom import glom
Expand Down Expand Up @@ -48,3 +72,94 @@ def format_number_with_suffix(num: int) -> str:
def unmarkdown(text: str) -> str:
"""markdown to plaintext"""
return MarkdownIt(renderer_cls=RendererPlain).render(text)


def extract_image_urls(tokens) -> list[str]:
image_urls = []

for token in tokens:
if token.type == "inline" and token.children:
for child in token.children:
if child.type == "image" and "src" in child.attrs:
image_urls.append(child.attrs["src"])

return image_urls


def get_mimetype_from_url(url: str) -> str:
try:
r = requests.head(url)
raise_for_status(r)
return r.headers.get("content-type", "application/octet-stream")
except requests.RequestException as e:
logger.warning(f"Error fetching mimetype for {url}: {e}")
return "application/octet-stream"


def process_wa_image_urls(image_urls: list[str]) -> list[str]:
from wand.image import Image

processed_images = []
for image_url in image_urls:

parsed_url = furl(image_url)
if parsed_url.scheme not in ["http", "https"]:
continue

mime_type = get_mimetype_from_url(image_url)

if mime_type in WHATSAPP_VALID_IMAGE_FORMATS:
r = requests.get(image_url)
raise_for_status(r)
filename = (
r.headers.get("content-disposition", "")
.split("filename=")[-1]
.strip('"')
)
image_data = r.content

with Image(blob=image_data) as img:
if img.format.lower() not in ["png", "jpeg"]:
png_blob = img.make_blob(format="png")
processed_images.append(
upload_file_from_bytes(filename, png_blob, "image/png")
)
else:
processed_images.append(image_url)

return processed_images


def wa_markdown(text: str) -> str | tuple[list[str | Any], str]:
"""commonmark to WA compatible Markdown"""

if text is None:
return ""

md = MarkdownIt("commonmark").enable("strikethrough")
tokens = md.parse(text)
image_urls = extract_image_urls(tokens)
processed_images = process_wa_image_urls(image_urls)
whatsapp_msg_text = MDRenderer().render(
tokens, options=WA_FORMATTING_OPTIONS, env={}
)
return processed_images, whatsapp_msg_text


def is_list_item_complete(text: str) -> bool:
"""Returns True if the last block is a list item, False otherwise."""

if text is None:
return False
blocks = re.split(new_para, text.strip())

if not blocks:
return False

last_block = blocks[-1].strip()
lines = [ln for ln in last_block.split("\n") if ln.strip()]
list_item_pattern = re.compile(r"^\s*(?:[*+\-]|\d+\.)\s+")

is_list_block = any(list_item_pattern.match(ln) for ln in lines)

return is_list_block
29 changes: 29 additions & 0 deletions daras_ai_v2/facebook_bots.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from daras_ai_v2.bots import BotInterface, ReplyButton, ButtonPressed
from daras_ai_v2.exceptions import raise_for_status
from daras_ai_v2.text_splitter import text_splitter
from daras_ai.text_format import wa_markdown

WA_MSG_MAX_SIZE = 1024

Expand Down Expand Up @@ -138,6 +139,8 @@ def send_msg_to(
) -> str | None:
# see https://developers.facebook.com/docs/whatsapp/api/messages/media/

images, text = wa_markdown(text)

# split text into chunks if too long
if text and len(text) > WA_MSG_MAX_SIZE:
splits = text_splitter(
Expand Down Expand Up @@ -190,6 +193,32 @@ def send_msg_to(
},
},
]

elif images:
if buttons:
messages = _build_msg_buttons(
buttons,
{
"body": {
"text": text or "\u200b",
},
"header": {
"type": "image",
"image": {"link": images[0]},
},
},
)
else:
messages = [
{
"type": "image",
"image": {
"link": images[0],
"caption": text,
},
},
]

elif buttons:
# interactive text msg
messages = _build_msg_buttons(
Expand Down
5 changes: 5 additions & 0 deletions daras_ai_v2/language_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
)

from daras_ai.image_input import gs_url_to_uri, bytes_to_cv2_img, cv2_img_to_bytes
from daras_ai.text_format import is_list_item_complete
from daras_ai_v2.asr import get_google_auth_session
from daras_ai_v2.exceptions import raise_for_status, UserError
from daras_ai_v2.gpu_server import call_celery_task
Expand Down Expand Up @@ -1242,6 +1243,10 @@ def _stream_openai_chunked(
if not (isinstance(last_part, str) and last_part.strip()):
continue

# add regex to handle not breaking in list items
if is_list_item_complete(chunk):
continue

# iterate through the separators and find the best one that matches
for sep in default_separators[:-1]:
# find the last occurrence of the separator
Expand Down
2 changes: 1 addition & 1 deletion daras_ai_v2/vector_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ def do_check_document_updates(
metadatas = yield from apply_parallel(
doc_or_yt_url_to_file_metas,
lookups.keys(),
message="Fetching latest knowlege docs...",
message="Fetching latest knowledge docs...",
max_workers=100,
)

Expand Down
38 changes: 37 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ python-pptx = "^1.0.2"
azure-identity = "^1.19.0"
azure-keyvault-secrets = "^4.9.0"
xlrd = "^2.0.1"
mdformat = "^0.7.21"

[tool.poetry.group.dev.dependencies]
watchdog = "^2.1.9"
Expand Down
Loading
Loading