From 3400e1ddac3005498b1c5c61bef997fc964dddf3 Mon Sep 17 00:00:00 2001 From: Mark Ericksen Date: Thu, 2 May 2024 17:36:21 -0600 Subject: [PATCH] updated OpenAI to use ContentPart's media option for setting image media data when base64 encoded --- MODEL_BEHAVIORS.md | 10 +++++++++- lib/chat_models/chat_open_ai.ex | 16 +++++++++++++++- lib/message/content_part.ex | 8 ++++++++ test/chat_models/chat_open_ai_test.exs | 18 ++++++++++++++++++ 4 files changed, 50 insertions(+), 2 deletions(-) diff --git a/MODEL_BEHAVIORS.md b/MODEL_BEHAVIORS.md index 84f92b9d..dced858f 100644 --- a/MODEL_BEHAVIORS.md +++ b/MODEL_BEHAVIORS.md @@ -8,4 +8,12 @@ This is to document some of the differences between model behaviors. This is spe - 2024-04-17 - Anthropic's Claude 3 does respond well to pre-filled assistant responses and it is officially encouraged. - - 2024-04-17 \ No newline at end of file + - 2024-04-17 + +## Image content parts + +- Anthropic will not work if the media type is included with the base64 data. ie. `"data:image/jpeg;base64," <> "..."` + - Requires providing the media type as a separate option. +- ChatGPT - + - When `base64` data given, requires providing the image data prefixed with the base64 content. + - Ex: `"data:image/jpeg;base64," <> "..."` diff --git a/lib/chat_models/chat_open_ai.ex b/lib/chat_models/chat_open_ai.ex index 210755dd..6d67d1b3 100644 --- a/lib/chat_models/chat_open_ai.ex +++ b/lib/chat_models/chat_open_ai.ex @@ -239,7 +239,21 @@ defmodule LangChain.ChatModels.ChatOpenAI do end def for_api(%ContentPart{type: image} = part) when image in [:image, :image_url] do - %{"type" => "image_url", "image_url" => %{"url" => part.content}} + media_prefix = + case Keyword.get(part.options || [], :media, nil) do + nil -> + "" + + type when is_binary(type) -> + "data:#{type};base64," + + other -> + message = "Received unsupported media type for ContentPart: #{inspect(other)}" + Logger.error(message) + raise LangChainError, message + end + + %{"type" => "image_url", "image_url" => %{"url" => media_prefix <> part.content}} end # ToolCall support diff --git a/lib/message/content_part.ex b/lib/message/content_part.ex index a9c1897a..849de936 100644 --- a/lib/message/content_part.ex +++ b/lib/message/content_part.ex @@ -84,6 +84,14 @@ defmodule LangChain.Message.ContentPart do - `:media` - Provide the "media type" for the image. Examples: "image/jpeg", "image/png", etc. ChatGPT does not require this but other LLMs may. + + ChatGPT requires media type information to prefix the base64 content. Setting + the `media: "image/jpeg"` type will do that. Otherwise the data must be + provided with the required prefix. + + Anthropic requires the media type information to be submitted as separate + information with the JSON request. This media option provides an abstraction + to normalize the behavior. """ @spec image!(String.t(), Keyword.t()) :: t() | no_return() def image!(content, opts \\ []) do diff --git a/test/chat_models/chat_open_ai_test.exs b/test/chat_models/chat_open_ai_test.exs index 590afd7a..e76ea0c8 100644 --- a/test/chat_models/chat_open_ai_test.exs +++ b/test/chat_models/chat_open_ai_test.exs @@ -177,6 +177,24 @@ defmodule LangChain.ChatModels.ChatOpenAITest do assert result == expected end + test "turns an image ContentPart with base64 media into the expected JSON format" do + expected = %{ + "type" => "image_url", + "image_url" => %{"url" => "_base64_data"} + } + + result = ChatOpenAI.for_api(ContentPart.image!("image_base64_data", media: "image/jpeg")) + assert result == expected + + expected = %{ + "type" => "image_url", + "image_url" => %{"url" => "_base64_data"} + } + + result = ChatOpenAI.for_api(ContentPart.image!("image_base64_data", media: "image/png")) + assert result == expected + end + test "turns an image_url ContentPart into the expected JSON format" do expected = %{"type" => "image_url", "image_url" => %{"url" => "url-to-image"}} result = ChatOpenAI.for_api(ContentPart.image_url!("url-to-image"))