Merge pull request #333 from cdcd72/feature/support-gpt-4o

Support GPT-4o model
memochou1993 · Jul 9, 2024 · 4793599 · 4793599
2 parents fa947ca + 34adcc9
commit 4793599
Show file tree

Hide file tree

Showing 13 changed files with 132 additions and 20 deletions.
diff --git a/app/app.js b/app/app.js
@@ -47,7 +47,7 @@ const handleEvents = async (events = []) => (
         events
           .map((event) => new Event(event))
           .filter((event) => event.isMessage)
-          .filter((event) => event.isText || event.isAudio)
+          .filter((event) => event.isText || event.isAudio || event.isImage)
           .map((event) => new Context(event))
           .map((context) => context.initialize()),
       ))

diff --git a/app/context.js b/app/context.js
@@ -9,6 +9,7 @@ import {
   addMark,
   convertText,
   fetchAudio,
+  fetchImage,
   fetchGroup,
   fetchUser,
   generateTranscription,
@@ -87,6 +88,9 @@ class Context {
       const text = this.transcription.replace(config.BOT_NAME, '').trim();
       return addMark(text);
     }
+    if (this.event.isImage) {
+      return this.transcription.trim();
+    }
     return '?';
   }
 
@@ -99,6 +103,10 @@ class Context {
       const text = this.transcription.toLowerCase();
       return text.startsWith(config.BOT_NAME.toLowerCase());
     }
+    if (this.event.isImage) {
+      const text = this.transcription.toLowerCase();
+      return text.startsWith(config.BOT_NAME.toLowerCase());
+    }
     return false;
   }
 
@@ -111,7 +119,14 @@ class Context {
     }
     if (this.event.isAudio) {
       try {
-        await this.transcribe();
+        await this.transcribeAudio();
+      } catch (err) {
+        return this.pushError(err);
+      }
+    }
+    if (this.event.isImage) {
+      try {
+        await this.transcribeImage();
       } catch (err) {
         return this.pushError(err);
       }
@@ -163,14 +178,19 @@ class Context {
     this.source = new Source(sources[this.id]);
   }
 
-  async transcribe() {
+  async transcribeAudio() {
     const buffer = await fetchAudio(this.event.messageId);
     const file = `/tmp/${this.event.messageId}.m4a`;
     fs.writeFileSync(file, buffer);
     const { text } = await generateTranscription({ file, buffer });
     this.transcription = convertText(text);
   }
 
+  async transcribeImage() {
+    const base64String = await fetchImage(this.event.messageId);
+    this.transcription = base64String;
+  }
+
   /**
    * @param {Object} param
    * @param {string} param.text

diff --git a/app/handlers/talk.js b/app/handlers/talk.js
@@ -2,7 +2,7 @@ import config from '../../config/index.js';
 import { t } from '../../locales/index.js';
 import { ROLE_AI, ROLE_HUMAN } from '../../services/openai.js';
 import { generateCompletion } from '../../utils/index.js';
-import { COMMAND_BOT_CONTINUE, COMMAND_BOT_TALK } from '../commands/index.js';
+import { COMMAND_BOT_CONTINUE, COMMAND_BOT_TALK, COMMAND_BOT_FORGET } from '../commands/index.js';
 import Context from '../context.js';
 import { updateHistory } from '../history/index.js';
 import { getPrompt, setPrompt } from '../prompt/index.js';
@@ -24,14 +24,23 @@ const check = (context) => (
 const exec = (context) => check(context) && (
   async () => {
     const prompt = getPrompt(context.userId);
-    prompt.write(ROLE_HUMAN, `${t('__COMPLETION_DEFAULT_AI_TONE')(config.BOT_TONE)}${context.trimmedText}`).write(ROLE_AI);
     try {
-      const { text, isFinishReasonStop } = await generateCompletion({ prompt });
-      prompt.patch(text);
-      setPrompt(context.userId, prompt);
-      updateHistory(context.id, (history) => history.write(config.BOT_NAME, text));
-      const actions = isFinishReasonStop ? [] : [COMMAND_BOT_CONTINUE];
-      context.pushText(text, actions);
+      if (context.event.isImage) {
+        const text = context.trimmedText;
+        prompt.writeImage(ROLE_HUMAN, text).write(ROLE_AI);
+        prompt.patch('Get Image');
+        setPrompt(context.userId, prompt);
+        updateHistory(context.id, (history) => history.writeImage(ROLE_HUMAN, text));
+        context.pushText(t('__COMPLETION_GOT_IMAGE_REPLY'), [COMMAND_BOT_FORGET]);
+      } else {
+        prompt.write(ROLE_HUMAN, `${t('__COMPLETION_DEFAULT_AI_TONE')(config.BOT_TONE)}${context.trimmedText}`).write(ROLE_AI);
+        const { text, isFinishReasonStop } = await generateCompletion({ prompt });
+        prompt.patch(text);
+        setPrompt(context.userId, prompt);
+        updateHistory(context.id, (history) => history.write(config.BOT_NAME, text));
+        const actions = isFinishReasonStop ? [COMMAND_BOT_FORGET] : [COMMAND_BOT_CONTINUE];
+        context.pushText(text, actions);
+      }
     } catch (err) {
       context.pushError(err);
     }

diff --git a/app/history/history.js b/app/history/history.js
@@ -40,6 +40,27 @@ class History {
     return this;
   }
 
+  /**
+   * @param {string} role
+   * @param {string} content
+   */
+  writeImage(role, content = '') {
+    const imageContent = [
+      {
+        type: 'text',
+        text: '這是一張圖片',
+      },
+      {
+        type: 'image',
+        image_url: {
+          url: content,
+        },
+      },
+    ];
+    this.messages.push(new Message({ role, content: imageContent }));
+    return this;
+  }
+
   /**
    * @param {string} content
    */

diff --git a/app/models/event.js b/app/models/event.js
@@ -3,6 +3,7 @@ import {
   MESSAGE_TYPE_AUDIO,
   MESSAGE_TYPE_STICKER,
   MESSAGE_TYPE_TEXT,
+  MESSAGE_TYPE_IMAGE,
   SOURCE_TYPE_GROUP,
 } from '../../services/line.js';
 
@@ -62,6 +63,13 @@ class Event {
     return this.message.type === MESSAGE_TYPE_AUDIO;
   }
 
+  /**
+   * @returns {boolean}
+   */
+  get isImage() {
+    return this.message.type === MESSAGE_TYPE_IMAGE;
+  }
+
   /**
    * @returns {string}
    */

diff --git a/app/prompt/message.js b/app/prompt/message.js
@@ -20,6 +20,9 @@ class Message {
   }
 
   toString() {
+    if (Array.isArray(this.content)) {
+      return `\n${this.role}: ${this.content[0].text}`;
+    }
     return this.role ? `\n${this.role}: ${this.content}` : this.content;
   }
 }

diff --git a/app/prompt/prompt.js b/app/prompt/prompt.js
@@ -49,6 +49,27 @@ class Prompt {
     return this;
   }
 
+  /**
+   * @param {string} role
+   * @param {string} content
+   */
+  writeImage(role, content = '') {
+    const imageContent = [
+      {
+        type: 'text',
+        text: '這是一張圖片',
+      },
+      {
+        type: 'image_url',
+        image_url: {
+          url: content,
+        },
+      },
+    ];
+    this.messages.push(new Message({ role, content: imageContent }));
+    return this;
+  }
+
   /**
    * @param {string} content
    */

diff --git a/locales/en.js b/locales/en.js
@@ -102,6 +102,7 @@ const en = {
   __COMPLETION_SEARCH_NOT_FOUND: '查無資料', // TODO
   __COMPLETION_QUOTATION_MARK_OPENING: '"',
   __COMPLETION_QUOTATION_MARK_CLOSING: '"',
+  __COMPLETION_GOT_IMAGE_REPLY: 'The image has been obtained, please explain the intention.',
   __ERROR_ECONNABORTED: 'Timed out',
   __ERROR_UNKNOWN: 'Something went wrong',
   __ERROR_MAX_GROUPS_REACHED: 'Maximum groups reached',

diff --git a/locales/ja.js b/locales/ja.js
@@ -102,6 +102,7 @@ const ja = {
   __COMPLETION_SEARCH_NOT_FOUND: '查無資料', // TODO
   __COMPLETION_QUOTATION_MARK_OPENING: '「',
   __COMPLETION_QUOTATION_MARK_CLOSING: '」',
+  __COMPLETION_GOT_IMAGE_REPLY: '画像を取得しました、意図を説明してください。',
   __ERROR_ECONNABORTED: '接続がタイムアウトしました。',
   __ERROR_UNKNOWN: '技術的な問題が発生しています。',
   __ERROR_MAX_GROUPS_REACHED: '最大ユーザー数に達しています。',

diff --git a/locales/zh.js b/locales/zh.js
@@ -102,6 +102,7 @@ const zh = {
   __COMPLETION_SEARCH_NOT_FOUND: '查無資料',
   __COMPLETION_QUOTATION_MARK_OPENING: '「',
   __COMPLETION_QUOTATION_MARK_CLOSING: '」',
+  __COMPLETION_GOT_IMAGE_REPLY: '已取得圖片，請說明意圖。',
   __ERROR_ECONNABORTED: '這個問題太複雜了',
   __ERROR_UNKNOWN: '系統出了點狀況',
   __ERROR_MAX_GROUPS_REACHED: '群組數量到達上限了',

diff --git a/services/openai.js b/services/openai.js
@@ -15,7 +15,7 @@ export const IMAGE_SIZE_512 = '512x512';
 export const IMAGE_SIZE_1024 = '1024x1024';
 
 export const MODEL_GPT_3_5_TURBO = 'gpt-3.5-turbo';
-export const MODEL_GPT_4 = 'gpt-4';
+export const MODEL_GPT_4_OMNI = 'gpt-4o';
 export const MODEL_WHISPER_1 = 'whisper-1';
 
 const client = axios.create({
@@ -38,21 +38,34 @@ client.interceptors.response.use(handleFulfilled, (err) => {
   return handleRejected(err);
 });
 
+const isAboutImageCompletion = ({ messages }) => {
+  let flag = false;
+  messages.forEach((message) => {
+    if (message.role === ROLE_AI && message.content === 'Get Image') {
+      flag = true;
+    }
+  });
+  return flag;
+};
+
 const createChatCompletion = ({
   model = config.OPENAI_COMPLETION_MODEL,
   messages,
   temperature = config.OPENAI_COMPLETION_TEMPERATURE,
   maxTokens = config.OPENAI_COMPLETION_MAX_TOKENS,
   frequencyPenalty = config.OPENAI_COMPLETION_FREQUENCY_PENALTY,
   presencePenalty = config.OPENAI_COMPLETION_PRESENCE_PENALTY,
-}) => client.post('/v1/chat/completions', {
-  model,
-  messages,
-  temperature,
-  max_tokens: maxTokens,
-  frequency_penalty: frequencyPenalty,
-  presence_penalty: presencePenalty,
-});
+}) => {
+  const body = {
+    model: isAboutImageCompletion({ messages }) ? MODEL_GPT_4_OMNI : model,
+    messages,
+    temperature,
+    max_tokens: maxTokens,
+    frequency_penalty: frequencyPenalty,
+    presence_penalty: presencePenalty,
+  };
+  return client.post('/v1/chat/completions', body);
+};
 
 const createTextCompletion = ({
   model = config.OPENAI_COMPLETION_MODEL,

diff --git a/utils/fetch-image.js b/utils/fetch-image.js
@@ -0,0 +1,12 @@
+import { fetchContent } from '../services/line.js';
+
+/**
+ * @param {string} messageId
+ * @returns {Promise<string>}
+ */
+const fetchImage = async (messageId) => {
+  const { data } = await fetchContent({ messageId });
+  return `data:image/jpeg;base64,${Buffer.from(data, 'binary').toString('base64')}`;
+};
+
+export default fetchImage;
diff --git a/utils/index.js b/utils/index.js
@@ -2,6 +2,7 @@ import addMark from './add-mark.js';
 import convertText from './convert-text.js';
 import fetchAnswer from './fetch-answer.js';
 import fetchAudio from './fetch-audio.js';
+import fetchImage from './fetch-image.js';
 import fetchEnvironment from './fetch-environment.js';
 import fetchGroup from './fetch-group.js';
 import fetchUser from './fetch-user.js';
@@ -19,6 +20,7 @@ export {
   convertText,
   fetchAnswer,
   fetchAudio,
+  fetchImage,
   fetchEnvironment,
   fetchGroup,
   fetchUser,