From 5e5d56ad9f90abc75b0f7d4a2f6487488d9a2b97 Mon Sep 17 00:00:00 2001 From: seohyunjun Date: Fri, 29 Mar 2024 20:07:34 +0900 Subject: [PATCH] add compare korean token count each embedding api model --- .gitignore | 1 + llm_embedding.ipynb | 608 ++++++++++++++++++++++++++++++++++++++ openaiAPI_embedding.ipynb | 2 +- 3 files changed, 610 insertions(+), 1 deletion(-) create mode 100644 .gitignore create mode 100644 llm_embedding.ipynb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2eea525 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.env \ No newline at end of file diff --git a/llm_embedding.ipynb b/llm_embedding.ipynb new file mode 100644 index 0000000..ecc620c --- /dev/null +++ b/llm_embedding.ipynb @@ -0,0 +1,608 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare LLM Embedding Korean Token" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Load Env file " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "load_dotenv()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Text\n", + "\n", + "``` text\n", + "대한민국 법률 전문 \n", + "\n", + "유구한 역사와 전통에 빛나는 우리들 대한국민은 기미 삼일운동으로 대한민국을 건립하여 세계에 선포한 위대한 독립정신을 계승하여 이제 민주독립국가를 재건함에 있어서 정의인도와 동포애로써 민족의 단결을 공고히 하며 모든 사회적 폐습을 타파하고 민주주의제제도를 수립하여 정치, 경제, 사회, 문화의 모든 영역에 있어서 각인의 기회를 균등히 하고 능력을 최고도로 발휘케 하며 각인의 책임과 의무를 완수케하여 안으로는 국민생활의 균등한 향상을 기하고 밖으로는 항구적인 국제평화의 유지에 노력하여 우리들과 우리들의 자손의 안전과 자유와 행복을 영원히 확보할 것을 결의하고 우리들의 정당 또 자유로히 선거된 대표로써 구성된 국회에서 단기 4281년 7월 12일 이 헌법을 제정한다.\n", + "\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "text = \"\"\"유구한 역사와 전통에 빛나는 우리들 대한국민은 기미 삼일운동으로 대한민국을 건립하여 세계에 선포한 위대한 독립정신을 계승하여 이제 민주독립국가를 재건함에 있어서 정의인도와 동포애로써 민족의 단결을 공고히 하며 모든 사회적 폐습을 타파하고 민주주의제제도를 수립하여 정치, 경제, 사회, 문화의 모든 영역에 있어서 각인의 기회를 균등히 하고 능력을 최고도로 발휘케 하며 각인의 책임과 의무를 완수케하여 안으로는 국민생활의 균등한 향상을 기하고 밖으로는 항구적인 국제평화의 유지에 노력하여 우리들과 우리들의 자손의 안전과 자유와 행복을 영원히 확보할 것을 결의하고 우리들의 정당 또 자유로히 선거된 대표로써 구성된 국회에서 단기 4281년 7월 12일 이 헌법을 제정한다.\"\"\"\n", + "\n", + "def normalize_text(s, sep_token = \" \\n \"):\n", + " s = re.sub(r'\\s+', ' ', s).strip()\n", + " s = re.sub(r\". ,\",\"\",s)\n", + " # remove all instances of multiple spaces\n", + " s = s.replace(\"..\",\".\")\n", + " s = s.replace(\". .\",\".\")\n", + " s = s.replace(\"\\n\", \"\")\n", + " s = s.replace(\"#\",\"\")\n", + " s = s.strip()\n", + " if s ==\"\":\n", + " s = \"\"\n", + " return s\n", + "\n", + "normalize_text = normalize_text(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Text length before and after normalization\n", + "373 -> 373\n" + ] + } + ], + "source": [ + "print(\"Text length before and after normalization\")\n", + "print(f\"{len(text)} -> {len(normalize_text)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1) OpenAI embedding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "|MODEL|~ PAGES PER DOLLAR|PERFORMANCE ON MTEB EVAL|MAX INPUT|\n", + "|---|---|---|---|\n", + "|text-embedding-3-small|62,500|62.3%|8191|\n", + "|text-embedding-3-large|9,615|64.6%|8191|\n", + "|text-embedding-ada-002|12,500|61.0%|8191|" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tokenzier Information\n", + "|Model|Tokenizer|\n", + "|---|---|\n", + "|gpt-4|cl100k_base|\n", + "|gpt-3.5-turbo|cl100k_base|\n", + "|gpt-3.5|cl100k_base|\n", + "|gpt-35-turbo|cl100k_base|\n", + "|davinci-002|cl100k_base|\n", + "|babbage-002|cl100k_base|\n", + "|text-embedding-ada-002|cl100k_base|\n", + "|text-embedding-3-small|cl100k_base|\n", + "|text-embedding-3-large|cl100k_base|\n", + "|text-davinci-003|r50k_base|\n", + "|text-davinci-002|r50k_base|\n", + "|text-davinci-001|r50k_base|\n", + "|text-curie-001|r50k_base|\n", + "|text-babbage-001|r50k_base|\n", + "|text-ada-001|r50k_base|\n", + "|davinci|r50k_base|\n", + "|curie|r50k_base|\n", + "|babbage|r50k_base|\n", + "|ada|r50k_base|\n", + "|code-davinvi-002|p50k_base|\n", + "|code-davinci-001|p50k_base|\n", + "|code-cushman-002|p50k_base|\n", + "|code-cushman-001|p50k_base|\n", + "|davinci-codex|p50k_base|\n", + "|cushman-codex|p50k_base|\n", + "|text-davinci-edit-001|p50k_edit|\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Price (1$=1300₩)\n", + "|Model|Usage|\n", + "|---|---|\n", + "|text-embedding-3-small|$0.02 / 1M tokens|\n", + "|text-embedding-3-large|$0.13 / 1M tokens|\n", + "|ada v2|$0.10 / 1M tokens|" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.000130\n" + ] + } + ], + "source": [ + "print(f\"{0.10 * 1300 / 1000000:.6f}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import tiktoken\n", + "\n", + "enc = tiktoken.get_encoding(encoding_name=\"cl100k_base\")\n", + "result_cl100k_base = enc.encode(normalize_text)\n", + "\n", + "enc = tiktoken.get_encoding(encoding_name=\"r50k_base\")\n", + "result_r50k_base = enc.encode(normalize_text)\n", + "\n", + "enc = tiktoken.get_encoding(encoding_name=\"p50k_base\")\n", + "result_p50k_base = enc.encode(normalize_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OpenAI text-embedding-ada-002 korean encoding\n", + "Origin Text : 373\n", + "Embed Text(cl100k_base) : 396\n", + "Embed Text(r50k_base) : 815\n", + "Embed Text(p50k_base) : 815\n" + ] + } + ], + "source": [ + "print(\"OpenAI text-embedding-ada-002 korean encoding\")\n", + "print(f\"Origin Text : {len(normalize_text)}\")\n", + "print(f\"Embed Text(cl100k_base) : {len(result_cl100k_base)}\")\n", + "print(f\"Embed Text(r50k_base) : {len(result_r50k_base)}\")\n", + "print(f\"Embed Text(p50k_base) : {len(result_p50k_base)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Result (ada-002 / cl100k_base)\n", + "- text-embedding-3-small - 0.02 * 1300/1000000*(396)=0.10296₩\n", + "- text-embedding-3-large - 0.13 * 1300/1000000*(396)=0.67524₩\n", + "- ada v2 - 0.10 * 1300/1000000*(396)=0.5268₩" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2) Gemini-PRO" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install google-generativeai" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Set Environmnet" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# price https://ai.google.dev/pricing\n", + "# count token https://cloud.google.com/vertex-ai/docs/generative-ai/multimodal/get-token-count?hl=ko" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/seohyunjun/anaconda3/envs/python311/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import google.generativeai as genai\n", + "\n", + "genai.configure(api_key=os.environ[\"GOOGLE_API_KEY\"])\n", + "\n", + "model = genai.GenerativeModel('gemini-pro')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "gemini_pro_token = model.count_tokens(text)\n", + "gemini_pro_normalize_token = model.count_tokens(normalize_text)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GeminiPro text-embedding-ada-002 korean encoding\n", + "Origin Text : 373\n", + "Embed Text(normal) : 262\n", + "Embed Text(normalize) : 262\n" + ] + } + ], + "source": [ + "print(\"GeminiPro text-embedding-ada-002 korean encoding\")\n", + "print(f\"Origin Text : {len(text)}\")\n", + "print(f\"Embed Text(normal) : {gemini_pro_token.total_tokens}\")\n", + "print(f\"Embed Text(normalize) : {gemini_pro_normalize_token.total_tokens}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Price\n", + "- $0.000125 / 1K characters\n", + "- $0.0025 / image" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.000162\n" + ] + } + ], + "source": [ + "print(f\"{0.000125 * 1300 / 1000:.6f}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Result (gemini-pro)\n", + "- 0.02 * 1300/1000*(373)=0.0606125₩\n", + "- 0.02 * 1300/1000*(262)=0.0425749₩" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3) Claude 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Set Environment" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install anthropic\n", + "# !pip install -U voyageai" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### [Embedding model](https://docs.voyageai.com/docs/pricing?ref=anthropic)\n", + "\n", + "\n", + "|Model|Price per thousand tokens|Price per million tokens|Number of free tokens|\n", + "|---|---|---|---|\n", + "|voyage-2|$0.0001|$0.1|50 million|\n", + "|voyage-large-2|$0.00012|$0.12|50 million|\n", + "|voyage-code-2|$0.00012|$0.12|50 million|\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "import voyageai\n", + "\n", + "vo = voyageai.Client(os.getenv(\"VOYAGE_API_KEY\"))\n", + "# This will automatically use the environment variable VOYAGE_API_KEY.\n", + "# Alternatively, you can use \n", + "# \n", + "voyage2_token = vo.embed(text, model=\"voyage-2\", input_type=\"document\")\n", + "voyage2_normalize_token = vo.embed(normalize_text, model=\"voyage-2\", input_type=\"document\")\n", + "\n", + "voyage2_large_token = vo.embed(text, model=\"voyage-large-2\", input_type=\"document\")\n", + "voyage2_large_normalize_token = vo.embed(text, model=\"voyage-large-2\", input_type=\"document\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Anthropic Voyage-2 korean embedding\n", + "Origin Text : 373\n", + "Embed Text(voyage-2) : 543\n", + "Embed Normalize Text(voyage-2) : 543\n", + "Embed Text(voyage-large-2) : 543\n", + "Embed Normalize Text(voyage-large-2): 543\n" + ] + } + ], + "source": [ + "print(\"Anthropic Voyage-2 korean embedding\")\n", + "print(f\"Origin Text : {len(normalize_text)}\")\n", + "print(f\"Embed Text(voyage-2) : {voyage2_token.total_tokens}\")\n", + "print(f\"Embed Normalize Text(voyage-2) : {voyage2_normalize_token.total_tokens}\")\n", + "print(f\"Embed Text(voyage-large-2) : {voyage2_large_token.total_tokens}\")\n", + "print(f\"Embed Normalize Text(voyage-large-2): {voyage2_large_normalize_token.total_tokens}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Anthropic Voyage-2 korean embedding Price\n", + "Origin Text : 0.0000009698₩\n", + "Embed Text(voyage-2) : 0.0000014118₩\n", + "Embed Normalize Text(voyage-2) : 0.0000014118₩\n", + "Embed Text(voyage-large-2) : 0.0000016942₩\n", + "Embed Normalize Text(voyage-large-2): 0.0000016942₩\n" + ] + } + ], + "source": [ + "print(\"Anthropic Voyage-2 korean embedding Price\")\n", + "print(f\"Origin Text : {len(normalize_text)* 0.0001 * 1300 / 50000000:.10f}₩\")\n", + "print(f\"Embed Text(voyage-2) : {voyage2_token.total_tokens * 0.0001 * 1300 / 50000000:.10f}₩\")\n", + "print(f\"Embed Normalize Text(voyage-2) : {voyage2_normalize_token.total_tokens* 0.0001 * 1300 / 50000000:.10f}₩\")\n", + "print(f\"Embed Text(voyage-large-2) : {voyage2_large_token.total_tokens* 0.00012 * 1300 / 50000000:.10f}₩\")\n", + "print(f\"Embed Normalize Text(voyage-large-2): {voyage2_large_normalize_token.total_tokens* 0.00012 * 1300 / 50000000:.10f}₩\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install -U openai==1.2.0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4) Solar" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "from openai import OpenAI # openai==1.2.0\n", + "\n", + "client = OpenAI(\n", + " api_key=os.getenv(\"SOLAR_API_KEY\"),\n", + " base_url=\"https://api.upstage.ai/v1/solar\"\n", + ")\n", + "\n", + "# solar-1-mini-embedding-query and solar-1-mini-embedding-passage\n", + "from openai import OpenAI # openai==1.2.0\n", + "\n", + "solar_query_text = client.embeddings.create(\n", + " input=text,\n", + " model=\"solar-1-mini-embedding-query\",\n", + " encoding_format=\"float\" #encoding_format should be float, not be base64\n", + ")\n", + "\n", + "solar_query_normalize_text = client.embeddings.create(\n", + " input=normalize_text,\n", + " model=\"solar-1-mini-embedding-query\",\n", + " encoding_format=\"float\" #encoding_format should be float, not be base64\n", + ")\n", + "\n", + "\n", + "solar_passage_text = client.embeddings.create(\n", + " input=text,\n", + " model=\"solar-1-mini-embedding-passage\",\n", + " encoding_format=\"float\" #encoding_format should be float, not be base64\n", + ")\n", + "\n", + "solar_passage_normalize_text = client.embeddings.create(\n", + " input=normalize_text,\n", + " model=\"solar-1-mini-embedding-passage\",\n", + " encoding_format=\"float\" #encoding_format should be float, not be base64\n", + ")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OpenAI Solar-1-mini korean embedding\n", + "Origin Text : 373\n", + "Embed Text(solar-1-mini-query) : 190\n", + "Embed Normalize Text(solar-1-mini-query) : 190\n", + "Embed Text(solar-1-mini-passage) : 172\n", + "Embed Normalize Text(solar-1-mini-passage) : 172\n" + ] + } + ], + "source": [ + "\n", + "print(\"OpenAI Solar-1-mini korean embedding\")\n", + "print(f\"Origin Text : {len(text)}\")\n", + "print(f\"Embed Text(solar-1-mini-query) : {solar_query_text.usage.total_tokens}\")\n", + "print(f\"Embed Normalize Text(solar-1-mini-query) : {solar_query_normalize_text.usage.total_tokens}\")\n", + "print(f\"Embed Text(solar-1-mini-passage) : {solar_passage_text.usage.total_tokens}\")\n", + "print(f\"Embed Normalize Text(solar-1-mini-passage) : {solar_passage_normalize_text.usage.total_tokens}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "|Company|Embedding Model|Price|Token Count|Official Price|\n", + "|---|---|---|---|---|\n", + "|OpenAI|text-embedding-3-small|0.10296₩|396|$0.02 / 1M tokens|\n", + "|OpenAI|text-embedding-3-large|0.67524₩|396|$0.13 / 1M tokens|\n", + "|OpenAI|ada-v2|0.5268₩|396|$0.10 / 1M tokens|$0.10 / 1M tokens|\n", + "|Gemini-PRO|gemini-pro|0.0425749₩|262|$0.000125 / 1K characters|\n", + "|Claude 3|voyage-2|0.0396₩|543|$0.0001 / 50M tokens|$0.0001 / 50M tokens|\n", + "|Claude 3|voyage-large-2|0.0000016942₩|543|$0.00012 / 50M tokens|\n", + "|Solar|solar-1-mini-query|-|190|Free|\n", + "|Solar|solar-1-mini-passage|-|172|Free|\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python311", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/openaiAPI_embedding.ipynb b/openaiAPI_embedding.ipynb index b2f5524..96774e1 100644 --- a/openaiAPI_embedding.ipynb +++ b/openaiAPI_embedding.ipynb @@ -1377,7 +1377,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.11.7" }, "orig_nbformat": 4 },