From 10d36f150bf4ac53b9bc33723409cab3c8c5d82b Mon Sep 17 00:00:00 2001 From: Yevheniy Oliynyk Date: Mon, 27 May 2024 08:54:43 +0200 Subject: [PATCH] Code refactoring (#5) --- cli.js | 1 + src/configure.js | 6 +- src/harvest.js | 283 +++++++++++++++++++++++++++++++---------------- src/reset.js | 55 ++++----- src/upload.js | 13 ++- src/utils.js | 114 +++++++++++++++---- 6 files changed, 318 insertions(+), 154 deletions(-) diff --git a/cli.js b/cli.js index 5d3fc41..d92b769 100755 --- a/cli.js +++ b/cli.js @@ -9,6 +9,7 @@ import chalk from 'chalk'; import fs from 'fs'; import path from 'path'; import { fileURLToPath } from 'url'; + const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); diff --git a/src/configure.js b/src/configure.js index 0ae1b97..b3eaa98 100644 --- a/src/configure.js +++ b/src/configure.js @@ -3,7 +3,7 @@ import { getCrowdin } from './utils.js'; import chalk from 'chalk'; import axios from 'axios'; -async function configureCli(name, commandOptions, command) { +async function configureCli(_name, commandOptions, _command) { const options = commandOptions.opts(); const questions = [{ @@ -170,7 +170,9 @@ async function configureCli(name, commandOptions, command) { const answers = await inquirer.prompt(questions); console.log(chalk.hex('#FFA500').bold('\nYou can now execute the harvest command by running:\n')); - console.log(chalk.green(`crowdin-context-harvester `) + + + console.log( + chalk.green(`crowdin-context-harvester `) + chalk.blue('harvest ') + (answers.org ? chalk.yellow('--org=') + chalk.white(`"${answers.org}" `) : '') + (answers.token ? chalk.yellow('--token=') + chalk.white(`"${answers.token}" `) : '') + diff --git a/src/harvest.js b/src/harvest.js index 1df7f32..4979b0c 100644 --- a/src/harvest.js +++ b/src/harvest.js @@ -1,24 +1,79 @@ +//@ts-check +import axios from 'axios'; +import chalk from 'chalk'; +import cliWidth from 'cli-width'; import fs from 'fs'; import { globSync } from 'glob'; +import { encode } from 'gpt-tokenizer'; +import { Parser } from 'json2csv'; import ora from 'ora'; -import { getCrowdin, getCrowdinFiles, fetchCrowdinStrings, uploadAiStringsToCrowdin } from './utils.js'; -import chalk from 'chalk'; -import { encode } from 'gpt-tokenizer' -import axios from 'axios'; import { table } from 'table'; -import { Parser } from 'json2csv'; -import cliWidth from 'cli-width'; +import { fetchCrowdinStrings, getCrowdin, getCrowdinFiles, uploadAiStringsToCrowdin } from './utils.js'; const AI_MODEL_CONTEXT_WINDOW = 128000; // the context window size of the recommended AI model +// tools that are used in the AI model. this way we get more predictable results from the model +const AI_TOOLS = [{ + type: "function", + function: { + name: "setContext", + description: "Always use this function to return the context.", + parameters: { + type: "object", + properties: { + contexts: { + type: "array", + items: { + type: "object", + properties: { + id: { + type: "number", + description: "Key ID of the string. This is the ID of the string that you are providing context for." + }, + context: { + type: "string", + description: "Context of the string. This is the context that you are providing for the string." + } + }, + required: ["id", "context"] + }, + } + } + } + } +}]; + +const DEFAULT_PROMPT = `Extract the context for the following UI labels. + +- Context is useful information for linguists or an AI translating these texts about how the text is used in the project they are localizing or when it appears in the UI. +- Only provide context if exact matches of the strings or keys are found in the code. +- If no matches are found, do not provide context. +- Only return context if you find a key or text usage in the code. +- Any context provided should start with 'Used as...' or 'Appears as...'. +- Always call the setContext tool to return the context. + +Strings: +%strings% + +Code: +%code%`; + const spinner = ora(); -// stringifies chat messages and encodes them into tokens to measure the length +/** + * Stringifies chat messages and encodes them into tokens to measure the length + * + * @param {Array} messages + */ function encodeChat(messages) { return encode(messages.map(message => message.content).join('\n\n')); } -// prints the strings that would be updated in a dry run +/** + * Prints the strings that would be updated in a dry run + * + * @param {Array} strings + */ function dryRunPrint(strings) { const stringsWithAiContext = strings.filter((string) => string.aiContext); @@ -34,20 +89,20 @@ function dryRunPrint(strings) { alignment: 'center', content: 'Strings with AI Context' }, - columns: { - 0: { + columns: [ + { width: idColumnWidth, wrapWord: true }, - 1: { + { width: textColumnWidth, wrapWord: true }, - 2: { + { width: contextColumnWidth, wrapWord: true } - } + ] }; let data = []; @@ -61,13 +116,19 @@ function dryRunPrint(strings) { return; } - console.log('\n') + console.log('\n'); + //@ts-ignore console.log(table(data, config)); console.log(`\n${stringsWithAiContext.length} strings would be updated. Please be aware that an LLM model may return different results for the same input next time you run the tool.\n`); } -// writes the strings with AI context to a CSV file +/** + * Writes the strings with AI context to a CSV file + * + * @param {object} options + * @param {Array} strings + */ function writeCsv(options, strings) { const csvFile = options.csvFile; @@ -99,9 +160,14 @@ function writeCsv(options, strings) { } } -// this function runs at the end of the context extraction process -// it goes through all extracted contexts, compile an array of contexts for every string -// if user wanted to confirm the context, it will ask for confirmation +/** + * This function runs at the end of the context extraction process + * it goes through all extracted contexts, compile an array of contexts for every string + * if user wanted to confirm the context, it will ask for confirmation + * + * @param {Array} strings + * @param {object} [stringsContext] + */ async function appendContext(strings, stringsContext) { for (const context of stringsContext?.contexts || []) { const string = strings.find((s) => s.id === context.id); @@ -116,7 +182,13 @@ async function appendContext(strings, stringsContext) { } } -// used to split strings into smaller chunks if user has many strings in their Crowdin project +/** + * Used to split strings into smaller chunks if user has many strings in their Crowdin project + * + * @param {Array} array + * @param {number} maxSize + * @returns + */ function splitArray(array, maxSize) { let result = []; for (let i = 0; i < array.length; i += maxSize) { @@ -125,8 +197,14 @@ function splitArray(array, maxSize) { return result; } -// screens the code file and filters out strings that are not present in the code -// this is to do not send unnecessary strings to the AI model and reduce chunking +/** + * Screens the code file and filters out strings that are not present in the code + * this is to do not send unnecessary strings to the AI model and reduce chunking + * + * @param {Array} crowdinStrings + * @param {string} content + * @param {string} screen + */ function filterStrings(crowdinStrings, content, screen) { return crowdinStrings.filter((crowdinString) => { if (screen === 'keys') { @@ -143,8 +221,17 @@ function filterStrings(crowdinStrings, content, screen) { }); } -// chunks the strings and code into smaller parts if needed and sends them to the AI model -async function chunkAndExtract(apiClient, options, content, crowdinStrings, fileName) { +/** + * Chunks the strings and code into smaller parts if needed and sends them to the AI model + * + * @param {object} param0 + * @param {object} param0.apiClient + * @param {object} param0.options + * @param {string} param0.content + * @param {Array} param0.crowdinStrings + * @param {string} param0.fileName + */ +async function chunkAndExtract({ apiClient, options, content, crowdinStrings, fileName }) { spinner.start(`Extracting context from ${chalk.green(fileName)}...`); // filter out strings that are not present in the code if the user wants to screen them @@ -163,21 +250,30 @@ async function chunkAndExtract(apiClient, options, content, crowdinStrings, file let chunks = [crowdinStrings]; let splitCount = 0; + let fullMessage = buildMessages({ options, crowdinStrings: chunks.flat(), content }); + // we first try to split the strings into smaller chunks to fit into the AI model context window. // splitting the code is less desirable - while (encodeChat(buildMessages(options, chunks.flat(), content)).length > AI_MODEL_CONTEXT_WINDOW && splitCount < 10) { + while (encodeChat(fullMessage).length > AI_MODEL_CONTEXT_WINDOW && splitCount < 10) { chunks = chunks.flatMap(chunk => splitArray(chunk, Math.ceil(chunk.length / 2))); splitCount++; } + fullMessage = buildMessages({ options, crowdinStrings: chunks.flat(), content }); + // if the strings + code are still too long, we split the code into smaller chunks - if (encodeChat(buildMessages(options, chunks.flat(), content)).length > AI_MODEL_CONTEXT_WINDOW) { - let contentChunks = content.match(new RegExp('.{1,' + Math.ceil(content.length / 2) + '}', 'g')); + if (encodeChat(fullMessage).length > AI_MODEL_CONTEXT_WINDOW) { + const contentChunks = content.match(new RegExp('.{1,' + Math.ceil(content.length / 2) + '}', 'g')) || ''; for (let i = 0; i < chunks.length; i++) { for (let j = 0; j < contentChunks.length; j++) { spinner.start(`Chunk ${i + 1}/${chunks.length} and content chunk ${j + 1}/${contentChunks.length}...`); - result.push((await executePrompt(apiClient, options, buildMessages(options, chunks[i], contentChunks[j]))).contexts); + const result = await executePrompt({ + apiClient, + options, + messages: buildMessages({ options, crowdinStrings: chunks[i], content: contentChunks[j] }), + }); + result.push(result.contexts); spinner.succeed(); } } @@ -185,7 +281,12 @@ async function chunkAndExtract(apiClient, options, content, crowdinStrings, file // if chunked strings fit into the AI model with full code, we send every strings chunk with the full code for (let chunk of chunks) { chunks.length > 1 && spinner.start(`Chunk ${chunks.indexOf(chunk) + 1}/${chunks.length}...`); - result.push((await executePrompt(apiClient, options, buildMessages(options, chunk, content))).contexts); + const result = await executePrompt({ + apiClient, + options, + messages: buildMessages({ options, crowdinStrings: chunk, content }), + }); + result.push(result.contexts); chunks.length > 1 && spinner.succeed(); } } @@ -199,36 +300,36 @@ async function chunkAndExtract(apiClient, options, content, crowdinStrings, file }; } -// builds the chat messages for the AI model -function buildMessages(options, crowdinStrings, content) { +/** + * Builds the chat messages for the AI model + * + * @param {object} param0 + * @param {object} param0.options + * @param {Array} param0.crowdinStrings + * @param {string} param0.content + */ +function buildMessages({ options, crowdinStrings, content }) { + const strings = JSON.stringify(crowdinStrings, null, 2); return [{ role: 'system', content: 'You are a helpful assistant who extracts context from code for UI labels.', }, { role: 'user', - content: getPrompt(options, JSON.stringify(crowdinStrings, null, 2), content), + content: getPrompt({ options, strings, content }), }]; } -// returns the prompt for the AI model, either default or provided by the user -function getPrompt(options, strings, content) { - const defaultPrompt = `Extract the context for the following UI labels. - - - Context is useful information for linguists or an AI translating these texts about how the text is used in the project they are localizing or when it appears in the UI. - - Only provide context if exact matches of the strings or keys are found in the code. - - If no matches are found, do not provide context. - - Only return context if you find a key or text usage in the code. - - Any context provided should start with 'Used as...' or 'Appears as...'. - - Always call the setContext tool to return the context. - -Strings: -%strings% - -Code: -%code%`; - - let prompt = defaultPrompt; +/** + * Returns the prompt for the AI model, either default or provided by the user + * + * @param {object} param0 + * @param {object} param0.options + * @param {string} param0.strings + * @param {string} param0.content + */ +function getPrompt({ options, strings, content }) { + let prompt = DEFAULT_PROMPT; if (options.promptFile) { try { @@ -246,22 +347,29 @@ Code: return prompt.replace('%strings%', strings).replace('%code%', content); } -// picks a preferred AI provider and executes the prompt -// returns an array of objects, every object is a string id and extracted context -async function executePrompt(apiClient, options, messages) { +/** + * Picks a preferred AI provider and executes the prompt + * Returns an array of objects, every object is a string id and extracted context + * + * @param {object} param0 + * @param {object} param0.apiClient + * @param {object} param0.options + * @param {Array} param0.messages + */ +async function executePrompt({ apiClient, options, messages }) { if (options.ai === 'crowdin') { let aiResponse; if (apiClient.isEnterprise) { aiResponse = (await apiClient.aiApi.createAiOrganizationProxyChatCompletion(options.crowdinAiId, { model: options.model, messages, - tools: getTools() + tools: AI_TOOLS })); } else { aiResponse = (await apiClient.aiApi.createAiUserProxyChatCompletion(apiClient.userId, options.crowdinAiId, { model: options.model, messages, - tools: getTools() + tools: AI_TOOLS })); } @@ -270,7 +378,7 @@ async function executePrompt(apiClient, options, messages) { } else { const openAiResponse = (await axios.post('https://api.openai.com/v1/chat/completions', { model: options.model, - tools: getTools(), + tools: AI_TOOLS, messages, }, { headers: { @@ -284,41 +392,8 @@ async function executePrompt(apiClient, options, messages) { } } -// returns the tools that are used in the AI model. this way we get more predictable results from the model -function getTools() { - return [{ - type: "function", - function: { - name: "setContext", - description: "Always use this function to return the context.", - parameters: { - type: "object", - properties: { - contexts: { - type: "array", - items: { - type: "object", - properties: { - id: { - type: "number", - description: "Key ID of the string. This is the ID of the string that you are providing context for." - }, - context: { - type: "string", - description: "Context of the string. This is the context that you are providing for the string." - } - }, - required: ["id", "context"] - }, - } - } - } - } - }]; -} - // main function that orchestrates the context extraction process -async function harvest(name, commandOptions, command) { +async function harvest(_name, commandOptions, _command) { try { const options = commandOptions.opts(); @@ -359,7 +434,11 @@ async function harvest(name, commandOptions, command) { path: 'croql' }] } else { - containers = await getCrowdinFiles(apiClient, options.project, options.crowdinFiles); + containers = await getCrowdinFiles({ + apiClient, + project: options.project, + filesPattern: options.crowdinFiles + }); } } } catch (error) { @@ -379,7 +458,15 @@ async function harvest(name, commandOptions, command) { let stringsBatch = []; try { spinner.start(`Loading strings from ${chalk.green(container.path || container.name)}`); - stringsBatch = await fetchCrowdinStrings(apiClient, options.project, isStringsProject, container, strings, options.croql); + const result = await fetchCrowdinStrings({ + apiClient, + project: options.project, + isStringsProject, + container, + croql: options.croql + }); + strings.push(...result.crowdinStrings); + stringsBatch = result.strings; spinner.succeed(); } catch (error) { spinner.fail(); @@ -401,7 +488,13 @@ async function harvest(name, commandOptions, command) { // extract the context from the code file try { - context = await chunkAndExtract(apiClient, options, content, stringsBatch, localFile); + context = await chunkAndExtract({ + apiClient, + options, + content, + crowdinStrings: stringsBatch, + fileName: localFile, + }); } catch (error) { console.error(`\nError extracting context from ${chalk.green(localFile)}: ${error}. Proceeding with other files...`); continue; @@ -424,7 +517,11 @@ async function harvest(name, commandOptions, command) { writeCsv(options, strings); } else if (options.output === 'crowdin') { spinner.start(`Updating Crowdin strings...`); - uploadAiStringsToCrowdin(apiClient, options.project, strings); + await uploadAiStringsToCrowdin({ + apiClient, + project: options.project, + strings + }); spinner.succeed(); } } catch (error) { diff --git a/src/reset.js b/src/reset.js index 15cddb3..cd5e1b5 100644 --- a/src/reset.js +++ b/src/reset.js @@ -1,9 +1,10 @@ +//@ts-check import ora from 'ora'; -import { getCrowdin, getCrowdinFiles, fetchCrowdinStrings } from './utils.js'; +import { getCrowdin, getCrowdinFiles, fetchCrowdinStrings, uploadWithoutAiStringsToCrowdin } from './utils.js'; const spinner = ora(); -async function reset(name, commandOptions, command) { +async function reset(_name, commandOptions, _command) { const options = commandOptions.opts(); const apiClient = await getCrowdin(options); @@ -27,7 +28,11 @@ async function reset(name, commandOptions, command) { if (isStringsProject) { containers = (await apiClient.sourceFilesApi.withFetchAll().listProjectBranches(options.project)).data.map(branch => branch.data); } else { - containers = await getCrowdinFiles(apiClient, options.project, options.crowdinFiles); + containers = await getCrowdinFiles({ + apiClient, + project: options.project, + filesPattern: options.crowdinFiles + }); } } catch (error) { spinner.fail(); @@ -39,7 +44,13 @@ async function reset(name, commandOptions, command) { for (const container of containers) { spinner.start(`Removing AI context from ${container.path || container.name}...`); try { - await fetchCrowdinStrings(apiClient, options.project, isStringsProject, container, strings); + const result = await fetchCrowdinStrings({ + apiClient, + project: options.project, + isStringsProject, + container, + }); + strings.push(...result.crowdinStrings); } catch (error) { spinner.fail(); console.error(`Error loading strings from ${container.path || container.name}: ${error}. Proceeding with other files...`); @@ -51,7 +62,11 @@ async function reset(name, commandOptions, command) { }); try { - updateStrings(apiClient, options.project, strings); + await uploadWithoutAiStringsToCrowdin({ + apiClient, + project: options.project, + strings, + }); } catch (error) { spinner.fail(); console.error(`Error updating strings: ${error}`); @@ -62,34 +77,4 @@ async function reset(name, commandOptions, command) { } } -async function updateStrings(apiClient, project, strings) { - const contextUpdateBatchRequest = []; - for (const string of strings) { - contextUpdateBatchRequest.push({ - op: 'replace', - path: `/${string.id}/context`, - value: removeAIContext(string.context), - }); - } - - await apiClient.sourceStringsApi.stringBatchOperations(project, contextUpdateBatchRequest); -} - -// Remove AI context from the string context -function removeAIContext(context) { - if(!context) return context; - - const aiContextSection = '\n\n✨ AI Context\n'; - const endAiContextSection = '\n✨ 🔚'; - - const aiContextIndex = context?.indexOf(aiContextSection); - const endAiContextIndex = context?.indexOf(endAiContextSection); - - if (aiContextIndex !== -1 && endAiContextIndex !== -1) { - return context.substring(0, aiContextIndex) + context.substring(endAiContextIndex + endAiContextSection.length); - } - - return context; -} - export default reset; \ No newline at end of file diff --git a/src/upload.js b/src/upload.js index 8d161bd..bc70be5 100644 --- a/src/upload.js +++ b/src/upload.js @@ -1,10 +1,11 @@ +//@ts-check import ora from 'ora'; import { getCrowdin, uploadAiStringsToCrowdin } from './utils.js'; import csv from 'csvtojson'; const spinner = ora(); -async function upload(name, commandOptions, command) { +async function upload(_name, commandOptions, _command) { const options = commandOptions.opts(); spinner.start(`Connecting to Crowdin...`); @@ -13,10 +14,10 @@ async function upload(name, commandOptions, command) { try { spinner.start(`Reading the CSV file...`); - let data = await csv().fromFile(options.csvFile); + let strings = await csv().fromFile(options.csvFile); spinner.succeed(); - data = data.map((row) => { + strings = strings.map((row) => { return { id: row.id, context: row.context, @@ -25,7 +26,11 @@ async function upload(name, commandOptions, command) { }); spinner.start(`Uploading the reviewed context to Crowdin...`); - await uploadAiStringsToCrowdin(apiClient, options.project, data); + await uploadAiStringsToCrowdin({ + apiClient, + project: options.project, + strings, + }); spinner.succeed(); console.log(`✨ The reviewed context has been uploaded to Crowdin project.`); diff --git a/src/utils.js b/src/utils.js index f6de188..17ac0dd 100644 --- a/src/utils.js +++ b/src/utils.js @@ -1,9 +1,11 @@ +//@ts-check import crowdin from '@crowdin/crowdin-api-client'; import { minimatch } from 'minimatch'; // returns a Crowdin API client // this function looks for the .org property to determine if the client is for crowdin.com or CrowdIn Enterprise async function getCrowdin(options) { + //@ts-ignore const apiClient = new crowdin.default({ token: options.token, ...(options.org && { organization: options.org }), @@ -24,24 +26,40 @@ async function getCrowdin(options) { return apiClient; } -async function getCrowdinFiles(apiClient, project, filesPattern) { +/** + * @param {object} param0 + * @param {object} param0.apiClient + * @param {number} param0.project + * @param {string} param0.filesPattern + */ +async function getCrowdinFiles({ apiClient, project, filesPattern }) { let files = (await apiClient.sourceFilesApi.withFetchAll().listProjectFiles(project)).data.map(file => file.data); // filter out files from the list taht match the glob pattern in files variable - return files.filter((file) => { - return minimatch(file.path, filesPattern || '*', { - matchBase: true - }); - }).map((file) => { - return { - id: file.id, - path: file.path, - }; - }); + return files + .filter((file) => + minimatch(file.path, filesPattern || '*', { + matchBase: true + }) + ) + .map((file) => ( + { + id: file.id, + path: file.path, + } + )); } -async function fetchCrowdinStrings(apiClient, project, isStringsProject, container, strings, croql) { - let filter = {}; +/** + * @param {object} param0 + * @param {object} param0.apiClient + * @param {number} param0.project + * @param {boolean} param0.isStringsProject + * @param {object} param0.container + * @param {string} [param0.croql] + */ +async function fetchCrowdinStrings({ apiClient, project, isStringsProject, container, croql }) { + const filter = {}; if (isStringsProject) { filter.branchId = container.id; @@ -55,19 +73,23 @@ async function fetchCrowdinStrings(apiClient, project, isStringsProject, contain const crowdinStrings = (await apiClient.sourceStringsApi.withFetchAll().listProjectStrings(project, filter)).data.map((string) => string.data); - // merge the strings from the file or branch with the global strings array - strings.push(...crowdinStrings); - - return crowdinStrings.map((string) => { + const strings = crowdinStrings.map((string) => { return { id: string.id, text: string.text, key: string.identifier, }; }); + + return { crowdinStrings, strings }; } -// appends the AI extracted context to the existing context +/** + * Appends the AI extracted context to the existing context + * + * @param {string} context + * @param {string[]} aiContext + */ function appendAiContext(context, aiContext) { const aiContextSection = '\n\n✨ AI Context\n'; const endAiContextSection = '\n✨ 🔚'; @@ -82,8 +104,38 @@ function appendAiContext(context, aiContext) { return context + aiContextSection + aiContext.join('\n') + endAiContextSection; } -// updates strings in Crowdin with the AI extracted context -async function uploadAiStringsToCrowdin(apiClient, project, strings) { +/** + * Remove AI context from the string context + * + * @param {string} context + */ +function removeAIContext(context) { + if (!context) { + return context; + }; + + const aiContextSection = '\n\n✨ AI Context\n'; + const endAiContextSection = '\n✨ 🔚'; + + const aiContextIndex = context?.indexOf(aiContextSection); + const endAiContextIndex = context?.indexOf(endAiContextSection); + + if (aiContextIndex !== -1 && endAiContextIndex !== -1) { + return context.substring(0, aiContextIndex) + context.substring(endAiContextIndex + endAiContextSection.length); + } + + return context; +} + +/** + * Updates strings in Crowdin with the AI extracted context + * + * @param {object} param0 + * @param {object} param0.apiClient + * @param {number} param0.project + * @param {Array} param0.strings + */ +async function uploadAiStringsToCrowdin({ apiClient, project, strings }) { const stringsWithAiContext = strings.filter((string) => string?.aiContext?.length > 0); const contextUpdateBatchRequest = []; @@ -98,9 +150,31 @@ async function uploadAiStringsToCrowdin(apiClient, project, strings) { await apiClient.sourceStringsApi.stringBatchOperations(project, contextUpdateBatchRequest); } +/** + * Updates strings in Crowdin without the AI extracted context + * + * @param {object} param0 + * @param {object} param0.apiClient + * @param {number} param0.project + * @param {Array} param0.strings + */ +async function uploadWithoutAiStringsToCrowdin({ apiClient, project, strings }) { + const contextUpdateBatchRequest = []; + for (const string of strings) { + contextUpdateBatchRequest.push({ + op: 'replace', + path: `/${string.id}/context`, + value: removeAIContext(string.context), + }); + } + + await apiClient.sourceStringsApi.stringBatchOperations(project, contextUpdateBatchRequest); +} + export { getCrowdin, getCrowdinFiles, fetchCrowdinStrings, uploadAiStringsToCrowdin, + uploadWithoutAiStringsToCrowdin, }; \ No newline at end of file