From 76ea9d2e541b7f08ed8d6ad0865cbac398789479 Mon Sep 17 00:00:00 2001 From: Synthetic Goop Date: Thu, 25 Jan 2024 12:13:03 +0800 Subject: [PATCH 1/3] fix(chunker): Fix chunk length calculation for unicode characters --- packages/ssr/src/utils/chunker.ts | 62 ++++++++++++++++------- packages/ssr/tests/chunker.spec.ts | 79 ++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+), 17 deletions(-) diff --git a/packages/ssr/src/utils/chunker.ts b/packages/ssr/src/utils/chunker.ts index 2e63d49a..5fde738c 100644 --- a/packages/ssr/src/utils/chunker.ts +++ b/packages/ssr/src/utils/chunker.ts @@ -3,34 +3,62 @@ interface Chunk { value: string; } -function createChunkRegExp(chunkSize: number) { - return new RegExp('.{1,' + chunkSize + '}', 'g'); -} - const MAX_CHUNK_SIZE = 3180; -const MAX_CHUNK_REGEXP = createChunkRegExp(MAX_CHUNK_SIZE); /** * create chunks from a string and return an array of object */ export function createChunks(key: string, value: string, chunkSize?: number): Chunk[] { - const re = chunkSize !== undefined ? createChunkRegExp(chunkSize) : MAX_CHUNK_REGEXP; - // check the length of the string to work out if it should be returned or chunked - const chunkCount = Math.ceil(value.length / (chunkSize ?? MAX_CHUNK_SIZE)); + const resolvedChunkSize = chunkSize ?? MAX_CHUNK_SIZE; + + let encodedValue = encodeURIComponent(value); - if (chunkCount === 1) { + if (encodedValue.length <= resolvedChunkSize) { return [{ name: key, value }]; } - const chunks: Chunk[] = []; - // split string into a array based on the regex - const values = value.match(re); - values?.forEach((value, i) => { - const name = `${key}.${i}`; - chunks.push({ name, value }); - }); + const chunks = []; + + while (encodedValue.length > 0) { + let encodedChunkHead = encodedValue.slice(0, resolvedChunkSize); + + const lastEscapePos = encodedChunkHead.lastIndexOf('%'); + + // Check if the last escaped character is truncated. + if (lastEscapePos > resolvedChunkSize - 3) { + // If so, reslice the string to exclude the whole escape sequence. + // We only reduce the size of the string as the chunk must + // be smaller than the chunk size. + encodedChunkHead = encodedChunkHead.slice(0, lastEscapePos); + } + + let valueHead; + + // Check if the chunk was split along a valid unicode boundary. + while (encodedChunkHead.length > 0) { + try { + // Try to decode the chunk back and see if it is valid. + // Stop when the chunk is valid. + valueHead = decodeURIComponent(encodedChunkHead); + break; + } catch (error) { + if ( + error instanceof URIError && + encodedChunkHead.at(-3) === '%' && + encodedChunkHead.length > 3 + ) { + encodedChunkHead = encodedChunkHead.slice(0, encodedChunkHead.length - 3); + } else { + throw error; + } + } + } + + chunks.push(valueHead); + encodedValue = encodedValue.slice(encodedChunkHead.length); + } - return chunks; + return chunks.map((value, i) => ({ name: `${key}.${i}`, value })); } // Get fully constructed chunks diff --git a/packages/ssr/tests/chunker.spec.ts b/packages/ssr/tests/chunker.spec.ts index a9038fd0..f8b28c50 100644 --- a/packages/ssr/tests/chunker.spec.ts +++ b/packages/ssr/tests/chunker.spec.ts @@ -64,4 +64,83 @@ describe('chunker', () => { expect(len(`${key}=${DOUBLE_CHUNK_STRING}`)).toBe(7257); expect(combined).toBe(DOUBLE_CHUNK_STRING); }); + + it('should correctly break between unicode boundaries in escaped characters', () => { + const test = ' '; + const chunks = createChunks('key', test, 4); + expect(chunks).toEqual([ + { + name: 'key.0', + value: ' ' + }, + { + name: 'key.1', + value: ' ' + }, + { + name: 'key.2', + value: ' ' + } + ]); + + expect(chunks.map((char) => char.value).join('')).toEqual(test); + }); + + describe('should correctly break between unicode boundaries in long unicode', () => { + it('should correctly break between unicode boundaries in long unicode (4 bytes)', () => { + const test = '🤦🏻‍♂️'; + const chunksAtStartBorder = createChunks('key', test, 12); + const chunksAtEndBorder = createChunks('key', test, 17); + expect(chunksAtStartBorder).toEqual(chunksAtEndBorder); + expect(chunksAtStartBorder).toEqual([ + { + name: 'key.0', + value: '🤦' + }, + { + name: 'key.1', + value: '🏻' + }, + { + name: 'key.2', + value: '‍' + }, + { + name: 'key.3', + value: '♂' + }, + { + name: 'key.4', + value: '️' + } + ]); + expect(chunksAtStartBorder.map((char) => char.value).join('')).toEqual(test); + }); + + it('should correctly break between unicode boundaries in long unicode (5 bytes)', () => { + const test = '🤦🏻‍♂️'; + const chunksAtStartBorder = createChunks('key', test, 18); + const chunksAtEndBorder = createChunks('key', test, 20); + expect(chunksAtStartBorder).toEqual(chunksAtEndBorder); + expect(chunksAtStartBorder).toEqual([ + { + name: 'key.0', + value: '🤦' + }, + { + name: 'key.1', + value: '🏻' + }, + { + name: 'key.2', + value: '‍♂' + }, + { + name: 'key.3', + value: '️' + } + ]); + expect(chunksAtStartBorder.map((char) => char.value).join('')).toEqual(test); + }); + }); }); From f5f66fc38268db64641af0b61b16be7ef5f31cf3 Mon Sep 17 00:00:00 2001 From: Synthetic Goop Date: Sat, 27 Jan 2024 12:06:52 +0800 Subject: [PATCH 2/3] fix: Update types --- packages/ssr/src/utils/chunker.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/ssr/src/utils/chunker.ts b/packages/ssr/src/utils/chunker.ts index 5fde738c..8561de00 100644 --- a/packages/ssr/src/utils/chunker.ts +++ b/packages/ssr/src/utils/chunker.ts @@ -17,7 +17,7 @@ export function createChunks(key: string, value: string, chunkSize?: number): Ch return [{ name: key, value }]; } - const chunks = []; + const chunks: string[] = []; while (encodedValue.length > 0) { let encodedChunkHead = encodedValue.slice(0, resolvedChunkSize); @@ -32,7 +32,7 @@ export function createChunks(key: string, value: string, chunkSize?: number): Ch encodedChunkHead = encodedChunkHead.slice(0, lastEscapePos); } - let valueHead; + let valueHead: string; // Check if the chunk was split along a valid unicode boundary. while (encodedChunkHead.length > 0) { From 5251364c08a65c47f0f7bec7b8ba3f9c24c71066 Mon Sep 17 00:00:00 2001 From: Synthetic Goop Date: Sat, 27 Jan 2024 12:13:09 +0800 Subject: [PATCH 3/3] fix: Update types --- packages/ssr/src/utils/chunker.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/ssr/src/utils/chunker.ts b/packages/ssr/src/utils/chunker.ts index 8561de00..9efcefe0 100644 --- a/packages/ssr/src/utils/chunker.ts +++ b/packages/ssr/src/utils/chunker.ts @@ -32,7 +32,7 @@ export function createChunks(key: string, value: string, chunkSize?: number): Ch encodedChunkHead = encodedChunkHead.slice(0, lastEscapePos); } - let valueHead: string; + let valueHead: string = ''; // Check if the chunk was split along a valid unicode boundary. while (encodedChunkHead.length > 0) {