Skip to content

Commit

Permalink
fix(chunker): Fix chunk length calculation for unicode characters
Browse files Browse the repository at this point in the history
  • Loading branch information
SyntheticGoop committed Jan 25, 2024
1 parent 39e301a commit 76ea9d2
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 17 deletions.
62 changes: 45 additions & 17 deletions packages/ssr/src/utils/chunker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,62 @@ interface Chunk {
value: string;
}

function createChunkRegExp(chunkSize: number) {
return new RegExp('.{1,' + chunkSize + '}', 'g');
}

const MAX_CHUNK_SIZE = 3180;
const MAX_CHUNK_REGEXP = createChunkRegExp(MAX_CHUNK_SIZE);

/**
* create chunks from a string and return an array of object
*/
export function createChunks(key: string, value: string, chunkSize?: number): Chunk[] {
const re = chunkSize !== undefined ? createChunkRegExp(chunkSize) : MAX_CHUNK_REGEXP;
// check the length of the string to work out if it should be returned or chunked
const chunkCount = Math.ceil(value.length / (chunkSize ?? MAX_CHUNK_SIZE));
const resolvedChunkSize = chunkSize ?? MAX_CHUNK_SIZE;

let encodedValue = encodeURIComponent(value);

if (chunkCount === 1) {
if (encodedValue.length <= resolvedChunkSize) {
return [{ name: key, value }];
}

const chunks: Chunk[] = [];
// split string into a array based on the regex
const values = value.match(re);
values?.forEach((value, i) => {
const name = `${key}.${i}`;
chunks.push({ name, value });
});
const chunks = [];

while (encodedValue.length > 0) {
let encodedChunkHead = encodedValue.slice(0, resolvedChunkSize);

const lastEscapePos = encodedChunkHead.lastIndexOf('%');

// Check if the last escaped character is truncated.
if (lastEscapePos > resolvedChunkSize - 3) {
// If so, reslice the string to exclude the whole escape sequence.
// We only reduce the size of the string as the chunk must
// be smaller than the chunk size.
encodedChunkHead = encodedChunkHead.slice(0, lastEscapePos);
}

let valueHead;

// Check if the chunk was split along a valid unicode boundary.
while (encodedChunkHead.length > 0) {
try {
// Try to decode the chunk back and see if it is valid.
// Stop when the chunk is valid.
valueHead = decodeURIComponent(encodedChunkHead);
break;
} catch (error) {
if (
error instanceof URIError &&
encodedChunkHead.at(-3) === '%' &&
encodedChunkHead.length > 3
) {
encodedChunkHead = encodedChunkHead.slice(0, encodedChunkHead.length - 3);
} else {
throw error;
}
}
}

chunks.push(valueHead);
encodedValue = encodedValue.slice(encodedChunkHead.length);
}

return chunks;
return chunks.map((value, i) => ({ name: `${key}.${i}`, value }));
}

// Get fully constructed chunks
Expand Down
79 changes: 79 additions & 0 deletions packages/ssr/tests/chunker.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,83 @@ describe('chunker', () => {
expect(len(`${key}=${DOUBLE_CHUNK_STRING}`)).toBe(7257);
expect(combined).toBe(DOUBLE_CHUNK_STRING);
});

it('should correctly break between unicode boundaries in escaped characters', () => {
const test = ' ';
const chunks = createChunks('key', test, 4);
expect(chunks).toEqual([
{
name: 'key.0',
value: ' '
},
{
name: 'key.1',
value: ' '
},
{
name: 'key.2',
value: ' '
}
]);

expect(chunks.map((char) => char.value).join('')).toEqual(test);
});

describe('should correctly break between unicode boundaries in long unicode', () => {
it('should correctly break between unicode boundaries in long unicode (4 bytes)', () => {
const test = 'πŸ€¦πŸ»β€β™‚οΈ';
const chunksAtStartBorder = createChunks('key', test, 12);
const chunksAtEndBorder = createChunks('key', test, 17);
expect(chunksAtStartBorder).toEqual(chunksAtEndBorder);
expect(chunksAtStartBorder).toEqual([
{
name: 'key.0',
value: '🀦'
},
{
name: 'key.1',
value: '🏻'
},
{
name: 'key.2',
value: '‍'
},
{
name: 'key.3',
value: 'β™‚'
},
{
name: 'key.4',
value: '️'
}
]);
expect(chunksAtStartBorder.map((char) => char.value).join('')).toEqual(test);
});

it('should correctly break between unicode boundaries in long unicode (5 bytes)', () => {
const test = 'πŸ€¦πŸ»β€β™‚οΈ';
const chunksAtStartBorder = createChunks('key', test, 18);
const chunksAtEndBorder = createChunks('key', test, 20);
expect(chunksAtStartBorder).toEqual(chunksAtEndBorder);
expect(chunksAtStartBorder).toEqual([
{
name: 'key.0',
value: '🀦'
},
{
name: 'key.1',
value: '🏻'
},
{
name: 'key.2',
value: '‍♂'
},
{
name: 'key.3',
value: '️'
}
]);
expect(chunksAtStartBorder.map((char) => char.value).join('')).toEqual(test);
});
});
});

0 comments on commit 76ea9d2

Please sign in to comment.