Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(marticle): updates a use case where ul nodes could contain spans and be within a div #934

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
254 changes: 254 additions & 0 deletions servers/parser-graphql-wrapper/src/marticle/marticleParser.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,260 @@ describe('MarticleParser', () => {
expect(res).toEqual(expected);
});

it('should parse unorded lists within a div, that has invalid data in the ul and return unmarseable', () => {
const input = `<div lang="en">
<ul>
<li>1-2.</li>
<span>test</span>
<li>2-2.</li>
<li>3-2.</li>
<li>4-2.</li>
<li>5-2.</li>
<li>6-2.</li>
</ul>
</div>`;
const res = marticleParser.parse(input);
const expected = [
{
__typename: 'MarticleBulletedList',
rows: [
{
level: 0,
content: '1-2.',
},
{
level: 0,
content: '2-2.',
},
{
level: 0,
content: '3-2.',
},
{
level: 0,
content: '4-2.',
},
{
level: 0,
content: '5-2.',
},
{
level: 0,
content: '6-2.',
},
],
},
{
__typename: 'UnMarseable',
html: '<span>test</span>',
},
];
expect(res).toEqual(expected);
});

it('should parse unorded lists within a div, that has invalid data in a nested ul and span after li and return unmarseable', () => {
const input = `<div lang="en">
<ul>
<li>1-2.</li>
<ul>
<li>1-2-a</li>
<span>test</span>
</ul>
<li>2-2.</li>
<li>3-2.</li>
<li>4-2.</li>
<li>5-2.</li>
<li>6-2.</li>
</ul>
</div>`;
const res = marticleParser.parse(input);
const expected = [
{
__typename: 'MarticleBulletedList',
rows: [
{
level: 0,
content: '1-2.',
},
{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The schema doesn't allow returning unmarseable in lists so we'll have to update the caller function to remove this or pop it out

__typename: 'UnMarseable',
html: `<li></li>`,
},
{
level: 1,
content: '1-2-a',
},
{
level: 0,
content: '2-2.',
},
{
level: 0,
content: '3-2.',
},
{
level: 0,
content: '4-2.',
},
{
level: 0,
content: '5-2.',
},
{
level: 0,
content: '6-2.',
},
],
},
];
expect(res).toEqual(expected);
});

it('should parse unorded lists within a div, that has invalid data in a nested ul and span before li and return unmarseable', () => {
const input = `<div lang="en">
<ul>
<li>1-2.</li>
<ul>
<span>test</span>
<li>1-2-a</li>
</ul>
<li>2-2.</li>
<li>3-2.</li>
<li>4-2.</li>
<li>5-2.</li>
<li>6-2.</li>
</ul>
</div>`;
const res = marticleParser.parse(input);
const expected = [
{
__typename: 'MarticleBulletedList',
rows: [
{
level: 0,
content: '1-2.',
},
{
__typename: 'UnMarseable',
html: `<li></li>`,
},
{
level: 1,
content: '1-2-a',
},
{
level: 0,
content: '2-2.',
},
{
level: 0,
content: '3-2.',
},
{
level: 0,
content: '4-2.',
},
{
level: 0,
content: '5-2.',
},
{
level: 0,
content: '6-2.',
},
],
},
];
expect(res).toEqual(expected);
});

it('should parse unorded lists within a div, that has invalid data in a ul and return unmarseable', () => {
const input = `<div lang="en">
<ul>
<span>test</span>
<li>1-2.</li>
<li>2-2.</li>
<li>3-2.</li>
<li>4-2.</li>
<li>5-2.</li>
<li>6-2.</li>
</ul>
</div>`;
const res = marticleParser.parse(input);
const expected = [
{
__typename: 'MarticleBulletedList',
rows: [
{
level: 0,
content: '1-2.',
},
{
level: 0,
content: '2-2.',
},
{
level: 0,
content: '3-2.',
},
{
level: 0,
content: '4-2.',
},
{
level: 0,
content: '5-2.',
},
{
level: 0,
content: '6-2.',
},
],
},
{
__typename: 'UnMarseable',
html: '<span>test</span>',
},
];
expect(res).toEqual(expected);
});

it('should parse unorded lists within a div, that has invalid data in a nested ul and return unmarseable', () => {
const input = `<div lang="en">
<ul>
<li>1-2.</li>
<ul>
<li>1-2-a-i</li>
<span>test</span>
<li>1-2-a</li>
</ul>
<li>2-2.</li>
<li>3-2.</li>
<li>4-2.</li>
<li>5-2.</li>
<li>6-2.</li>
</ul>
</div>`;
const res = marticleParser.parse(input);
const expected = [
{
__typename: 'MarticleBulletedList',
rows: [
{ level: 0, content: '1-2.' },
{ __typename: 'UnMarseable', html: '<li></li>' },
{ level: 1, content: '1-2-a-i' },
{ level: 1, content: '1-2-a' },
{ level: 0, content: '2-2.' },
{ level: 0, content: '3-2.' },
{ level: 0, content: '4-2.' },
{ level: 0, content: '5-2.' },
{ level: 0, content: '6-2.' },
],
},
];
expect(res).toEqual(expected);
});

it('should parse rogue <li>s', () => {
const input =
'<div>' +
Expand Down
94 changes: 60 additions & 34 deletions servers/parser-graphql-wrapper/src/marticle/marticleParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import turndownService from './turndown';
import TurndownService from 'turndown';
import { config } from './config';
import { serverLogger } from '@pocket-tools/ts-logger';
import { isArray } from 'node:util';

export const videoTypeMap = {
1: VideoType.Youtube,
Expand Down Expand Up @@ -94,10 +95,10 @@ const immediateComponents = [
// to represent it in a flat list ('splitting' the 'p' node into 2).
const eventualComponents = ['P', 'BLOCKQUOTE', 'LI', 'DIV'];

function unMarseableTransformer(root: Node): UnMarseable {
function unMarseableTransformer(html: string): UnMarseable {
return {
__typename: 'UnMarseable',
html: (root as Element).outerHTML,
html,
};
}
// Assign UnMarseable transformer to all UnMarseable tags
Expand All @@ -109,6 +110,21 @@ const unMarseableTransformers = unMarseableComponents.reduce(
{},
);

// Transformer for when there is an error processing a list element
function listErrorTransformer(root: Node): UnMarseable {
let html = '';
if (isArray(root)) {
root.forEach((node: Node) => {
html += (node as Element).outerHTML;
node.parentNode.removeChild(node);
});
} else {
html += (root as Element).outerHTML;
root.parentNode.removeChild(root);
}
return unMarseableTransformer(html.trim());
}

// Methods for transforming a subtree of the DOM that represents
// an article into one or more MarticleComponents.
// To avoid many if/else statements, create a map of root tag
Expand Down Expand Up @@ -187,41 +203,51 @@ const transformers = {
// Lists can be broken up, so the transformer can return any kind
// of Marticle* component ( + lists).
// Kind of cheating on types for documentation purposes
UL: (root: Node, article: Item): MarticleElement[] => {
const { output, aggFrom } = listTransformer(
root,
[],
'UL',
undefined,
article,
);
// Result might contain rows that need to be aggregated into a single
// MarticleBulletedList
if (aggFrom != null) {
const aggOutput = output.splice(aggFrom) as ListElement[];
output.push({
__typename: 'MarticleBulletedList',
rows: aggOutput,
});
UL: (root: Node, article: Item): MarticleElement[] | UnMarseable => {
try {
const { output, aggFrom } = listTransformer(
root,
[],
'UL',
undefined,
article,
);
// Result might contain rows that need to be aggregated into a single
// MarticleBulletedList
if (aggFrom != null) {
const aggOutput = output.splice(aggFrom) as ListElement[];
output.push({
__typename: 'MarticleBulletedList',
rows: aggOutput,
});
}
return output as MarticleElement[];
} catch (err) {
serverLogger.error('Error processing UL list', { item: article, err });
return listErrorTransformer(root);
}
return output as MarticleElement[];
},
OL: (root: Node, article: Item): MarticleElement[] => {
const { output, aggFrom } = listTransformer(
root,
[],
'OL',
undefined,
article,
);
if (aggFrom != null) {
const aggOutput = output.splice(aggFrom) as NumberedListElement[];
output.push({
__typename: 'MarticleNumberedList',
rows: aggOutput,
});
OL: (root: Node, article: Item): MarticleElement[] | UnMarseable => {
try {
const { output, aggFrom } = listTransformer(
root,
[],
'OL',
undefined,
article,
);
if (aggFrom != null) {
const aggOutput = output.splice(aggFrom) as NumberedListElement[];
output.push({
__typename: 'MarticleNumberedList',
rows: aggOutput,
});
}
return output as MarticleElement[];
} catch (err) {
serverLogger.error('Error processing OL list', { item: article, err });
return listErrorTransformer(root);
}
return output as MarticleElement[];
},
LI: (
children: Node[],
Expand Down
Loading