diff --git a/packages/readabilityjs/Readability.js b/packages/readabilityjs/Readability.js index 6799eb5dac..58d5cccd04 100644 --- a/packages/readabilityjs/Readability.js +++ b/packages/readabilityjs/Readability.js @@ -303,7 +303,7 @@ Readability.prototype = { if (!this._keepClasses) { // Remove classes. - this._cleanClasses(articleContent); + this._cleanElement(articleContent); } }, @@ -456,7 +456,7 @@ Readability.prototype = { * @param Element * @return void */ - _cleanClasses: function (node) { + _cleanElement: function (node) { if (node.className && node.className.startsWith && node.className.startsWith('_omnivore')) { return; } @@ -483,8 +483,10 @@ Readability.prototype = { node.removeAttribute("class"); } + this._removeAllEventHandlers(node) + for (node = node.firstElementChild; node; node = node.nextElementSibling) { - this._cleanClasses(node); + this._cleanElement(node); } }, @@ -546,7 +548,6 @@ Readability.prototype = { this._forEachNode(medias, function (media) { var src = media.getAttribute("src"); var poster = media.getAttribute("poster"); - var srcset = media.getAttribute("srcset"); if (src) { media.setAttribute("src", this.toAbsoluteURI(src)); @@ -558,6 +559,20 @@ Readability.prototype = { }); }, + // removes all the javascript event handlers from the supplied element + _removeAllEventHandlers(element) { + const attributes = element.attributes; + + // Iterate in reverse because removing attributes changes the length + for (let i = attributes.length - 1; i >= 0; i--) { + const attribute = attributes[i]; + // Check if the attribute starts with "on" (like "onload", "onerror", etc.) + if (attribute.name.startsWith('on')) { + element.removeAttribute(attribute.name); + } + } + }, + /** Creates imageproxy links for all article images with href source */ _createImageProxyLinks: function (articleContent) { if (this.createImageProxyUrl !== undefined) { diff --git a/packages/readabilityjs/test/test-pages/caixin/expected.html b/packages/readabilityjs/test/test-pages/caixin/expected.html index 1d85d1b14e..a2455e52a8 100644 --- a/packages/readabilityjs/test/test-pages/caixin/expected.html +++ b/packages/readabilityjs/test/test-pages/caixin/expected.html @@ -19,7 +19,7 @@
【财新网】9月26日,汽车服务平台途虎养车正式在港交所主板挂牌上市。途虎养车( 09690.HK )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。
+【财新网】9月26日,汽车服务平台途虎养车正式在港交所主板挂牌上市。途虎养车( 09690.HK )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。
途虎养车上市不易。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。
diff --git a/packages/readabilityjs/test/test-readability.js b/packages/readabilityjs/test/test-readability.js index 05460f8b32..abcba77604 100644 --- a/packages/readabilityjs/test/test-readability.js +++ b/packages/readabilityjs/test/test-readability.js @@ -1,353 +1,417 @@ -var chai = require("chai"); -var sinon = require("sinon"); -var chaiAsPromised = require("chai-as-promised"); -const { parseHTML } = require("linkedom"); -const nock = require("nock"); +var chai = require('chai') +var sinon = require('sinon') +var chaiAsPromised = require('chai-as-promised') +const { parseHTML } = require('linkedom') +const nock = require('nock') -chai.use(chaiAsPromised); -chai.config.includeStack = true; -var expect = chai.expect; +chai.use(chaiAsPromised) +chai.config.includeStack = true +var expect = chai.expect -var Readability = require("../index").Readability; -var JSDOMParser = require("../JSDOMParser"); -var prettyPrint = require("./utils").prettyPrint; +var Readability = require('../index').Readability +var JSDOMParser = require('../JSDOMParser') +var prettyPrint = require('./utils').prettyPrint -const isOmnivore = process.env.IS_OMNIVORE; -var testPages = require("./utils").getTestPages(isOmnivore); +const isOmnivore = process.env.IS_OMNIVORE +var testPages = require('./utils').getTestPages(isOmnivore) function reformatError(err) { - var formattedError = new Error(err.message); - formattedError.stack = err.stack; - return formattedError; + var formattedError = new Error(err.message) + formattedError.stack = err.stack + return formattedError } function inOrderTraverse(fromNode) { if (fromNode.firstChild) { - return fromNode.firstChild; + return fromNode.firstChild } while (fromNode && !fromNode.nextSibling) { - fromNode = fromNode.parentNode; + fromNode = fromNode.parentNode } - return fromNode ? fromNode.nextSibling : null; + return fromNode ? fromNode.nextSibling : null } function inOrderIgnoreEmptyTextNodes(fromNode) { do { - fromNode = inOrderTraverse(fromNode); - } while (fromNode && fromNode.nodeType == 3 && !fromNode.textContent.trim()); - return fromNode; + fromNode = inOrderTraverse(fromNode) + } while (fromNode && fromNode.nodeType == 3 && !fromNode.textContent.trim()) + return fromNode } function traverseDOM(callback, expectedDOM, actualDOM) { - var actualNode = actualDOM.documentElement || actualDOM.childNodes[0]; - var expectedNode = expectedDOM.documentElement || expectedDOM.childNodes[0]; + var actualNode = actualDOM.documentElement || actualDOM.childNodes[0] + var expectedNode = expectedDOM.documentElement || expectedDOM.childNodes[0] while (actualNode || expectedNode) { // We'll stop if we don't have both actualNode and expectedNode if (!callback(actualNode, expectedNode)) { - break; + break } - actualNode = inOrderIgnoreEmptyTextNodes(actualNode); - expectedNode = inOrderIgnoreEmptyTextNodes(expectedNode); + actualNode = inOrderIgnoreEmptyTextNodes(actualNode) + expectedNode = inOrderIgnoreEmptyTextNodes(expectedNode) } } // Collapse subsequent whitespace like HTML: function htmlTransform(str) { - return str.replace(/\s+/g, " "); + return str.replace(/\s+/g, ' ') } -function runTestsWithItems(label, domGenerationFn, source, expectedContent, expectedMetadata, uri) { - describe(label, function() { - this.timeout(30000); +function runTestsWithItems( + label, + domGenerationFn, + source, + expectedContent, + expectedMetadata, + uri +) { + describe(label, function () { + this.timeout(30000) - var result; + var result - before(async function() { + before(async function () { try { - var doc = domGenerationFn(source); + var doc = domGenerationFn(source) // Provide one class name to preserve, which we know appears in a few // of the test documents. - var myReader = new Readability(doc, { classesToPreserve: ["caption"], url: uri }); - result = await myReader.parse(); + var myReader = new Readability(doc, { + classesToPreserve: ['caption'], + url: uri, + }) + result = await myReader.parse() } catch (err) { - throw reformatError(err); + throw reformatError(err) } - }); + }) - it("should return a result object", function() { - expect(result).to.include.keys("content", "title", "excerpt", "byline"); - }); + it('should return a result object', function () { + expect(result).to.include.keys('content', 'title', 'excerpt', 'byline') + }) - it("should extract expected content", function() { + it('should extract expected content', function () { function nodeStr(n) { if (!n) { - return "(no node)"; + return '(no node)' } if (n.nodeType == 3) { - return "#text(" + htmlTransform(n.textContent) + ")"; + return '#text(' + htmlTransform(n.textContent) + ')' } if (n.nodeType != 1) { - return "some other node type: " + n.nodeType + " with data " + n.data; + return 'some other node type: ' + n.nodeType + ' with data ' + n.data } - var rv = n.localName; + var rv = n.localName if (n.id) { - rv += "#" + n.id; + rv += '#' + n.id } if (n.className) { - rv += ".(" + n.className + ")"; + rv += '.(' + n.className + ')' } - return rv; + return rv } function genPath(node) { if (node.id) { - return "#" + node.id; + return '#' + node.id } - if (node.tagName == "BODY") { - return "body"; + if (node.tagName == 'BODY') { + return 'body' } - var parent = node.parentNode; - var parentPath = genPath(parent); - var index = Array.prototype.indexOf.call(parent.childNodes, node) + 1; - return parentPath + " > " + nodeStr(node) + ":nth-child(" + index + ")"; + var parent = node.parentNode + var parentPath = genPath(parent) + var index = Array.prototype.indexOf.call(parent.childNodes, node) + 1 + return parentPath + ' > ' + nodeStr(node) + ':nth-child(' + index + ')' } function findableNodeDesc(node) { - return genPath(node) + "(in: ``" + node.parentNode.innerHTML + "``)"; + return genPath(node) + '(in: ``' + node.parentNode.innerHTML + '``)' } function attributesForNode(node) { - return Array.from(node.attributes).map(function(attr) { - return attr.name + "=" + attr.value; - }).join(","); + return Array.from(node.attributes) + .map(function (attr) { + return attr.name + '=' + attr.value + }) + .join(',') } - - var actualDOM = domGenerationFn(prettyPrint(result.content)); - var expectedDOM = domGenerationFn(prettyPrint(expectedContent)); - traverseDOM(function(actualNode, expectedNode) { - if (actualNode && expectedNode) { - var actualDesc = nodeStr(actualNode); - var expectedDesc = nodeStr(expectedNode); - if (actualDesc != expectedDesc) { - expect(actualDesc, findableNodeDesc(actualNode)).eql(expectedDesc); - return false; - } - // Compare text for text nodes: - if (actualNode.nodeType == 3) { - var actualText = htmlTransform(actualNode.textContent); - var expectedText = htmlTransform(expectedNode.textContent); - expect(actualText, findableNodeDesc(actualNode)).eql(expectedText); - if (actualText != expectedText) { - return false; + var actualDOM = domGenerationFn(prettyPrint(result.content)) + var expectedDOM = domGenerationFn(prettyPrint(expectedContent)) + traverseDOM( + function (actualNode, expectedNode) { + if (actualNode && expectedNode) { + var actualDesc = nodeStr(actualNode) + var expectedDesc = nodeStr(expectedNode) + if (actualDesc != expectedDesc) { + expect(actualDesc, findableNodeDesc(actualNode)).eql(expectedDesc) + return false } - // Compare attributes for element nodes: - } else if (actualNode.nodeType == 1) { - var actualNodeDesc = attributesForNode(actualNode); - var expectedNodeDesc = attributesForNode(expectedNode); - var desc = "node " + nodeStr(actualNode) + " attributes (" + actualNodeDesc + ") should match (" + expectedNodeDesc + ") "; - expect(actualNode.attributes.length, desc).eql(expectedNode.attributes.length); - for (var i = 0; i < actualNode.attributes.length; i++) { - var attr = actualNode.attributes[i].name; - var actualValue = actualNode.getAttribute(attr); - var expectedValue = expectedNode.getAttribute(attr); - expect(expectedValue, "node (" + findableNodeDesc(actualNode) + ") attribute " + attr + " should match").eql(actualValue); + // Compare text for text nodes: + if (actualNode.nodeType == 3) { + var actualText = htmlTransform(actualNode.textContent) + var expectedText = htmlTransform(expectedNode.textContent) + expect(actualText, findableNodeDesc(actualNode)).eql(expectedText) + if (actualText != expectedText) { + return false + } + // Compare attributes for element nodes: + } else if (actualNode.nodeType == 1) { + var actualNodeDesc = attributesForNode(actualNode) + var expectedNodeDesc = attributesForNode(expectedNode) + var desc = + 'node ' + + nodeStr(actualNode) + + ' attributes (' + + actualNodeDesc + + ') should match (' + + expectedNodeDesc + + ') ' + expect(actualNode.attributes.length, desc).eql( + expectedNode.attributes.length + ) + for (var i = 0; i < actualNode.attributes.length; i++) { + var attr = actualNode.attributes[i].name + var actualValue = actualNode.getAttribute(attr) + var expectedValue = expectedNode.getAttribute(attr) + expect( + expectedValue, + 'node (' + + findableNodeDesc(actualNode) + + ') attribute ' + + attr + + ' should match' + ).eql(actualValue) + } } + } else { + expect( + nodeStr(actualNode), + 'Should have a node from both DOMs' + ).eql(nodeStr(expectedNode)) + return false } - } else { - expect(nodeStr(actualNode), "Should have a node from both DOMs").eql(nodeStr(expectedNode)); - return false; - } - return true; - }, actualDOM, expectedDOM); - }); - - it("should extract expected title", function() { - expect(result.title).eql(expectedMetadata.title); - }); - - it("should extract expected byline", function() { - expect(result.byline).eql(expectedMetadata.byline); - }); - - it("should extract expected excerpt", function() { - expect(result.excerpt).eql(expectedMetadata.excerpt); - }); - - it("should extract expected site name", function() { - expect(result.siteName).eql(expectedMetadata.siteName); - }); - - expectedMetadata.dir && it("should extract expected direction", function() { - expect(result.dir).eql(expectedMetadata.dir); - }); - }); + return true + }, + actualDOM, + expectedDOM + ) + }) + + it('should extract expected title', function () { + expect(result.title).eql(expectedMetadata.title) + }) + + it('should extract expected byline', function () { + expect(result.byline).eql(expectedMetadata.byline) + }) + + it('should extract expected excerpt', function () { + expect(result.excerpt).eql(expectedMetadata.excerpt) + }) + + it('should extract expected site name', function () { + expect(result.siteName).eql(expectedMetadata.siteName) + }) + + expectedMetadata.dir && + it('should extract expected direction', function () { + expect(result.dir).eql(expectedMetadata.dir) + }) + }) } function removeCommentNodesRecursively(node) { for (var i = node.childNodes.length - 1; i >= 0; i--) { - var child = node.childNodes[i]; + var child = node.childNodes[i] if (child.nodeType === child.COMMENT_NODE) { - node.removeChild(child); + node.removeChild(child) } else if (child.nodeType === child.ELEMENT_NODE) { - removeCommentNodesRecursively(child); + removeCommentNodesRecursively(child) } } } -describe("Readability API", function() { - describe("#constructor", function() { - var doc = new JSDOMParser().parse("