From 9e4231fafd2691fc64443d164bbf26bdd5432249 Mon Sep 17 00:00:00 2001 From: Nosov Date: Thu, 15 Jun 2017 20:44:03 +0300 Subject: [PATCH 01/16] added wbr selfClosing tag --- index.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/index.js b/index.js index 38d7e04..a39919b 100644 --- a/index.js +++ b/index.js @@ -476,7 +476,8 @@ var kSelfClosingElements = { input: true, area: true, br: true, - hr: true + hr: true, + wbr: true }; var kElementsClosedByOpening = { li: {li: true}, From 771fb7133faea81b82b32240b02f8c18b732b19b Mon Sep 17 00:00:00 2001 From: Nosov Date: Thu, 15 Jun 2017 20:46:31 +0300 Subject: [PATCH 02/16] updage version upto 1.0.2-wbr --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 5b4fb67..459debb 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "fast-html-parser", - "version": "1.0.1", + "version": "1.0.2-wbr", "description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.", "main": "index.js", "scripts": { From 860d4ad308b84cb67b145a6aafc67ea7524092c8 Mon Sep 17 00:00:00 2001 From: Nosov Date: Thu, 15 Jun 2017 21:02:53 +0300 Subject: [PATCH 03/16] added col selfClosing tag --- index.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/index.js b/index.js index a39919b..6a02180 100644 --- a/index.js +++ b/index.js @@ -477,7 +477,8 @@ var kSelfClosingElements = { area: true, br: true, hr: true, - wbr: true + wbr: true, + col: true }; var kElementsClosedByOpening = { li: {li: true}, From 68e6a66c337880df024bd94bd486ed9800163e11 Mon Sep 17 00:00:00 2001 From: Nosov Date: Thu, 15 Jun 2017 21:05:21 +0300 Subject: [PATCH 04/16] update verion upto 1.0.3-col --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 459debb..c95eb4b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "fast-html-parser", - "version": "1.0.2-wbr", + "version": "1.0.3-col", "description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.", "main": "index.js", "scripts": { From 0f1c6a67756e14080ea7f2f7041285411d8b0e91 Mon Sep 17 00:00:00 2001 From: markitosha Date: Sat, 12 Aug 2017 12:15:00 +0300 Subject: [PATCH 05/16] =?UTF-8?q?=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=B5=D0=BD=D0=BE=20=D1=80=D0=B0=D1=81=D0=BF=D0=BE=D0=B7=D0=BD?= =?UTF-8?q?=D0=B0=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20=D0=B0=D1=82=D1=80=D0=B8?= =?UTF-8?q?=D0=B1=D1=83=D1=82=D0=BE=D0=B2,=20=D0=BD=D0=B0=D1=87=D0=B8?= =?UTF-8?q?=D0=BD=D0=B0=D1=8E=D1=89=D0=B8=D1=85=D1=81=D1=8F=20=D0=BD=D0=B0?= =?UTF-8?q?=20=5F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index.js b/index.js index 6a02180..0b2dbd9 100644 --- a/index.js +++ b/index.js @@ -371,7 +371,7 @@ $inherit(HTMLElement, Node, { return this._rawAttrs; var attrs = {}; if (this.rawAttrs) { - var re = /\b([a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig; + var re = /\b([_a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig; for (var match; match = re.exec(this.rawAttrs); ) attrs[match[1]] = match[3] || match[4] || match[5]; } From 9682fca964a63ba4deeff14a849e097ed32bb8f7 Mon Sep 17 00:00:00 2001 From: markitosha Date: Sat, 12 Aug 2017 12:55:50 +0300 Subject: [PATCH 06/16] =?UTF-8?q?=D0=9F=D0=B5=D1=80=D0=B5=D0=BF=D0=B8?= =?UTF-8?q?=D1=81=D0=B0=D0=BD=20=D0=BF=D0=B0=D1=80=D1=81=D0=B5=D1=80=20htm?= =?UTF-8?q?l=20=D1=82=D0=B5=D0=B3=D0=BE=D0=B2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- index.js | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/index.js b/index.js index 0b2dbd9..7e4cd1c 100644 --- a/index.js +++ b/index.js @@ -467,7 +467,61 @@ $define(Matcher, { } }); -var kMarkupPattern = /)-->|<(\/?)([a-z][a-z0-9]*)\s*([^>]*?)(\/?)>/ig; +var kMarkupPattern = (function () { + var lastIndex = 0; + + return { + lastIndex: lastIndex, + exec: function (str) { + var bracketStack = 0; + var readTagName = true; + var match = ['', '', '', '']; + match['input'] = str; + + for (var i = lastIndex; i < str.length; ++i) { + ++lastIndex; + switch (str[i]) { + case '<': + if (!bracketStack) { + match['index'] = i; + } + bracketStack++; + break; + case '/': + if (i > 0 && str[i - 1] === '<') { + match[1] = '/'; + } else if (i < str.length - 1 && str[i + 1] === '>') { + match[4] = '/'; + } + break; + case ' ': + if (!readTagName) { + match[3] += str[i]; + } else { + readTagName = false; + } + break; + case '>': + if (!(--bracketStack)) { + match[0] = str.slice(match['index'], i + 1); + return match; + } + break; + default: + if (readTagName) { + match[2] += str[i]; + } else { + match[3] += str[i]; + } + break; + } + } + lastIndex = 0; + return null; + } + } +})(); + var kAttributePattern = /\b(id|class)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig; var kSelfClosingElements = { meta: true, @@ -527,6 +581,7 @@ module.exports = { options = options || {}; for (var match, text; match = kMarkupPattern.exec(data); ) { + console.log(match); if (lastTextPos > -1) { if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) { // if has content From 5c53cc052642a203107eda44612fa5d1b4fdfb58 Mon Sep 17 00:00:00 2001 From: markitosha Date: Sat, 12 Aug 2017 14:50:24 +0300 Subject: [PATCH 07/16] =?UTF-8?q?=D0=98=D1=81=D0=BF=D1=80=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=BE=20=D0=BF=D1=80=D0=BE=D1=85=D0=BE=D0=B6?= =?UTF-8?q?=D0=B4=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B2=D1=81=D0=B5=D1=85=20?= =?UTF-8?q?=D1=82=D0=B5=D1=81=D1=82=D0=BE=D0=B2=20=D0=BF=D0=B0=D1=80=D1=81?= =?UTF-8?q?=D0=B5=D1=80=D0=BE=D0=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- index.js | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/index.js b/index.js index 7e4cd1c..dde9ba6 100644 --- a/index.js +++ b/index.js @@ -1,4 +1,5 @@ require('apollojs'); +var equal = require('ramda').equals; var entities = require('entities'); @@ -471,19 +472,24 @@ var kMarkupPattern = (function () { var lastIndex = 0; return { - lastIndex: lastIndex, exec: function (str) { var bracketStack = 0; var readTagName = true; - var match = ['', '', '', '']; + var readAttributes = false; + var inTag = false; + var match = ['', '', '', '', '']; match['input'] = str; for (var i = lastIndex; i < str.length; ++i) { ++lastIndex; switch (str[i]) { case '<': + if (i < str.length - 1 && str[i + 1] === '!') { + break; + } if (!bracketStack) { match['index'] = i; + inTag = true; } bracketStack++; break; @@ -492,25 +498,30 @@ var kMarkupPattern = (function () { match[1] = '/'; } else if (i < str.length - 1 && str[i + 1] === '>') { match[4] = '/'; + } else if (inTag && readTagName) { + match[2] += str[i]; + } else if (inTag && readAttributes) { + match[3] += str[i]; } break; case ' ': - if (!readTagName) { + if (inTag && readAttributes) { match[3] += str[i]; - } else { + } else if (inTag) { readTagName = false; + readAttributes = true; } break; case '>': - if (!(--bracketStack)) { + if (bracketStack > 0 && !(--bracketStack)) { match[0] = str.slice(match['index'], i + 1); return match; } break; default: - if (readTagName) { + if (inTag && readTagName) { match[2] += str[i]; - } else { + } else if (inTag && readAttributes) { match[3] += str[i]; } break; @@ -518,6 +529,12 @@ var kMarkupPattern = (function () { } lastIndex = 0; return null; + }, + get lastIndex() { + return lastIndex; + }, + set lastIndex(newLastIndex) { + // lastIndex = newLastIndex; } } })(); @@ -581,7 +598,6 @@ module.exports = { options = options || {}; for (var match, text; match = kMarkupPattern.exec(data); ) { - console.log(match); if (lastTextPos > -1) { if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) { // if has content From 4a2b56ca6d4bc25e262d9dda32493321db7b10ee Mon Sep 17 00:00:00 2001 From: markitosha Date: Sat, 12 Aug 2017 14:58:36 +0300 Subject: [PATCH 08/16] =?UTF-8?q?=D0=98=D1=81=D0=BF=D1=80=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B0=20=D1=80=D0=B0=D0=B1=D0=BE=D1=82=D0=B0?= =?UTF-8?q?=20=D0=BF=D0=B0=D1=80=D1=81=D0=B5=D1=80=D0=B0=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20=D1=81=D0=BB=D1=83=D1=87=D0=B0=D1=8F=20=D1=81=20=D0=B2?= =?UTF-8?q?=D0=BB=D0=BE=D0=B6=D0=B5=D0=BD=D1=8B=D0=BC=D0=B8=20=D0=B2=20?= =?UTF-8?q?=D0=B0=D1=82=D1=80=D0=B8=D0=B1=D1=83=D1=82=D1=8B=20=D1=82=D0=B5?= =?UTF-8?q?=D0=B3=D0=B0=D0=BC=D0=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- index.js | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/index.js b/index.js index dde9ba6..6edce7b 100644 --- a/index.js +++ b/index.js @@ -490,13 +490,17 @@ var kMarkupPattern = (function () { if (!bracketStack) { match['index'] = i; inTag = true; + } else if (inTag && readTagName) { + match[2] += str[i]; + } else if (inTag && readAttributes) { + match[3] += str[i]; } bracketStack++; break; case '/': - if (i > 0 && str[i - 1] === '<') { + if (i - 1 === match['index']) { match[1] = '/'; - } else if (i < str.length - 1 && str[i + 1] === '>') { + } else if (bracketStack === 1 && i < str.length - 1 && str[i + 1] === '>') { match[4] = '/'; } else if (inTag && readTagName) { match[2] += str[i]; @@ -516,6 +520,10 @@ var kMarkupPattern = (function () { if (bracketStack > 0 && !(--bracketStack)) { match[0] = str.slice(match['index'], i + 1); return match; + } else if (inTag && readTagName) { + match[2] += str[i]; + } else if (inTag && readAttributes) { + match[3] += str[i]; } break; default: From 51ce7958bc1106f6aecb108256f556ecf7146586 Mon Sep 17 00:00:00 2001 From: markitosha Date: Sat, 12 Aug 2017 16:47:02 +0300 Subject: [PATCH 09/16] =?UTF-8?q?=20=D0=9F=D0=B0=D1=80=D1=81=D0=B5=D1=80?= =?UTF-8?q?=20=D0=BF=D0=B5=D1=80=D0=B5=D0=BF=D0=B8=D1=81=D0=B0=D0=BD=20?= =?UTF-8?q?=D0=B2=20=D1=82=D0=B5=D1=80=D0=BC=D0=B8=D0=BD=D0=B0=D1=85=20?= =?UTF-8?q?=D1=81=D0=BE=D1=81=D1=82=D0=BE=D1=8F=D0=BD=D0=B8=D0=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- index.js | 177 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 121 insertions(+), 56 deletions(-) diff --git a/index.js b/index.js index 6edce7b..d17cbbc 100644 --- a/index.js +++ b/index.js @@ -468,82 +468,147 @@ $define(Matcher, { } }); +// parser states +var INITIAL = 'INITIAL'; +var TAG_OPENED = 'TAG_OPENED'; +var READ_TAG_NAME = 'READ_TAG_NAME'; +var READ_ATTRIBUTES = 'READ_ATTRIBUTES'; +var IS_SELF_CLOSING = 'IS_SELF_CLOSING'; +var TAG_CLOSE = 'TAG_CLOSE'; + var kMarkupPattern = (function () { var lastIndex = 0; + function makeState(state, match, bracketStack ) { + return { + state: state, + match: match, + bracketStack: bracketStack + }; + } + + function initial(match, sym, index) { + if (sym === '<' ) { + match['index'] = index; + return makeState(TAG_OPENED, match, 1); + } + + return makeState(INITIAL, match, 0); + } + + function tagOpened(match, sym) { + if (sym === '!') { + return makeState(INITIAL, match, 0); + } + + if (sym === '/') { + match[1] = '/'; + } else { + match[2] += sym; + } + + return makeState(READ_TAG_NAME, match, 1); + } + + function readTagName(match, sym) { + switch (sym) { + case ' ': + return makeState(READ_ATTRIBUTES, match, 1); + case '/': + return makeState(IS_SELF_CLOSING, match, 1); + case '>': + return makeState(TAG_CLOSE, match, 1); + default: + match[2] += sym; + return makeState(READ_TAG_NAME, match, 1); + } + } + + function readAttributes(match, sym, bracketStack) { + switch (sym) { + case '/': + if (bracketStack === 1) { + return makeState(IS_SELF_CLOSING, match, bracketStack); + } + + return makeState(READ_ATTRIBUTES, match, bracketStack); + case '>': + if (--bracketStack) { + match[3] += sym; + return makeState(READ_ATTRIBUTES, match, bracketStack); + } + + return makeState(TAG_CLOSE, match, bracketStack); + case '<': + ++bracketStack; + default: + match[3] += sym; + return makeState(READ_ATTRIBUTES, match, bracketStack); + } + } + + function isSelfClosing(match, sym) { + if (sym === '>') { + match[4] = '/'; + return makeState(TAG_CLOSE, match, 0); + } + + match[3] += '/' + sym; + return makeState(READ_ATTRIBUTES, match, 1); + } + + function tagClose(match, str, index) { + lastIndex = index; + match[0] = str.slice(match['index'], index); + + return makeState(INITIAL, match, 0); + } + return { exec: function (str) { - var bracketStack = 0; - var readTagName = true; - var readAttributes = false; - var inTag = false; - var match = ['', '', '', '', '']; - match['input'] = str; + var state = { + state: INITIAL, + match: ['', '', '', '', ''], + bracketStack: 0 + }; + state.match['input'] = str; for (var i = lastIndex; i < str.length; ++i) { - ++lastIndex; - switch (str[i]) { - case '<': - if (i < str.length - 1 && str[i + 1] === '!') { - break; - } - if (!bracketStack) { - match['index'] = i; - inTag = true; - } else if (inTag && readTagName) { - match[2] += str[i]; - } else if (inTag && readAttributes) { - match[3] += str[i]; - } - bracketStack++; + switch (state.state) { + case INITIAL: + state = initial(state.match, str[i], i); break; - case '/': - if (i - 1 === match['index']) { - match[1] = '/'; - } else if (bracketStack === 1 && i < str.length - 1 && str[i + 1] === '>') { - match[4] = '/'; - } else if (inTag && readTagName) { - match[2] += str[i]; - } else if (inTag && readAttributes) { - match[3] += str[i]; - } + case TAG_OPENED: + state = tagOpened(state.match, str[i]); break; - case ' ': - if (inTag && readAttributes) { - match[3] += str[i]; - } else if (inTag) { - readTagName = false; - readAttributes = true; - } + case READ_TAG_NAME: + state = readTagName(state.match, str[i]); break; - case '>': - if (bracketStack > 0 && !(--bracketStack)) { - match[0] = str.slice(match['index'], i + 1); - return match; - } else if (inTag && readTagName) { - match[2] += str[i]; - } else if (inTag && readAttributes) { - match[3] += str[i]; - } + case READ_ATTRIBUTES: + state = readAttributes(state.match, str[i], state.bracketClose); break; - default: - if (inTag && readTagName) { - match[2] += str[i]; - } else if (inTag && readAttributes) { - match[3] += str[i]; - } + case IS_SELF_CLOSING: + state = isSelfClosing(state.match, str[i]); break; + case TAG_CLOSE: + state = tagClose(state.match, str, i); + return state.match; + default: + break; } } + if (state.state === TAG_CLOSE) { + state = tagClose(state.match, str, str.length); + return state.match; + } + lastIndex = 0; return null; }, get lastIndex() { return lastIndex; }, - set lastIndex(newLastIndex) { - // lastIndex = newLastIndex; - } + set lastIndex(newLastIndex) {} } })(); From a083180fc9b5cc32e6e86d453d65fb9c955712a2 Mon Sep 17 00:00:00 2001 From: markitosha Date: Sat, 12 Aug 2017 18:45:03 +0300 Subject: [PATCH 10/16] =?UTF-8?q?=D0=98=D1=81=D0=BF=D1=80=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B0=20=D0=BE=D1=88=D0=B8=D0=B1=D0=BA=D0=B8?= =?UTF-8?q?=20=D0=BF=D1=80=D0=B8=20=D0=BF=D0=B0=D1=80=D1=81=D0=B8=D0=BD?= =?UTF-8?q?=D0=B3=D0=B5=20=D1=82=D0=B5=D0=B3=D0=BE=D0=B2=20=D0=B2=20=D0=B0?= =?UTF-8?q?=D1=82=D1=82=D1=80=D0=B8=D0=B1=D1=83=D1=82=D0=B0=D1=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- index.js | 371 +------------------------------------------------------ 1 file changed, 6 insertions(+), 365 deletions(-) diff --git a/index.js b/index.js index d17cbbc..5b7f239 100644 --- a/index.js +++ b/index.js @@ -1,17 +1,12 @@ require('apollojs'); -var equal = require('ramda').equals; - var entities = require('entities'); /** * Node Class as base class for TextNode and HTMLElement. */ -function Node() { - -} -$declare(Node, { +function Node() {} -}); +$declare(Node, {}); $defenum(Node, { ELEMENT_NODE: 1, TEXT_NODE: 3 @@ -25,44 +20,13 @@ function TextNode(value) { this.rawText = value; } $inherit(TextNode, Node, { - /** * Node Type declaration. * @type {Number} */ - nodeType: Node.TEXT_NODE, - - /** - * Get unescaped text value of current node and its children. - * @return {string} text content - */ - get text() { - return entities.decodeHTML5(this.rawText); - }, - - /** - * Detect if the node contains only white space. - * @return {bool} - */ - get isWhitespace() { - return /^(\s| )*$/.test(this.rawText); - } - + nodeType: Node.TEXT_NODE }); -var kBlockElements = { - div: true, - p: true, - // ul: true, - // ol: true, - li: true, - // table: true, - // tr: true, - td: true, - section: true, - br: true -}; - /** * HTMLElement, which contains a set of children. * Note: this is a minimalist implementation, no complete tree @@ -75,30 +39,20 @@ var kBlockElements = { function HTMLElement(name, keyAttrs, rawAttrs) { this.tagName = name; this.rawAttrs = rawAttrs || ''; - // this.parentNode = null; this.childNodes = []; if (keyAttrs.id) this.id = keyAttrs.id; - if (keyAttrs.class) - this.classNames = keyAttrs.class.split(/\s+/); - else - this.classNames = []; } $inherit(HTMLElement, Node, { - /** * Node Type declaration. * @type {Number} */ nodeType: Node.ELEMENT_NODE, - /** * Get unescaped text value of current node and its children. * @return {string} text content */ - get text() { - return entities.decodeHTML5(this.rawText); - }, /** * Get escpaed (as-it) text value of current node and its children. @@ -111,216 +65,6 @@ $inherit(HTMLElement, Node, { return res; }, - /** - * Get structured Text (with '\n' etc.) - * @return {string} structured text - */ - get structuredText() { - var currentBlock = []; - var blocks = [currentBlock]; - function dfs(node) { - if (node.nodeType === Node.ELEMENT_NODE) { - if (kBlockElements[node.tagName]) { - if (currentBlock.length > 0) - blocks.push(currentBlock = []); - node.childNodes.forEach(dfs); - if (currentBlock.length > 0) - blocks.push(currentBlock = []); - } else { - node.childNodes.forEach(dfs); - } - } else if (node.nodeType === Node.TEXT_NODE) { - if (node.isWhitespace) { - // Whitespace node, postponed output - currentBlock.prependWhitespace = true; - } else { - var text = node.text; - if (currentBlock.prependWhitespace) { - text = ' ' + text; - currentBlock.prependWhitespace = false; - } - currentBlock.push(text); - } - } - } - dfs(this); - return blocks - .map(function(block) { - // Normalize each line's whitespace - return block.join('').trim().replace(/\s{2,}/g, ' '); - }) - .join('\n').trimRight(); - }, - - /** - * Trim element from right (in block) after seeing pattern in a TextNode. - * @param {RegExp} pattern pattern to find - * @return {HTMLElement} reference to current node - */ - trimRight: function(pattern) { - function dfs(node) { - for (var i = 0; i < node.childNodes.length; i++) { - var childNode = node.childNodes[i]; - if (childNode.nodeType === Node.ELEMENT_NODE) { - dfs(childNode); - } else { - var index = childNode.rawText.search(pattern); - if (index > -1) { - childNode.rawText = childNode.rawText.substr(0, index); - // trim all following nodes. - node.childNodes.length = i+1; - } - } - } - } - dfs(this); - return this; - }, - - /** - * Get DOM structure - * @return {string} strucutre - */ - get structure() { - var res = []; - var indention = 0; - function write(str) { - res.push(' '.repeat(indention) + str); - } - function dfs(node) { - var idStr = node.id ? ('#' + node.id) : ''; - var classStr = node.classNames.length ? ('.' + node.classNames.join('.')) : ''; - write(node.tagName + idStr + classStr); - indention++; - for (var i = 0; i < node.childNodes.length; i++) { - var childNode = node.childNodes[i]; - if (childNode.nodeType === Node.ELEMENT_NODE) { - dfs(childNode); - } else if (childNode.nodeType === Node.TEXT_NODE) { - if (!childNode.isWhitespace) - write('#text'); - } - } - indention--; - } - dfs(this); - return res.join('\n'); - }, - - /** - * Remove whitespaces in this sub tree. - * @return {HTMLElement} pointer to this - */ - removeWhitespace: function() { - var i = 0, o = 0; - for (; i < this.childNodes.length; i++) { - var node = this.childNodes[i]; - if (node.nodeType === Node.TEXT_NODE) { - if (node.isWhitespace) - continue; - node.rawText = node.rawText.trim(); - } else if (node.nodeType === Node.ELEMENT_NODE) { - node.removeWhitespace(); - } - this.childNodes[o++] = node; - } - this.childNodes.length = o; - return this; - }, - - /** - * Query CSS selector to find matching nodes. - * @param {string} selector Simplified CSS selector - * @param {Matcher} selector A Matcher instance - * @return {HTMLElement[]} matching elements - */ - querySelectorAll: function(selector) { - var matcher; - if (selector instanceof Matcher) { - matcher = selector; - matcher.reset(); - } else { - matcher = new Matcher(selector); - } - var res = []; - var stack = []; - for (var i = 0; i < this.childNodes.length; i++) { - stack.push([this.childNodes[i], 0, false]); - while (stack.length) { - var state = stack.back; - var el = state[0]; - if (state[1] === 0) { - // Seen for first time. - if (el.nodeType !== Node.ELEMENT_NODE) { - stack.pop(); - continue; - } - if (state[2] = matcher.advance(el)) { - if (matcher.matched) { - res.push(el); - // no need to go further. - matcher.rewind(); - stack.pop(); - continue; - } - } - } - if (state[1] < el.childNodes.length) { - stack.push([el.childNodes[state[1]++], 0, false]); - } else { - if (state[2]) - matcher.rewind(); - stack.pop(); - } - } - } - return res; - }, - - /** - * Query CSS Selector to find matching node. - * @param {string} selector Simplified CSS selector - * @param {Matcher} selector A Matcher instance - * @return {HTMLElement} matching node - */ - querySelector: function(selector) { - var matcher; - if (selector instanceof Matcher) { - matcher = selector; - matcher.reset(); - } else { - matcher = new Matcher(selector); - } - var stack = []; - for (var i = 0; i < this.childNodes.length; i++) { - stack.push([this.childNodes[i], 0, false]); - while (stack.length) { - var state = stack.back; - var el = state[0]; - if (state[1] === 0) { - // Seen for first time. - if (el.nodeType !== Node.ELEMENT_NODE) { - stack.pop(); - continue; - } - if (state[2] = matcher.advance(el)) { - if (matcher.matched) { - return el; - } - } - } - if (state[1] < el.childNodes.length) { - stack.push([el.childNodes[state[1]++], 0, false]); - } else { - if (state[2]) - matcher.rewind(); - stack.pop(); - } - } - } - return null; - }, - /** * Append a child node to childNodes * @param {Node} node node to append @@ -331,23 +75,6 @@ $inherit(HTMLElement, Node, { this.childNodes.push(node); return node; }, - - /** - * Get first child node - * @return {Node} first child node - */ - get firstChild() { - return this.childNodes.front; - }, - - /** - * Get last child node - * @return {Node} last child node - */ - get lastChild() { - return this.childNodes.back; - }, - /** * Get attributes * @return {Object} parsed and unescaped attributes @@ -379,94 +106,8 @@ $inherit(HTMLElement, Node, { this._rawAttrs = attrs; return attrs; } - -}); -$define(HTMLElement, { - __wrap: function(el) { - el.childNodes.forEach(function(node) { - if (node.rawText) { - $wrap(node, TextNode); - } else { - $wrap(node, HTMLElement); - } - }); - } -}); - -/** - * Cache to store generated match functions - * @type {Object} - */ -var pMatchFunctionCache = {}; - -/** - * Matcher class to make CSS match - * @param {string} selector Selector - */ -function Matcher(selector) { - this.matchers = selector.split(' ').map(function(matcher) { - if (pMatchFunctionCache[matcher]) - return pMatchFunctionCache[matcher]; - var parts = matcher.split('.'); - var tagName = parts[0]; - var classes = parts.slice(1).sort(); - var source = ''; - if (tagName && tagName != '*') { - if (tagName[0] == '#') - source += 'if (el.id != ' + JSON.stringify(tagName.substr(1)) + ') return false;'; - else - source += 'if (el.tagName != ' + JSON.stringify(tagName) + ') return false;'; - } - if (classes.length > 0) - source += 'for (var cls = ' + JSON.stringify(classes) + ', i = 0; i < cls.length; i++) if (el.classNames.indexOf(cls[i]) === -1) return false;'; - source += 'return true;'; - return pMatchFunctionCache[matcher] = new Function('el', source); - }); - this.nextMatch = 0; -} -$declare(Matcher, { - /** - * Trying to advance match pointer - * @param {HTMLElement} el element to make the match - * @return {bool} true when pointer advanced. - */ - advance: function(el) { - if (this.nextMatch < this.matchers.length && - this.matchers[this.nextMatch](el)) { - this.nextMatch++; - return true; - } - return false; - }, - /** - * Rewind the match pointer - */ - rewind: function() { - this.nextMatch--; - }, - /** - * Trying to determine if match made. - * @return {bool} true when the match is made - */ - get matched() { - return this.nextMatch == this.matchers.length; - }, - /** - * Rest match pointer. - * @return {[type]} [description] - */ - reset: function() { - this.nextMatch = 0; - } -}); -$define(Matcher, { - /** - * flush cache to free memory - */ - flushCache: function() { - pMatchFunctionCache = {}; - } }); +$define(HTMLElement, {}); // parser states var INITIAL = 'INITIAL'; @@ -531,6 +172,7 @@ var kMarkupPattern = (function () { return makeState(IS_SELF_CLOSING, match, bracketStack); } + match[3] += sym; return makeState(READ_ATTRIBUTES, match, bracketStack); case '>': if (--bracketStack) { @@ -585,7 +227,7 @@ var kMarkupPattern = (function () { state = readTagName(state.match, str[i]); break; case READ_ATTRIBUTES: - state = readAttributes(state.match, str[i], state.bracketClose); + state = readAttributes(state.match, str[i], state.bracketStack); break; case IS_SELF_CLOSING: state = isSelfClosing(state.match, str[i]); @@ -651,7 +293,6 @@ var kBlockTextElements = { */ module.exports = { - Matcher: Matcher, Node: Node, HTMLElement: HTMLElement, TextNode: TextNode, From 388133b292331b018127ca708226da3848030ba8 Mon Sep 17 00:00:00 2001 From: markitosha Date: Sat, 12 Aug 2017 19:21:22 +0300 Subject: [PATCH 11/16] =?UTF-8?q?=D0=98=D1=81=D0=BF=D1=80=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B0=20=D0=BE=D1=88=D0=B8=D0=B1=D0=BA=D0=B0?= =?UTF-8?q?=20=D1=81=D0=BE=20=D0=B7=D0=BD=D0=B0=D0=BA=D0=BE=D0=BC=20=D0=BC?= =?UTF-8?q?=D0=B5=D0=BD=D1=8C=D1=88=D0=B5=20=D0=B2=20=D1=81=D0=BA=D1=80?= =?UTF-8?q?=D0=B8=D0=BF=D1=82=D0=B0=D1=85=20=D0=B8=20=D1=81=D1=82=D0=B8?= =?UTF-8?q?=D0=BB=D1=8F=D1=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/html.js | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/test/html.js b/test/html.js index eaa22df..27c095f 100644 --- a/test/html.js +++ b/test/html.js @@ -143,18 +143,22 @@ describe('HTML Parser', function() { it('should parse "

.." very fast', function() { - for (var i = 0; i < 100; i++) - parseHTML('

'); + console.time('timer1'); + for (var i = 0; i < 100; i++) + parseHTML('

'); + console.timeEnd('timer1'); + console.log('\n'); }); it('should parse "

.." fast', function() { + console.time('timer'); for (var i = 0; i < 100; i++) parseHTML('

', { lowerCaseTagName: true }); - + console.timeEnd('timer'); }); }); From 2eb5755c6e7200720b2d5c0a9e751b048c7b3a3c Mon Sep 17 00:00:00 2001 From: markitosha Date: Sat, 12 Aug 2017 19:34:58 +0300 Subject: [PATCH 12/16] =?UTF-8?q?=D0=98=D1=81=D0=BF=D1=80=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B0=20=D0=BE=D1=88=D0=B8=D0=B1=D0=BA=D0=B0?= =?UTF-8?q?=20=D1=81=D0=BE=20=D0=B7=D0=BD=D0=B0=D0=BA=D0=BE=D0=BC=20=D0=BC?= =?UTF-8?q?=D0=B5=D0=BD=D1=8C=D1=88=D0=B5=20=D0=B2=20=D1=81=D0=BA=D1=80?= =?UTF-8?q?=D0=B8=D0=BF=D1=82=D0=B0=D1=85=20=D0=B8=20=D1=81=D1=82=D0=B8?= =?UTF-8?q?=D0=BB=D1=8F=D1=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- index.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/index.js b/index.js index 5b7f239..e09568b 100644 --- a/index.js +++ b/index.js @@ -250,7 +250,9 @@ var kMarkupPattern = (function () { get lastIndex() { return lastIndex; }, - set lastIndex(newLastIndex) {} + set lastIndex(newLastIndex) { + lastIndex = newLastIndex; + } } })(); From 0a311bc232872b36b9ae1dca41f8c84bbdd31902 Mon Sep 17 00:00:00 2001 From: markitosha Date: Sat, 12 Aug 2017 19:42:15 +0300 Subject: [PATCH 13/16] =?UTF-8?q?=D0=92=D0=BE=D0=B7=D0=B2=D1=80=D0=B0?= =?UTF-8?q?=D1=89=D0=B5=D0=BD=D1=8B=20=D1=84=D1=83=D0=BD=D0=BA=D1=86=D0=B8?= =?UTF-8?q?=D0=B8,=20=D0=BD=D0=B5=D0=BE=D0=B1=D1=85=D0=BE=D0=B4=D0=B8?= =?UTF-8?q?=D0=BC=D1=8B=D0=B5=20=D0=B4=D0=BB=D1=8F=20=D1=82=D0=B5=D1=81?= =?UTF-8?q?=D1=82=D0=BE=D0=B2.=20=D0=A3=D0=B4=D0=B0=D0=BB=D0=B5=D0=BD?= =?UTF-8?q?=D1=8B=20=D1=82=D0=B5=D1=81=D1=82=D1=8B=20=D0=B4=D0=BB=D1=8F=20?= =?UTF-8?q?=D1=83=D0=B4=D0=B0=D0=BB=D0=B5=D0=BD=D0=BD=D0=BE=D0=B3=D0=BE=20?= =?UTF-8?q?=D1=84=D1=83=D0=BD=D0=BA=D1=86=D0=B8=D0=BE=D0=BD=D0=B0=D0=BB?= =?UTF-8?q?=D0=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- index.js | 19 ++++++++- test/html.js | 108 --------------------------------------------------- 2 files changed, 18 insertions(+), 109 deletions(-) diff --git a/index.js b/index.js index e09568b..4ec2376 100644 --- a/index.js +++ b/index.js @@ -24,7 +24,10 @@ $inherit(TextNode, Node, { * Node Type declaration. * @type {Number} */ - nodeType: Node.TEXT_NODE + nodeType: Node.TEXT_NODE, + get text() { + return entities.decodeHTML5(this.rawText); + } }); /** @@ -53,6 +56,9 @@ $inherit(HTMLElement, Node, { * Get unescaped text value of current node and its children. * @return {string} text content */ + get text() { + return entities.decodeHTML5(this.rawText); + }, /** * Get escpaed (as-it) text value of current node and its children. @@ -75,6 +81,17 @@ $inherit(HTMLElement, Node, { this.childNodes.push(node); return node; }, + get firstChild() { + return this.childNodes.front; + }, + +/** + * Get last child node + * @return {Node} last child node + */ + get lastChild() { + return this.childNodes.back; + }, /** * Get attributes * @return {Object} parsed and unescaped attributes diff --git a/test/html.js b/test/html.js index 27c095f..88f2c65 100644 --- a/test/html.js +++ b/test/html.js @@ -6,60 +6,9 @@ var HTMLParser = require('../'); describe('HTML Parser', function() { - var Matcher = HTMLParser.Matcher; var HTMLElement = HTMLParser.HTMLElement; var TextNode = HTMLParser.TextNode; - describe('Matcher', function() { - - it('should match corrent elements', function() { - - var matcher = new Matcher('#id .a a.b *.a.b .a.b * a'); - var MatchesNothingButStarEl = new HTMLElement('_', {}); - var withIdEl = new HTMLElement('p', { id: 'id' }); - var withClassNameEl = new HTMLElement('a', { class: 'a b' }); - - // console.log(util.inspect([withIdEl, withClassNameEl], { - // showHidden: true, - // depth: null - // })); - - matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // #id - matcher.advance(withClassNameEl).should.not.be.ok; // #id - matcher.advance(withIdEl).should.be.ok; // #id - - matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a - matcher.advance(withIdEl).should.not.be.ok; // .a - matcher.advance(withClassNameEl).should.be.ok; // .a - - matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a.b - matcher.advance(withIdEl).should.not.be.ok; // a.b - matcher.advance(withClassNameEl).should.be.ok; // a.b - - matcher.advance(withIdEl).should.not.be.ok; // *.a.b - matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // *.a.b - matcher.advance(withClassNameEl).should.be.ok; // *.a.b - - matcher.advance(withIdEl).should.not.be.ok; // .a.b - matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a.b - matcher.advance(withClassNameEl).should.be.ok; // .a.b - - matcher.advance(withIdEl).should.be.ok; // * - matcher.rewind(); - matcher.advance(MatchesNothingButStarEl).should.be.ok; // * - matcher.rewind(); - matcher.advance(withClassNameEl).should.be.ok; // * - - matcher.advance(withIdEl).should.not.be.ok; // a - matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a - matcher.advance(withClassNameEl).should.be.ok; // a - - matcher.matched.should.be.ok; - - }); - - }); - var parseHTML = HTMLParser.parse; describe('parse()', function() { @@ -163,37 +112,8 @@ describe('HTML Parser', function() { }); - describe('TextNode', function() { - - describe('#isWhitespace', function() { - var node = new TextNode(''); - node.isWhitespace.should.be.ok; - node = new TextNode(' \t'); - node.isWhitespace.should.be.ok; - node = new TextNode(' \t  \t'); - node.isWhitespace.should.be.ok; - }); - - }); - describe('HTMLElement', function() { - describe('#removeWhitespace()', function() { - - it('should remove whitespaces while preserving nodes with content', function() { - - var root = parseHTML('

\r \n \t

123

'); - - var p = new HTMLElement('p', {}, ''); - p.appendChild(new HTMLElement('h5', {}, '')) - .appendChild(new TextNode('123')); - - root.firstChild.removeWhitespace().should.eql(p); - - }); - - }); - describe('#rawAttributes', function() { it('should return escaped attributes of the element', function() { @@ -226,34 +146,6 @@ describe('HTML Parser', function() { }); - describe('#querySelectorAll()', function() { - - it('should return correct elements in DOM tree', function() { - - var root = parseHTML('
'); - - root.querySelectorAll('#id').should.eql([root.firstChild]); - root.querySelectorAll('span.a').should.eql([root.firstChild.firstChild.firstChild]); - root.querySelectorAll('span.b').should.eql([root.firstChild.firstChild.firstChild]); - root.querySelectorAll('span.a.b').should.eql([root.firstChild.firstChild.firstChild]); - root.querySelectorAll('#id .b').should.eql([root.firstChild.firstChild.firstChild]); - root.querySelectorAll('#id span').should.eql(root.firstChild.firstChild.childNodes); - - }); - - }); - - describe('#structuredText', function() { - - it('should return correct structured text', function() { - - var root = parseHTML('o

a

b

c
'); - root.structuredText.should.eql('o\na\nb\nc'); - - }); - - }); - }); }); From 523e1f7af0a5fac1a544e5eea20db0170cd8c74c Mon Sep 17 00:00:00 2001 From: markitosha Date: Thu, 17 Aug 2017 17:59:10 +0300 Subject: [PATCH 14/16] =?UTF-8?q?=D0=A3=D0=B4=D0=B0=D0=BB=D0=B5=D0=BD?= =?UTF-8?q?=D1=8B=20=D1=82=D0=B0=D0=B9=D0=BC=D0=B5=D1=80=D1=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/html.js | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/html.js b/test/html.js index 88f2c65..b61e0c9 100644 --- a/test/html.js +++ b/test/html.js @@ -92,22 +92,17 @@ describe('HTML Parser', function() { it('should parse "

.." very fast', function() { - console.time('timer1'); for (var i = 0; i < 100; i++) parseHTML('

'); - console.timeEnd('timer1'); - console.log('\n'); }); it('should parse "

.." fast', function() { - console.time('timer'); for (var i = 0; i < 100; i++) parseHTML('

', { lowerCaseTagName: true }); - console.timeEnd('timer'); }); }); From d9fa9788388646ee51da3274c99844326a9d38f6 Mon Sep 17 00:00:00 2001 From: markitosha Date: Sat, 19 Aug 2017 11:55:16 +0300 Subject: [PATCH 15/16] =?UTF-8?q?=D0=A0=D0=B5=D1=84=D0=B0=D0=BA=D1=82?= =?UTF-8?q?=D0=BE=D1=80=D0=B8=D0=BD=D0=B3=20=D0=B8=20=D0=BA=D0=BE=D0=BC?= =?UTF-8?q?=D0=BC=D0=B5=D0=BD=D1=82=D0=B0=D1=80=D0=B8=D0=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- index.js | 542 ++++++++++++++++++++++++++++----------------------- test/html.js | 8 +- 2 files changed, 298 insertions(+), 252 deletions(-) diff --git a/index.js b/index.js index 4ec2376..836a8f1 100644 --- a/index.js +++ b/index.js @@ -4,12 +4,13 @@ var entities = require('entities'); /** * Node Class as base class for TextNode and HTMLElement. */ -function Node() {} +function Node() { +} $declare(Node, {}); $defenum(Node, { - ELEMENT_NODE: 1, - TEXT_NODE: 3 + ELEMENT_NODE: 1, + TEXT_NODE: 3 }); /** @@ -17,17 +18,17 @@ $defenum(Node, { * @param {string} value [description] */ function TextNode(value) { - this.rawText = value; + this.rawText = value; } $inherit(TextNode, Node, { - /** - * Node Type declaration. - * @type {Number} - */ - nodeType: Node.TEXT_NODE, - get text() { - return entities.decodeHTML5(this.rawText); - } + /** + * Node Type declaration. + * @type {Number} + */ + nodeType: Node.TEXT_NODE, + get text() { + return entities.decodeHTML5(this.rawText); + } }); /** @@ -40,93 +41,99 @@ $inherit(TextNode, Node, { * @param {Object} rawAttrs attributes in string */ function HTMLElement(name, keyAttrs, rawAttrs) { - this.tagName = name; - this.rawAttrs = rawAttrs || ''; - this.childNodes = []; - if (keyAttrs.id) - this.id = keyAttrs.id; + this.tagName = name; + this.rawAttrs = rawAttrs || ''; + this.childNodes = []; + if (keyAttrs.id) { + this.id = keyAttrs.id; + } } $inherit(HTMLElement, Node, { - /** - * Node Type declaration. - * @type {Number} - */ - nodeType: Node.ELEMENT_NODE, - /** - * Get unescaped text value of current node and its children. - * @return {string} text content - */ - get text() { - return entities.decodeHTML5(this.rawText); - }, - - /** - * Get escpaed (as-it) text value of current node and its children. - * @return {string} text content - */ - get rawText() { - var res = ''; - for (var i = 0; i < this.childNodes.length; i++) - res += this.childNodes[i].rawText; - return res; - }, - - /** - * Append a child node to childNodes - * @param {Node} node node to append - * @return {Node} node appended - */ - appendChild: function(node) { - // node.parentNode = this; - this.childNodes.push(node); - return node; - }, - get firstChild() { - return this.childNodes.front; - }, - -/** - * Get last child node - * @return {Node} last child node - */ - get lastChild() { - return this.childNodes.back; - }, - /** - * Get attributes - * @return {Object} parsed and unescaped attributes - */ - get attributes() { - if (this._attrs) - return this._attrs; - this._attrs = {}; - var attrs = this.rawAttributes; - for (var key in attrs) { - this._attrs[key] = entities.decodeHTML5(attrs[key]); - } - return this._attrs; - }, - - /** - * Get escaped (as-it) attributes - * @return {Object} parsed attributes - */ - get rawAttributes() { - if (this._rawAttrs) - return this._rawAttrs; - var attrs = {}; - if (this.rawAttrs) { - var re = /\b([_a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig; - for (var match; match = re.exec(this.rawAttrs); ) - attrs[match[1]] = match[3] || match[4] || match[5]; + /** + * Node Type declaration. + * @type {Number} + */ + nodeType: Node.ELEMENT_NODE, + /** + * Get unescaped text value of current node and its children. + * @return {string} text content + */ + get text() { + return entities.decodeHTML5(this.rawText); + }, + + /** + * Get escpaed (as-it) text value of current node and its children. + * @return {string} text content + */ + get rawText() { + var res = ''; + for (var i = 0; i < this.childNodes.length; i++) { + res += this.childNodes[i].rawText; + } + return res; + }, + + /** + * Append a child node to childNodes + * @param {Node} node node to append + * @return {Node} node appended + */ + appendChild: function (node) { + // node.parentNode = this; + this.childNodes.push(node); + return node; + }, + + get firstChild() { + return this.childNodes.front; + }, + + /** + * Get last child node + * @return {Node} last child node + */ + get lastChild() { + return this.childNodes.back; + }, + /** + * Get attributes + * @return {Object} parsed and unescaped attributes + */ + get attributes() { + if (this._attrs) { + return this._attrs; + } + this._attrs = {}; + var attrs = this.rawAttributes; + for (var key in attrs) { + this._attrs[key] = entities.decodeHTML5(attrs[key]); + } + return this._attrs; + }, + + /** + * Get escaped (as-it) attributes + * @return {Object} parsed attributes + */ + get rawAttributes() { + if (this._rawAttrs) { + return this._rawAttrs; + } + var attrs = {}; + if (this.rawAttrs) { + var re = /\b([_a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig; + for (var match; match = re.exec(this.rawAttrs);) { + attrs[match[1]] = match[3] || match[4] || match[5]; + } + } + this._rawAttrs = attrs; + return attrs; } - this._rawAttrs = attrs; - return attrs; - } }); $define(HTMLElement, {}); -// parser states +// parser statuses var INITIAL = 'INITIAL'; var TAG_OPENED = 'TAG_OPENED'; var READ_TAG_NAME = 'READ_TAG_NAME'; @@ -134,19 +141,43 @@ var READ_ATTRIBUTES = 'READ_ATTRIBUTES'; var IS_SELF_CLOSING = 'IS_SELF_CLOSING'; var TAG_CLOSE = 'TAG_CLOSE'; +// indexes for match +var TAG_FULL = 0; +var TAG_CLOSING_CHAR = 1; +var TAG_NAME = 2; +var ATTRIBUTES = 3; +var SELF_CLOSING_CHAR = 4; + +/** + * Function, which works like RegExp (contains exec function and lastIndex param) + */ var kMarkupPattern = (function () { var lastIndex = 0; - function makeState(state, match, bracketStack ) { - return { - state: state, - match: match, - bracketStack: bracketStack - }; + /** + * Make state object from params + * @param {string} status next parcer status + * @param {Array} match array which contains parsed values + * @param {Number} openedBracketCounter counter of unclosed tag brackets + */ + function makeState(status, match, openedBracketCounter) { + return { + status: status, + match: match, + openedBracketCounter: openedBracketCounter + }; } + // Status functions that determine the processing of the next character + + /** + * Initial status function + * @param {Array} match array which contains parsed values + * @param {string} sym processed character + * @param {Number} index index of processed character + */ function initial(match, sym, index) { - if (sym === '<' ) { + if (sym === '<') { match['index'] = index; return makeState(TAG_OPENED, match, 1); } @@ -154,20 +185,26 @@ var kMarkupPattern = (function () { return makeState(INITIAL, match, 0); } + /** + * Processing of the character immediately following the opening bracket + * @param {Array} match array which contains parsed values + * @param {string} sym processed character + */ function tagOpened(match, sym) { - if (sym === '!') { - return makeState(INITIAL, match, 0); - } - if (sym === '/') { - match[1] = '/'; + match[TAG_CLOSING_CHAR] = '/'; } else { - match[2] += sym; + match[TAG_NAME] += sym; } return makeState(READ_TAG_NAME, match, 1); } + /** + * Processing of the characters in tag name + * @param {Array} match array which contains parsed values + * @param {string} sym processed character + */ function readTagName(match, sym) { switch (sym) { case ' ': @@ -177,63 +214,82 @@ var kMarkupPattern = (function () { case '>': return makeState(TAG_CLOSE, match, 1); default: - match[2] += sym; + match[TAG_NAME] += sym; return makeState(READ_TAG_NAME, match, 1); } } - function readAttributes(match, sym, bracketStack) { + /** + * Processing of the characters in attributes + * @param {Array} match array which contains parsed values + * @param {string} sym processed character + * @param {Number} openedBracketCounter counter of unclosed tag brackets + */ + function readAttributes(match, sym, openedBracketCounter) { switch (sym) { case '/': - if (bracketStack === 1) { - return makeState(IS_SELF_CLOSING, match, bracketStack); + if (openedBracketCounter === 1) { + return makeState(IS_SELF_CLOSING, match, openedBracketCounter); } - match[3] += sym; - return makeState(READ_ATTRIBUTES, match, bracketStack); + match[ATTRIBUTES] += sym; + return makeState(READ_ATTRIBUTES, match, openedBracketCounter); case '>': - if (--bracketStack) { - match[3] += sym; - return makeState(READ_ATTRIBUTES, match, bracketStack); + if (--openedBracketCounter) { + match[ATTRIBUTES] += sym; + return makeState(READ_ATTRIBUTES, match, openedBracketCounter); } - return makeState(TAG_CLOSE, match, bracketStack); + return makeState(TAG_CLOSE, match, openedBracketCounter); case '<': - ++bracketStack; + ++openedBracketCounter; + // without break, it's not a mistake default: - match[3] += sym; - return makeState(READ_ATTRIBUTES, match, bracketStack); + match[ATTRIBUTES] += sym; + return makeState(READ_ATTRIBUTES, match, openedBracketCounter); } } + /** + * Processing of the character immediately following the character '/' + * @param {Array} match array which contains parsed values + * @param {string} sym processed character + */ function isSelfClosing(match, sym) { if (sym === '>') { - match[4] = '/'; + match[SELF_CLOSING_CHAR] = '/'; return makeState(TAG_CLOSE, match, 0); } - match[3] += '/' + sym; + match[ATTRIBUTES] += '/' + sym; return makeState(READ_ATTRIBUTES, match, 1); } + /** + * Final processing of the string + * @param {Array} match array which contains parsed values + * @param {string} str processed string + * @param {Number} index index of processed character + */ function tagClose(match, str, index) { lastIndex = index; - match[0] = str.slice(match['index'], index); + match[TAG_FULL] = str.slice(match['index'], index); return makeState(INITIAL, match, 0); } return { exec: function (str) { + // state.match :: [TAG_FULL, TAG_CLOSING_CHAR, TAG_NAME, ATTRIBUTES, SELF_CLOSING_CHAR] var state = { - state: INITIAL, + status: INITIAL, match: ['', '', '', '', ''], - bracketStack: 0 + openedBracketCounter: 0 }; state.match['input'] = str; for (var i = lastIndex; i < str.length; ++i) { - switch (state.state) { + switch (state.status) { case INITIAL: state = initial(state.match, str[i], i); break; @@ -244,7 +300,7 @@ var kMarkupPattern = (function () { state = readTagName(state.match, str[i]); break; case READ_ATTRIBUTES: - state = readAttributes(state.match, str[i], state.bracketStack); + state = readAttributes(state.match, str[i], state.openedBracketCounter); break; case IS_SELF_CLOSING: state = isSelfClosing(state.match, str[i]); @@ -253,10 +309,10 @@ var kMarkupPattern = (function () { state = tagClose(state.match, str, i); return state.match; default: - break; + break; } } - if (state.state === TAG_CLOSE) { + if (state.status === TAG_CLOSE) { state = tagClose(state.match, str, str.length); return state.match; } @@ -265,7 +321,7 @@ var kMarkupPattern = (function () { return null; }, get lastIndex() { - return lastIndex; + return lastIndex; }, set lastIndex(newLastIndex) { lastIndex = newLastIndex; @@ -275,36 +331,36 @@ var kMarkupPattern = (function () { var kAttributePattern = /\b(id|class)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig; var kSelfClosingElements = { - meta: true, - img: true, - link: true, - input: true, - area: true, - br: true, - hr: true, - wbr: true, - col: true + meta: true, + img: true, + link: true, + input: true, + area: true, + br: true, + hr: true, + wbr: true, + col: true }; var kElementsClosedByOpening = { - li: {li: true}, - p: {p: true, div: true}, - td: {td: true, th: true}, - th: {td: true, th: true} + li: {li: true}, + p: {p: true, div: true}, + td: {td: true, th: true}, + th: {td: true, th: true} }; var kElementsClosedByClosing = { - li: {ul: true, ol: true}, - a: {div: true}, - b: {div: true}, - i: {div: true}, - p: {div: true}, - td: {tr: true, table: true}, - th: {tr: true, table: true} + li: {ul: true, ol: true}, + a: {div: true}, + b: {div: true}, + i: {div: true}, + p: {div: true}, + td: {tr: true, table: true}, + th: {tr: true, table: true} }; var kBlockTextElements = { - script: true, - noscript: true, - style: true, - pre: true + script: true, + noscript: true, + style: true, + pre: true }; /** @@ -312,102 +368,96 @@ var kBlockTextElements = { */ module.exports = { - Node: Node, - HTMLElement: HTMLElement, - TextNode: TextNode, - - /** - * Parse a chuck of HTML source. - * @param {string} data html - * @return {HTMLElement} root element - */ - parse: function(data, options) { - - var root = new HTMLElement(null, {}); - var currentParent = root; - var stack = [root]; - var lastTextPos = -1; - - options = options || {}; - - for (var match, text; match = kMarkupPattern.exec(data); ) { - if (lastTextPos > -1) { - if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) { - // if has content - text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length); - currentParent.appendChild(new TextNode(text)); - } - } - lastTextPos = kMarkupPattern.lastIndex; - if (match[0][1] == '!') { - // this is a comment - continue; - } - if (options.lowerCaseTagName) - match[2] = match[2].toLowerCase(); - if (!match[1]) { - // not or ... - var closeMarkup = ''; - var index = data.indexOf(closeMarkup, kMarkupPattern.lastIndex); - if (options[match[2]]) { - if (index == -1) { - // there is no matching ending for the text element. - text = data.substr(kMarkupPattern.lastIndex); - } else { - text = data.substring(kMarkupPattern.lastIndex, index); + Node: Node, + HTMLElement: HTMLElement, + TextNode: TextNode, + + /** + * Parse a chuck of HTML source. + * @param {string} data html + * @return {HTMLElement} root element + */ + parse: function (data, options) { + var root = new HTMLElement(null, {}); + var currentParent = root; + var stack = [root]; + var lastTextPos = -1; + + options = options || {}; + + for (var match, text; match = kMarkupPattern.exec(data);) { + if (lastTextPos > -1 && (lastTextPos + match[TAG_FULL].length < kMarkupPattern.lastIndex)) { + // if has content + text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length); + currentParent.appendChild(new TextNode(text)); } - if (text.length > 0) - currentParent.appendChild(new TextNode(text)); - } - if (index == -1) { - lastTextPos = kMarkupPattern.lastIndex = data.length + 1; - } else { - lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length; - match[1] = true; - } - } - } - if (match[1] || match[4] || - kSelfClosingElements[match[2]]) { - // or
etc. - while (true) { - if (currentParent.tagName == match[2]) { - stack.pop(); - currentParent = stack.back; - break; - } else { - // Trying to close current tag, and move on - if (kElementsClosedByClosing[currentParent.tagName]) { - if (kElementsClosedByClosing[currentParent.tagName][match[2]]) { - stack.pop(); - currentParent = stack.back; + lastTextPos = kMarkupPattern.lastIndex; + if (match[TAG_FULL][1] == '!') { + // this is a comment continue; - } } - // Use aggressive strategy to handle unmatching markups. - break; - } + if (!match[TAG_CLOSING_CHAR]) { + // not or ... + var closeMarkup = ''; + var index = data.indexOf(closeMarkup, kMarkupPattern.lastIndex); + if (options[match[TAG_NAME]]) { + if (index == -1) { + // there is no matching ending for the text element. + text = data.substr(kMarkupPattern.lastIndex); + } else { + text = data.substring(kMarkupPattern.lastIndex, index); + } + if (text.length > 0) { + currentParent.appendChild(new TextNode(text)); + } + } + if (index == -1) { + lastTextPos = kMarkupPattern.lastIndex = data.length + 1; + } else { + lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length; + match[TAG_CLOSING_CHAR] = true; + } + } + } + if (match[TAG_CLOSING_CHAR] || match[SELF_CLOSING_CHAR] || kSelfClosingElements[match[TAG_NAME]]) { + // or
etc. + while (true) { + if (currentParent.tagName == match[TAG_NAME]) { + stack.pop(); + currentParent = stack.back; + break; + } else { + // Trying to close current tag, and move on + if (kElementsClosedByClosing[currentParent.tagName]) { + if (kElementsClosedByClosing[currentParent.tagName][match[TAG_NAME]]) { + stack.pop(); + currentParent = stack.back; + continue; + } + } + // Use aggressive strategy to handle unmatching markups. + break; + } + } + } } - } - } - - return root; - - } + return root; + } }; diff --git a/test/html.js b/test/html.js index b61e0c9..fa29174 100644 --- a/test/html.js +++ b/test/html.js @@ -31,9 +31,7 @@ describe('HTML Parser', function() { it('should parse "

" and return root element', function() { - var root = parseHTML('

', { - lowerCaseTagName: true - }); + var root = parseHTML('

'); var div = new HTMLElement('div', {}, ''); var a = div.appendChild(new HTMLElement('a', {}, '')); @@ -100,9 +98,7 @@ describe('HTML Parser', function() { it('should parse "

.." fast', function() { for (var i = 0; i < 100; i++) - parseHTML('

', { - lowerCaseTagName: true - }); + parseHTML('

'); }); }); From f102642c4a7bca0b70acca3508b145a2b5a4af61 Mon Sep 17 00:00:00 2001 From: markitosha Date: Sat, 19 Aug 2017 16:15:09 +0300 Subject: [PATCH 16/16] =?UTF-8?q?=D0=98=D1=81=D0=BF=D1=80=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D1=8B=20=D0=BE=D0=BF=D0=B5=D1=87=D0=B0=D1=82?= =?UTF-8?q?=D0=BA=D0=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- index.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index.js b/index.js index 836a8f1..76cf79a 100644 --- a/index.js +++ b/index.js @@ -156,7 +156,7 @@ var kMarkupPattern = (function () { /** * Make state object from params - * @param {string} status next parcer status + * @param {string} status next parser status * @param {Array} match array which contains parsed values * @param {Number} openedBracketCounter counter of unclosed tag brackets */ @@ -168,7 +168,7 @@ var kMarkupPattern = (function () { }; } - // Status functions that determine the processing of the next character + // Status functions those determine the processing of the next character /** * Initial status function