From 9e4231fafd2691fc64443d164bbf26bdd5432249 Mon Sep 17 00:00:00 2001
From: Nosov
Date: Thu, 15 Jun 2017 20:44:03 +0300
Subject: [PATCH 01/16] added wbr selfClosing tag
---
index.js | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/index.js b/index.js
index 38d7e04..a39919b 100644
--- a/index.js
+++ b/index.js
@@ -476,7 +476,8 @@ var kSelfClosingElements = {
input: true,
area: true,
br: true,
- hr: true
+ hr: true,
+ wbr: true
};
var kElementsClosedByOpening = {
li: {li: true},
From 771fb7133faea81b82b32240b02f8c18b732b19b Mon Sep 17 00:00:00 2001
From: Nosov
Date: Thu, 15 Jun 2017 20:46:31 +0300
Subject: [PATCH 02/16] updage version upto 1.0.2-wbr
---
package.json | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/package.json b/package.json
index 5b4fb67..459debb 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "fast-html-parser",
- "version": "1.0.1",
+ "version": "1.0.2-wbr",
"description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.",
"main": "index.js",
"scripts": {
From 860d4ad308b84cb67b145a6aafc67ea7524092c8 Mon Sep 17 00:00:00 2001
From: Nosov
Date: Thu, 15 Jun 2017 21:02:53 +0300
Subject: [PATCH 03/16] added col selfClosing tag
---
index.js | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/index.js b/index.js
index a39919b..6a02180 100644
--- a/index.js
+++ b/index.js
@@ -477,7 +477,8 @@ var kSelfClosingElements = {
area: true,
br: true,
hr: true,
- wbr: true
+ wbr: true,
+ col: true
};
var kElementsClosedByOpening = {
li: {li: true},
From 68e6a66c337880df024bd94bd486ed9800163e11 Mon Sep 17 00:00:00 2001
From: Nosov
Date: Thu, 15 Jun 2017 21:05:21 +0300
Subject: [PATCH 04/16] update verion upto 1.0.3-col
---
package.json | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/package.json b/package.json
index 459debb..c95eb4b 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "fast-html-parser",
- "version": "1.0.2-wbr",
+ "version": "1.0.3-col",
"description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.",
"main": "index.js",
"scripts": {
From 0f1c6a67756e14080ea7f2f7041285411d8b0e91 Mon Sep 17 00:00:00 2001
From: markitosha
Date: Sat, 12 Aug 2017 12:15:00 +0300
Subject: [PATCH 05/16] =?UTF-8?q?=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB?=
=?UTF-8?q?=D0=B5=D0=BD=D0=BE=20=D1=80=D0=B0=D1=81=D0=BF=D0=BE=D0=B7=D0=BD?=
=?UTF-8?q?=D0=B0=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20=D0=B0=D1=82=D1=80=D0=B8?=
=?UTF-8?q?=D0=B1=D1=83=D1=82=D0=BE=D0=B2,=20=D0=BD=D0=B0=D1=87=D0=B8?=
=?UTF-8?q?=D0=BD=D0=B0=D1=8E=D1=89=D0=B8=D1=85=D1=81=D1=8F=20=D0=BD=D0=B0?=
=?UTF-8?q?=20=5F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
index.js | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/index.js b/index.js
index 6a02180..0b2dbd9 100644
--- a/index.js
+++ b/index.js
@@ -371,7 +371,7 @@ $inherit(HTMLElement, Node, {
return this._rawAttrs;
var attrs = {};
if (this.rawAttrs) {
- var re = /\b([a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
+ var re = /\b([_a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
for (var match; match = re.exec(this.rawAttrs); )
attrs[match[1]] = match[3] || match[4] || match[5];
}
From 9682fca964a63ba4deeff14a849e097ed32bb8f7 Mon Sep 17 00:00:00 2001
From: markitosha
Date: Sat, 12 Aug 2017 12:55:50 +0300
Subject: [PATCH 06/16] =?UTF-8?q?=D0=9F=D0=B5=D1=80=D0=B5=D0=BF=D0=B8?=
=?UTF-8?q?=D1=81=D0=B0=D0=BD=20=D0=BF=D0=B0=D1=80=D1=81=D0=B5=D1=80=20htm?=
=?UTF-8?q?l=20=D1=82=D0=B5=D0=B3=D0=BE=D0=B2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
index.js | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 56 insertions(+), 1 deletion(-)
diff --git a/index.js b/index.js
index 0b2dbd9..7e4cd1c 100644
--- a/index.js
+++ b/index.js
@@ -467,7 +467,61 @@ $define(Matcher, {
}
});
-var kMarkupPattern = /)-->|<(\/?)([a-z][a-z0-9]*)\s*([^>]*?)(\/?)>/ig;
+var kMarkupPattern = (function () {
+ var lastIndex = 0;
+
+ return {
+ lastIndex: lastIndex,
+ exec: function (str) {
+ var bracketStack = 0;
+ var readTagName = true;
+ var match = ['', '', '', ''];
+ match['input'] = str;
+
+ for (var i = lastIndex; i < str.length; ++i) {
+ ++lastIndex;
+ switch (str[i]) {
+ case '<':
+ if (!bracketStack) {
+ match['index'] = i;
+ }
+ bracketStack++;
+ break;
+ case '/':
+ if (i > 0 && str[i - 1] === '<') {
+ match[1] = '/';
+ } else if (i < str.length - 1 && str[i + 1] === '>') {
+ match[4] = '/';
+ }
+ break;
+ case ' ':
+ if (!readTagName) {
+ match[3] += str[i];
+ } else {
+ readTagName = false;
+ }
+ break;
+ case '>':
+ if (!(--bracketStack)) {
+ match[0] = str.slice(match['index'], i + 1);
+ return match;
+ }
+ break;
+ default:
+ if (readTagName) {
+ match[2] += str[i];
+ } else {
+ match[3] += str[i];
+ }
+ break;
+ }
+ }
+ lastIndex = 0;
+ return null;
+ }
+ }
+})();
+
var kAttributePattern = /\b(id|class)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
var kSelfClosingElements = {
meta: true,
@@ -527,6 +581,7 @@ module.exports = {
options = options || {};
for (var match, text; match = kMarkupPattern.exec(data); ) {
+ console.log(match);
if (lastTextPos > -1) {
if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) {
// if has content
From 5c53cc052642a203107eda44612fa5d1b4fdfb58 Mon Sep 17 00:00:00 2001
From: markitosha
Date: Sat, 12 Aug 2017 14:50:24 +0300
Subject: [PATCH 07/16] =?UTF-8?q?=D0=98=D1=81=D0=BF=D1=80=D0=B0=D0=B2?=
=?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=BE=20=D0=BF=D1=80=D0=BE=D1=85=D0=BE=D0=B6?=
=?UTF-8?q?=D0=B4=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B2=D1=81=D0=B5=D1=85=20?=
=?UTF-8?q?=D1=82=D0=B5=D1=81=D1=82=D0=BE=D0=B2=20=D0=BF=D0=B0=D1=80=D1=81?=
=?UTF-8?q?=D0=B5=D1=80=D0=BE=D0=BC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
index.js | 32 ++++++++++++++++++++++++--------
1 file changed, 24 insertions(+), 8 deletions(-)
diff --git a/index.js b/index.js
index 7e4cd1c..dde9ba6 100644
--- a/index.js
+++ b/index.js
@@ -1,4 +1,5 @@
require('apollojs');
+var equal = require('ramda').equals;
var entities = require('entities');
@@ -471,19 +472,24 @@ var kMarkupPattern = (function () {
var lastIndex = 0;
return {
- lastIndex: lastIndex,
exec: function (str) {
var bracketStack = 0;
var readTagName = true;
- var match = ['', '', '', ''];
+ var readAttributes = false;
+ var inTag = false;
+ var match = ['', '', '', '', ''];
match['input'] = str;
for (var i = lastIndex; i < str.length; ++i) {
++lastIndex;
switch (str[i]) {
case '<':
+ if (i < str.length - 1 && str[i + 1] === '!') {
+ break;
+ }
if (!bracketStack) {
match['index'] = i;
+ inTag = true;
}
bracketStack++;
break;
@@ -492,25 +498,30 @@ var kMarkupPattern = (function () {
match[1] = '/';
} else if (i < str.length - 1 && str[i + 1] === '>') {
match[4] = '/';
+ } else if (inTag && readTagName) {
+ match[2] += str[i];
+ } else if (inTag && readAttributes) {
+ match[3] += str[i];
}
break;
case ' ':
- if (!readTagName) {
+ if (inTag && readAttributes) {
match[3] += str[i];
- } else {
+ } else if (inTag) {
readTagName = false;
+ readAttributes = true;
}
break;
case '>':
- if (!(--bracketStack)) {
+ if (bracketStack > 0 && !(--bracketStack)) {
match[0] = str.slice(match['index'], i + 1);
return match;
}
break;
default:
- if (readTagName) {
+ if (inTag && readTagName) {
match[2] += str[i];
- } else {
+ } else if (inTag && readAttributes) {
match[3] += str[i];
}
break;
@@ -518,6 +529,12 @@ var kMarkupPattern = (function () {
}
lastIndex = 0;
return null;
+ },
+ get lastIndex() {
+ return lastIndex;
+ },
+ set lastIndex(newLastIndex) {
+ // lastIndex = newLastIndex;
}
}
})();
@@ -581,7 +598,6 @@ module.exports = {
options = options || {};
for (var match, text; match = kMarkupPattern.exec(data); ) {
- console.log(match);
if (lastTextPos > -1) {
if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) {
// if has content
From 4a2b56ca6d4bc25e262d9dda32493321db7b10ee Mon Sep 17 00:00:00 2001
From: markitosha
Date: Sat, 12 Aug 2017 14:58:36 +0300
Subject: [PATCH 08/16] =?UTF-8?q?=D0=98=D1=81=D0=BF=D1=80=D0=B0=D0=B2?=
=?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B0=20=D1=80=D0=B0=D0=B1=D0=BE=D1=82=D0=B0?=
=?UTF-8?q?=20=D0=BF=D0=B0=D1=80=D1=81=D0=B5=D1=80=D0=B0=20=D0=B4=D0=BB?=
=?UTF-8?q?=D1=8F=20=D1=81=D0=BB=D1=83=D1=87=D0=B0=D1=8F=20=D1=81=20=D0=B2?=
=?UTF-8?q?=D0=BB=D0=BE=D0=B6=D0=B5=D0=BD=D1=8B=D0=BC=D0=B8=20=D0=B2=20?=
=?UTF-8?q?=D0=B0=D1=82=D1=80=D0=B8=D0=B1=D1=83=D1=82=D1=8B=20=D1=82=D0=B5?=
=?UTF-8?q?=D0=B3=D0=B0=D0=BC=D0=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
index.js | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/index.js b/index.js
index dde9ba6..6edce7b 100644
--- a/index.js
+++ b/index.js
@@ -490,13 +490,17 @@ var kMarkupPattern = (function () {
if (!bracketStack) {
match['index'] = i;
inTag = true;
+ } else if (inTag && readTagName) {
+ match[2] += str[i];
+ } else if (inTag && readAttributes) {
+ match[3] += str[i];
}
bracketStack++;
break;
case '/':
- if (i > 0 && str[i - 1] === '<') {
+ if (i - 1 === match['index']) {
match[1] = '/';
- } else if (i < str.length - 1 && str[i + 1] === '>') {
+ } else if (bracketStack === 1 && i < str.length - 1 && str[i + 1] === '>') {
match[4] = '/';
} else if (inTag && readTagName) {
match[2] += str[i];
@@ -516,6 +520,10 @@ var kMarkupPattern = (function () {
if (bracketStack > 0 && !(--bracketStack)) {
match[0] = str.slice(match['index'], i + 1);
return match;
+ } else if (inTag && readTagName) {
+ match[2] += str[i];
+ } else if (inTag && readAttributes) {
+ match[3] += str[i];
}
break;
default:
From 51ce7958bc1106f6aecb108256f556ecf7146586 Mon Sep 17 00:00:00 2001
From: markitosha
Date: Sat, 12 Aug 2017 16:47:02 +0300
Subject: [PATCH 09/16] =?UTF-8?q?=20=D0=9F=D0=B0=D1=80=D1=81=D0=B5=D1=80?=
=?UTF-8?q?=20=D0=BF=D0=B5=D1=80=D0=B5=D0=BF=D0=B8=D1=81=D0=B0=D0=BD=20?=
=?UTF-8?q?=D0=B2=20=D1=82=D0=B5=D1=80=D0=BC=D0=B8=D0=BD=D0=B0=D1=85=20?=
=?UTF-8?q?=D1=81=D0=BE=D1=81=D1=82=D0=BE=D1=8F=D0=BD=D0=B8=D0=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
index.js | 177 +++++++++++++++++++++++++++++++++++++------------------
1 file changed, 121 insertions(+), 56 deletions(-)
diff --git a/index.js b/index.js
index 6edce7b..d17cbbc 100644
--- a/index.js
+++ b/index.js
@@ -468,82 +468,147 @@ $define(Matcher, {
}
});
+// parser states
+var INITIAL = 'INITIAL';
+var TAG_OPENED = 'TAG_OPENED';
+var READ_TAG_NAME = 'READ_TAG_NAME';
+var READ_ATTRIBUTES = 'READ_ATTRIBUTES';
+var IS_SELF_CLOSING = 'IS_SELF_CLOSING';
+var TAG_CLOSE = 'TAG_CLOSE';
+
var kMarkupPattern = (function () {
var lastIndex = 0;
+ function makeState(state, match, bracketStack ) {
+ return {
+ state: state,
+ match: match,
+ bracketStack: bracketStack
+ };
+ }
+
+ function initial(match, sym, index) {
+ if (sym === '<' ) {
+ match['index'] = index;
+ return makeState(TAG_OPENED, match, 1);
+ }
+
+ return makeState(INITIAL, match, 0);
+ }
+
+ function tagOpened(match, sym) {
+ if (sym === '!') {
+ return makeState(INITIAL, match, 0);
+ }
+
+ if (sym === '/') {
+ match[1] = '/';
+ } else {
+ match[2] += sym;
+ }
+
+ return makeState(READ_TAG_NAME, match, 1);
+ }
+
+ function readTagName(match, sym) {
+ switch (sym) {
+ case ' ':
+ return makeState(READ_ATTRIBUTES, match, 1);
+ case '/':
+ return makeState(IS_SELF_CLOSING, match, 1);
+ case '>':
+ return makeState(TAG_CLOSE, match, 1);
+ default:
+ match[2] += sym;
+ return makeState(READ_TAG_NAME, match, 1);
+ }
+ }
+
+ function readAttributes(match, sym, bracketStack) {
+ switch (sym) {
+ case '/':
+ if (bracketStack === 1) {
+ return makeState(IS_SELF_CLOSING, match, bracketStack);
+ }
+
+ return makeState(READ_ATTRIBUTES, match, bracketStack);
+ case '>':
+ if (--bracketStack) {
+ match[3] += sym;
+ return makeState(READ_ATTRIBUTES, match, bracketStack);
+ }
+
+ return makeState(TAG_CLOSE, match, bracketStack);
+ case '<':
+ ++bracketStack;
+ default:
+ match[3] += sym;
+ return makeState(READ_ATTRIBUTES, match, bracketStack);
+ }
+ }
+
+ function isSelfClosing(match, sym) {
+ if (sym === '>') {
+ match[4] = '/';
+ return makeState(TAG_CLOSE, match, 0);
+ }
+
+ match[3] += '/' + sym;
+ return makeState(READ_ATTRIBUTES, match, 1);
+ }
+
+ function tagClose(match, str, index) {
+ lastIndex = index;
+ match[0] = str.slice(match['index'], index);
+
+ return makeState(INITIAL, match, 0);
+ }
+
return {
exec: function (str) {
- var bracketStack = 0;
- var readTagName = true;
- var readAttributes = false;
- var inTag = false;
- var match = ['', '', '', '', ''];
- match['input'] = str;
+ var state = {
+ state: INITIAL,
+ match: ['', '', '', '', ''],
+ bracketStack: 0
+ };
+ state.match['input'] = str;
for (var i = lastIndex; i < str.length; ++i) {
- ++lastIndex;
- switch (str[i]) {
- case '<':
- if (i < str.length - 1 && str[i + 1] === '!') {
- break;
- }
- if (!bracketStack) {
- match['index'] = i;
- inTag = true;
- } else if (inTag && readTagName) {
- match[2] += str[i];
- } else if (inTag && readAttributes) {
- match[3] += str[i];
- }
- bracketStack++;
+ switch (state.state) {
+ case INITIAL:
+ state = initial(state.match, str[i], i);
break;
- case '/':
- if (i - 1 === match['index']) {
- match[1] = '/';
- } else if (bracketStack === 1 && i < str.length - 1 && str[i + 1] === '>') {
- match[4] = '/';
- } else if (inTag && readTagName) {
- match[2] += str[i];
- } else if (inTag && readAttributes) {
- match[3] += str[i];
- }
+ case TAG_OPENED:
+ state = tagOpened(state.match, str[i]);
break;
- case ' ':
- if (inTag && readAttributes) {
- match[3] += str[i];
- } else if (inTag) {
- readTagName = false;
- readAttributes = true;
- }
+ case READ_TAG_NAME:
+ state = readTagName(state.match, str[i]);
break;
- case '>':
- if (bracketStack > 0 && !(--bracketStack)) {
- match[0] = str.slice(match['index'], i + 1);
- return match;
- } else if (inTag && readTagName) {
- match[2] += str[i];
- } else if (inTag && readAttributes) {
- match[3] += str[i];
- }
+ case READ_ATTRIBUTES:
+ state = readAttributes(state.match, str[i], state.bracketClose);
break;
- default:
- if (inTag && readTagName) {
- match[2] += str[i];
- } else if (inTag && readAttributes) {
- match[3] += str[i];
- }
+ case IS_SELF_CLOSING:
+ state = isSelfClosing(state.match, str[i]);
break;
+ case TAG_CLOSE:
+ state = tagClose(state.match, str, i);
+ return state.match;
+ default:
+ break;
}
}
+ if (state.state === TAG_CLOSE) {
+ state = tagClose(state.match, str, str.length);
+ return state.match;
+ }
+
lastIndex = 0;
return null;
},
get lastIndex() {
return lastIndex;
},
- set lastIndex(newLastIndex) {
- // lastIndex = newLastIndex;
- }
+ set lastIndex(newLastIndex) {}
}
})();
From a083180fc9b5cc32e6e86d453d65fb9c955712a2 Mon Sep 17 00:00:00 2001
From: markitosha
Date: Sat, 12 Aug 2017 18:45:03 +0300
Subject: [PATCH 10/16] =?UTF-8?q?=D0=98=D1=81=D0=BF=D1=80=D0=B0=D0=B2?=
=?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B0=20=D0=BE=D1=88=D0=B8=D0=B1=D0=BA=D0=B8?=
=?UTF-8?q?=20=D0=BF=D1=80=D0=B8=20=D0=BF=D0=B0=D1=80=D1=81=D0=B8=D0=BD?=
=?UTF-8?q?=D0=B3=D0=B5=20=D1=82=D0=B5=D0=B3=D0=BE=D0=B2=20=D0=B2=20=D0=B0?=
=?UTF-8?q?=D1=82=D1=82=D1=80=D0=B8=D0=B1=D1=83=D1=82=D0=B0=D1=85?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
index.js | 371 +------------------------------------------------------
1 file changed, 6 insertions(+), 365 deletions(-)
diff --git a/index.js b/index.js
index d17cbbc..5b7f239 100644
--- a/index.js
+++ b/index.js
@@ -1,17 +1,12 @@
require('apollojs');
-var equal = require('ramda').equals;
-
var entities = require('entities');
/**
* Node Class as base class for TextNode and HTMLElement.
*/
-function Node() {
-
-}
-$declare(Node, {
+function Node() {}
-});
+$declare(Node, {});
$defenum(Node, {
ELEMENT_NODE: 1,
TEXT_NODE: 3
@@ -25,44 +20,13 @@ function TextNode(value) {
this.rawText = value;
}
$inherit(TextNode, Node, {
-
/**
* Node Type declaration.
* @type {Number}
*/
- nodeType: Node.TEXT_NODE,
-
- /**
- * Get unescaped text value of current node and its children.
- * @return {string} text content
- */
- get text() {
- return entities.decodeHTML5(this.rawText);
- },
-
- /**
- * Detect if the node contains only white space.
- * @return {bool}
- */
- get isWhitespace() {
- return /^(\s| )*$/.test(this.rawText);
- }
-
+ nodeType: Node.TEXT_NODE
});
-var kBlockElements = {
- div: true,
- p: true,
- // ul: true,
- // ol: true,
- li: true,
- // table: true,
- // tr: true,
- td: true,
- section: true,
- br: true
-};
-
/**
* HTMLElement, which contains a set of children.
* Note: this is a minimalist implementation, no complete tree
@@ -75,30 +39,20 @@ var kBlockElements = {
function HTMLElement(name, keyAttrs, rawAttrs) {
this.tagName = name;
this.rawAttrs = rawAttrs || '';
- // this.parentNode = null;
this.childNodes = [];
if (keyAttrs.id)
this.id = keyAttrs.id;
- if (keyAttrs.class)
- this.classNames = keyAttrs.class.split(/\s+/);
- else
- this.classNames = [];
}
$inherit(HTMLElement, Node, {
-
/**
* Node Type declaration.
* @type {Number}
*/
nodeType: Node.ELEMENT_NODE,
-
/**
* Get unescaped text value of current node and its children.
* @return {string} text content
*/
- get text() {
- return entities.decodeHTML5(this.rawText);
- },
/**
* Get escpaed (as-it) text value of current node and its children.
@@ -111,216 +65,6 @@ $inherit(HTMLElement, Node, {
return res;
},
- /**
- * Get structured Text (with '\n' etc.)
- * @return {string} structured text
- */
- get structuredText() {
- var currentBlock = [];
- var blocks = [currentBlock];
- function dfs(node) {
- if (node.nodeType === Node.ELEMENT_NODE) {
- if (kBlockElements[node.tagName]) {
- if (currentBlock.length > 0)
- blocks.push(currentBlock = []);
- node.childNodes.forEach(dfs);
- if (currentBlock.length > 0)
- blocks.push(currentBlock = []);
- } else {
- node.childNodes.forEach(dfs);
- }
- } else if (node.nodeType === Node.TEXT_NODE) {
- if (node.isWhitespace) {
- // Whitespace node, postponed output
- currentBlock.prependWhitespace = true;
- } else {
- var text = node.text;
- if (currentBlock.prependWhitespace) {
- text = ' ' + text;
- currentBlock.prependWhitespace = false;
- }
- currentBlock.push(text);
- }
- }
- }
- dfs(this);
- return blocks
- .map(function(block) {
- // Normalize each line's whitespace
- return block.join('').trim().replace(/\s{2,}/g, ' ');
- })
- .join('\n').trimRight();
- },
-
- /**
- * Trim element from right (in block) after seeing pattern in a TextNode.
- * @param {RegExp} pattern pattern to find
- * @return {HTMLElement} reference to current node
- */
- trimRight: function(pattern) {
- function dfs(node) {
- for (var i = 0; i < node.childNodes.length; i++) {
- var childNode = node.childNodes[i];
- if (childNode.nodeType === Node.ELEMENT_NODE) {
- dfs(childNode);
- } else {
- var index = childNode.rawText.search(pattern);
- if (index > -1) {
- childNode.rawText = childNode.rawText.substr(0, index);
- // trim all following nodes.
- node.childNodes.length = i+1;
- }
- }
- }
- }
- dfs(this);
- return this;
- },
-
- /**
- * Get DOM structure
- * @return {string} strucutre
- */
- get structure() {
- var res = [];
- var indention = 0;
- function write(str) {
- res.push(' '.repeat(indention) + str);
- }
- function dfs(node) {
- var idStr = node.id ? ('#' + node.id) : '';
- var classStr = node.classNames.length ? ('.' + node.classNames.join('.')) : '';
- write(node.tagName + idStr + classStr);
- indention++;
- for (var i = 0; i < node.childNodes.length; i++) {
- var childNode = node.childNodes[i];
- if (childNode.nodeType === Node.ELEMENT_NODE) {
- dfs(childNode);
- } else if (childNode.nodeType === Node.TEXT_NODE) {
- if (!childNode.isWhitespace)
- write('#text');
- }
- }
- indention--;
- }
- dfs(this);
- return res.join('\n');
- },
-
- /**
- * Remove whitespaces in this sub tree.
- * @return {HTMLElement} pointer to this
- */
- removeWhitespace: function() {
- var i = 0, o = 0;
- for (; i < this.childNodes.length; i++) {
- var node = this.childNodes[i];
- if (node.nodeType === Node.TEXT_NODE) {
- if (node.isWhitespace)
- continue;
- node.rawText = node.rawText.trim();
- } else if (node.nodeType === Node.ELEMENT_NODE) {
- node.removeWhitespace();
- }
- this.childNodes[o++] = node;
- }
- this.childNodes.length = o;
- return this;
- },
-
- /**
- * Query CSS selector to find matching nodes.
- * @param {string} selector Simplified CSS selector
- * @param {Matcher} selector A Matcher instance
- * @return {HTMLElement[]} matching elements
- */
- querySelectorAll: function(selector) {
- var matcher;
- if (selector instanceof Matcher) {
- matcher = selector;
- matcher.reset();
- } else {
- matcher = new Matcher(selector);
- }
- var res = [];
- var stack = [];
- for (var i = 0; i < this.childNodes.length; i++) {
- stack.push([this.childNodes[i], 0, false]);
- while (stack.length) {
- var state = stack.back;
- var el = state[0];
- if (state[1] === 0) {
- // Seen for first time.
- if (el.nodeType !== Node.ELEMENT_NODE) {
- stack.pop();
- continue;
- }
- if (state[2] = matcher.advance(el)) {
- if (matcher.matched) {
- res.push(el);
- // no need to go further.
- matcher.rewind();
- stack.pop();
- continue;
- }
- }
- }
- if (state[1] < el.childNodes.length) {
- stack.push([el.childNodes[state[1]++], 0, false]);
- } else {
- if (state[2])
- matcher.rewind();
- stack.pop();
- }
- }
- }
- return res;
- },
-
- /**
- * Query CSS Selector to find matching node.
- * @param {string} selector Simplified CSS selector
- * @param {Matcher} selector A Matcher instance
- * @return {HTMLElement} matching node
- */
- querySelector: function(selector) {
- var matcher;
- if (selector instanceof Matcher) {
- matcher = selector;
- matcher.reset();
- } else {
- matcher = new Matcher(selector);
- }
- var stack = [];
- for (var i = 0; i < this.childNodes.length; i++) {
- stack.push([this.childNodes[i], 0, false]);
- while (stack.length) {
- var state = stack.back;
- var el = state[0];
- if (state[1] === 0) {
- // Seen for first time.
- if (el.nodeType !== Node.ELEMENT_NODE) {
- stack.pop();
- continue;
- }
- if (state[2] = matcher.advance(el)) {
- if (matcher.matched) {
- return el;
- }
- }
- }
- if (state[1] < el.childNodes.length) {
- stack.push([el.childNodes[state[1]++], 0, false]);
- } else {
- if (state[2])
- matcher.rewind();
- stack.pop();
- }
- }
- }
- return null;
- },
-
/**
* Append a child node to childNodes
* @param {Node} node node to append
@@ -331,23 +75,6 @@ $inherit(HTMLElement, Node, {
this.childNodes.push(node);
return node;
},
-
- /**
- * Get first child node
- * @return {Node} first child node
- */
- get firstChild() {
- return this.childNodes.front;
- },
-
- /**
- * Get last child node
- * @return {Node} last child node
- */
- get lastChild() {
- return this.childNodes.back;
- },
-
/**
* Get attributes
* @return {Object} parsed and unescaped attributes
@@ -379,94 +106,8 @@ $inherit(HTMLElement, Node, {
this._rawAttrs = attrs;
return attrs;
}
-
-});
-$define(HTMLElement, {
- __wrap: function(el) {
- el.childNodes.forEach(function(node) {
- if (node.rawText) {
- $wrap(node, TextNode);
- } else {
- $wrap(node, HTMLElement);
- }
- });
- }
-});
-
-/**
- * Cache to store generated match functions
- * @type {Object}
- */
-var pMatchFunctionCache = {};
-
-/**
- * Matcher class to make CSS match
- * @param {string} selector Selector
- */
-function Matcher(selector) {
- this.matchers = selector.split(' ').map(function(matcher) {
- if (pMatchFunctionCache[matcher])
- return pMatchFunctionCache[matcher];
- var parts = matcher.split('.');
- var tagName = parts[0];
- var classes = parts.slice(1).sort();
- var source = '';
- if (tagName && tagName != '*') {
- if (tagName[0] == '#')
- source += 'if (el.id != ' + JSON.stringify(tagName.substr(1)) + ') return false;';
- else
- source += 'if (el.tagName != ' + JSON.stringify(tagName) + ') return false;';
- }
- if (classes.length > 0)
- source += 'for (var cls = ' + JSON.stringify(classes) + ', i = 0; i < cls.length; i++) if (el.classNames.indexOf(cls[i]) === -1) return false;';
- source += 'return true;';
- return pMatchFunctionCache[matcher] = new Function('el', source);
- });
- this.nextMatch = 0;
-}
-$declare(Matcher, {
- /**
- * Trying to advance match pointer
- * @param {HTMLElement} el element to make the match
- * @return {bool} true when pointer advanced.
- */
- advance: function(el) {
- if (this.nextMatch < this.matchers.length &&
- this.matchers[this.nextMatch](el)) {
- this.nextMatch++;
- return true;
- }
- return false;
- },
- /**
- * Rewind the match pointer
- */
- rewind: function() {
- this.nextMatch--;
- },
- /**
- * Trying to determine if match made.
- * @return {bool} true when the match is made
- */
- get matched() {
- return this.nextMatch == this.matchers.length;
- },
- /**
- * Rest match pointer.
- * @return {[type]} [description]
- */
- reset: function() {
- this.nextMatch = 0;
- }
-});
-$define(Matcher, {
- /**
- * flush cache to free memory
- */
- flushCache: function() {
- pMatchFunctionCache = {};
- }
});
+$define(HTMLElement, {});
// parser states
var INITIAL = 'INITIAL';
@@ -531,6 +172,7 @@ var kMarkupPattern = (function () {
return makeState(IS_SELF_CLOSING, match, bracketStack);
}
+ match[3] += sym;
return makeState(READ_ATTRIBUTES, match, bracketStack);
case '>':
if (--bracketStack) {
@@ -585,7 +227,7 @@ var kMarkupPattern = (function () {
state = readTagName(state.match, str[i]);
break;
case READ_ATTRIBUTES:
- state = readAttributes(state.match, str[i], state.bracketClose);
+ state = readAttributes(state.match, str[i], state.bracketStack);
break;
case IS_SELF_CLOSING:
state = isSelfClosing(state.match, str[i]);
@@ -651,7 +293,6 @@ var kBlockTextElements = {
*/
module.exports = {
- Matcher: Matcher,
Node: Node,
HTMLElement: HTMLElement,
TextNode: TextNode,
From 388133b292331b018127ca708226da3848030ba8 Mon Sep 17 00:00:00 2001
From: markitosha
Date: Sat, 12 Aug 2017 19:21:22 +0300
Subject: [PATCH 11/16] =?UTF-8?q?=D0=98=D1=81=D0=BF=D1=80=D0=B0=D0=B2?=
=?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B0=20=D0=BE=D1=88=D0=B8=D0=B1=D0=BA=D0=B0?=
=?UTF-8?q?=20=D1=81=D0=BE=20=D0=B7=D0=BD=D0=B0=D0=BA=D0=BE=D0=BC=20=D0=BC?=
=?UTF-8?q?=D0=B5=D0=BD=D1=8C=D1=88=D0=B5=20=D0=B2=20=D1=81=D0=BA=D1=80?=
=?UTF-8?q?=D0=B8=D0=BF=D1=82=D0=B0=D1=85=20=D0=B8=20=D1=81=D1=82=D0=B8?=
=?UTF-8?q?=D0=BB=D1=8F=D1=85?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
test/html.js | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/test/html.js b/test/html.js
index eaa22df..27c095f 100644
--- a/test/html.js
+++ b/test/html.js
@@ -143,18 +143,22 @@ describe('HTML Parser', function() {
it('should parse "![]()
.." very fast', function() {
- for (var i = 0; i < 100; i++)
- parseHTML('![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
');
+ console.time('timer1');
+ for (var i = 0; i < 100; i++)
+ parseHTML('![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
');
+ console.timeEnd('timer1');
+ console.log('\n');
});
it('should parse "![]()
.." fast', function() {
+ console.time('timer');
for (var i = 0; i < 100; i++)
parseHTML('![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
', {
lowerCaseTagName: true
});
-
+ console.timeEnd('timer');
});
});
From 2eb5755c6e7200720b2d5c0a9e751b048c7b3a3c Mon Sep 17 00:00:00 2001
From: markitosha
Date: Sat, 12 Aug 2017 19:34:58 +0300
Subject: [PATCH 12/16] =?UTF-8?q?=D0=98=D1=81=D0=BF=D1=80=D0=B0=D0=B2?=
=?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B0=20=D0=BE=D1=88=D0=B8=D0=B1=D0=BA=D0=B0?=
=?UTF-8?q?=20=D1=81=D0=BE=20=D0=B7=D0=BD=D0=B0=D0=BA=D0=BE=D0=BC=20=D0=BC?=
=?UTF-8?q?=D0=B5=D0=BD=D1=8C=D1=88=D0=B5=20=D0=B2=20=D1=81=D0=BA=D1=80?=
=?UTF-8?q?=D0=B8=D0=BF=D1=82=D0=B0=D1=85=20=D0=B8=20=D1=81=D1=82=D0=B8?=
=?UTF-8?q?=D0=BB=D1=8F=D1=85?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
index.js | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/index.js b/index.js
index 5b7f239..e09568b 100644
--- a/index.js
+++ b/index.js
@@ -250,7 +250,9 @@ var kMarkupPattern = (function () {
get lastIndex() {
return lastIndex;
},
- set lastIndex(newLastIndex) {}
+ set lastIndex(newLastIndex) {
+ lastIndex = newLastIndex;
+ }
}
})();
From 0a311bc232872b36b9ae1dca41f8c84bbdd31902 Mon Sep 17 00:00:00 2001
From: markitosha
Date: Sat, 12 Aug 2017 19:42:15 +0300
Subject: [PATCH 13/16] =?UTF-8?q?=D0=92=D0=BE=D0=B7=D0=B2=D1=80=D0=B0?=
=?UTF-8?q?=D1=89=D0=B5=D0=BD=D1=8B=20=D1=84=D1=83=D0=BD=D0=BA=D1=86=D0=B8?=
=?UTF-8?q?=D0=B8,=20=D0=BD=D0=B5=D0=BE=D0=B1=D1=85=D0=BE=D0=B4=D0=B8?=
=?UTF-8?q?=D0=BC=D1=8B=D0=B5=20=D0=B4=D0=BB=D1=8F=20=D1=82=D0=B5=D1=81?=
=?UTF-8?q?=D1=82=D0=BE=D0=B2.=20=D0=A3=D0=B4=D0=B0=D0=BB=D0=B5=D0=BD?=
=?UTF-8?q?=D1=8B=20=D1=82=D0=B5=D1=81=D1=82=D1=8B=20=D0=B4=D0=BB=D1=8F=20?=
=?UTF-8?q?=D1=83=D0=B4=D0=B0=D0=BB=D0=B5=D0=BD=D0=BD=D0=BE=D0=B3=D0=BE=20?=
=?UTF-8?q?=D1=84=D1=83=D0=BD=D0=BA=D1=86=D0=B8=D0=BE=D0=BD=D0=B0=D0=BB?=
=?UTF-8?q?=D0=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
index.js | 19 ++++++++-
test/html.js | 108 ---------------------------------------------------
2 files changed, 18 insertions(+), 109 deletions(-)
diff --git a/index.js b/index.js
index e09568b..4ec2376 100644
--- a/index.js
+++ b/index.js
@@ -24,7 +24,10 @@ $inherit(TextNode, Node, {
* Node Type declaration.
* @type {Number}
*/
- nodeType: Node.TEXT_NODE
+ nodeType: Node.TEXT_NODE,
+ get text() {
+ return entities.decodeHTML5(this.rawText);
+ }
});
/**
@@ -53,6 +56,9 @@ $inherit(HTMLElement, Node, {
* Get unescaped text value of current node and its children.
* @return {string} text content
*/
+ get text() {
+ return entities.decodeHTML5(this.rawText);
+ },
/**
* Get escpaed (as-it) text value of current node and its children.
@@ -75,6 +81,17 @@ $inherit(HTMLElement, Node, {
this.childNodes.push(node);
return node;
},
+ get firstChild() {
+ return this.childNodes.front;
+ },
+
+/**
+ * Get last child node
+ * @return {Node} last child node
+ */
+ get lastChild() {
+ return this.childNodes.back;
+ },
/**
* Get attributes
* @return {Object} parsed and unescaped attributes
diff --git a/test/html.js b/test/html.js
index 27c095f..88f2c65 100644
--- a/test/html.js
+++ b/test/html.js
@@ -6,60 +6,9 @@ var HTMLParser = require('../');
describe('HTML Parser', function() {
- var Matcher = HTMLParser.Matcher;
var HTMLElement = HTMLParser.HTMLElement;
var TextNode = HTMLParser.TextNode;
- describe('Matcher', function() {
-
- it('should match corrent elements', function() {
-
- var matcher = new Matcher('#id .a a.b *.a.b .a.b * a');
- var MatchesNothingButStarEl = new HTMLElement('_', {});
- var withIdEl = new HTMLElement('p', { id: 'id' });
- var withClassNameEl = new HTMLElement('a', { class: 'a b' });
-
- // console.log(util.inspect([withIdEl, withClassNameEl], {
- // showHidden: true,
- // depth: null
- // }));
-
- matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // #id
- matcher.advance(withClassNameEl).should.not.be.ok; // #id
- matcher.advance(withIdEl).should.be.ok; // #id
-
- matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a
- matcher.advance(withIdEl).should.not.be.ok; // .a
- matcher.advance(withClassNameEl).should.be.ok; // .a
-
- matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a.b
- matcher.advance(withIdEl).should.not.be.ok; // a.b
- matcher.advance(withClassNameEl).should.be.ok; // a.b
-
- matcher.advance(withIdEl).should.not.be.ok; // *.a.b
- matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // *.a.b
- matcher.advance(withClassNameEl).should.be.ok; // *.a.b
-
- matcher.advance(withIdEl).should.not.be.ok; // .a.b
- matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a.b
- matcher.advance(withClassNameEl).should.be.ok; // .a.b
-
- matcher.advance(withIdEl).should.be.ok; // *
- matcher.rewind();
- matcher.advance(MatchesNothingButStarEl).should.be.ok; // *
- matcher.rewind();
- matcher.advance(withClassNameEl).should.be.ok; // *
-
- matcher.advance(withIdEl).should.not.be.ok; // a
- matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a
- matcher.advance(withClassNameEl).should.be.ok; // a
-
- matcher.matched.should.be.ok;
-
- });
-
- });
-
var parseHTML = HTMLParser.parse;
describe('parse()', function() {
@@ -163,37 +112,8 @@ describe('HTML Parser', function() {
});
- describe('TextNode', function() {
-
- describe('#isWhitespace', function() {
- var node = new TextNode('');
- node.isWhitespace.should.be.ok;
- node = new TextNode(' \t');
- node.isWhitespace.should.be.ok;
- node = new TextNode(' \t \t');
- node.isWhitespace.should.be.ok;
- });
-
- });
-
describe('HTMLElement', function() {
- describe('#removeWhitespace()', function() {
-
- it('should remove whitespaces while preserving nodes with content', function() {
-
- var root = parseHTML(' \r \n \t
123
');
-
- var p = new HTMLElement('p', {}, '');
- p.appendChild(new HTMLElement('h5', {}, ''))
- .appendChild(new TextNode('123'));
-
- root.firstChild.removeWhitespace().should.eql(p);
-
- });
-
- });
-
describe('#rawAttributes', function() {
it('should return escaped attributes of the element', function() {
@@ -226,34 +146,6 @@ describe('HTML Parser', function() {
});
- describe('#querySelectorAll()', function() {
-
- it('should return correct elements in DOM tree', function() {
-
- var root = parseHTML('
');
-
- root.querySelectorAll('#id').should.eql([root.firstChild]);
- root.querySelectorAll('span.a').should.eql([root.firstChild.firstChild.firstChild]);
- root.querySelectorAll('span.b').should.eql([root.firstChild.firstChild.firstChild]);
- root.querySelectorAll('span.a.b').should.eql([root.firstChild.firstChild.firstChild]);
- root.querySelectorAll('#id .b').should.eql([root.firstChild.firstChild.firstChild]);
- root.querySelectorAll('#id span').should.eql(root.firstChild.firstChild.childNodes);
-
- });
-
- });
-
- describe('#structuredText', function() {
-
- it('should return correct structured text', function() {
-
- var root = parseHTML('oa
b
c');
- root.structuredText.should.eql('o\na\nb\nc');
-
- });
-
- });
-
});
});
From 523e1f7af0a5fac1a544e5eea20db0170cd8c74c Mon Sep 17 00:00:00 2001
From: markitosha
Date: Thu, 17 Aug 2017 17:59:10 +0300
Subject: [PATCH 14/16] =?UTF-8?q?=D0=A3=D0=B4=D0=B0=D0=BB=D0=B5=D0=BD?=
=?UTF-8?q?=D1=8B=20=D1=82=D0=B0=D0=B9=D0=BC=D0=B5=D1=80=D1=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
test/html.js | 5 -----
1 file changed, 5 deletions(-)
diff --git a/test/html.js b/test/html.js
index 88f2c65..b61e0c9 100644
--- a/test/html.js
+++ b/test/html.js
@@ -92,22 +92,17 @@ describe('HTML Parser', function() {
it('should parse "![]()
.." very fast', function() {
- console.time('timer1');
for (var i = 0; i < 100; i++)
parseHTML('![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
');
- console.timeEnd('timer1');
- console.log('\n');
});
it('should parse "![]()
.." fast', function() {
- console.time('timer');
for (var i = 0; i < 100; i++)
parseHTML('![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
', {
lowerCaseTagName: true
});
- console.timeEnd('timer');
});
});
From d9fa9788388646ee51da3274c99844326a9d38f6 Mon Sep 17 00:00:00 2001
From: markitosha
Date: Sat, 19 Aug 2017 11:55:16 +0300
Subject: [PATCH 15/16] =?UTF-8?q?=D0=A0=D0=B5=D1=84=D0=B0=D0=BA=D1=82?=
=?UTF-8?q?=D0=BE=D1=80=D0=B8=D0=BD=D0=B3=20=D0=B8=20=D0=BA=D0=BE=D0=BC?=
=?UTF-8?q?=D0=BC=D0=B5=D0=BD=D1=82=D0=B0=D1=80=D0=B8=D0=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
index.js | 542 ++++++++++++++++++++++++++++-----------------------
test/html.js | 8 +-
2 files changed, 298 insertions(+), 252 deletions(-)
diff --git a/index.js b/index.js
index 4ec2376..836a8f1 100644
--- a/index.js
+++ b/index.js
@@ -4,12 +4,13 @@ var entities = require('entities');
/**
* Node Class as base class for TextNode and HTMLElement.
*/
-function Node() {}
+function Node() {
+}
$declare(Node, {});
$defenum(Node, {
- ELEMENT_NODE: 1,
- TEXT_NODE: 3
+ ELEMENT_NODE: 1,
+ TEXT_NODE: 3
});
/**
@@ -17,17 +18,17 @@ $defenum(Node, {
* @param {string} value [description]
*/
function TextNode(value) {
- this.rawText = value;
+ this.rawText = value;
}
$inherit(TextNode, Node, {
- /**
- * Node Type declaration.
- * @type {Number}
- */
- nodeType: Node.TEXT_NODE,
- get text() {
- return entities.decodeHTML5(this.rawText);
- }
+ /**
+ * Node Type declaration.
+ * @type {Number}
+ */
+ nodeType: Node.TEXT_NODE,
+ get text() {
+ return entities.decodeHTML5(this.rawText);
+ }
});
/**
@@ -40,93 +41,99 @@ $inherit(TextNode, Node, {
* @param {Object} rawAttrs attributes in string
*/
function HTMLElement(name, keyAttrs, rawAttrs) {
- this.tagName = name;
- this.rawAttrs = rawAttrs || '';
- this.childNodes = [];
- if (keyAttrs.id)
- this.id = keyAttrs.id;
+ this.tagName = name;
+ this.rawAttrs = rawAttrs || '';
+ this.childNodes = [];
+ if (keyAttrs.id) {
+ this.id = keyAttrs.id;
+ }
}
$inherit(HTMLElement, Node, {
- /**
- * Node Type declaration.
- * @type {Number}
- */
- nodeType: Node.ELEMENT_NODE,
- /**
- * Get unescaped text value of current node and its children.
- * @return {string} text content
- */
- get text() {
- return entities.decodeHTML5(this.rawText);
- },
-
- /**
- * Get escpaed (as-it) text value of current node and its children.
- * @return {string} text content
- */
- get rawText() {
- var res = '';
- for (var i = 0; i < this.childNodes.length; i++)
- res += this.childNodes[i].rawText;
- return res;
- },
-
- /**
- * Append a child node to childNodes
- * @param {Node} node node to append
- * @return {Node} node appended
- */
- appendChild: function(node) {
- // node.parentNode = this;
- this.childNodes.push(node);
- return node;
- },
- get firstChild() {
- return this.childNodes.front;
- },
-
-/**
- * Get last child node
- * @return {Node} last child node
- */
- get lastChild() {
- return this.childNodes.back;
- },
- /**
- * Get attributes
- * @return {Object} parsed and unescaped attributes
- */
- get attributes() {
- if (this._attrs)
- return this._attrs;
- this._attrs = {};
- var attrs = this.rawAttributes;
- for (var key in attrs) {
- this._attrs[key] = entities.decodeHTML5(attrs[key]);
- }
- return this._attrs;
- },
-
- /**
- * Get escaped (as-it) attributes
- * @return {Object} parsed attributes
- */
- get rawAttributes() {
- if (this._rawAttrs)
- return this._rawAttrs;
- var attrs = {};
- if (this.rawAttrs) {
- var re = /\b([_a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
- for (var match; match = re.exec(this.rawAttrs); )
- attrs[match[1]] = match[3] || match[4] || match[5];
+ /**
+ * Node Type declaration.
+ * @type {Number}
+ */
+ nodeType: Node.ELEMENT_NODE,
+ /**
+ * Get unescaped text value of current node and its children.
+ * @return {string} text content
+ */
+ get text() {
+ return entities.decodeHTML5(this.rawText);
+ },
+
+ /**
+ * Get escpaed (as-it) text value of current node and its children.
+ * @return {string} text content
+ */
+ get rawText() {
+ var res = '';
+ for (var i = 0; i < this.childNodes.length; i++) {
+ res += this.childNodes[i].rawText;
+ }
+ return res;
+ },
+
+ /**
+ * Append a child node to childNodes
+ * @param {Node} node node to append
+ * @return {Node} node appended
+ */
+ appendChild: function (node) {
+ // node.parentNode = this;
+ this.childNodes.push(node);
+ return node;
+ },
+
+ get firstChild() {
+ return this.childNodes.front;
+ },
+
+ /**
+ * Get last child node
+ * @return {Node} last child node
+ */
+ get lastChild() {
+ return this.childNodes.back;
+ },
+ /**
+ * Get attributes
+ * @return {Object} parsed and unescaped attributes
+ */
+ get attributes() {
+ if (this._attrs) {
+ return this._attrs;
+ }
+ this._attrs = {};
+ var attrs = this.rawAttributes;
+ for (var key in attrs) {
+ this._attrs[key] = entities.decodeHTML5(attrs[key]);
+ }
+ return this._attrs;
+ },
+
+ /**
+ * Get escaped (as-it) attributes
+ * @return {Object} parsed attributes
+ */
+ get rawAttributes() {
+ if (this._rawAttrs) {
+ return this._rawAttrs;
+ }
+ var attrs = {};
+ if (this.rawAttrs) {
+ var re = /\b([_a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
+ for (var match; match = re.exec(this.rawAttrs);) {
+ attrs[match[1]] = match[3] || match[4] || match[5];
+ }
+ }
+ this._rawAttrs = attrs;
+ return attrs;
}
- this._rawAttrs = attrs;
- return attrs;
- }
});
$define(HTMLElement, {});
-// parser states
+// parser statuses
var INITIAL = 'INITIAL';
var TAG_OPENED = 'TAG_OPENED';
var READ_TAG_NAME = 'READ_TAG_NAME';
@@ -134,19 +141,43 @@ var READ_ATTRIBUTES = 'READ_ATTRIBUTES';
var IS_SELF_CLOSING = 'IS_SELF_CLOSING';
var TAG_CLOSE = 'TAG_CLOSE';
+// indexes for match
+var TAG_FULL = 0;
+var TAG_CLOSING_CHAR = 1;
+var TAG_NAME = 2;
+var ATTRIBUTES = 3;
+var SELF_CLOSING_CHAR = 4;
+
+/**
+ * Function, which works like RegExp (contains exec function and lastIndex param)
+ */
var kMarkupPattern = (function () {
var lastIndex = 0;
- function makeState(state, match, bracketStack ) {
- return {
- state: state,
- match: match,
- bracketStack: bracketStack
- };
+ /**
+ * Make state object from params
+ * @param {string} status next parcer status
+ * @param {Array} match array which contains parsed values
+ * @param {Number} openedBracketCounter counter of unclosed tag brackets
+ */
+ function makeState(status, match, openedBracketCounter) {
+ return {
+ status: status,
+ match: match,
+ openedBracketCounter: openedBracketCounter
+ };
}
+ // Status functions that determine the processing of the next character
+
+ /**
+ * Initial status function
+ * @param {Array} match array which contains parsed values
+ * @param {string} sym processed character
+ * @param {Number} index index of processed character
+ */
function initial(match, sym, index) {
- if (sym === '<' ) {
+ if (sym === '<') {
match['index'] = index;
return makeState(TAG_OPENED, match, 1);
}
@@ -154,20 +185,26 @@ var kMarkupPattern = (function () {
return makeState(INITIAL, match, 0);
}
+ /**
+ * Processing of the character immediately following the opening bracket
+ * @param {Array} match array which contains parsed values
+ * @param {string} sym processed character
+ */
function tagOpened(match, sym) {
- if (sym === '!') {
- return makeState(INITIAL, match, 0);
- }
-
if (sym === '/') {
- match[1] = '/';
+ match[TAG_CLOSING_CHAR] = '/';
} else {
- match[2] += sym;
+ match[TAG_NAME] += sym;
}
return makeState(READ_TAG_NAME, match, 1);
}
+ /**
+ * Processing of the characters in tag name
+ * @param {Array} match array which contains parsed values
+ * @param {string} sym processed character
+ */
function readTagName(match, sym) {
switch (sym) {
case ' ':
@@ -177,63 +214,82 @@ var kMarkupPattern = (function () {
case '>':
return makeState(TAG_CLOSE, match, 1);
default:
- match[2] += sym;
+ match[TAG_NAME] += sym;
return makeState(READ_TAG_NAME, match, 1);
}
}
- function readAttributes(match, sym, bracketStack) {
+ /**
+ * Processing of the characters in attributes
+ * @param {Array} match array which contains parsed values
+ * @param {string} sym processed character
+ * @param {Number} openedBracketCounter counter of unclosed tag brackets
+ */
+ function readAttributes(match, sym, openedBracketCounter) {
switch (sym) {
case '/':
- if (bracketStack === 1) {
- return makeState(IS_SELF_CLOSING, match, bracketStack);
+ if (openedBracketCounter === 1) {
+ return makeState(IS_SELF_CLOSING, match, openedBracketCounter);
}
- match[3] += sym;
- return makeState(READ_ATTRIBUTES, match, bracketStack);
+ match[ATTRIBUTES] += sym;
+ return makeState(READ_ATTRIBUTES, match, openedBracketCounter);
case '>':
- if (--bracketStack) {
- match[3] += sym;
- return makeState(READ_ATTRIBUTES, match, bracketStack);
+ if (--openedBracketCounter) {
+ match[ATTRIBUTES] += sym;
+ return makeState(READ_ATTRIBUTES, match, openedBracketCounter);
}
- return makeState(TAG_CLOSE, match, bracketStack);
+ return makeState(TAG_CLOSE, match, openedBracketCounter);
case '<':
- ++bracketStack;
+ ++openedBracketCounter;
+ // without break, it's not a mistake
default:
- match[3] += sym;
- return makeState(READ_ATTRIBUTES, match, bracketStack);
+ match[ATTRIBUTES] += sym;
+ return makeState(READ_ATTRIBUTES, match, openedBracketCounter);
}
}
+ /**
+ * Processing of the character immediately following the character '/'
+ * @param {Array} match array which contains parsed values
+ * @param {string} sym processed character
+ */
function isSelfClosing(match, sym) {
if (sym === '>') {
- match[4] = '/';
+ match[SELF_CLOSING_CHAR] = '/';
return makeState(TAG_CLOSE, match, 0);
}
- match[3] += '/' + sym;
+ match[ATTRIBUTES] += '/' + sym;
return makeState(READ_ATTRIBUTES, match, 1);
}
+ /**
+ * Final processing of the string
+ * @param {Array} match array which contains parsed values
+ * @param {string} str processed string
+ * @param {Number} index index of processed character
+ */
function tagClose(match, str, index) {
lastIndex = index;
- match[0] = str.slice(match['index'], index);
+ match[TAG_FULL] = str.slice(match['index'], index);
return makeState(INITIAL, match, 0);
}
return {
exec: function (str) {
+ // state.match :: [TAG_FULL, TAG_CLOSING_CHAR, TAG_NAME, ATTRIBUTES, SELF_CLOSING_CHAR]
var state = {
- state: INITIAL,
+ status: INITIAL,
match: ['', '', '', '', ''],
- bracketStack: 0
+ openedBracketCounter: 0
};
state.match['input'] = str;
for (var i = lastIndex; i < str.length; ++i) {
- switch (state.state) {
+ switch (state.status) {
case INITIAL:
state = initial(state.match, str[i], i);
break;
@@ -244,7 +300,7 @@ var kMarkupPattern = (function () {
state = readTagName(state.match, str[i]);
break;
case READ_ATTRIBUTES:
- state = readAttributes(state.match, str[i], state.bracketStack);
+ state = readAttributes(state.match, str[i], state.openedBracketCounter);
break;
case IS_SELF_CLOSING:
state = isSelfClosing(state.match, str[i]);
@@ -253,10 +309,10 @@ var kMarkupPattern = (function () {
state = tagClose(state.match, str, i);
return state.match;
default:
- break;
+ break;
}
}
- if (state.state === TAG_CLOSE) {
+ if (state.status === TAG_CLOSE) {
state = tagClose(state.match, str, str.length);
return state.match;
}
@@ -265,7 +321,7 @@ var kMarkupPattern = (function () {
return null;
},
get lastIndex() {
- return lastIndex;
+ return lastIndex;
},
set lastIndex(newLastIndex) {
lastIndex = newLastIndex;
@@ -275,36 +331,36 @@ var kMarkupPattern = (function () {
var kAttributePattern = /\b(id|class)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
var kSelfClosingElements = {
- meta: true,
- img: true,
- link: true,
- input: true,
- area: true,
- br: true,
- hr: true,
- wbr: true,
- col: true
+ meta: true,
+ img: true,
+ link: true,
+ input: true,
+ area: true,
+ br: true,
+ hr: true,
+ wbr: true,
+ col: true
};
var kElementsClosedByOpening = {
- li: {li: true},
- p: {p: true, div: true},
- td: {td: true, th: true},
- th: {td: true, th: true}
+ li: {li: true},
+ p: {p: true, div: true},
+ td: {td: true, th: true},
+ th: {td: true, th: true}
};
var kElementsClosedByClosing = {
- li: {ul: true, ol: true},
- a: {div: true},
- b: {div: true},
- i: {div: true},
- p: {div: true},
- td: {tr: true, table: true},
- th: {tr: true, table: true}
+ li: {ul: true, ol: true},
+ a: {div: true},
+ b: {div: true},
+ i: {div: true},
+ p: {div: true},
+ td: {tr: true, table: true},
+ th: {tr: true, table: true}
};
var kBlockTextElements = {
- script: true,
- noscript: true,
- style: true,
- pre: true
+ script: true,
+ noscript: true,
+ style: true,
+ pre: true
};
/**
@@ -312,102 +368,96 @@ var kBlockTextElements = {
*/
module.exports = {
- Node: Node,
- HTMLElement: HTMLElement,
- TextNode: TextNode,
-
- /**
- * Parse a chuck of HTML source.
- * @param {string} data html
- * @return {HTMLElement} root element
- */
- parse: function(data, options) {
-
- var root = new HTMLElement(null, {});
- var currentParent = root;
- var stack = [root];
- var lastTextPos = -1;
-
- options = options || {};
-
- for (var match, text; match = kMarkupPattern.exec(data); ) {
- if (lastTextPos > -1) {
- if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) {
- // if has content
- text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length);
- currentParent.appendChild(new TextNode(text));
- }
- }
- lastTextPos = kMarkupPattern.lastIndex;
- if (match[0][1] == '!') {
- // this is a comment
- continue;
- }
- if (options.lowerCaseTagName)
- match[2] = match[2].toLowerCase();
- if (!match[1]) {
- // not tags
- var attrs = {};
- for (var attMatch; attMatch = kAttributePattern.exec(match[3]); )
- attrs[attMatch[1]] = attMatch[3] || attMatch[4] || attMatch[5];
- // console.log(attrs);
- if (!match[4] && kElementsClosedByOpening[currentParent.tagName]) {
- if (kElementsClosedByOpening[currentParent.tagName][match[2]]) {
- stack.pop();
- currentParent = stack.back;
- }
- }
- currentParent = currentParent.appendChild(
- new HTMLElement(match[2], attrs, match[3]));
- stack.push(currentParent);
- if (kBlockTextElements[match[2]]) {
- // a little test to find next or ...
- var closeMarkup = '' + match[2] + '>';
- var index = data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
- if (options[match[2]]) {
- if (index == -1) {
- // there is no matching ending for the text element.
- text = data.substr(kMarkupPattern.lastIndex);
- } else {
- text = data.substring(kMarkupPattern.lastIndex, index);
+ Node: Node,
+ HTMLElement: HTMLElement,
+ TextNode: TextNode,
+
+ /**
+ * Parse a chuck of HTML source.
+ * @param {string} data html
+ * @return {HTMLElement} root element
+ */
+ parse: function (data, options) {
+ var root = new HTMLElement(null, {});
+ var currentParent = root;
+ var stack = [root];
+ var lastTextPos = -1;
+
+ options = options || {};
+
+ for (var match, text; match = kMarkupPattern.exec(data);) {
+ if (lastTextPos > -1 && (lastTextPos + match[TAG_FULL].length < kMarkupPattern.lastIndex)) {
+ // if has content
+ text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length);
+ currentParent.appendChild(new TextNode(text));
}
- if (text.length > 0)
- currentParent.appendChild(new TextNode(text));
- }
- if (index == -1) {
- lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
- } else {
- lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length;
- match[1] = true;
- }
- }
- }
- if (match[1] || match[4] ||
- kSelfClosingElements[match[2]]) {
- // or /> or
etc.
- while (true) {
- if (currentParent.tagName == match[2]) {
- stack.pop();
- currentParent = stack.back;
- break;
- } else {
- // Trying to close current tag, and move on
- if (kElementsClosedByClosing[currentParent.tagName]) {
- if (kElementsClosedByClosing[currentParent.tagName][match[2]]) {
- stack.pop();
- currentParent = stack.back;
+ lastTextPos = kMarkupPattern.lastIndex;
+ if (match[TAG_FULL][1] == '!') {
+ // this is a comment
continue;
- }
}
- // Use aggressive strategy to handle unmatching markups.
- break;
- }
+ if (!match[TAG_CLOSING_CHAR]) {
+ // not tags
+ var attrs = {};
+ for (var attMatch; attMatch = kAttributePattern.exec(match[ATTRIBUTES]);) {
+ attrs[attMatch[1]] = attMatch[3] || attMatch[4] || attMatch[5];
+ }
+ if (
+ !match[SELF_CLOSING_CHAR]
+ && kElementsClosedByOpening[currentParent.tagName]
+ && kElementsClosedByOpening[currentParent.tagName][match[TAG_NAME]]
+ ) {
+ stack.pop();
+ currentParent = stack.back;
+ }
+ currentParent = currentParent.appendChild(new HTMLElement(match[TAG_NAME], attrs, match[ATTRIBUTES]));
+ stack.push(currentParent);
+ if (kBlockTextElements[match[TAG_NAME]]) {
+ // a little test to find next or ...
+ var closeMarkup = '' + match[TAG_NAME] + '>';
+ var index = data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
+ if (options[match[TAG_NAME]]) {
+ if (index == -1) {
+ // there is no matching ending for the text element.
+ text = data.substr(kMarkupPattern.lastIndex);
+ } else {
+ text = data.substring(kMarkupPattern.lastIndex, index);
+ }
+ if (text.length > 0) {
+ currentParent.appendChild(new TextNode(text));
+ }
+ }
+ if (index == -1) {
+ lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
+ } else {
+ lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length;
+ match[TAG_CLOSING_CHAR] = true;
+ }
+ }
+ }
+ if (match[TAG_CLOSING_CHAR] || match[SELF_CLOSING_CHAR] || kSelfClosingElements[match[TAG_NAME]]) {
+ // or /> or
etc.
+ while (true) {
+ if (currentParent.tagName == match[TAG_NAME]) {
+ stack.pop();
+ currentParent = stack.back;
+ break;
+ } else {
+ // Trying to close current tag, and move on
+ if (kElementsClosedByClosing[currentParent.tagName]) {
+ if (kElementsClosedByClosing[currentParent.tagName][match[TAG_NAME]]) {
+ stack.pop();
+ currentParent = stack.back;
+ continue;
+ }
+ }
+ // Use aggressive strategy to handle unmatching markups.
+ break;
+ }
+ }
+ }
}
- }
- }
-
- return root;
-
- }
+ return root;
+ }
};
diff --git a/test/html.js b/test/html.js
index b61e0c9..fa29174 100644
--- a/test/html.js
+++ b/test/html.js
@@ -31,9 +31,7 @@ describe('HTML Parser', function() {
it('should parse "![]()
" and return root element', function() {
- var root = parseHTML('![]()
', {
- lowerCaseTagName: true
- });
+ var root = parseHTML('![]()
');
var div = new HTMLElement('div', {}, '');
var a = div.appendChild(new HTMLElement('a', {}, ''));
@@ -100,9 +98,7 @@ describe('HTML Parser', function() {
it('should parse "![]()
.." fast', function() {
for (var i = 0; i < 100; i++)
- parseHTML('![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
', {
- lowerCaseTagName: true
- });
+ parseHTML('![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
![]()
');
});
});
From f102642c4a7bca0b70acca3508b145a2b5a4af61 Mon Sep 17 00:00:00 2001
From: markitosha
Date: Sat, 19 Aug 2017 16:15:09 +0300
Subject: [PATCH 16/16] =?UTF-8?q?=D0=98=D1=81=D0=BF=D1=80=D0=B0=D0=B2?=
=?UTF-8?q?=D0=BB=D0=B5=D0=BD=D1=8B=20=D0=BE=D0=BF=D0=B5=D1=87=D0=B0=D1=82?=
=?UTF-8?q?=D0=BA=D0=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
index.js | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/index.js b/index.js
index 836a8f1..76cf79a 100644
--- a/index.js
+++ b/index.js
@@ -156,7 +156,7 @@ var kMarkupPattern = (function () {
/**
* Make state object from params
- * @param {string} status next parcer status
+ * @param {string} status next parser status
* @param {Array} match array which contains parsed values
* @param {Number} openedBracketCounter counter of unclosed tag brackets
*/
@@ -168,7 +168,7 @@ var kMarkupPattern = (function () {
};
}
- // Status functions that determine the processing of the next character
+ // Status functions those determine the processing of the next character
/**
* Initial status function