ES HTML Parser is a HTML parser that generates an abstract syntax tree similar to the ESTree specification.
This project began as a fork of hyntax and is developed to follow ESTree-like ast specification.
See online demo.
npm install es-html-parser
import { parse } from "es-html-parser";
const input = `
<html>
<body>
<button type="button"> press here </button>
</body>
</html>
`;
const { ast, tokens } = parse(input);
parse(html: string, options?: Options): ParseResult;
Arguments
html
: HTML string to parse.options (optional)
tokenAdapter
: The adapter option for changing tokens information.
Returns
ParseResult
: Result of parsing
interface ParseResult {
ast: DocumentNode;
tokens: AnyToken[];
}
ast
: The root node of the ast.tokens
: An array of resulting tokens.
The AnyNode
is an union type of all nodes.
type AnyNode =
| DocumentNode
| TextNode
| TagNode
| OpenTagStartNode
| OpenTagEndNode
| CloseTagNode
| AttributeNode
| AttributeKeyNode
| AttributeValueNode
| AttributeValueWrapperStartNode
| AttributeValueWrapperEndNode
| ScriptTagNode
| OpenScriptTagStartNode
| CloseScriptTagNode
| OpenScriptTagEndNode
| ScriptTagContentNode
| StyleTagNode
| OpenStyleTagStartNode
| OpenStyleTagEndNode
| StyleTagContentNode
| CloseStyleTagNode
| CommentNode
| CommentOpenNode
| CommentCloseNode
| CommentContentNode
| DoctypeNode
| DoctypeOpenNode
| DoctypeCloseNode
| DoctypeAttributeNode
| DoctypeAttributeValueNode
| DoctypeAttributeWrapperStartNode
| DoctypeAttributeWrapperEndNode;
The AnyToken
is an union type all tokens.
type AnyToken =
| Token<TokenTypes.Text>
| Token<TokenTypes.OpenTagStart>
| Token<TokenTypes.OpenTagEnd>
| Token<TokenTypes.CloseTag>
| Token<TokenTypes.AttributeKey>
| Token<TokenTypes.AttributeAssignment>
| Token<TokenTypes.AttributeValueWrapperStart>
| Token<TokenTypes.AttributeValue>
| Token<TokenTypes.AttributeValueWrapperEnd>
| Token<TokenTypes.DoctypeOpen>
| Token<TokenTypes.DoctypeAttributeValue>
| Token<TokenTypes.DoctypeAttributeWrapperStart>
| Token<TokenTypes.DoctypeAttributeWrapperEnd>
| Token<TokenTypes.DoctypeClose>
| Token<TokenTypes.CommentOpen>
| Token<TokenTypes.CommentContent>
| Token<TokenTypes.CommentClose>
| Token<TokenTypes.OpenScriptTagStart>
| Token<TokenTypes.OpenScriptTagEnd>
| Token<TokenTypes.ScriptTagContent>
| Token<TokenTypes.CloseScriptTag>
| Token<TokenTypes.OpenStyleTagStart>
| Token<TokenTypes.OpenStyleTagEnd>
| Token<TokenTypes.StyleTagContent>
| Token<TokenTypes.CloseStyleTag>;
enum TokenTypes {
Text = "Text",
OpenTagStart = "OpenTagStart",
OpenTagEnd = "OpenTagEnd",
CloseTag = "CloseTag",
AttributeKey = "AttributeKey",
AttributeAssignment = "AttributeAssignment",
AttributeValueWrapperStart = "AttributeValueWrapperStart",
AttributeValue = "AttributeValue",
AttributeValueWrapperEnd = "AttributeValueWrapperEnd",
DoctypeOpen = "DoctypeOpen",
DoctypeAttributeValue = "DoctypeAttributeValue",
DoctypeAttributeWrapperStart = "DoctypeAttributeWrapperStart",
DoctypeAttributeWrapperEnd = "DoctypeAttributeWrapperEnd",
DoctypeClose = "DoctypeClose",
CommentOpen = "CommentOpen",
CommentContent = "CommentContent",
CommentClose = "CommentClose",
OpenScriptTagStart = "OpenScriptTagStart",
OpenScriptTagEnd = "OpenScriptTagEnd",
ScriptTagContent = "ScriptTagContent",
CloseScriptTag = "CloseScriptTag",
OpenStyleTagStart = "OpenStyleTagStart",
OpenStyleTagEnd = "OpenStyleTagEnd",
StyleTagContent = "StyleTagContent",
CloseStyleTag = "CloseStyleTag",
}
enum NodeTypes {
Document = "Document",
Tag = "Tag",
Text = "Text",
Doctype = "Doctype",
Comment = "Comment",
CommentOpen = "CommentOpen",
CommentClose = "CommentClose",
CommentContent = "CommentContent",
Attribute = "Attribute",
AttributeKey = "AttributeKey",
AttributeValue = "AttributeValue",
AttributeValueWrapperStart = "AttributeValueWrapperStart",
AttributeValueWrapperEnd = "AttributeValueWrapperEnd",
CloseTag = "CloseTag",
OpenTagEnd = "OpenTagEnd",
OpenTagStart = "OpenTagStart",
DoctypeOpen = "DoctypeOpen",
DoctypeAttribute = "DoctypeAttribute",
DoctypeClose = "DoctypeClose",
ScriptTag = "ScriptTag",
OpenScriptTagStart = "OpenScriptTagStart",
OpenScriptTagEnd = "OpenScriptTagEnd",
ScriptTagContent = "ScriptTagContent",
StyleTag = "StyleTag",
OpenStyleTagStart = "OpenStyleTagStart",
OpenStyleTagEnd = "OpenStyleTagEnd",
StyleTagContent = "StyleTagContent",
CloseStyleTag = "CloseStyleTag",
CloseScriptTag = "CloseScriptTag",
DoctypeAttributeValue = "DoctypeAttributeValue",
DoctypeAttributeWrapperStart = "DoctypeAttributeWrapperStart",
DoctypeAttributeWrapperEnd = "DoctypeAttributeWrapperEnd",
}
Every AST node and token implements the BaseNode
interface.
interface BaseNode {
type: string;
loc: SourceLocation;
range: [number, number];
}
The type
field is representing the AST type. Its value is one of the NodeTypes
or TokenTypes
.
The loc
and range
fields represent the source location of the node.
interface SourceLocation {
start: Position;
end: Position;
}
The start
field represents the start location of the node.
The end
field represents the end location of the node.
interface Position {
line: number; // >= 1
column: number; // >= 0
}
The line
field is a number representing the line number where the node positioned. (1-based index).
The column
field is a number representing the offset in the line. (0-based index).
All tokens implement the Token
interface.
interface Token<T extends TokenTypes> extends BaseNode {
type: T;
value: string;
}
DocumentNode
represents a whole parsed document. It's a root node of the AST.
interface DocumentNode extends BaseNode {
type: "Document";
children: Array<TextNode | TagNode | ScriptNode | StyleNode | CommentNode>;
}
TextNode
represents any plain text in HTML.
interface TextNode extends BaseNode {
type: "Text";
value: string;
}
TagNode
represents all kinds of tag nodes in HTML except for doctype, script, style, and comment. (e.g. <div></div>
, <span></span>
...)
interface TagNode extends BaseNode {
type: "Tag";
selfClosing: boolean;
name: string;
openStart: OpenTagStartNode;
openEnd: OpenTagEndNode;
close?: CloseTagNode;
children: Array<TextNode | TagNode | ScriptNode | StyleNode | CommentNode>;
attributes: Array<AttributeNode>;
}
OpenTagStartNode
represents the opening part of the Start tags. (e.g. <div
)
interface OpenTagStartNode extends BaseNode {
type: "OpenTagStart";
value: string;
}
OpenTagEndNode
represents the closing part of the Start tags. (e.g. >
, />
)
interface OpenTagEndNode extends BaseNode {
type: "OpenTagEnd";
value: string;
}
ClosingTagNode
represents the End tags. (e.g. </div>
)
interface CloseTagNode extends BaseNode {
type: "CloseTag";
value: string;
}
AttributeNode
represents an attribute. (e.g. id="foo"
)
interface AttributeNode extends BaseNode {
type: "Attribute";
key: AttributeKeyNode;
value?: AttributeValueNode;
startWrapper?: AttributeValueWrapperStartNode;
endWrapper?: AttributeValueWrapperEndNode;
}
AttributeKeyNode
represents a key part of an attribute. (e.g. id
)
interface AttributeKeyNode extends BaseNode {
type: "AttributeKey";
value: string;
}
AttributeValueWrapperStartNode
represents the left side character that wraps the value of the attribute. (e.g. "
, '
)
interface AttributeValueWrapperStartNode extends BaseNode {
type: "AttributeValueWrapperStart";
value: string;
}
AttributeValueWrapperEndNode
represents the right side character that wraps the value of the attribute. (e.g. "
, '
)
interface AttributeValueWrapperEndNode extends BaseNode {
type: "AttributeValueWrapperEnd";
value: string;
}
AttributeValueNode
represents the value part of the attribute. It does not include wrapper characters. (e.g. foo
)
interface AttributeValueNode extends BaseNode {
type: "AttributeValue";
value: string;
}
The ScriptTagNode
represents a script tags in the HTML. (e.g. <script> console.log('hello'); </script>
).
interface ScriptTagNode extends BaseNode {
type: "ScriptTag";
attributes: Array<AttributeNode>;
openStart: OpenScriptTagStartNode;
openEnd: OpenScriptTagEndNode;
close: CloseScriptTagNode;
value?: ScriptTagContentNode;
}
OpenScriptTagStartNode
represents an opening part of a start script tag. (e.g. <script
)
interface OpenScriptTagStartNode extends BaseNode {
type: "OpenScriptTagStart";
value: string;
}
OpenScriptTagEndNode
represents a closing part of a start script tag. (e.g. >
)
interface OpenScriptTagEndNode extends BaseNode {
type: "OpenScriptTagEnd";
value: string;
}
CloseScriptTagNode
represents a close script tag. (e.g. </script>
)
interface CloseScriptTagNode extends BaseNode {
type: "CloseScriptTag";
value: string;
}
ScriptTagContentNode
represents a script content in script tag. (e.g. console.log('hello');
)
interface ScriptTagContentNode extends BaseNode {
type: "ScriptTagContent";
value: string;
}
StyleTagNode
represents style tags. (e.g. <style> .foo {} </style>
)
interface StyleTagNode extends BaseNode {
type: "StyleTag";
attributes: Array<AttributeNode>;
openStart: OpenStyleTagStartNode;
openEnd: OpenStyleTagEndNode;
close: CloseStyleTagNode;
value?: StyleTagContentNode;
}
OpenStyleTagStartNode
represents an opening part of a start style tag. (e.g. <style
)
interface OpenStyleTagStartNode extends BaseNode {
type: "OpenStyleTagStart";
value: string;
}
OpenStyleTagEndNode
represents a closing part of a start style tag. (e.g. >
)
interface OpenStyleTagEndNode extends BaseNode {
type: "OpenStyleTagEnd";
value: string;
}
CloseStyleTagNode
represents a close style tag. (e.g. </style>
)
interface CloseStyleTagNode extends BaseNode {
type: "CloseStyleTag";
value: string;
}
StyleTagContentNode
represents a style content in style tag.
interface StyleTagContentNode extends BaseNode {
type: "StyleTagContent";
value: string;
}
CommentNode
represents comment in HTML. (e.g. <!-- content -->
)
interface CommentNode extends BaseNode {
type: "Comment";
open: CommentOpenNode;
close: CommentCloseNode;
value: CommentContentNode;
}
CommentOpenNode
represents comment start character sequence. (e.g. <!--
)
interface CommentOpenNode extends BaseNode {
type: "CommentOpen";
value: string;
}
CommentCloseNode
represents comment end character sequence. (e.g. -->
)
interface CommentCloseNode extends BaseNode {
type: "CommentClose";
value: string;
}
The CommentContentNode
represents text in the comment.
interface CommentContentNode extends BaseNode {
type: "CommentContent";
value: string;
}
DoctypeNode
represents the DOCTYPE in html.
interface DoctypeNode extends BaseNode {
type: "Doctype";
attributes: Array<DoctypeAttributeNode>;
open: DoctypeOpenNode;
close: DoctypeCloseNode;
}
DoctypeOpenNode
represents character sequence of doctype start . (<!DOCTYPE
)
interface DoctypeOpenNode extends BaseNode {
type: "DoctypeOpen";
value: string;
}
DoctypeCloseNode
represents the doctype end character sequence (e.g. >
)
interface DoctypeCloseNode extends BaseNode {
type: "DoctypeClose";
value: string;
}
DoctypeAttributeNode
represents an attribute of doctype node. (e.g. html
, "-//W3C//DTD HTML 4.01 Transitional//EN"
)
interface DoctypeAttributeNode extends BaseNode {
type: "DoctypeAttribute";
key: DoctypeAttributeKey;
}
DoctypeAttributeValueNode
represents a value of doctype node's attribute. (e.g. html
, -//W3C//DTD HTML 4.01 Transitional//EN
)
. It does not include wrapper characters ('
, "
)
interface DoctypeAttributeValueNode extends BaseNode {
type: "DoctypeAttributeValue";
value: string;
}
DoctypeAttributeWrapperStartNode
represents a left side character that wraps the value of the attribute. (e.g. "
, '
)
interface DoctypeAttributeWrapperStartNode extends BaseNode {
type: "DoctypeAttributeWrapperStart";
value: string;
}
DoctypeAttributeWrapperEndNode
represents a right side character that wraps the value of the attribute. (e.g. "
, '
)
interface DoctypeAttributeWrapperEndNode extends BaseNode {
type: "DoctypeAttributeWrapperEnd";
value: string;
}