Rewrite in TypeScript, Add method toString, Attribute innerHTML\ outerHTML

This commit is contained in:
taoqiufeng 2017-06-14 15:36:24 +08:00
parent 787f00e359
commit e73b44c7dc
10 changed files with 2516 additions and 814 deletions

1
.gitignore vendored
View file

@ -19,3 +19,4 @@ bower_components
!.gitignore
*.sublime-*
dist/

View file

@ -1,4 +1,3 @@
language: node_js
node_js:
- "0.11"
- "0.10"
- node

46
gulpfile.js Normal file
View file

@ -0,0 +1,46 @@
const gulp = require('gulp');
gulp.task('clean', () => {
const del = require('del');
return del('./dist/');
});
gulp.task('compile-ts', () => {
const ts = require('gulp-typescript');
const tsProject = ts.createProject('./tsconfig.json');
const dest = tsProject.options.outDir;
return tsProject.src()
.pipe(tsProject())
.pipe(gulp.dest(dest));
});
gulp.task('copy-files', () => {
return gulp.src(['./package.json', 'readme.md'])
.pipe(gulp.dest('./dist/'));
});
gulp.task('watch-ts', async () => {
const ts = require('gulp-typescript');
const tsProject = ts.createProject('./tsconfig.json');
const path = require('path');
const dest = tsProject.options.outDir;
await tsProject.src()
.pipe(tsProject())
.pipe(gulp.dest(dest));
return gulp.watch(['./src/**/*.ts'], (file) => {
const tsProject = ts.createProject('./tsconfig.json');
const relative = path.relative('./', path.dirname(file.path));
const outDir = tsProject.options.outDir;
const dest = path.join(outDir, relative);
return gulp.src(file.path)
.pipe(tsProject())
.pipe(gulp.dest(dest));
});
});
gulp.task('default', (cb) => {
const sequence = require('gulp-sequence');
sequence('clean', 'copy-files', 'compile-ts', cb);
});
gulp.task('dev', ['watch-ts']);

607
index.js
View file

@ -1,607 +0,0 @@
require('apollojs');
var entities = require('entities');
/**
* Node Class as base class for TextNode and HTMLElement.
*/
function Node() {
}
$declare(Node, {
});
$defenum(Node, {
ELEMENT_NODE: 1,
TEXT_NODE: 3
});
/**
* TextNode to contain a text element in DOM tree.
* @param {string} value [description]
*/
function TextNode(value) {
this.rawText = value;
}
$inherit(TextNode, Node, {
/**
* Node Type declaration.
* @type {Number}
*/
nodeType: Node.TEXT_NODE,
/**
* Get unescaped text value of current node and its children.
* @return {string} text content
*/
get text() {
return entities.decodeHTML5(this.rawText);
},
/**
* Detect if the node contains only white space.
* @return {bool}
*/
get isWhitespace() {
return /^(\s| )*$/.test(this.rawText);
}
});
var kBlockElements = {
div: true,
p: true,
// ul: true,
// ol: true,
li: true,
// table: true,
// tr: true,
td: true,
section: true,
br: true
};
/**
* HTMLElement, which contains a set of children.
* Note: this is a minimalist implementation, no complete tree
* structure provided (no parentNode, nextSibling,
* previousSibling etc).
* @param {string} name tagName
* @param {Object} keyAttrs id and class attribute
* @param {Object} rawAttrs attributes in string
*/
function HTMLElement(name, keyAttrs, rawAttrs) {
this.tagName = name;
this.rawAttrs = rawAttrs || '';
// this.parentNode = null;
this.childNodes = [];
if (keyAttrs.id)
this.id = keyAttrs.id;
if (keyAttrs.class)
this.classNames = keyAttrs.class.split(/\s+/);
else
this.classNames = [];
}
$inherit(HTMLElement, Node, {
/**
* Node Type declaration.
* @type {Number}
*/
nodeType: Node.ELEMENT_NODE,
/**
* Get unescaped text value of current node and its children.
* @return {string} text content
*/
get text() {
return entities.decodeHTML5(this.rawText);
},
/**
* Get escpaed (as-it) text value of current node and its children.
* @return {string} text content
*/
get rawText() {
var res = '';
for (var i = 0; i < this.childNodes.length; i++)
res += this.childNodes[i].rawText;
return res;
},
/**
* Get structured Text (with '\n' etc.)
* @return {string} structured text
*/
get structuredText() {
var currentBlock = [];
var blocks = [currentBlock];
function dfs(node) {
if (node.nodeType === Node.ELEMENT_NODE) {
if (kBlockElements[node.tagName]) {
if (currentBlock.length > 0)
blocks.push(currentBlock = []);
node.childNodes.forEach(dfs);
if (currentBlock.length > 0)
blocks.push(currentBlock = []);
} else {
node.childNodes.forEach(dfs);
}
} else if (node.nodeType === Node.TEXT_NODE) {
if (node.isWhitespace) {
// Whitespace node, postponed output
currentBlock.prependWhitespace = true;
} else {
var text = node.text;
if (currentBlock.prependWhitespace) {
text = ' ' + text;
currentBlock.prependWhitespace = false;
}
currentBlock.push(text);
}
}
}
dfs(this);
return blocks
.map(function(block) {
// Normalize each line's whitespace
return block.join('').trim().replace(/\s{2,}/g, ' ');
})
.join('\n').trimRight();
},
/**
* Trim element from right (in block) after seeing pattern in a TextNode.
* @param {RegExp} pattern pattern to find
* @return {HTMLElement} reference to current node
*/
trimRight: function(pattern) {
function dfs(node) {
for (var i = 0; i < node.childNodes.length; i++) {
var childNode = node.childNodes[i];
if (childNode.nodeType === Node.ELEMENT_NODE) {
dfs(childNode);
} else {
var index = childNode.rawText.search(pattern);
if (index > -1) {
childNode.rawText = childNode.rawText.substr(0, index);
// trim all following nodes.
node.childNodes.length = i+1;
}
}
}
}
dfs(this);
return this;
},
/**
* Get DOM structure
* @return {string} strucutre
*/
get structure() {
var res = [];
var indention = 0;
function write(str) {
res.push(' '.repeat(indention) + str);
}
function dfs(node) {
var idStr = node.id ? ('#' + node.id) : '';
var classStr = node.classNames.length ? ('.' + node.classNames.join('.')) : '';
write(node.tagName + idStr + classStr);
indention++;
for (var i = 0; i < node.childNodes.length; i++) {
var childNode = node.childNodes[i];
if (childNode.nodeType === Node.ELEMENT_NODE) {
dfs(childNode);
} else if (childNode.nodeType === Node.TEXT_NODE) {
if (!childNode.isWhitespace)
write('#text');
}
}
indention--;
}
dfs(this);
return res.join('\n');
},
/**
* Remove whitespaces in this sub tree.
* @return {HTMLElement} pointer to this
*/
removeWhitespace: function() {
var i = 0, o = 0;
for (; i < this.childNodes.length; i++) {
var node = this.childNodes[i];
if (node.nodeType === Node.TEXT_NODE) {
if (node.isWhitespace)
continue;
node.rawText = node.rawText.trim();
} else if (node.nodeType === Node.ELEMENT_NODE) {
node.removeWhitespace();
}
this.childNodes[o++] = node;
}
this.childNodes.length = o;
return this;
},
/**
* Query CSS selector to find matching nodes.
* @param {string} selector Simplified CSS selector
* @param {Matcher} selector A Matcher instance
* @return {HTMLElement[]} matching elements
*/
querySelectorAll: function(selector) {
var matcher;
if (selector instanceof Matcher) {
matcher = selector;
matcher.reset();
} else {
matcher = new Matcher(selector);
}
var res = [];
var stack = [];
for (var i = 0; i < this.childNodes.length; i++) {
stack.push([this.childNodes[i], 0, false]);
while (stack.length) {
var state = stack.back;
var el = state[0];
if (state[1] === 0) {
// Seen for first time.
if (el.nodeType !== Node.ELEMENT_NODE) {
stack.pop();
continue;
}
if (state[2] = matcher.advance(el)) {
if (matcher.matched) {
res.push(el);
// no need to go further.
matcher.rewind();
stack.pop();
continue;
}
}
}
if (state[1] < el.childNodes.length) {
stack.push([el.childNodes[state[1]++], 0, false]);
} else {
if (state[2])
matcher.rewind();
stack.pop();
}
}
}
return res;
},
/**
* Query CSS Selector to find matching node.
* @param {string} selector Simplified CSS selector
* @param {Matcher} selector A Matcher instance
* @return {HTMLElement} matching node
*/
querySelector: function(selector) {
var matcher;
if (selector instanceof Matcher) {
matcher = selector;
matcher.reset();
} else {
matcher = new Matcher(selector);
}
var stack = [];
for (var i = 0; i < this.childNodes.length; i++) {
stack.push([this.childNodes[i], 0, false]);
while (stack.length) {
var state = stack.back;
var el = state[0];
if (state[1] === 0) {
// Seen for first time.
if (el.nodeType !== Node.ELEMENT_NODE) {
stack.pop();
continue;
}
if (state[2] = matcher.advance(el)) {
if (matcher.matched) {
return el;
}
}
}
if (state[1] < el.childNodes.length) {
stack.push([el.childNodes[state[1]++], 0, false]);
} else {
if (state[2])
matcher.rewind();
stack.pop();
}
}
}
return null;
},
/**
* Append a child node to childNodes
* @param {Node} node node to append
* @return {Node} node appended
*/
appendChild: function(node) {
// node.parentNode = this;
this.childNodes.push(node);
return node;
},
/**
* Get first child node
* @return {Node} first child node
*/
get firstChild() {
return this.childNodes.front;
},
/**
* Get last child node
* @return {Node} last child node
*/
get lastChild() {
return this.childNodes.back;
},
/**
* Get attributes
* @return {Object} parsed and unescaped attributes
*/
get attributes() {
if (this._attrs)
return this._attrs;
this._attrs = {};
var attrs = this.rawAttributes;
for (var key in attrs) {
this._attrs[key] = entities.decodeHTML5(attrs[key]);
}
return this._attrs;
},
/**
* Get escaped (as-it) attributes
* @return {Object} parsed attributes
*/
get rawAttributes() {
if (this._rawAttrs)
return this._rawAttrs;
var attrs = {};
if (this.rawAttrs) {
var re = /\b([a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
for (var match; match = re.exec(this.rawAttrs); )
attrs[match[1]] = match[3] || match[4] || match[5];
}
this._rawAttrs = attrs;
return attrs;
}
});
$define(HTMLElement, {
__wrap: function(el) {
el.childNodes.forEach(function(node) {
if (node.rawText) {
$wrap(node, TextNode);
} else {
$wrap(node, HTMLElement);
}
});
}
});
/**
* Cache to store generated match functions
* @type {Object}
*/
var pMatchFunctionCache = {};
/**
* Matcher class to make CSS match
* @param {string} selector Selector
*/
function Matcher(selector) {
this.matchers = selector.split(' ').map(function(matcher) {
if (pMatchFunctionCache[matcher])
return pMatchFunctionCache[matcher];
var parts = matcher.split('.');
var tagName = parts[0];
var classes = parts.slice(1).sort();
var source = '';
if (tagName && tagName != '*') {
if (tagName[0] == '#')
source += 'if (el.id != ' + JSON.stringify(tagName.substr(1)) + ') return false;';
else
source += 'if (el.tagName != ' + JSON.stringify(tagName) + ') return false;';
}
if (classes.length > 0)
source += 'for (var cls = ' + JSON.stringify(classes) + ', i = 0; i < cls.length; i++) if (el.classNames.indexOf(cls[i]) === -1) return false;';
source += 'return true;';
return pMatchFunctionCache[matcher] = new Function('el', source);
});
this.nextMatch = 0;
}
$declare(Matcher, {
/**
* Trying to advance match pointer
* @param {HTMLElement} el element to make the match
* @return {bool} true when pointer advanced.
*/
advance: function(el) {
if (this.nextMatch < this.matchers.length &&
this.matchers[this.nextMatch](el)) {
this.nextMatch++;
return true;
}
return false;
},
/**
* Rewind the match pointer
*/
rewind: function() {
this.nextMatch--;
},
/**
* Trying to determine if match made.
* @return {bool} true when the match is made
*/
get matched() {
return this.nextMatch == this.matchers.length;
},
/**
* Rest match pointer.
* @return {[type]} [description]
*/
reset: function() {
this.nextMatch = 0;
}
});
$define(Matcher, {
/**
* flush cache to free memory
*/
flushCache: function() {
pMatchFunctionCache = {};
}
});
var kMarkupPattern = /<!--[^]*?(?=-->)-->|<(\/?)([a-z][a-z0-9]*)\s*([^>]*?)(\/?)>/ig;
var kAttributePattern = /\b(id|class)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
var kSelfClosingElements = {
meta: true,
img: true,
link: true,
input: true,
area: true,
br: true,
hr: true
};
var kElementsClosedByOpening = {
li: {li: true},
p: {p: true, div: true},
td: {td: true, th: true},
th: {td: true, th: true}
};
var kElementsClosedByClosing = {
li: {ul: true, ol: true},
a: {div: true},
b: {div: true},
i: {div: true},
p: {div: true},
td: {tr: true, table: true},
th: {tr: true, table: true}
};
var kBlockTextElements = {
script: true,
noscript: true,
style: true,
pre: true
};
/**
* Parses HTML and returns a root element
*/
module.exports = {
Matcher: Matcher,
Node: Node,
HTMLElement: HTMLElement,
TextNode: TextNode,
/**
* Parse a chuck of HTML source.
* @param {string} data html
* @return {HTMLElement} root element
*/
parse: function(data, options) {
var root = new HTMLElement(null, {});
var currentParent = root;
var stack = [root];
var lastTextPos = -1;
options = options || {};
for (var match, text; match = kMarkupPattern.exec(data); ) {
if (lastTextPos > -1) {
if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) {
// if has content
text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length);
currentParent.appendChild(new TextNode(text));
}
}
lastTextPos = kMarkupPattern.lastIndex;
if (match[0][1] == '!') {
// this is a comment
continue;
}
if (options.lowerCaseTagName)
match[2] = match[2].toLowerCase();
if (!match[1]) {
// not </ tags
var attrs = {};
for (var attMatch; attMatch = kAttributePattern.exec(match[3]); )
attrs[attMatch[1]] = attMatch[3] || attMatch[4] || attMatch[5];
// console.log(attrs);
if (!match[4] && kElementsClosedByOpening[currentParent.tagName]) {
if (kElementsClosedByOpening[currentParent.tagName][match[2]]) {
stack.pop();
currentParent = stack.back;
}
}
currentParent = currentParent.appendChild(
new HTMLElement(match[2], attrs, match[3]));
stack.push(currentParent);
if (kBlockTextElements[match[2]]) {
// a little test to find next </script> or </style> ...
var closeMarkup = '</' + match[2] + '>';
var index = data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
if (options[match[2]]) {
if (index == -1) {
// there is no matching ending for the text element.
text = data.substr(kMarkupPattern.lastIndex);
} else {
text = data.substring(kMarkupPattern.lastIndex, index);
}
if (text.length > 0)
currentParent.appendChild(new TextNode(text));
}
if (index == -1) {
lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
} else {
lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length;
match[1] = true;
}
}
}
if (match[1] || match[4] ||
kSelfClosingElements[match[2]]) {
// </ or /> or <br> etc.
while (true) {
if (currentParent.tagName == match[2]) {
stack.pop();
currentParent = stack.back;
break;
} else {
// Trying to close current tag, and move on
if (kElementsClosedByClosing[currentParent.tagName]) {
if (kElementsClosedByClosing[currentParent.tagName][match[2]]) {
stack.pop();
currentParent = stack.back;
continue;
}
}
// Use aggressive strategy to handle unmatching markups.
break;
}
}
}
}
return root;
}
};

View file

@ -1,43 +1,53 @@
{
"name": "fast-html-parser",
"version": "1.0.1",
"description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.",
"main": "index.js",
"scripts": {
"test": "mocha",
"posttest": "mocha -R travis-cov",
"coverage": "mocha -R html-cov > coverage.html"
},
"author": "Xiaoyi Shi <ashi009@gmail.com>",
"license": "MIT",
"dependencies": {
"apollojs": "^1.3.0",
"entities": "^1.1.1"
},
"devDependencies": {
"mocha": "^1",
"should": "*",
"blanket": "*",
"travis-cov": "*"
},
"config": {
"blanket": {
"pattern": "index.js",
"data-cover-never": ["node_modules"]
},
"travis-cov": {
"threshold": 70
}
},
"directories": {
"test": "test"
},
"repository": {
"type": "git",
"url": "https://github.com/ashi009/node-fast-html-parser.git"
},
"bugs": {
"url": "https://github.com/ashi009/node-fast-html-parser/issues"
},
"homepage": "https://github.com/ashi009/node-fast-html-parser"
"name": "fast-html-parser",
"version": "1.0.1",
"description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.",
"main": "index.js",
"scripts": {
"test": "gulp && mocha",
"posttest": "mocha -R travis-cov",
"coverage": "mocha -R html-cov > coverage.html",
"build": "gulp"
},
"author": "Xiaoyi Shi <ashi009@gmail.com>",
"license": "MIT",
"dependencies": {
"entities": "latest"
},
"devDependencies": {
"@types/entities": "latest",
"@types/node": "latest",
"blanket": "latest",
"del": "latest",
"gulp": "latest",
"gulp-sequence": "latest",
"gulp-typescript": "latest",
"mocha": "latest",
"should": "latest",
"spec": "latest",
"travis-cov": "latest",
"typescript": "next"
},
"config": {
"blanket": {
"pattern": "./dist/index.js",
"data-cover-never": [
"node_modules"
]
},
"travis-cov": {
"threshold": 70
}
},
"directories": {
"test": "test"
},
"repository": {
"type": "git",
"url": "https://github.com/ashi009/node-fast-html-parser.git"
},
"bugs": {
"url": "https://github.com/ashi009/node-fast-html-parser/issues"
},
"homepage": "https://github.com/ashi009/node-fast-html-parser"
}

655
src/index.ts Normal file
View file

@ -0,0 +1,655 @@
import * as entities from 'entities';
export enum NodeType {
ELEMENT_NODE = 1,
TEXT_NODE = 3
}
/**
* Node Class as base class for TextNode and HTMLElement.
*/
export abstract class Node {
nodeType: NodeType;
childNodes = [] as Node[];
text: string;
rawText: string;
abstract toString(): String;
}
/**
* TextNode to contain a text element in DOM tree.
* @param {string} value [description]
*/
export class TextNode extends Node {
constructor(value: string) {
super();
this.rawText = value;
}
/**
* Node Type declaration.
* @type {Number}
*/
nodeType = NodeType.TEXT_NODE;
/**
* Get unescaped text value of current node and its children.
* @return {string} text content
*/
get text() {
return entities.decodeHTML5(this.rawText);
}
/**
* Detect if the node contains only white space.
* @return {bool}
*/
get isWhitespace() {
return /^(\s|&nbsp;)*$/.test(this.rawText);
}
toString() {
return this.text;
}
}
const kBlockElements = {
div: true,
p: true,
// ul: true,
// ol: true,
li: true,
// table: true,
// tr: true,
td: true,
section: true,
br: true
};
export interface KeyAttributes {
id?: string;
class?: string;
}
export interface Attributes {
[key: string]: string;
}
export interface RawAttributes {
[key: string]: string;
}
function arr_back<T>(arr: T[]) {
return arr[arr.length - 1];
}
/**
* HTMLElement, which contains a set of children.
*
* Note: this is a minimalist implementation, no complete tree
* structure provided (no parentNode, nextSibling,
* previousSibling etc).
* @class HTMLElement
* @extends {Node}
*/
export class HTMLElement extends Node {
private _attrs: Attributes;
private _rawAttrs: RawAttributes;
id: string;
classNames = [] as string[];
tagName: string;
rawAttrs: string;
/**
* Node Type declaration.
* @type {Number}
*/
nodeType = NodeType.ELEMENT_NODE;
/**
* Creates an instance of HTMLElement.
* @param {string} name tagName
* @param {KeyAttributes} keyAttrs id and class attribute
* @param {string} [rawAttrs] attributes in string
*
* @memberof HTMLElement
*/
constructor(name: string, keyAttrs: KeyAttributes, rawAttrs?: string) {
super();
this.tagName = name;
this.rawAttrs = rawAttrs || '';
// this.parentNode = null;
this.childNodes = [];
if (keyAttrs.id) {
this.id = keyAttrs.id;
}
if (keyAttrs.class) {
this.classNames = keyAttrs.class.split(/\s+/);
}
}
/**
* Get escpaed (as-it) text value of current node and its children.
* @return {string} text content
*/
get rawText() {
let res = '';
for (let i = 0; i < this.childNodes.length; i++)
res += this.childNodes[i].rawText;
return res;
}
/**
* Get unescaped text value of current node and its children.
* @return {string} text content
*/
get text() {
return entities.decodeHTML5(this.rawText);
}
/**
* Get structured Text (with '\n' etc.)
* @return {string} structured text
*/
get structuredText() {
let currentBlock = [] as string[];
const blocks = [currentBlock];
function dfs(node: Node) {
if (node.nodeType === NodeType.ELEMENT_NODE) {
if (kBlockElements[(node as HTMLElement).tagName]) {
if (currentBlock.length > 0) {
blocks.push(currentBlock = []);
}
node.childNodes.forEach(dfs);
if (currentBlock.length > 0) {
blocks.push(currentBlock = []);
}
} else {
node.childNodes.forEach(dfs);
}
} else if (node.nodeType === NodeType.TEXT_NODE) {
if ((node as TextNode).isWhitespace) {
// Whitespace node, postponed output
(currentBlock as any).prependWhitespace = true;
} else {
let text = node.text;
if ((currentBlock as any).prependWhitespace) {
text = ' ' + text;
(currentBlock as any).prependWhitespace = false;
}
currentBlock.push(text);
}
}
}
dfs(this);
return blocks
.map(function (block) {
// Normalize each line's whitespace
return block.join('').trim().replace(/\s{2,}/g, ' ');
})
.join('\n').replace(/\s+$/, ''); // trimRight;
}
toString() {
const tag = this.tagName;
if (tag) {
const is_un_closed = /^meta$/i.test(tag);
const is_self_closed = /^(img|br|hr|area|base|input|doctype|link)$/i.test(tag);
const attrs = this.rawAttrs ? ' ' + this.rawAttrs : '';
if (is_un_closed) {
return `<${tag}${attrs}>`;
} else if (is_self_closed) {
return `<${tag}${attrs} />`;
} else {
return `<${tag}${attrs}>${this.innerHTML}</${tag}>`;
}
} else {
return this.innerHTML;
}
}
get innerHTML() {
return this.childNodes.map((child) => {
return child.toString();
}).join('');
}
get outerHTML() {
return this.toString();
}
/**
* Trim element from right (in block) after seeing pattern in a TextNode.
* @param {RegExp} pattern pattern to find
* @return {HTMLElement} reference to current node
*/
trimRight(pattern: RegExp) {
function dfs(node: Node) {
for (let i = 0; i < node.childNodes.length; i++) {
const childNode = node.childNodes[i];
if (childNode.nodeType === NodeType.ELEMENT_NODE) {
dfs(childNode);
} else {
const index = childNode.rawText.search(pattern);
if (index > -1) {
childNode.rawText = childNode.rawText.substr(0, index);
// trim all following nodes.
node.childNodes.length = i + 1;
}
}
}
}
dfs(this);
return this;
}
/**
* Get DOM structure
* @return {string} strucutre
*/
get structure() {
const res = [] as string[];
let indention = 0;
function write(str: string) {
res.push(' '.repeat(indention) + str);
}
function dfs(node: HTMLElement) {
const idStr = node.id ? ('#' + node.id) : '';
const classStr = node.classNames.length ? ('.' + node.classNames.join('.')) : '';
write(node.tagName + idStr + classStr);
indention++;
for (let i = 0; i < node.childNodes.length; i++) {
const childNode = node.childNodes[i];
if (childNode.nodeType === NodeType.ELEMENT_NODE) {
dfs(childNode as HTMLElement);
} else if (childNode.nodeType === NodeType.TEXT_NODE) {
if (!(childNode as TextNode).isWhitespace)
write('#text');
}
}
indention--;
}
dfs(this);
return res.join('\n');
}
/**
* Remove whitespaces in this sub tree.
* @return {HTMLElement} pointer to this
*/
removeWhitespace() {
let o = 0;
for (let i = 0; i < this.childNodes.length; i++) {
const node = this.childNodes[i];
if (node.nodeType === NodeType.TEXT_NODE) {
if ((node as TextNode).isWhitespace)
continue;
node.rawText = node.rawText.trim();
} else if (node.nodeType === NodeType.ELEMENT_NODE) {
(node as HTMLElement).removeWhitespace();
}
this.childNodes[o++] = node;
}
this.childNodes.length = o;
return this;
}
/**
* Query CSS selector to find matching nodes.
* @param {string} selector Simplified CSS selector
* @param {Matcher} selector A Matcher instance
* @return {HTMLElement[]} matching elements
*/
querySelectorAll(selector: string | Matcher) {
let matcher: Matcher;
if (selector instanceof Matcher) {
matcher = selector;
matcher.reset();
} else {
matcher = new Matcher(selector);
}
const res = [] as Node[];
const stack = [] as { 0: Node; 1: 0 | 1; 2: boolean; }[];
for (let i = 0; i < this.childNodes.length; i++) {
stack.push([this.childNodes[i], 0, false]);
while (stack.length) {
const state = arr_back(stack);
const el = state[0];
if (state[1] === 0) {
// Seen for first time.
if (el.nodeType !== NodeType.ELEMENT_NODE) {
stack.pop();
continue;
}
if (state[2] = matcher.advance(el)) {
if (matcher.matched) {
res.push(el);
// no need to go further.
matcher.rewind();
stack.pop();
continue;
}
}
}
if (state[1] < el.childNodes.length) {
stack.push([el.childNodes[state[1]++], 0, false]);
} else {
if (state[2])
matcher.rewind();
stack.pop();
}
}
}
return res;
}
/**
* Query CSS Selector to find matching node.
* @param {string} selector Simplified CSS selector
* @param {Matcher} selector A Matcher instance
* @return {HTMLElement} matching node
*/
querySelector(selector: string | Matcher) {
let matcher: Matcher;
if (selector instanceof Matcher) {
matcher = selector;
matcher.reset();
} else {
matcher = new Matcher(selector);
}
const stack = [] as { 0: Node; 1: 0 | 1; 2: boolean; }[];
for (let i = 0; i < this.childNodes.length; i++) {
stack.push([this.childNodes[i], 0, false]);
while (stack.length) {
const state = arr_back(stack);
const el = state[0];
if (state[1] === 0) {
// Seen for first time.
if (el.nodeType !== NodeType.ELEMENT_NODE) {
stack.pop();
continue;
}
if (state[2] = matcher.advance(el)) {
if (matcher.matched) {
return el;
}
}
}
if (state[1] < el.childNodes.length) {
stack.push([el.childNodes[state[1]++], 0, false]);
} else {
if (state[2])
matcher.rewind();
stack.pop();
}
}
}
return null;
}
/**
* Append a child node to childNodes
* @param {Node} node node to append
* @return {Node} node appended
*/
appendChild(node: Node) {
// node.parentNode = this;
this.childNodes.push(node);
return node;
}
/**
* Get first child node
* @return {Node} first child node
*/
get firstChild() {
return this.childNodes[0];
}
/**
* Get last child node
* @return {Node} last child node
*/
get lastChild() {
return arr_back(this.childNodes);
}
/**
* Get attributes
* @return {Object} parsed and unescaped attributes
*/
get attributes() {
if (this._attrs)
return this._attrs;
this._attrs = {};
const attrs = this.rawAttributes;
for (const key in attrs) {
this._attrs[key] = entities.decodeHTML5(attrs[key]);
}
return this._attrs;
}
/**
* Get escaped (as-it) attributes
* @return {Object} parsed attributes
*/
get rawAttributes() {
if (this._rawAttrs)
return this._rawAttrs;
const attrs = {} as RawAttributes;
if (this.rawAttrs) {
const re = /\b([a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
let match: RegExpExecArray;
while (match = re.exec(this.rawAttrs)) {
attrs[match[1]] = match[3] || match[4] || match[5];
}
}
this._rawAttrs = attrs;
return attrs;
}
}
interface MatherFunction {
(el: Node): boolean;
}
/**
* Cache to store generated match functions
* @type {Object}
*/
let pMatchFunctionCache = {} as { [name: string]: MatherFunction };
/**
* Matcher class to make CSS match
*
* @class Matcher
*/
export class Matcher {
private matchers: MatherFunction[];
private nextMatch = 0;
/**
* Creates an instance of Matcher.
* @param {string} selector
*
* @memberof Matcher
*/
constructor(selector: string) {
this.matchers = selector.split(' ').map((matcher) => {
if (pMatchFunctionCache[matcher])
return pMatchFunctionCache[matcher];
const parts = matcher.split('.');
const tagName = parts[0];
const classes = parts.slice(1).sort();
let source = '';
if (tagName && tagName != '*') {
if (tagName[0] == '#')
source += 'if (el.id != ' + JSON.stringify(tagName.substr(1)) + ') return false;';
else
source += 'if (el.tagName != ' + JSON.stringify(tagName) + ') return false;';
}
if (classes.length > 0)
source += 'for (var cls = ' + JSON.stringify(classes) + ', i = 0; i < cls.length; i++) if (el.classNames.indexOf(cls[i]) === -1) return false;';
source += 'return true;';
return pMatchFunctionCache[matcher] = new Function('el', source) as MatherFunction;
});
}
/**
* Trying to advance match pointer
* @param {HTMLElement} el element to make the match
* @return {bool} true when pointer advanced.
*/
advance(el: Node) {
if (this.nextMatch < this.matchers.length &&
this.matchers[this.nextMatch](el)) {
this.nextMatch++;
return true;
}
return false;
}
/**
* Rewind the match pointer
*/
rewind() {
this.nextMatch--;
}
/**
* Trying to determine if match made.
* @return {bool} true when the match is made
*/
get matched() {
return this.nextMatch == this.matchers.length;
}
/**
* Rest match pointer.
* @return {[type]} [description]
*/
reset() {
this.nextMatch = 0;
}
/**
* flush cache to free memory
*/
flushCache() {
pMatchFunctionCache = {};
}
}
const kMarkupPattern = /<!--[^]*?(?=-->)-->|<(\/?)([a-z][a-z0-9]*)\s*([^>]*?)(\/?)>/ig;
const kAttributePattern = /\b(id|class)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
const kSelfClosingElements = {
meta: true,
img: true,
link: true,
input: true,
area: true,
br: true,
hr: true
};
const kElementsClosedByOpening = {
li: { li: true },
p: { p: true, div: true },
td: { td: true, th: true },
th: { td: true, th: true }
};
const kElementsClosedByClosing = {
li: { ul: true, ol: true },
a: { div: true },
b: { div: true },
i: { div: true },
p: { div: true },
td: { tr: true, table: true },
th: { tr: true, table: true }
};
const kBlockTextElements = {
script: true,
noscript: true,
style: true,
pre: true
};
/**
* Parses HTML and returns a root element
* Parse a chuck of HTML source.
* @param {string} data html
* @return {HTMLElement} root element
*/
export function parse(data: string, options?: {
lowerCaseTagName: boolean;
}) {
const root = new HTMLElement(null, {});
let currentParent = root;
const stack = [root];
let lastTextPos = -1;
options = options || {} as any;
let match: RegExpExecArray;
while (match = kMarkupPattern.exec(data)) {
if (lastTextPos > -1) {
if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) {
// if has content
const text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length);
currentParent.appendChild(new TextNode(text));
}
}
lastTextPos = kMarkupPattern.lastIndex;
if (match[0][1] == '!') {
// this is a comment
continue;
}
if (options.lowerCaseTagName)
match[2] = match[2].toLowerCase();
if (!match[1]) {
// not </ tags
var attrs = {};
for (var attMatch; attMatch = kAttributePattern.exec(match[3]);)
attrs[attMatch[1]] = attMatch[3] || attMatch[4] || attMatch[5];
// console.log(attrs);
if (!match[4] && kElementsClosedByOpening[currentParent.tagName]) {
if (kElementsClosedByOpening[currentParent.tagName][match[2]]) {
stack.pop();
currentParent = arr_back(stack);
}
}
currentParent = currentParent.appendChild(
new HTMLElement(match[2], attrs, match[3])) as HTMLElement;
stack.push(currentParent);
if (kBlockTextElements[match[2]]) {
// a little test to find next </script> or </style> ...
var closeMarkup = '</' + match[2] + '>';
var index = data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
if (options[match[2]]) {
let text: string;
if (index == -1) {
// there is no matching ending for the text element.
text = data.substr(kMarkupPattern.lastIndex);
} else {
text = data.substring(kMarkupPattern.lastIndex, index);
}
if (text.length > 0)
currentParent.appendChild(new TextNode(text));
}
if (index == -1) {
lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
} else {
lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length;
match[1] = 'true';
}
}
}
if (match[1] || match[4] ||
kSelfClosingElements[match[2]]) {
// </ or /> or <br> etc.
while (true) {
if (currentParent.tagName == match[2]) {
stack.pop();
currentParent = arr_back(stack);
break;
} else {
// Trying to close current tag, and move on
if (kElementsClosedByClosing[currentParent.tagName]) {
if (kElementsClosedByClosing[currentParent.tagName][match[2]]) {
stack.pop();
currentParent = arr_back(stack);
continue;
}
}
// Use aggressive strategy to handle unmatching markups.
break;
}
}
}
}
return root;
}

11
t.html Normal file
View file

@ -0,0 +1,11 @@
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title>Document</title>
</head>
<body>
</body>
</html>

View file

@ -2,254 +2,261 @@ var should = require('should');
var fs = require('fs');
var util = require('util');
var HTMLParser = require('../');
var HTMLParser = require('../dist');
describe('HTML Parser', function() {
describe('HTML Parser', function () {
var Matcher = HTMLParser.Matcher;
var HTMLElement = HTMLParser.HTMLElement;
var TextNode = HTMLParser.TextNode;
var Matcher = HTMLParser.Matcher;
var HTMLElement = HTMLParser.HTMLElement;
var TextNode = HTMLParser.TextNode;
describe('Matcher', function() {
describe('Matcher', function () {
it('should match corrent elements', function() {
it('should match corrent elements', function () {
var matcher = new Matcher('#id .a a.b *.a.b .a.b * a');
var MatchesNothingButStarEl = new HTMLElement('_', {});
var withIdEl = new HTMLElement('p', { id: 'id' });
var withClassNameEl = new HTMLElement('a', { class: 'a b' });
var matcher = new Matcher('#id .a a.b *.a.b .a.b * a');
var MatchesNothingButStarEl = new HTMLElement('_', {});
var withIdEl = new HTMLElement('p', { id: 'id' });
var withClassNameEl = new HTMLElement('a', { class: 'a b' });
// console.log(util.inspect([withIdEl, withClassNameEl], {
// showHidden: true,
// depth: null
// }));
// console.log(util.inspect([withIdEl, withClassNameEl], {
// showHidden: true,
// depth: null
// }));
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // #id
matcher.advance(withClassNameEl).should.not.be.ok; // #id
matcher.advance(withIdEl).should.be.ok; // #id
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // #id
matcher.advance(withClassNameEl).should.not.be.ok; // #id
matcher.advance(withIdEl).should.be.ok; // #id
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a
matcher.advance(withIdEl).should.not.be.ok; // .a
matcher.advance(withClassNameEl).should.be.ok; // .a
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a
matcher.advance(withIdEl).should.not.be.ok; // .a
matcher.advance(withClassNameEl).should.be.ok; // .a
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a.b
matcher.advance(withIdEl).should.not.be.ok; // a.b
matcher.advance(withClassNameEl).should.be.ok; // a.b
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a.b
matcher.advance(withIdEl).should.not.be.ok; // a.b
matcher.advance(withClassNameEl).should.be.ok; // a.b
matcher.advance(withIdEl).should.not.be.ok; // *.a.b
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // *.a.b
matcher.advance(withClassNameEl).should.be.ok; // *.a.b
matcher.advance(withIdEl).should.not.be.ok; // *.a.b
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // *.a.b
matcher.advance(withClassNameEl).should.be.ok; // *.a.b
matcher.advance(withIdEl).should.not.be.ok; // .a.b
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a.b
matcher.advance(withClassNameEl).should.be.ok; // .a.b
matcher.advance(withIdEl).should.not.be.ok; // .a.b
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a.b
matcher.advance(withClassNameEl).should.be.ok; // .a.b
matcher.advance(withIdEl).should.be.ok; // *
matcher.rewind();
matcher.advance(MatchesNothingButStarEl).should.be.ok; // *
matcher.rewind();
matcher.advance(withClassNameEl).should.be.ok; // *
matcher.advance(withIdEl).should.be.ok; // *
matcher.rewind();
matcher.advance(MatchesNothingButStarEl).should.be.ok; // *
matcher.rewind();
matcher.advance(withClassNameEl).should.be.ok; // *
matcher.advance(withIdEl).should.not.be.ok; // a
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a
matcher.advance(withClassNameEl).should.be.ok; // a
matcher.advance(withIdEl).should.not.be.ok; // a
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a
matcher.advance(withClassNameEl).should.be.ok; // a
matcher.matched.should.be.ok;
matcher.matched.should.be.ok;
});
});
});
});
var parseHTML = HTMLParser.parse;
var parseHTML = HTMLParser.parse;
describe('parse()', function() {
describe('parse()', function () {
it('should parse "<p id=\\"id\\"><a class=\'cls\'>Hello</a><ul><li><li></ul><span></span></p>" and return root element', function() {
it('should parse "<p id=\\"id\\"><a class=\'cls\'>Hello</a><ul><li><li></ul><span></span></p>" and return root element', function () {
var root = parseHTML('<p id="id"><a class=\'cls\'>Hello</a><ul><li><li></ul><span></span></p>');
var root = parseHTML('<p id="id"><a class=\'cls\'>Hello</a><ul><li><li></ul><span></span></p>');
var p = new HTMLElement('p', { id: 'id' }, 'id="id"');
p.appendChild(new HTMLElement('a', { class: 'cls' }, 'class=\'cls\''))
.appendChild(new TextNode('Hello'));
var ul = p.appendChild(new HTMLElement('ul', {}, ''));
ul.appendChild(new HTMLElement('li', {}, ''));
ul.appendChild(new HTMLElement('li', {}, ''));
p.appendChild(new HTMLElement('span', {}, ''));
var p = new HTMLElement('p', { id: 'id' }, 'id="id"');
p.appendChild(new HTMLElement('a', { class: 'cls' }, 'class=\'cls\''))
.appendChild(new TextNode('Hello'));
var ul = p.appendChild(new HTMLElement('ul', {}, ''));
ul.appendChild(new HTMLElement('li', {}, ''));
ul.appendChild(new HTMLElement('li', {}, ''));
p.appendChild(new HTMLElement('span', {}, ''));
root.firstChild.should.eql(p);
root.firstChild.should.eql(p);
});
});
it('should parse "<DIV><a><img/></A><p></P></div>" and return root element', function() {
it('should parse "<DIV><a><img/></A><p></P></div>" and return root element', function () {
var root = parseHTML('<DIV><a><img/></A><p></P></div>', {
lowerCaseTagName: true
});
var root = parseHTML('<DIV><a><img/></A><p></P></div>', {
lowerCaseTagName: true
});
var div = new HTMLElement('div', {}, '');
var a = div.appendChild(new HTMLElement('a', {}, ''));
var img = a.appendChild(new HTMLElement('img', {}, ''));
var p = div.appendChild(new HTMLElement('p', {}, ''));
var div = new HTMLElement('div', {}, '');
var a = div.appendChild(new HTMLElement('a', {}, ''));
var img = a.appendChild(new HTMLElement('img', {}, ''));
var p = div.appendChild(new HTMLElement('p', {}, ''));
root.firstChild.should.eql(div);
root.firstChild.should.eql(div);
});
});
it('should parse "<div><a><img/></a><p></p></div>" and return root element', function() {
it('should parse "<div><a><img/></a><p></p></div>" and return root element', function () {
var root = parseHTML('<div><a><img/></a><p></p></div>');
var root = parseHTML('<div><a><img/></a><p></p></div>');
var div = new HTMLElement('div', {}, '');
var a = div.appendChild(new HTMLElement('a', {}, ''));
var img = a.appendChild(new HTMLElement('img', {}, ''));
var p = div.appendChild(new HTMLElement('p', {}, ''));
var div = new HTMLElement('div', {}, '');
var a = div.appendChild(new HTMLElement('a', {}, ''));
var img = a.appendChild(new HTMLElement('img', {}, ''));
var p = div.appendChild(new HTMLElement('p', {}, ''));
root.firstChild.should.eql(div);
root.firstChild.should.eql(div);
});
});
it('should not extract text in script and style by default', function() {
it('should not extract text in script and style by default', function () {
var root = parseHTML('<script>1</script><style>2</style>');
var root = parseHTML('<script>1</script><style>2</style>');
root.firstChild.childNodes.should.be.empty;
root.lastChild.childNodes.should.be.empty;
root.firstChild.childNodes.should.be.empty;
root.lastChild.childNodes.should.be.empty;
});
});
it('should extract text in script and style when ask so', function() {
it('should extract text in script and style when ask so', function () {
var root = parseHTML('<script>1</script><style>2&amp;</style>', {
script: true,
style: true
});
var root = parseHTML('<script>1</script><style>2&amp;</style>', {
script: true,
style: true
});
root.firstChild.childNodes.should.not.be.empty;
root.firstChild.childNodes.should.eql([new TextNode('1')]);
root.firstChild.text.should.eql('1');
root.lastChild.childNodes.should.not.be.empty;
root.lastChild.childNodes.should.eql([new TextNode('2&amp;')]);
root.lastChild.text.should.eql('2&');
root.lastChild.rawText.should.eql('2&amp;');
});
root.firstChild.childNodes.should.not.be.empty;
root.firstChild.childNodes.should.eql([new TextNode('1')]);
root.firstChild.text.should.eql('1');
root.lastChild.childNodes.should.not.be.empty;
root.lastChild.childNodes.should.eql([new TextNode('2&amp;')]);
root.lastChild.text.should.eql('2&');
root.lastChild.rawText.should.eql('2&amp;');
});
it('should be able to parse "html/incomplete-script" file', function() {
it('should be able to parse "html/incomplete-script" file', function () {
var root = parseHTML(fs.readFileSync(__dirname + '/html/incomplete-script').toString(), {
script: true
});
var root = parseHTML(fs.readFileSync(__dirname + '/html/incomplete-script').toString(), {
script: true
});
});
});
it('should parse "<div><a><img/></a><p></p></div>.." very fast', function() {
it('should parse "<div><a><img/></a><p></p></div>.." very fast', function () {
for (var i = 0; i < 100; i++)
parseHTML('<div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div>');
for (var i = 0; i < 100; i++)
parseHTML('<div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div>');
});
});
it('should parse "<DIV><a><img/></A><p></P></div>.." fast', function() {
it('should parse "<DIV><a><img/></A><p></P></div>.." fast', function () {
for (var i = 0; i < 100; i++)
parseHTML('<DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div>', {
lowerCaseTagName: true
});
for (var i = 0; i < 100; i++)
parseHTML('<DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div>', {
lowerCaseTagName: true
});
});
});
});
});
describe('TextNode', function() {
describe('TextNode', function () {
describe('#isWhitespace', function() {
var node = new TextNode('');
node.isWhitespace.should.be.ok;
node = new TextNode(' \t');
node.isWhitespace.should.be.ok;
node = new TextNode(' \t&nbsp; \t');
node.isWhitespace.should.be.ok;
});
describe('#isWhitespace', function () {
var node = new TextNode('');
node.isWhitespace.should.be.ok;
node = new TextNode(' \t');
node.isWhitespace.should.be.ok;
node = new TextNode(' \t&nbsp; \t');
node.isWhitespace.should.be.ok;
});
});
});
describe('HTMLElement', function() {
describe('HTMLElement', function () {
describe('#removeWhitespace()', function() {
describe('#removeWhitespace()', function () {
it('should remove whitespaces while preserving nodes with content', function() {
it('should remove whitespaces while preserving nodes with content', function () {
var root = parseHTML('<p> \r \n \t <h5> 123 </h5></p>');
var root = parseHTML('<p> \r \n \t <h5> 123 </h5></p>');
var p = new HTMLElement('p', {}, '');
p.appendChild(new HTMLElement('h5', {}, ''))
.appendChild(new TextNode('123'));
var p = new HTMLElement('p', {}, '');
p.appendChild(new HTMLElement('h5', {}, ''))
.appendChild(new TextNode('123'));
root.firstChild.removeWhitespace().should.eql(p);
root.firstChild.removeWhitespace().should.eql(p);
});
});
});
});
describe('#rawAttributes', function() {
describe('#rawAttributes', function () {
it('should return escaped attributes of the element', function() {
it('should return escaped attributes of the element', function () {
var root = parseHTML('<p a=12 data-id="!$$&amp;" yAz=\'1\'></p>');
var root = parseHTML('<p a=12 data-id="!$$&amp;" yAz=\'1\'></p>');
root.firstChild.rawAttributes.should.eql({
'a': '12',
'data-id': '!$$&amp;',
'yAz': '1'
});
root.firstChild.rawAttributes.should.eql({
'a': '12',
'data-id': '!$$&amp;',
'yAz': '1'
});
});
});
});
});
describe('#attributes', function() {
describe('#attributes', function () {
it('should return attributes of the element', function() {
it('should return attributes of the element', function () {
var root = parseHTML('<p a=12 data-id="!$$&amp;" yAz=\'1\'></p>');
var root = parseHTML('<p a=12 data-id="!$$&amp;" yAz=\'1\'></p>');
root.firstChild.attributes.should.eql({
'a': '12',
'data-id': '!$$&',
'yAz': '1'
});
root.firstChild.attributes.should.eql({
'a': '12',
'data-id': '!$$&',
'yAz': '1'
});
});
});
});
});
describe('#querySelectorAll()', function() {
describe('#querySelectorAll()', function () {
it('should return correct elements in DOM tree', function() {
it('should return correct elements in DOM tree', function () {
var root = parseHTML('<a id="id"><div><span class="a b"></span><span></span><span></span></div></a>');
var root = parseHTML('<a id="id"><div><span class="a b"></span><span></span><span></span></div></a>');
root.querySelectorAll('#id').should.eql([root.firstChild]);
root.querySelectorAll('span.a').should.eql([root.firstChild.firstChild.firstChild]);
root.querySelectorAll('span.b').should.eql([root.firstChild.firstChild.firstChild]);
root.querySelectorAll('span.a.b').should.eql([root.firstChild.firstChild.firstChild]);
root.querySelectorAll('#id .b').should.eql([root.firstChild.firstChild.firstChild]);
root.querySelectorAll('#id span').should.eql(root.firstChild.firstChild.childNodes);
root.querySelectorAll('#id').should.eql([root.firstChild]);
root.querySelectorAll('span.a').should.eql([root.firstChild.firstChild.firstChild]);
root.querySelectorAll('span.b').should.eql([root.firstChild.firstChild.firstChild]);
root.querySelectorAll('span.a.b').should.eql([root.firstChild.firstChild.firstChild]);
root.querySelectorAll('#id .b').should.eql([root.firstChild.firstChild.firstChild]);
root.querySelectorAll('#id span').should.eql(root.firstChild.firstChild.childNodes);
});
});
});
});
describe('#structuredText', function() {
describe('#structuredText', function () {
it('should return correct structured text', function() {
it('should return correct structured text', function () {
var root = parseHTML('<span>o<p>a</p><p>b</p>c</span>');
root.structuredText.should.eql('o\na\nb\nc');
var root = parseHTML('<span>o<p>a</p><p>b</p>c</span>');
root.structuredText.should.eql('o\na\nb\nc');
});
});
});
});
});
});
describe('stringify', function () {
describe('toString', function () {
const html = '<p id="id" data-feidao-actions="ssss"><a class=\'cls\'>Hello</a><ul><li>aaaaa</li></ul><span>bbb</span></p>';
const root = parseHTML(html);
root.toString().should.eql(html)
});
});
});

78
tsconfig.json Normal file
View file

@ -0,0 +1,78 @@
{
"exclude": [
"./dist/"
],
"include": [
"./src/**/*.ts"
],
"compilerOptions": {
"module": "commonjs",
"target": "esnext",
"noImplicitAny": true,
"sourceMap": false,
"emitDecoratorMetadata": true,
"experimentalDecorators": true,
"strictNullChecks": false,
"noImplicitThis": true,
"rootDir": "./src/",
"rootDirs": [
"./src/",
"./tests/"
],
"allowJs": false,
"allowUnreachableCode": false,
"allowUnusedLabels": false,
"alwaysStrict": true,
"baseUrl": "",
"charset": "utf8",
"declaration": true,
// "declarationDir": "./dts/",
"inlineSourceMap": false,
"allowSyntheticDefaultImports": false,
"diagnostics": false,
"emitBOM": false,
"forceConsistentCasingInFileNames": false,
"importHelpers": false,
"inlineSources": false,
"isolatedModules": false,
"lib": [
// "es6",
"esnext"
],
"listFiles": true, // default false
"listEmittedFiles": true, // default false
"locale": "zh_CN",
"newLine": "CRLF",
"noEmit": false,
"moduleResolution": "node",
"noEmitHelpers": false,
"noEmitOnError": false,
"noImplicitReturns": false,
"noImplicitUseStrict": false,
"maxNodeModuleJsDepth": 0,
"noLib": false,
"outDir": "./dist",
// "outFile": "./dist/tqf",
"noFallthroughCasesInSwitch": false,
"noResolve": false,
"noUnusedLocals": false,
"noUnusedParameters": false,
"paths": {},
"preserveConstEnums": false,
"pretty": true,
// "mapRoot": "",
"removeComments": false,
"skipDefaultLibCheck": true, // default false
"skipLibCheck": true, // default false
"stripInternal": false,
"suppressExcessPropertyErrors": false,
"suppressImplicitAnyIndexErrors": true, // default false
"traceResolution": true, // default false
"typeRoots": [
],
"types": [
"node"
],
"watch": false
}
}

1502
yarn.lock Normal file

File diff suppressed because it is too large Load diff