mirror of
https://github.com/danbulant/node-html-parser
synced 2026-06-15 04:31:20 +00:00
Rewrite in TypeScript, Add method toString, Attribute innerHTML\ outerHTML
This commit is contained in:
parent
787f00e359
commit
e73b44c7dc
10 changed files with 2516 additions and 814 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -19,3 +19,4 @@ bower_components
|
|||
!.gitignore
|
||||
|
||||
*.sublime-*
|
||||
dist/
|
||||
|
|
@ -1,4 +1,3 @@
|
|||
language: node_js
|
||||
node_js:
|
||||
- "0.11"
|
||||
- "0.10"
|
||||
- node
|
||||
|
|
|
|||
46
gulpfile.js
Normal file
46
gulpfile.js
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
const gulp = require('gulp');
|
||||
|
||||
gulp.task('clean', () => {
|
||||
const del = require('del');
|
||||
return del('./dist/');
|
||||
});
|
||||
|
||||
gulp.task('compile-ts', () => {
|
||||
const ts = require('gulp-typescript');
|
||||
const tsProject = ts.createProject('./tsconfig.json');
|
||||
const dest = tsProject.options.outDir;
|
||||
return tsProject.src()
|
||||
.pipe(tsProject())
|
||||
.pipe(gulp.dest(dest));
|
||||
});
|
||||
|
||||
gulp.task('copy-files', () => {
|
||||
return gulp.src(['./package.json', 'readme.md'])
|
||||
.pipe(gulp.dest('./dist/'));
|
||||
});
|
||||
|
||||
gulp.task('watch-ts', async () => {
|
||||
const ts = require('gulp-typescript');
|
||||
const tsProject = ts.createProject('./tsconfig.json');
|
||||
const path = require('path');
|
||||
const dest = tsProject.options.outDir;
|
||||
await tsProject.src()
|
||||
.pipe(tsProject())
|
||||
.pipe(gulp.dest(dest));
|
||||
return gulp.watch(['./src/**/*.ts'], (file) => {
|
||||
const tsProject = ts.createProject('./tsconfig.json');
|
||||
const relative = path.relative('./', path.dirname(file.path));
|
||||
const outDir = tsProject.options.outDir;
|
||||
const dest = path.join(outDir, relative);
|
||||
return gulp.src(file.path)
|
||||
.pipe(tsProject())
|
||||
.pipe(gulp.dest(dest));
|
||||
});
|
||||
});
|
||||
|
||||
gulp.task('default', (cb) => {
|
||||
const sequence = require('gulp-sequence');
|
||||
sequence('clean', 'copy-files', 'compile-ts', cb);
|
||||
});
|
||||
|
||||
gulp.task('dev', ['watch-ts']);
|
||||
607
index.js
607
index.js
|
|
@ -1,607 +0,0 @@
|
|||
require('apollojs');
|
||||
|
||||
var entities = require('entities');
|
||||
|
||||
/**
|
||||
* Node Class as base class for TextNode and HTMLElement.
|
||||
*/
|
||||
function Node() {
|
||||
|
||||
}
|
||||
$declare(Node, {
|
||||
|
||||
});
|
||||
$defenum(Node, {
|
||||
ELEMENT_NODE: 1,
|
||||
TEXT_NODE: 3
|
||||
});
|
||||
|
||||
/**
|
||||
* TextNode to contain a text element in DOM tree.
|
||||
* @param {string} value [description]
|
||||
*/
|
||||
function TextNode(value) {
|
||||
this.rawText = value;
|
||||
}
|
||||
$inherit(TextNode, Node, {
|
||||
|
||||
/**
|
||||
* Node Type declaration.
|
||||
* @type {Number}
|
||||
*/
|
||||
nodeType: Node.TEXT_NODE,
|
||||
|
||||
/**
|
||||
* Get unescaped text value of current node and its children.
|
||||
* @return {string} text content
|
||||
*/
|
||||
get text() {
|
||||
return entities.decodeHTML5(this.rawText);
|
||||
},
|
||||
|
||||
/**
|
||||
* Detect if the node contains only white space.
|
||||
* @return {bool}
|
||||
*/
|
||||
get isWhitespace() {
|
||||
return /^(\s| )*$/.test(this.rawText);
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
var kBlockElements = {
|
||||
div: true,
|
||||
p: true,
|
||||
// ul: true,
|
||||
// ol: true,
|
||||
li: true,
|
||||
// table: true,
|
||||
// tr: true,
|
||||
td: true,
|
||||
section: true,
|
||||
br: true
|
||||
};
|
||||
|
||||
/**
|
||||
* HTMLElement, which contains a set of children.
|
||||
* Note: this is a minimalist implementation, no complete tree
|
||||
* structure provided (no parentNode, nextSibling,
|
||||
* previousSibling etc).
|
||||
* @param {string} name tagName
|
||||
* @param {Object} keyAttrs id and class attribute
|
||||
* @param {Object} rawAttrs attributes in string
|
||||
*/
|
||||
function HTMLElement(name, keyAttrs, rawAttrs) {
|
||||
this.tagName = name;
|
||||
this.rawAttrs = rawAttrs || '';
|
||||
// this.parentNode = null;
|
||||
this.childNodes = [];
|
||||
if (keyAttrs.id)
|
||||
this.id = keyAttrs.id;
|
||||
if (keyAttrs.class)
|
||||
this.classNames = keyAttrs.class.split(/\s+/);
|
||||
else
|
||||
this.classNames = [];
|
||||
}
|
||||
$inherit(HTMLElement, Node, {
|
||||
|
||||
/**
|
||||
* Node Type declaration.
|
||||
* @type {Number}
|
||||
*/
|
||||
nodeType: Node.ELEMENT_NODE,
|
||||
|
||||
/**
|
||||
* Get unescaped text value of current node and its children.
|
||||
* @return {string} text content
|
||||
*/
|
||||
get text() {
|
||||
return entities.decodeHTML5(this.rawText);
|
||||
},
|
||||
|
||||
/**
|
||||
* Get escpaed (as-it) text value of current node and its children.
|
||||
* @return {string} text content
|
||||
*/
|
||||
get rawText() {
|
||||
var res = '';
|
||||
for (var i = 0; i < this.childNodes.length; i++)
|
||||
res += this.childNodes[i].rawText;
|
||||
return res;
|
||||
},
|
||||
|
||||
/**
|
||||
* Get structured Text (with '\n' etc.)
|
||||
* @return {string} structured text
|
||||
*/
|
||||
get structuredText() {
|
||||
var currentBlock = [];
|
||||
var blocks = [currentBlock];
|
||||
function dfs(node) {
|
||||
if (node.nodeType === Node.ELEMENT_NODE) {
|
||||
if (kBlockElements[node.tagName]) {
|
||||
if (currentBlock.length > 0)
|
||||
blocks.push(currentBlock = []);
|
||||
node.childNodes.forEach(dfs);
|
||||
if (currentBlock.length > 0)
|
||||
blocks.push(currentBlock = []);
|
||||
} else {
|
||||
node.childNodes.forEach(dfs);
|
||||
}
|
||||
} else if (node.nodeType === Node.TEXT_NODE) {
|
||||
if (node.isWhitespace) {
|
||||
// Whitespace node, postponed output
|
||||
currentBlock.prependWhitespace = true;
|
||||
} else {
|
||||
var text = node.text;
|
||||
if (currentBlock.prependWhitespace) {
|
||||
text = ' ' + text;
|
||||
currentBlock.prependWhitespace = false;
|
||||
}
|
||||
currentBlock.push(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
dfs(this);
|
||||
return blocks
|
||||
.map(function(block) {
|
||||
// Normalize each line's whitespace
|
||||
return block.join('').trim().replace(/\s{2,}/g, ' ');
|
||||
})
|
||||
.join('\n').trimRight();
|
||||
},
|
||||
|
||||
/**
|
||||
* Trim element from right (in block) after seeing pattern in a TextNode.
|
||||
* @param {RegExp} pattern pattern to find
|
||||
* @return {HTMLElement} reference to current node
|
||||
*/
|
||||
trimRight: function(pattern) {
|
||||
function dfs(node) {
|
||||
for (var i = 0; i < node.childNodes.length; i++) {
|
||||
var childNode = node.childNodes[i];
|
||||
if (childNode.nodeType === Node.ELEMENT_NODE) {
|
||||
dfs(childNode);
|
||||
} else {
|
||||
var index = childNode.rawText.search(pattern);
|
||||
if (index > -1) {
|
||||
childNode.rawText = childNode.rawText.substr(0, index);
|
||||
// trim all following nodes.
|
||||
node.childNodes.length = i+1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dfs(this);
|
||||
return this;
|
||||
},
|
||||
|
||||
/**
|
||||
* Get DOM structure
|
||||
* @return {string} strucutre
|
||||
*/
|
||||
get structure() {
|
||||
var res = [];
|
||||
var indention = 0;
|
||||
function write(str) {
|
||||
res.push(' '.repeat(indention) + str);
|
||||
}
|
||||
function dfs(node) {
|
||||
var idStr = node.id ? ('#' + node.id) : '';
|
||||
var classStr = node.classNames.length ? ('.' + node.classNames.join('.')) : '';
|
||||
write(node.tagName + idStr + classStr);
|
||||
indention++;
|
||||
for (var i = 0; i < node.childNodes.length; i++) {
|
||||
var childNode = node.childNodes[i];
|
||||
if (childNode.nodeType === Node.ELEMENT_NODE) {
|
||||
dfs(childNode);
|
||||
} else if (childNode.nodeType === Node.TEXT_NODE) {
|
||||
if (!childNode.isWhitespace)
|
||||
write('#text');
|
||||
}
|
||||
}
|
||||
indention--;
|
||||
}
|
||||
dfs(this);
|
||||
return res.join('\n');
|
||||
},
|
||||
|
||||
/**
|
||||
* Remove whitespaces in this sub tree.
|
||||
* @return {HTMLElement} pointer to this
|
||||
*/
|
||||
removeWhitespace: function() {
|
||||
var i = 0, o = 0;
|
||||
for (; i < this.childNodes.length; i++) {
|
||||
var node = this.childNodes[i];
|
||||
if (node.nodeType === Node.TEXT_NODE) {
|
||||
if (node.isWhitespace)
|
||||
continue;
|
||||
node.rawText = node.rawText.trim();
|
||||
} else if (node.nodeType === Node.ELEMENT_NODE) {
|
||||
node.removeWhitespace();
|
||||
}
|
||||
this.childNodes[o++] = node;
|
||||
}
|
||||
this.childNodes.length = o;
|
||||
return this;
|
||||
},
|
||||
|
||||
/**
|
||||
* Query CSS selector to find matching nodes.
|
||||
* @param {string} selector Simplified CSS selector
|
||||
* @param {Matcher} selector A Matcher instance
|
||||
* @return {HTMLElement[]} matching elements
|
||||
*/
|
||||
querySelectorAll: function(selector) {
|
||||
var matcher;
|
||||
if (selector instanceof Matcher) {
|
||||
matcher = selector;
|
||||
matcher.reset();
|
||||
} else {
|
||||
matcher = new Matcher(selector);
|
||||
}
|
||||
var res = [];
|
||||
var stack = [];
|
||||
for (var i = 0; i < this.childNodes.length; i++) {
|
||||
stack.push([this.childNodes[i], 0, false]);
|
||||
while (stack.length) {
|
||||
var state = stack.back;
|
||||
var el = state[0];
|
||||
if (state[1] === 0) {
|
||||
// Seen for first time.
|
||||
if (el.nodeType !== Node.ELEMENT_NODE) {
|
||||
stack.pop();
|
||||
continue;
|
||||
}
|
||||
if (state[2] = matcher.advance(el)) {
|
||||
if (matcher.matched) {
|
||||
res.push(el);
|
||||
// no need to go further.
|
||||
matcher.rewind();
|
||||
stack.pop();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (state[1] < el.childNodes.length) {
|
||||
stack.push([el.childNodes[state[1]++], 0, false]);
|
||||
} else {
|
||||
if (state[2])
|
||||
matcher.rewind();
|
||||
stack.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
},
|
||||
|
||||
/**
|
||||
* Query CSS Selector to find matching node.
|
||||
* @param {string} selector Simplified CSS selector
|
||||
* @param {Matcher} selector A Matcher instance
|
||||
* @return {HTMLElement} matching node
|
||||
*/
|
||||
querySelector: function(selector) {
|
||||
var matcher;
|
||||
if (selector instanceof Matcher) {
|
||||
matcher = selector;
|
||||
matcher.reset();
|
||||
} else {
|
||||
matcher = new Matcher(selector);
|
||||
}
|
||||
var stack = [];
|
||||
for (var i = 0; i < this.childNodes.length; i++) {
|
||||
stack.push([this.childNodes[i], 0, false]);
|
||||
while (stack.length) {
|
||||
var state = stack.back;
|
||||
var el = state[0];
|
||||
if (state[1] === 0) {
|
||||
// Seen for first time.
|
||||
if (el.nodeType !== Node.ELEMENT_NODE) {
|
||||
stack.pop();
|
||||
continue;
|
||||
}
|
||||
if (state[2] = matcher.advance(el)) {
|
||||
if (matcher.matched) {
|
||||
return el;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (state[1] < el.childNodes.length) {
|
||||
stack.push([el.childNodes[state[1]++], 0, false]);
|
||||
} else {
|
||||
if (state[2])
|
||||
matcher.rewind();
|
||||
stack.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
},
|
||||
|
||||
/**
|
||||
* Append a child node to childNodes
|
||||
* @param {Node} node node to append
|
||||
* @return {Node} node appended
|
||||
*/
|
||||
appendChild: function(node) {
|
||||
// node.parentNode = this;
|
||||
this.childNodes.push(node);
|
||||
return node;
|
||||
},
|
||||
|
||||
/**
|
||||
* Get first child node
|
||||
* @return {Node} first child node
|
||||
*/
|
||||
get firstChild() {
|
||||
return this.childNodes.front;
|
||||
},
|
||||
|
||||
/**
|
||||
* Get last child node
|
||||
* @return {Node} last child node
|
||||
*/
|
||||
get lastChild() {
|
||||
return this.childNodes.back;
|
||||
},
|
||||
|
||||
/**
|
||||
* Get attributes
|
||||
* @return {Object} parsed and unescaped attributes
|
||||
*/
|
||||
get attributes() {
|
||||
if (this._attrs)
|
||||
return this._attrs;
|
||||
this._attrs = {};
|
||||
var attrs = this.rawAttributes;
|
||||
for (var key in attrs) {
|
||||
this._attrs[key] = entities.decodeHTML5(attrs[key]);
|
||||
}
|
||||
return this._attrs;
|
||||
},
|
||||
|
||||
/**
|
||||
* Get escaped (as-it) attributes
|
||||
* @return {Object} parsed attributes
|
||||
*/
|
||||
get rawAttributes() {
|
||||
if (this._rawAttrs)
|
||||
return this._rawAttrs;
|
||||
var attrs = {};
|
||||
if (this.rawAttrs) {
|
||||
var re = /\b([a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
|
||||
for (var match; match = re.exec(this.rawAttrs); )
|
||||
attrs[match[1]] = match[3] || match[4] || match[5];
|
||||
}
|
||||
this._rawAttrs = attrs;
|
||||
return attrs;
|
||||
}
|
||||
|
||||
});
|
||||
$define(HTMLElement, {
|
||||
__wrap: function(el) {
|
||||
el.childNodes.forEach(function(node) {
|
||||
if (node.rawText) {
|
||||
$wrap(node, TextNode);
|
||||
} else {
|
||||
$wrap(node, HTMLElement);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* Cache to store generated match functions
|
||||
* @type {Object}
|
||||
*/
|
||||
var pMatchFunctionCache = {};
|
||||
|
||||
/**
|
||||
* Matcher class to make CSS match
|
||||
* @param {string} selector Selector
|
||||
*/
|
||||
function Matcher(selector) {
|
||||
this.matchers = selector.split(' ').map(function(matcher) {
|
||||
if (pMatchFunctionCache[matcher])
|
||||
return pMatchFunctionCache[matcher];
|
||||
var parts = matcher.split('.');
|
||||
var tagName = parts[0];
|
||||
var classes = parts.slice(1).sort();
|
||||
var source = '';
|
||||
if (tagName && tagName != '*') {
|
||||
if (tagName[0] == '#')
|
||||
source += 'if (el.id != ' + JSON.stringify(tagName.substr(1)) + ') return false;';
|
||||
else
|
||||
source += 'if (el.tagName != ' + JSON.stringify(tagName) + ') return false;';
|
||||
}
|
||||
if (classes.length > 0)
|
||||
source += 'for (var cls = ' + JSON.stringify(classes) + ', i = 0; i < cls.length; i++) if (el.classNames.indexOf(cls[i]) === -1) return false;';
|
||||
source += 'return true;';
|
||||
return pMatchFunctionCache[matcher] = new Function('el', source);
|
||||
});
|
||||
this.nextMatch = 0;
|
||||
}
|
||||
$declare(Matcher, {
|
||||
/**
|
||||
* Trying to advance match pointer
|
||||
* @param {HTMLElement} el element to make the match
|
||||
* @return {bool} true when pointer advanced.
|
||||
*/
|
||||
advance: function(el) {
|
||||
if (this.nextMatch < this.matchers.length &&
|
||||
this.matchers[this.nextMatch](el)) {
|
||||
this.nextMatch++;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
},
|
||||
/**
|
||||
* Rewind the match pointer
|
||||
*/
|
||||
rewind: function() {
|
||||
this.nextMatch--;
|
||||
},
|
||||
/**
|
||||
* Trying to determine if match made.
|
||||
* @return {bool} true when the match is made
|
||||
*/
|
||||
get matched() {
|
||||
return this.nextMatch == this.matchers.length;
|
||||
},
|
||||
/**
|
||||
* Rest match pointer.
|
||||
* @return {[type]} [description]
|
||||
*/
|
||||
reset: function() {
|
||||
this.nextMatch = 0;
|
||||
}
|
||||
});
|
||||
$define(Matcher, {
|
||||
/**
|
||||
* flush cache to free memory
|
||||
*/
|
||||
flushCache: function() {
|
||||
pMatchFunctionCache = {};
|
||||
}
|
||||
});
|
||||
|
||||
var kMarkupPattern = /<!--[^]*?(?=-->)-->|<(\/?)([a-z][a-z0-9]*)\s*([^>]*?)(\/?)>/ig;
|
||||
var kAttributePattern = /\b(id|class)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
|
||||
var kSelfClosingElements = {
|
||||
meta: true,
|
||||
img: true,
|
||||
link: true,
|
||||
input: true,
|
||||
area: true,
|
||||
br: true,
|
||||
hr: true
|
||||
};
|
||||
var kElementsClosedByOpening = {
|
||||
li: {li: true},
|
||||
p: {p: true, div: true},
|
||||
td: {td: true, th: true},
|
||||
th: {td: true, th: true}
|
||||
};
|
||||
var kElementsClosedByClosing = {
|
||||
li: {ul: true, ol: true},
|
||||
a: {div: true},
|
||||
b: {div: true},
|
||||
i: {div: true},
|
||||
p: {div: true},
|
||||
td: {tr: true, table: true},
|
||||
th: {tr: true, table: true}
|
||||
};
|
||||
var kBlockTextElements = {
|
||||
script: true,
|
||||
noscript: true,
|
||||
style: true,
|
||||
pre: true
|
||||
};
|
||||
|
||||
/**
|
||||
* Parses HTML and returns a root element
|
||||
*/
|
||||
module.exports = {
|
||||
|
||||
Matcher: Matcher,
|
||||
Node: Node,
|
||||
HTMLElement: HTMLElement,
|
||||
TextNode: TextNode,
|
||||
|
||||
/**
|
||||
* Parse a chuck of HTML source.
|
||||
* @param {string} data html
|
||||
* @return {HTMLElement} root element
|
||||
*/
|
||||
parse: function(data, options) {
|
||||
|
||||
var root = new HTMLElement(null, {});
|
||||
var currentParent = root;
|
||||
var stack = [root];
|
||||
var lastTextPos = -1;
|
||||
|
||||
options = options || {};
|
||||
|
||||
for (var match, text; match = kMarkupPattern.exec(data); ) {
|
||||
if (lastTextPos > -1) {
|
||||
if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) {
|
||||
// if has content
|
||||
text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length);
|
||||
currentParent.appendChild(new TextNode(text));
|
||||
}
|
||||
}
|
||||
lastTextPos = kMarkupPattern.lastIndex;
|
||||
if (match[0][1] == '!') {
|
||||
// this is a comment
|
||||
continue;
|
||||
}
|
||||
if (options.lowerCaseTagName)
|
||||
match[2] = match[2].toLowerCase();
|
||||
if (!match[1]) {
|
||||
// not </ tags
|
||||
var attrs = {};
|
||||
for (var attMatch; attMatch = kAttributePattern.exec(match[3]); )
|
||||
attrs[attMatch[1]] = attMatch[3] || attMatch[4] || attMatch[5];
|
||||
// console.log(attrs);
|
||||
if (!match[4] && kElementsClosedByOpening[currentParent.tagName]) {
|
||||
if (kElementsClosedByOpening[currentParent.tagName][match[2]]) {
|
||||
stack.pop();
|
||||
currentParent = stack.back;
|
||||
}
|
||||
}
|
||||
currentParent = currentParent.appendChild(
|
||||
new HTMLElement(match[2], attrs, match[3]));
|
||||
stack.push(currentParent);
|
||||
if (kBlockTextElements[match[2]]) {
|
||||
// a little test to find next </script> or </style> ...
|
||||
var closeMarkup = '</' + match[2] + '>';
|
||||
var index = data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
|
||||
if (options[match[2]]) {
|
||||
if (index == -1) {
|
||||
// there is no matching ending for the text element.
|
||||
text = data.substr(kMarkupPattern.lastIndex);
|
||||
} else {
|
||||
text = data.substring(kMarkupPattern.lastIndex, index);
|
||||
}
|
||||
if (text.length > 0)
|
||||
currentParent.appendChild(new TextNode(text));
|
||||
}
|
||||
if (index == -1) {
|
||||
lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
|
||||
} else {
|
||||
lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length;
|
||||
match[1] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (match[1] || match[4] ||
|
||||
kSelfClosingElements[match[2]]) {
|
||||
// </ or /> or <br> etc.
|
||||
while (true) {
|
||||
if (currentParent.tagName == match[2]) {
|
||||
stack.pop();
|
||||
currentParent = stack.back;
|
||||
break;
|
||||
} else {
|
||||
// Trying to close current tag, and move on
|
||||
if (kElementsClosedByClosing[currentParent.tagName]) {
|
||||
if (kElementsClosedByClosing[currentParent.tagName][match[2]]) {
|
||||
stack.pop();
|
||||
currentParent = stack.back;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// Use aggressive strategy to handle unmatching markups.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return root;
|
||||
|
||||
}
|
||||
|
||||
};
|
||||
92
package.json
92
package.json
|
|
@ -1,43 +1,53 @@
|
|||
{
|
||||
"name": "fast-html-parser",
|
||||
"version": "1.0.1",
|
||||
"description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "mocha",
|
||||
"posttest": "mocha -R travis-cov",
|
||||
"coverage": "mocha -R html-cov > coverage.html"
|
||||
},
|
||||
"author": "Xiaoyi Shi <ashi009@gmail.com>",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"apollojs": "^1.3.0",
|
||||
"entities": "^1.1.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"mocha": "^1",
|
||||
"should": "*",
|
||||
"blanket": "*",
|
||||
"travis-cov": "*"
|
||||
},
|
||||
"config": {
|
||||
"blanket": {
|
||||
"pattern": "index.js",
|
||||
"data-cover-never": ["node_modules"]
|
||||
},
|
||||
"travis-cov": {
|
||||
"threshold": 70
|
||||
}
|
||||
},
|
||||
"directories": {
|
||||
"test": "test"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/ashi009/node-fast-html-parser.git"
|
||||
},
|
||||
"bugs": {
|
||||
"url": "https://github.com/ashi009/node-fast-html-parser/issues"
|
||||
},
|
||||
"homepage": "https://github.com/ashi009/node-fast-html-parser"
|
||||
"name": "fast-html-parser",
|
||||
"version": "1.0.1",
|
||||
"description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "gulp && mocha",
|
||||
"posttest": "mocha -R travis-cov",
|
||||
"coverage": "mocha -R html-cov > coverage.html",
|
||||
"build": "gulp"
|
||||
},
|
||||
"author": "Xiaoyi Shi <ashi009@gmail.com>",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"entities": "latest"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/entities": "latest",
|
||||
"@types/node": "latest",
|
||||
"blanket": "latest",
|
||||
"del": "latest",
|
||||
"gulp": "latest",
|
||||
"gulp-sequence": "latest",
|
||||
"gulp-typescript": "latest",
|
||||
"mocha": "latest",
|
||||
"should": "latest",
|
||||
"spec": "latest",
|
||||
"travis-cov": "latest",
|
||||
"typescript": "next"
|
||||
},
|
||||
"config": {
|
||||
"blanket": {
|
||||
"pattern": "./dist/index.js",
|
||||
"data-cover-never": [
|
||||
"node_modules"
|
||||
]
|
||||
},
|
||||
"travis-cov": {
|
||||
"threshold": 70
|
||||
}
|
||||
},
|
||||
"directories": {
|
||||
"test": "test"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/ashi009/node-fast-html-parser.git"
|
||||
},
|
||||
"bugs": {
|
||||
"url": "https://github.com/ashi009/node-fast-html-parser/issues"
|
||||
},
|
||||
"homepage": "https://github.com/ashi009/node-fast-html-parser"
|
||||
}
|
||||
|
|
|
|||
655
src/index.ts
Normal file
655
src/index.ts
Normal file
|
|
@ -0,0 +1,655 @@
|
|||
import * as entities from 'entities';
|
||||
|
||||
export enum NodeType {
|
||||
ELEMENT_NODE = 1,
|
||||
TEXT_NODE = 3
|
||||
}
|
||||
|
||||
/**
|
||||
* Node Class as base class for TextNode and HTMLElement.
|
||||
*/
|
||||
export abstract class Node {
|
||||
nodeType: NodeType;
|
||||
childNodes = [] as Node[];
|
||||
text: string;
|
||||
rawText: string;
|
||||
abstract toString(): String;
|
||||
}
|
||||
/**
|
||||
* TextNode to contain a text element in DOM tree.
|
||||
* @param {string} value [description]
|
||||
*/
|
||||
export class TextNode extends Node {
|
||||
constructor(value: string) {
|
||||
super();
|
||||
this.rawText = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Node Type declaration.
|
||||
* @type {Number}
|
||||
*/
|
||||
nodeType = NodeType.TEXT_NODE;
|
||||
|
||||
/**
|
||||
* Get unescaped text value of current node and its children.
|
||||
* @return {string} text content
|
||||
*/
|
||||
get text() {
|
||||
return entities.decodeHTML5(this.rawText);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect if the node contains only white space.
|
||||
* @return {bool}
|
||||
*/
|
||||
get isWhitespace() {
|
||||
return /^(\s| )*$/.test(this.rawText);
|
||||
}
|
||||
|
||||
toString() {
|
||||
return this.text;
|
||||
}
|
||||
}
|
||||
|
||||
const kBlockElements = {
|
||||
div: true,
|
||||
p: true,
|
||||
// ul: true,
|
||||
// ol: true,
|
||||
li: true,
|
||||
// table: true,
|
||||
// tr: true,
|
||||
td: true,
|
||||
section: true,
|
||||
br: true
|
||||
};
|
||||
|
||||
export interface KeyAttributes {
|
||||
id?: string;
|
||||
class?: string;
|
||||
}
|
||||
|
||||
export interface Attributes {
|
||||
[key: string]: string;
|
||||
}
|
||||
|
||||
export interface RawAttributes {
|
||||
[key: string]: string;
|
||||
}
|
||||
|
||||
function arr_back<T>(arr: T[]) {
|
||||
return arr[arr.length - 1];
|
||||
}
|
||||
|
||||
/**
|
||||
* HTMLElement, which contains a set of children.
|
||||
*
|
||||
* Note: this is a minimalist implementation, no complete tree
|
||||
* structure provided (no parentNode, nextSibling,
|
||||
* previousSibling etc).
|
||||
* @class HTMLElement
|
||||
* @extends {Node}
|
||||
*/
|
||||
export class HTMLElement extends Node {
|
||||
private _attrs: Attributes;
|
||||
private _rawAttrs: RawAttributes;
|
||||
id: string;
|
||||
classNames = [] as string[];
|
||||
tagName: string;
|
||||
rawAttrs: string;
|
||||
/**
|
||||
* Node Type declaration.
|
||||
* @type {Number}
|
||||
*/
|
||||
nodeType = NodeType.ELEMENT_NODE;
|
||||
/**
|
||||
* Creates an instance of HTMLElement.
|
||||
* @param {string} name tagName
|
||||
* @param {KeyAttributes} keyAttrs id and class attribute
|
||||
* @param {string} [rawAttrs] attributes in string
|
||||
*
|
||||
* @memberof HTMLElement
|
||||
*/
|
||||
constructor(name: string, keyAttrs: KeyAttributes, rawAttrs?: string) {
|
||||
super();
|
||||
this.tagName = name;
|
||||
this.rawAttrs = rawAttrs || '';
|
||||
// this.parentNode = null;
|
||||
this.childNodes = [];
|
||||
if (keyAttrs.id) {
|
||||
this.id = keyAttrs.id;
|
||||
}
|
||||
if (keyAttrs.class) {
|
||||
this.classNames = keyAttrs.class.split(/\s+/);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Get escpaed (as-it) text value of current node and its children.
|
||||
* @return {string} text content
|
||||
*/
|
||||
get rawText() {
|
||||
let res = '';
|
||||
for (let i = 0; i < this.childNodes.length; i++)
|
||||
res += this.childNodes[i].rawText;
|
||||
return res;
|
||||
}
|
||||
/**
|
||||
* Get unescaped text value of current node and its children.
|
||||
* @return {string} text content
|
||||
*/
|
||||
get text() {
|
||||
return entities.decodeHTML5(this.rawText);
|
||||
}
|
||||
/**
|
||||
* Get structured Text (with '\n' etc.)
|
||||
* @return {string} structured text
|
||||
*/
|
||||
get structuredText() {
|
||||
let currentBlock = [] as string[];
|
||||
const blocks = [currentBlock];
|
||||
function dfs(node: Node) {
|
||||
if (node.nodeType === NodeType.ELEMENT_NODE) {
|
||||
if (kBlockElements[(node as HTMLElement).tagName]) {
|
||||
if (currentBlock.length > 0) {
|
||||
blocks.push(currentBlock = []);
|
||||
}
|
||||
node.childNodes.forEach(dfs);
|
||||
if (currentBlock.length > 0) {
|
||||
blocks.push(currentBlock = []);
|
||||
}
|
||||
} else {
|
||||
node.childNodes.forEach(dfs);
|
||||
}
|
||||
} else if (node.nodeType === NodeType.TEXT_NODE) {
|
||||
if ((node as TextNode).isWhitespace) {
|
||||
// Whitespace node, postponed output
|
||||
(currentBlock as any).prependWhitespace = true;
|
||||
} else {
|
||||
let text = node.text;
|
||||
if ((currentBlock as any).prependWhitespace) {
|
||||
text = ' ' + text;
|
||||
(currentBlock as any).prependWhitespace = false;
|
||||
}
|
||||
currentBlock.push(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
dfs(this);
|
||||
return blocks
|
||||
.map(function (block) {
|
||||
// Normalize each line's whitespace
|
||||
return block.join('').trim().replace(/\s{2,}/g, ' ');
|
||||
})
|
||||
.join('\n').replace(/\s+$/, ''); // trimRight;
|
||||
}
|
||||
|
||||
toString() {
|
||||
const tag = this.tagName;
|
||||
if (tag) {
|
||||
const is_un_closed = /^meta$/i.test(tag);
|
||||
const is_self_closed = /^(img|br|hr|area|base|input|doctype|link)$/i.test(tag);
|
||||
const attrs = this.rawAttrs ? ' ' + this.rawAttrs : '';
|
||||
if (is_un_closed) {
|
||||
return `<${tag}${attrs}>`;
|
||||
} else if (is_self_closed) {
|
||||
return `<${tag}${attrs} />`;
|
||||
} else {
|
||||
return `<${tag}${attrs}>${this.innerHTML}</${tag}>`;
|
||||
}
|
||||
} else {
|
||||
return this.innerHTML;
|
||||
}
|
||||
}
|
||||
|
||||
get innerHTML() {
|
||||
return this.childNodes.map((child) => {
|
||||
return child.toString();
|
||||
}).join('');
|
||||
}
|
||||
|
||||
get outerHTML() {
|
||||
return this.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Trim element from right (in block) after seeing pattern in a TextNode.
|
||||
* @param {RegExp} pattern pattern to find
|
||||
* @return {HTMLElement} reference to current node
|
||||
*/
|
||||
trimRight(pattern: RegExp) {
|
||||
function dfs(node: Node) {
|
||||
for (let i = 0; i < node.childNodes.length; i++) {
|
||||
const childNode = node.childNodes[i];
|
||||
if (childNode.nodeType === NodeType.ELEMENT_NODE) {
|
||||
dfs(childNode);
|
||||
} else {
|
||||
const index = childNode.rawText.search(pattern);
|
||||
if (index > -1) {
|
||||
childNode.rawText = childNode.rawText.substr(0, index);
|
||||
// trim all following nodes.
|
||||
node.childNodes.length = i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dfs(this);
|
||||
return this;
|
||||
}
|
||||
/**
|
||||
* Get DOM structure
|
||||
* @return {string} strucutre
|
||||
*/
|
||||
get structure() {
|
||||
const res = [] as string[];
|
||||
let indention = 0;
|
||||
function write(str: string) {
|
||||
res.push(' '.repeat(indention) + str);
|
||||
}
|
||||
function dfs(node: HTMLElement) {
|
||||
const idStr = node.id ? ('#' + node.id) : '';
|
||||
const classStr = node.classNames.length ? ('.' + node.classNames.join('.')) : '';
|
||||
write(node.tagName + idStr + classStr);
|
||||
indention++;
|
||||
for (let i = 0; i < node.childNodes.length; i++) {
|
||||
const childNode = node.childNodes[i];
|
||||
if (childNode.nodeType === NodeType.ELEMENT_NODE) {
|
||||
dfs(childNode as HTMLElement);
|
||||
} else if (childNode.nodeType === NodeType.TEXT_NODE) {
|
||||
if (!(childNode as TextNode).isWhitespace)
|
||||
write('#text');
|
||||
}
|
||||
}
|
||||
indention--;
|
||||
}
|
||||
dfs(this);
|
||||
return res.join('\n');
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove whitespaces in this sub tree.
|
||||
* @return {HTMLElement} pointer to this
|
||||
*/
|
||||
removeWhitespace() {
|
||||
let o = 0;
|
||||
for (let i = 0; i < this.childNodes.length; i++) {
|
||||
const node = this.childNodes[i];
|
||||
if (node.nodeType === NodeType.TEXT_NODE) {
|
||||
if ((node as TextNode).isWhitespace)
|
||||
continue;
|
||||
node.rawText = node.rawText.trim();
|
||||
} else if (node.nodeType === NodeType.ELEMENT_NODE) {
|
||||
(node as HTMLElement).removeWhitespace();
|
||||
}
|
||||
this.childNodes[o++] = node;
|
||||
}
|
||||
this.childNodes.length = o;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Query CSS selector to find matching nodes.
|
||||
* @param {string} selector Simplified CSS selector
|
||||
* @param {Matcher} selector A Matcher instance
|
||||
* @return {HTMLElement[]} matching elements
|
||||
*/
|
||||
querySelectorAll(selector: string | Matcher) {
|
||||
let matcher: Matcher;
|
||||
if (selector instanceof Matcher) {
|
||||
matcher = selector;
|
||||
matcher.reset();
|
||||
} else {
|
||||
matcher = new Matcher(selector);
|
||||
}
|
||||
const res = [] as Node[];
|
||||
const stack = [] as { 0: Node; 1: 0 | 1; 2: boolean; }[];
|
||||
for (let i = 0; i < this.childNodes.length; i++) {
|
||||
stack.push([this.childNodes[i], 0, false]);
|
||||
while (stack.length) {
|
||||
const state = arr_back(stack);
|
||||
const el = state[0];
|
||||
if (state[1] === 0) {
|
||||
// Seen for first time.
|
||||
if (el.nodeType !== NodeType.ELEMENT_NODE) {
|
||||
stack.pop();
|
||||
continue;
|
||||
}
|
||||
if (state[2] = matcher.advance(el)) {
|
||||
if (matcher.matched) {
|
||||
res.push(el);
|
||||
// no need to go further.
|
||||
matcher.rewind();
|
||||
stack.pop();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (state[1] < el.childNodes.length) {
|
||||
stack.push([el.childNodes[state[1]++], 0, false]);
|
||||
} else {
|
||||
if (state[2])
|
||||
matcher.rewind();
|
||||
stack.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Query CSS Selector to find matching node.
|
||||
* @param {string} selector Simplified CSS selector
|
||||
* @param {Matcher} selector A Matcher instance
|
||||
* @return {HTMLElement} matching node
|
||||
*/
|
||||
querySelector(selector: string | Matcher) {
|
||||
let matcher: Matcher;
|
||||
if (selector instanceof Matcher) {
|
||||
matcher = selector;
|
||||
matcher.reset();
|
||||
} else {
|
||||
matcher = new Matcher(selector);
|
||||
}
|
||||
const stack = [] as { 0: Node; 1: 0 | 1; 2: boolean; }[];
|
||||
for (let i = 0; i < this.childNodes.length; i++) {
|
||||
stack.push([this.childNodes[i], 0, false]);
|
||||
while (stack.length) {
|
||||
const state = arr_back(stack);
|
||||
const el = state[0];
|
||||
if (state[1] === 0) {
|
||||
// Seen for first time.
|
||||
if (el.nodeType !== NodeType.ELEMENT_NODE) {
|
||||
stack.pop();
|
||||
continue;
|
||||
}
|
||||
if (state[2] = matcher.advance(el)) {
|
||||
if (matcher.matched) {
|
||||
return el;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (state[1] < el.childNodes.length) {
|
||||
stack.push([el.childNodes[state[1]++], 0, false]);
|
||||
} else {
|
||||
if (state[2])
|
||||
matcher.rewind();
|
||||
stack.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Append a child node to childNodes
|
||||
* @param {Node} node node to append
|
||||
* @return {Node} node appended
|
||||
*/
|
||||
appendChild(node: Node) {
|
||||
// node.parentNode = this;
|
||||
this.childNodes.push(node);
|
||||
return node;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get first child node
|
||||
* @return {Node} first child node
|
||||
*/
|
||||
get firstChild() {
|
||||
return this.childNodes[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get last child node
|
||||
* @return {Node} last child node
|
||||
*/
|
||||
get lastChild() {
|
||||
return arr_back(this.childNodes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get attributes
|
||||
* @return {Object} parsed and unescaped attributes
|
||||
*/
|
||||
get attributes() {
|
||||
if (this._attrs)
|
||||
return this._attrs;
|
||||
this._attrs = {};
|
||||
const attrs = this.rawAttributes;
|
||||
for (const key in attrs) {
|
||||
this._attrs[key] = entities.decodeHTML5(attrs[key]);
|
||||
}
|
||||
return this._attrs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get escaped (as-it) attributes
|
||||
* @return {Object} parsed attributes
|
||||
*/
|
||||
get rawAttributes() {
|
||||
if (this._rawAttrs)
|
||||
return this._rawAttrs;
|
||||
const attrs = {} as RawAttributes;
|
||||
if (this.rawAttrs) {
|
||||
const re = /\b([a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
|
||||
let match: RegExpExecArray;
|
||||
while (match = re.exec(this.rawAttrs)) {
|
||||
attrs[match[1]] = match[3] || match[4] || match[5];
|
||||
}
|
||||
}
|
||||
this._rawAttrs = attrs;
|
||||
return attrs;
|
||||
}
|
||||
}
|
||||
|
||||
interface MatherFunction {
|
||||
(el: Node): boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cache to store generated match functions
|
||||
* @type {Object}
|
||||
*/
|
||||
let pMatchFunctionCache = {} as { [name: string]: MatherFunction };
|
||||
/**
|
||||
* Matcher class to make CSS match
|
||||
*
|
||||
* @class Matcher
|
||||
*/
|
||||
export class Matcher {
|
||||
private matchers: MatherFunction[];
|
||||
private nextMatch = 0;
|
||||
/**
|
||||
* Creates an instance of Matcher.
|
||||
* @param {string} selector
|
||||
*
|
||||
* @memberof Matcher
|
||||
*/
|
||||
constructor(selector: string) {
|
||||
this.matchers = selector.split(' ').map((matcher) => {
|
||||
if (pMatchFunctionCache[matcher])
|
||||
return pMatchFunctionCache[matcher];
|
||||
const parts = matcher.split('.');
|
||||
const tagName = parts[0];
|
||||
const classes = parts.slice(1).sort();
|
||||
let source = '';
|
||||
if (tagName && tagName != '*') {
|
||||
if (tagName[0] == '#')
|
||||
source += 'if (el.id != ' + JSON.stringify(tagName.substr(1)) + ') return false;';
|
||||
else
|
||||
source += 'if (el.tagName != ' + JSON.stringify(tagName) + ') return false;';
|
||||
}
|
||||
if (classes.length > 0)
|
||||
source += 'for (var cls = ' + JSON.stringify(classes) + ', i = 0; i < cls.length; i++) if (el.classNames.indexOf(cls[i]) === -1) return false;';
|
||||
source += 'return true;';
|
||||
return pMatchFunctionCache[matcher] = new Function('el', source) as MatherFunction;
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Trying to advance match pointer
|
||||
* @param {HTMLElement} el element to make the match
|
||||
* @return {bool} true when pointer advanced.
|
||||
*/
|
||||
advance(el: Node) {
|
||||
if (this.nextMatch < this.matchers.length &&
|
||||
this.matchers[this.nextMatch](el)) {
|
||||
this.nextMatch++;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
/**
|
||||
* Rewind the match pointer
|
||||
*/
|
||||
rewind() {
|
||||
this.nextMatch--;
|
||||
}
|
||||
/**
|
||||
* Trying to determine if match made.
|
||||
* @return {bool} true when the match is made
|
||||
*/
|
||||
get matched() {
|
||||
return this.nextMatch == this.matchers.length;
|
||||
}
|
||||
/**
|
||||
* Rest match pointer.
|
||||
* @return {[type]} [description]
|
||||
*/
|
||||
reset() {
|
||||
this.nextMatch = 0;
|
||||
}
|
||||
/**
|
||||
* flush cache to free memory
|
||||
*/
|
||||
flushCache() {
|
||||
pMatchFunctionCache = {};
|
||||
}
|
||||
}
|
||||
|
||||
const kMarkupPattern = /<!--[^]*?(?=-->)-->|<(\/?)([a-z][a-z0-9]*)\s*([^>]*?)(\/?)>/ig;
|
||||
const kAttributePattern = /\b(id|class)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig;
|
||||
const kSelfClosingElements = {
|
||||
meta: true,
|
||||
img: true,
|
||||
link: true,
|
||||
input: true,
|
||||
area: true,
|
||||
br: true,
|
||||
hr: true
|
||||
};
|
||||
const kElementsClosedByOpening = {
|
||||
li: { li: true },
|
||||
p: { p: true, div: true },
|
||||
td: { td: true, th: true },
|
||||
th: { td: true, th: true }
|
||||
};
|
||||
const kElementsClosedByClosing = {
|
||||
li: { ul: true, ol: true },
|
||||
a: { div: true },
|
||||
b: { div: true },
|
||||
i: { div: true },
|
||||
p: { div: true },
|
||||
td: { tr: true, table: true },
|
||||
th: { tr: true, table: true }
|
||||
};
|
||||
const kBlockTextElements = {
|
||||
script: true,
|
||||
noscript: true,
|
||||
style: true,
|
||||
pre: true
|
||||
};
|
||||
|
||||
/**
|
||||
* Parses HTML and returns a root element
|
||||
* Parse a chuck of HTML source.
|
||||
* @param {string} data html
|
||||
* @return {HTMLElement} root element
|
||||
*/
|
||||
export function parse(data: string, options?: {
|
||||
lowerCaseTagName: boolean;
|
||||
}) {
|
||||
const root = new HTMLElement(null, {});
|
||||
let currentParent = root;
|
||||
const stack = [root];
|
||||
let lastTextPos = -1;
|
||||
|
||||
options = options || {} as any;
|
||||
let match: RegExpExecArray;
|
||||
while (match = kMarkupPattern.exec(data)) {
|
||||
if (lastTextPos > -1) {
|
||||
if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) {
|
||||
// if has content
|
||||
const text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length);
|
||||
currentParent.appendChild(new TextNode(text));
|
||||
}
|
||||
}
|
||||
lastTextPos = kMarkupPattern.lastIndex;
|
||||
if (match[0][1] == '!') {
|
||||
// this is a comment
|
||||
continue;
|
||||
}
|
||||
if (options.lowerCaseTagName)
|
||||
match[2] = match[2].toLowerCase();
|
||||
if (!match[1]) {
|
||||
// not </ tags
|
||||
var attrs = {};
|
||||
for (var attMatch; attMatch = kAttributePattern.exec(match[3]);)
|
||||
attrs[attMatch[1]] = attMatch[3] || attMatch[4] || attMatch[5];
|
||||
// console.log(attrs);
|
||||
if (!match[4] && kElementsClosedByOpening[currentParent.tagName]) {
|
||||
if (kElementsClosedByOpening[currentParent.tagName][match[2]]) {
|
||||
stack.pop();
|
||||
currentParent = arr_back(stack);
|
||||
}
|
||||
}
|
||||
currentParent = currentParent.appendChild(
|
||||
new HTMLElement(match[2], attrs, match[3])) as HTMLElement;
|
||||
stack.push(currentParent);
|
||||
if (kBlockTextElements[match[2]]) {
|
||||
// a little test to find next </script> or </style> ...
|
||||
var closeMarkup = '</' + match[2] + '>';
|
||||
var index = data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
|
||||
if (options[match[2]]) {
|
||||
let text: string;
|
||||
if (index == -1) {
|
||||
// there is no matching ending for the text element.
|
||||
text = data.substr(kMarkupPattern.lastIndex);
|
||||
} else {
|
||||
text = data.substring(kMarkupPattern.lastIndex, index);
|
||||
}
|
||||
if (text.length > 0)
|
||||
currentParent.appendChild(new TextNode(text));
|
||||
}
|
||||
if (index == -1) {
|
||||
lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
|
||||
} else {
|
||||
lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length;
|
||||
match[1] = 'true';
|
||||
}
|
||||
}
|
||||
}
|
||||
if (match[1] || match[4] ||
|
||||
kSelfClosingElements[match[2]]) {
|
||||
// </ or /> or <br> etc.
|
||||
while (true) {
|
||||
if (currentParent.tagName == match[2]) {
|
||||
stack.pop();
|
||||
currentParent = arr_back(stack);
|
||||
break;
|
||||
} else {
|
||||
// Trying to close current tag, and move on
|
||||
if (kElementsClosedByClosing[currentParent.tagName]) {
|
||||
if (kElementsClosedByClosing[currentParent.tagName][match[2]]) {
|
||||
stack.pop();
|
||||
currentParent = arr_back(stack);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// Use aggressive strategy to handle unmatching markups.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return root;
|
||||
}
|
||||
11
t.html
Normal file
11
t.html
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<meta http-equiv="X-UA-Compatible" content="ie=edge">
|
||||
<title>Document</title>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
335
test/html.js
335
test/html.js
|
|
@ -2,254 +2,261 @@ var should = require('should');
|
|||
var fs = require('fs');
|
||||
var util = require('util');
|
||||
|
||||
var HTMLParser = require('../');
|
||||
var HTMLParser = require('../dist');
|
||||
|
||||
describe('HTML Parser', function() {
|
||||
describe('HTML Parser', function () {
|
||||
|
||||
var Matcher = HTMLParser.Matcher;
|
||||
var HTMLElement = HTMLParser.HTMLElement;
|
||||
var TextNode = HTMLParser.TextNode;
|
||||
var Matcher = HTMLParser.Matcher;
|
||||
var HTMLElement = HTMLParser.HTMLElement;
|
||||
var TextNode = HTMLParser.TextNode;
|
||||
|
||||
describe('Matcher', function() {
|
||||
describe('Matcher', function () {
|
||||
|
||||
it('should match corrent elements', function() {
|
||||
it('should match corrent elements', function () {
|
||||
|
||||
var matcher = new Matcher('#id .a a.b *.a.b .a.b * a');
|
||||
var MatchesNothingButStarEl = new HTMLElement('_', {});
|
||||
var withIdEl = new HTMLElement('p', { id: 'id' });
|
||||
var withClassNameEl = new HTMLElement('a', { class: 'a b' });
|
||||
var matcher = new Matcher('#id .a a.b *.a.b .a.b * a');
|
||||
var MatchesNothingButStarEl = new HTMLElement('_', {});
|
||||
var withIdEl = new HTMLElement('p', { id: 'id' });
|
||||
var withClassNameEl = new HTMLElement('a', { class: 'a b' });
|
||||
|
||||
// console.log(util.inspect([withIdEl, withClassNameEl], {
|
||||
// showHidden: true,
|
||||
// depth: null
|
||||
// }));
|
||||
// console.log(util.inspect([withIdEl, withClassNameEl], {
|
||||
// showHidden: true,
|
||||
// depth: null
|
||||
// }));
|
||||
|
||||
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // #id
|
||||
matcher.advance(withClassNameEl).should.not.be.ok; // #id
|
||||
matcher.advance(withIdEl).should.be.ok; // #id
|
||||
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // #id
|
||||
matcher.advance(withClassNameEl).should.not.be.ok; // #id
|
||||
matcher.advance(withIdEl).should.be.ok; // #id
|
||||
|
||||
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a
|
||||
matcher.advance(withIdEl).should.not.be.ok; // .a
|
||||
matcher.advance(withClassNameEl).should.be.ok; // .a
|
||||
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a
|
||||
matcher.advance(withIdEl).should.not.be.ok; // .a
|
||||
matcher.advance(withClassNameEl).should.be.ok; // .a
|
||||
|
||||
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a.b
|
||||
matcher.advance(withIdEl).should.not.be.ok; // a.b
|
||||
matcher.advance(withClassNameEl).should.be.ok; // a.b
|
||||
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a.b
|
||||
matcher.advance(withIdEl).should.not.be.ok; // a.b
|
||||
matcher.advance(withClassNameEl).should.be.ok; // a.b
|
||||
|
||||
matcher.advance(withIdEl).should.not.be.ok; // *.a.b
|
||||
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // *.a.b
|
||||
matcher.advance(withClassNameEl).should.be.ok; // *.a.b
|
||||
matcher.advance(withIdEl).should.not.be.ok; // *.a.b
|
||||
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // *.a.b
|
||||
matcher.advance(withClassNameEl).should.be.ok; // *.a.b
|
||||
|
||||
matcher.advance(withIdEl).should.not.be.ok; // .a.b
|
||||
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a.b
|
||||
matcher.advance(withClassNameEl).should.be.ok; // .a.b
|
||||
matcher.advance(withIdEl).should.not.be.ok; // .a.b
|
||||
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a.b
|
||||
matcher.advance(withClassNameEl).should.be.ok; // .a.b
|
||||
|
||||
matcher.advance(withIdEl).should.be.ok; // *
|
||||
matcher.rewind();
|
||||
matcher.advance(MatchesNothingButStarEl).should.be.ok; // *
|
||||
matcher.rewind();
|
||||
matcher.advance(withClassNameEl).should.be.ok; // *
|
||||
matcher.advance(withIdEl).should.be.ok; // *
|
||||
matcher.rewind();
|
||||
matcher.advance(MatchesNothingButStarEl).should.be.ok; // *
|
||||
matcher.rewind();
|
||||
matcher.advance(withClassNameEl).should.be.ok; // *
|
||||
|
||||
matcher.advance(withIdEl).should.not.be.ok; // a
|
||||
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a
|
||||
matcher.advance(withClassNameEl).should.be.ok; // a
|
||||
matcher.advance(withIdEl).should.not.be.ok; // a
|
||||
matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a
|
||||
matcher.advance(withClassNameEl).should.be.ok; // a
|
||||
|
||||
matcher.matched.should.be.ok;
|
||||
matcher.matched.should.be.ok;
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
var parseHTML = HTMLParser.parse;
|
||||
var parseHTML = HTMLParser.parse;
|
||||
|
||||
describe('parse()', function() {
|
||||
describe('parse()', function () {
|
||||
|
||||
it('should parse "<p id=\\"id\\"><a class=\'cls\'>Hello</a><ul><li><li></ul><span></span></p>" and return root element', function() {
|
||||
it('should parse "<p id=\\"id\\"><a class=\'cls\'>Hello</a><ul><li><li></ul><span></span></p>" and return root element', function () {
|
||||
|
||||
var root = parseHTML('<p id="id"><a class=\'cls\'>Hello</a><ul><li><li></ul><span></span></p>');
|
||||
var root = parseHTML('<p id="id"><a class=\'cls\'>Hello</a><ul><li><li></ul><span></span></p>');
|
||||
|
||||
var p = new HTMLElement('p', { id: 'id' }, 'id="id"');
|
||||
p.appendChild(new HTMLElement('a', { class: 'cls' }, 'class=\'cls\''))
|
||||
.appendChild(new TextNode('Hello'));
|
||||
var ul = p.appendChild(new HTMLElement('ul', {}, ''));
|
||||
ul.appendChild(new HTMLElement('li', {}, ''));
|
||||
ul.appendChild(new HTMLElement('li', {}, ''));
|
||||
p.appendChild(new HTMLElement('span', {}, ''));
|
||||
var p = new HTMLElement('p', { id: 'id' }, 'id="id"');
|
||||
p.appendChild(new HTMLElement('a', { class: 'cls' }, 'class=\'cls\''))
|
||||
.appendChild(new TextNode('Hello'));
|
||||
var ul = p.appendChild(new HTMLElement('ul', {}, ''));
|
||||
ul.appendChild(new HTMLElement('li', {}, ''));
|
||||
ul.appendChild(new HTMLElement('li', {}, ''));
|
||||
p.appendChild(new HTMLElement('span', {}, ''));
|
||||
|
||||
root.firstChild.should.eql(p);
|
||||
root.firstChild.should.eql(p);
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
it('should parse "<DIV><a><img/></A><p></P></div>" and return root element', function() {
|
||||
it('should parse "<DIV><a><img/></A><p></P></div>" and return root element', function () {
|
||||
|
||||
var root = parseHTML('<DIV><a><img/></A><p></P></div>', {
|
||||
lowerCaseTagName: true
|
||||
});
|
||||
var root = parseHTML('<DIV><a><img/></A><p></P></div>', {
|
||||
lowerCaseTagName: true
|
||||
});
|
||||
|
||||
var div = new HTMLElement('div', {}, '');
|
||||
var a = div.appendChild(new HTMLElement('a', {}, ''));
|
||||
var img = a.appendChild(new HTMLElement('img', {}, ''));
|
||||
var p = div.appendChild(new HTMLElement('p', {}, ''));
|
||||
var div = new HTMLElement('div', {}, '');
|
||||
var a = div.appendChild(new HTMLElement('a', {}, ''));
|
||||
var img = a.appendChild(new HTMLElement('img', {}, ''));
|
||||
var p = div.appendChild(new HTMLElement('p', {}, ''));
|
||||
|
||||
root.firstChild.should.eql(div);
|
||||
root.firstChild.should.eql(div);
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
it('should parse "<div><a><img/></a><p></p></div>" and return root element', function() {
|
||||
it('should parse "<div><a><img/></a><p></p></div>" and return root element', function () {
|
||||
|
||||
var root = parseHTML('<div><a><img/></a><p></p></div>');
|
||||
var root = parseHTML('<div><a><img/></a><p></p></div>');
|
||||
|
||||
var div = new HTMLElement('div', {}, '');
|
||||
var a = div.appendChild(new HTMLElement('a', {}, ''));
|
||||
var img = a.appendChild(new HTMLElement('img', {}, ''));
|
||||
var p = div.appendChild(new HTMLElement('p', {}, ''));
|
||||
var div = new HTMLElement('div', {}, '');
|
||||
var a = div.appendChild(new HTMLElement('a', {}, ''));
|
||||
var img = a.appendChild(new HTMLElement('img', {}, ''));
|
||||
var p = div.appendChild(new HTMLElement('p', {}, ''));
|
||||
|
||||
root.firstChild.should.eql(div);
|
||||
root.firstChild.should.eql(div);
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
it('should not extract text in script and style by default', function() {
|
||||
it('should not extract text in script and style by default', function () {
|
||||
|
||||
var root = parseHTML('<script>1</script><style>2</style>');
|
||||
var root = parseHTML('<script>1</script><style>2</style>');
|
||||
|
||||
root.firstChild.childNodes.should.be.empty;
|
||||
root.lastChild.childNodes.should.be.empty;
|
||||
root.firstChild.childNodes.should.be.empty;
|
||||
root.lastChild.childNodes.should.be.empty;
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
it('should extract text in script and style when ask so', function() {
|
||||
it('should extract text in script and style when ask so', function () {
|
||||
|
||||
var root = parseHTML('<script>1</script><style>2&</style>', {
|
||||
script: true,
|
||||
style: true
|
||||
});
|
||||
var root = parseHTML('<script>1</script><style>2&</style>', {
|
||||
script: true,
|
||||
style: true
|
||||
});
|
||||
|
||||
root.firstChild.childNodes.should.not.be.empty;
|
||||
root.firstChild.childNodes.should.eql([new TextNode('1')]);
|
||||
root.firstChild.text.should.eql('1');
|
||||
root.lastChild.childNodes.should.not.be.empty;
|
||||
root.lastChild.childNodes.should.eql([new TextNode('2&')]);
|
||||
root.lastChild.text.should.eql('2&');
|
||||
root.lastChild.rawText.should.eql('2&');
|
||||
});
|
||||
root.firstChild.childNodes.should.not.be.empty;
|
||||
root.firstChild.childNodes.should.eql([new TextNode('1')]);
|
||||
root.firstChild.text.should.eql('1');
|
||||
root.lastChild.childNodes.should.not.be.empty;
|
||||
root.lastChild.childNodes.should.eql([new TextNode('2&')]);
|
||||
root.lastChild.text.should.eql('2&');
|
||||
root.lastChild.rawText.should.eql('2&');
|
||||
});
|
||||
|
||||
it('should be able to parse "html/incomplete-script" file', function() {
|
||||
it('should be able to parse "html/incomplete-script" file', function () {
|
||||
|
||||
var root = parseHTML(fs.readFileSync(__dirname + '/html/incomplete-script').toString(), {
|
||||
script: true
|
||||
});
|
||||
var root = parseHTML(fs.readFileSync(__dirname + '/html/incomplete-script').toString(), {
|
||||
script: true
|
||||
});
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
it('should parse "<div><a><img/></a><p></p></div>.." very fast', function() {
|
||||
it('should parse "<div><a><img/></a><p></p></div>.." very fast', function () {
|
||||
|
||||
for (var i = 0; i < 100; i++)
|
||||
parseHTML('<div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div>');
|
||||
for (var i = 0; i < 100; i++)
|
||||
parseHTML('<div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div><div><a><img/></a><p></p></div>');
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
it('should parse "<DIV><a><img/></A><p></P></div>.." fast', function() {
|
||||
it('should parse "<DIV><a><img/></A><p></P></div>.." fast', function () {
|
||||
|
||||
for (var i = 0; i < 100; i++)
|
||||
parseHTML('<DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div>', {
|
||||
lowerCaseTagName: true
|
||||
});
|
||||
for (var i = 0; i < 100; i++)
|
||||
parseHTML('<DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div><DIV><a><img/></A><p></P></div>', {
|
||||
lowerCaseTagName: true
|
||||
});
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
describe('TextNode', function() {
|
||||
describe('TextNode', function () {
|
||||
|
||||
describe('#isWhitespace', function() {
|
||||
var node = new TextNode('');
|
||||
node.isWhitespace.should.be.ok;
|
||||
node = new TextNode(' \t');
|
||||
node.isWhitespace.should.be.ok;
|
||||
node = new TextNode(' \t \t');
|
||||
node.isWhitespace.should.be.ok;
|
||||
});
|
||||
describe('#isWhitespace', function () {
|
||||
var node = new TextNode('');
|
||||
node.isWhitespace.should.be.ok;
|
||||
node = new TextNode(' \t');
|
||||
node.isWhitespace.should.be.ok;
|
||||
node = new TextNode(' \t \t');
|
||||
node.isWhitespace.should.be.ok;
|
||||
});
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
describe('HTMLElement', function() {
|
||||
describe('HTMLElement', function () {
|
||||
|
||||
describe('#removeWhitespace()', function() {
|
||||
describe('#removeWhitespace()', function () {
|
||||
|
||||
it('should remove whitespaces while preserving nodes with content', function() {
|
||||
it('should remove whitespaces while preserving nodes with content', function () {
|
||||
|
||||
var root = parseHTML('<p> \r \n \t <h5> 123 </h5></p>');
|
||||
var root = parseHTML('<p> \r \n \t <h5> 123 </h5></p>');
|
||||
|
||||
var p = new HTMLElement('p', {}, '');
|
||||
p.appendChild(new HTMLElement('h5', {}, ''))
|
||||
.appendChild(new TextNode('123'));
|
||||
var p = new HTMLElement('p', {}, '');
|
||||
p.appendChild(new HTMLElement('h5', {}, ''))
|
||||
.appendChild(new TextNode('123'));
|
||||
|
||||
root.firstChild.removeWhitespace().should.eql(p);
|
||||
root.firstChild.removeWhitespace().should.eql(p);
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
describe('#rawAttributes', function() {
|
||||
describe('#rawAttributes', function () {
|
||||
|
||||
it('should return escaped attributes of the element', function() {
|
||||
it('should return escaped attributes of the element', function () {
|
||||
|
||||
var root = parseHTML('<p a=12 data-id="!$$&" yAz=\'1\'></p>');
|
||||
var root = parseHTML('<p a=12 data-id="!$$&" yAz=\'1\'></p>');
|
||||
|
||||
root.firstChild.rawAttributes.should.eql({
|
||||
'a': '12',
|
||||
'data-id': '!$$&',
|
||||
'yAz': '1'
|
||||
});
|
||||
root.firstChild.rawAttributes.should.eql({
|
||||
'a': '12',
|
||||
'data-id': '!$$&',
|
||||
'yAz': '1'
|
||||
});
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
describe('#attributes', function() {
|
||||
describe('#attributes', function () {
|
||||
|
||||
it('should return attributes of the element', function() {
|
||||
it('should return attributes of the element', function () {
|
||||
|
||||
var root = parseHTML('<p a=12 data-id="!$$&" yAz=\'1\'></p>');
|
||||
var root = parseHTML('<p a=12 data-id="!$$&" yAz=\'1\'></p>');
|
||||
|
||||
root.firstChild.attributes.should.eql({
|
||||
'a': '12',
|
||||
'data-id': '!$$&',
|
||||
'yAz': '1'
|
||||
});
|
||||
root.firstChild.attributes.should.eql({
|
||||
'a': '12',
|
||||
'data-id': '!$$&',
|
||||
'yAz': '1'
|
||||
});
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
describe('#querySelectorAll()', function() {
|
||||
describe('#querySelectorAll()', function () {
|
||||
|
||||
it('should return correct elements in DOM tree', function() {
|
||||
it('should return correct elements in DOM tree', function () {
|
||||
|
||||
var root = parseHTML('<a id="id"><div><span class="a b"></span><span></span><span></span></div></a>');
|
||||
var root = parseHTML('<a id="id"><div><span class="a b"></span><span></span><span></span></div></a>');
|
||||
|
||||
root.querySelectorAll('#id').should.eql([root.firstChild]);
|
||||
root.querySelectorAll('span.a').should.eql([root.firstChild.firstChild.firstChild]);
|
||||
root.querySelectorAll('span.b').should.eql([root.firstChild.firstChild.firstChild]);
|
||||
root.querySelectorAll('span.a.b').should.eql([root.firstChild.firstChild.firstChild]);
|
||||
root.querySelectorAll('#id .b').should.eql([root.firstChild.firstChild.firstChild]);
|
||||
root.querySelectorAll('#id span').should.eql(root.firstChild.firstChild.childNodes);
|
||||
root.querySelectorAll('#id').should.eql([root.firstChild]);
|
||||
root.querySelectorAll('span.a').should.eql([root.firstChild.firstChild.firstChild]);
|
||||
root.querySelectorAll('span.b').should.eql([root.firstChild.firstChild.firstChild]);
|
||||
root.querySelectorAll('span.a.b').should.eql([root.firstChild.firstChild.firstChild]);
|
||||
root.querySelectorAll('#id .b').should.eql([root.firstChild.firstChild.firstChild]);
|
||||
root.querySelectorAll('#id span').should.eql(root.firstChild.firstChild.childNodes);
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
describe('#structuredText', function() {
|
||||
describe('#structuredText', function () {
|
||||
|
||||
it('should return correct structured text', function() {
|
||||
it('should return correct structured text', function () {
|
||||
|
||||
var root = parseHTML('<span>o<p>a</p><p>b</p>c</span>');
|
||||
root.structuredText.should.eql('o\na\nb\nc');
|
||||
var root = parseHTML('<span>o<p>a</p><p>b</p>c</span>');
|
||||
root.structuredText.should.eql('o\na\nb\nc');
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
describe('stringify', function () {
|
||||
describe('toString', function () {
|
||||
const html = '<p id="id" data-feidao-actions="ssss"><a class=\'cls\'>Hello</a><ul><li>aaaaa</li></ul><span>bbb</span></p>';
|
||||
const root = parseHTML(html);
|
||||
root.toString().should.eql(html)
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
78
tsconfig.json
Normal file
78
tsconfig.json
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
{
|
||||
"exclude": [
|
||||
"./dist/"
|
||||
],
|
||||
"include": [
|
||||
"./src/**/*.ts"
|
||||
],
|
||||
"compilerOptions": {
|
||||
"module": "commonjs",
|
||||
"target": "esnext",
|
||||
"noImplicitAny": true,
|
||||
"sourceMap": false,
|
||||
"emitDecoratorMetadata": true,
|
||||
"experimentalDecorators": true,
|
||||
"strictNullChecks": false,
|
||||
"noImplicitThis": true,
|
||||
"rootDir": "./src/",
|
||||
"rootDirs": [
|
||||
"./src/",
|
||||
"./tests/"
|
||||
],
|
||||
"allowJs": false,
|
||||
"allowUnreachableCode": false,
|
||||
"allowUnusedLabels": false,
|
||||
"alwaysStrict": true,
|
||||
"baseUrl": "",
|
||||
"charset": "utf8",
|
||||
"declaration": true,
|
||||
// "declarationDir": "./dts/",
|
||||
"inlineSourceMap": false,
|
||||
"allowSyntheticDefaultImports": false,
|
||||
"diagnostics": false,
|
||||
"emitBOM": false,
|
||||
"forceConsistentCasingInFileNames": false,
|
||||
"importHelpers": false,
|
||||
"inlineSources": false,
|
||||
"isolatedModules": false,
|
||||
"lib": [
|
||||
// "es6",
|
||||
"esnext"
|
||||
],
|
||||
"listFiles": true, // default false
|
||||
"listEmittedFiles": true, // default false
|
||||
"locale": "zh_CN",
|
||||
"newLine": "CRLF",
|
||||
"noEmit": false,
|
||||
"moduleResolution": "node",
|
||||
"noEmitHelpers": false,
|
||||
"noEmitOnError": false,
|
||||
"noImplicitReturns": false,
|
||||
"noImplicitUseStrict": false,
|
||||
"maxNodeModuleJsDepth": 0,
|
||||
"noLib": false,
|
||||
"outDir": "./dist",
|
||||
// "outFile": "./dist/tqf",
|
||||
"noFallthroughCasesInSwitch": false,
|
||||
"noResolve": false,
|
||||
"noUnusedLocals": false,
|
||||
"noUnusedParameters": false,
|
||||
"paths": {},
|
||||
"preserveConstEnums": false,
|
||||
"pretty": true,
|
||||
// "mapRoot": "",
|
||||
"removeComments": false,
|
||||
"skipDefaultLibCheck": true, // default false
|
||||
"skipLibCheck": true, // default false
|
||||
"stripInternal": false,
|
||||
"suppressExcessPropertyErrors": false,
|
||||
"suppressImplicitAnyIndexErrors": true, // default false
|
||||
"traceResolution": true, // default false
|
||||
"typeRoots": [
|
||||
],
|
||||
"types": [
|
||||
"node"
|
||||
],
|
||||
"watch": false
|
||||
}
|
||||
}
|
||||
Loading…
Reference in a new issue