commit e401c8aadf7fe2b0a03852067e86027e84ab0ef9 Author: Xiaoyi Date: Fri Jul 11 18:21:41 2014 +0800 Initialize diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..815f58d --- /dev/null +++ b/.gitignore @@ -0,0 +1,21 @@ +lib-cov +*.seed +*.log +*.csv +*.dat +*.out +*.pid +*.gz + +pids +logs +results + +npm-debug.log +node_modules +bower_components + +.* +!.gitignore + +*.sublime-* diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..4a83e22 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,5 @@ +language: node_js +node_js: + - "0.11" + - "0.10" + - "0.8" diff --git a/README.md b/README.md new file mode 100644 index 0000000..66d3da5 --- /dev/null +++ b/README.md @@ -0,0 +1,132 @@ +# Fast HTML Parser + +Fast HTML Parser is a _very fast_ HTML parser. Which will generate a simplified +DOM tree, with basic element query support. + +Per the design, it intends to parse massive HTML files in lowest price, thus the +performance is the top priority. For this reason, some malformatted HTML may not +be able to parse, but most usual errors are covered (eg. HTML4 style no closing +`
  • `, `` etc). + +## Install + +```shell +npm install --save fast-html-parser +``` + +## Performance + +Faster than htmlparser2! + +```shell +fast-html-parser: 2.18409 ms/file ± 1.37431 +high5 : 4.55435 ms/file ± 2.51132 +htmlparser : 27.6920 ms/file ± 171.588 +htmlparser2-dom : 6.22320 ms/file ± 3.48772 +htmlparser2 : 3.58360 ms/file ± 2.23658 +hubbub : 16.1774 ms/file ± 8.95079 +libxmljs : 7.19406 ms/file ± 7.04495 +parse5 : 10.7590 ms/file ± 8.09687 +``` + +Tested with [htmlparser-benchmark](https://github.com/AndreasMadsen/htmlparser-benchmark). + +## Usage + +```js +var HTMLParser = require('fast-html-parser'); + +var root = HTMLParser.parse(''); + +console.log(root.firstChild.structure); +// ul#list +// li +// #text + +console.log(root.querySelector('#list')); +// { tagName: 'ul', +// rawAttrs: 'id="list"', +// childNodes: +// [ { tagName: 'li', +// rawAttrs: '', +// childNodes: [Object], +// classNames: [] } ], +// id: 'list', +// classNames: [] } +``` + +## API + +### parse(data[, options]) + +Parse given data, and return root of the generated DOM. + +- **data**, data to parse +- **options**, parse options + + ```js + { + lowerCaseTagName: false, // convert tag name to lower case (hurt performance heavily) + script: false, // retrieve content in or ... + var closeMarkup = ''; + var index = data.indexOf(closeMarkup, kMarkupPattern.lastIndex); + if (options[match[2]]) { + if (index == -1) { + // there is no matching ending for the text element. + text = data.substr(kMarkupPattern.lastIndex); + } else { + text = data.substring(kMarkupPattern.lastIndex, index); + } + if (text.length > 0) + currentParent.appendChild(new TextNode(text)); + } + if (index == -1) { + lastTextPos = kMarkupPattern.lastIndex = data.length + 1; + } else { + lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length; + match[1] = true; + } + } + } + if (match[1] || match[4] || + kSelfClosingElements[match[2]]) { + // or
    etc. + while (true) { + if (currentParent.tagName == match[2]) { + stack.pop(); + currentParent = stack.back; + break; + } else { + // Trying to close current tag, and move on + if (kElementsClosedByClosing[currentParent.tagName]) { + if (kElementsClosedByClosing[currentParent.tagName][match[2]]) { + stack.pop(); + currentParent = stack.back; + continue; + } + } + // Use aggressive strategy to handle unmatching markups. + break; + } + } + } + } + + return root; + + } + +}; diff --git a/package.json b/package.json new file mode 100644 index 0000000..778bf21 --- /dev/null +++ b/package.json @@ -0,0 +1,19 @@ +{ + "name": "fast-html-parser", + "version": "1.0.0", + "description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.", + "main": "index.js", + "scripts": { + "test": "mocha" + }, + "author": "Xiaoyi Shi ", + "license": "MIT", + "dependencies": { + "apollojs": "^1.3.0", + "entities": "^1.1.1" + }, + "devDependencies": { + "mocha": "*", + "should": "*" + } +} diff --git a/test/html.js b/test/html.js new file mode 100644 index 0000000..1bf1a69 --- /dev/null +++ b/test/html.js @@ -0,0 +1,253 @@ +var should = require('should'); +var fs = require('fs'); +var util = require('util'); + +var HTMLParser = require('../'); + +describe('HTML Parser', function() { + + var Matcher = HTMLParser.Matcher; + var HTMLElement = HTMLParser.HTMLElement; + var TextNode = HTMLParser.TextNode; + + describe('Matcher', function() { + + it('should match corrent elements', function() { + + var matcher = new Matcher('#id .a a.b *.a.b .a.b * a'); + var MatchesNothingButStarEl = new HTMLElement('_', {}); + var withIdEl = new HTMLElement('p', { id: 'id' }); + var withClassNameEl = new HTMLElement('a', { class: 'a b' }); + + // console.log(util.inspect([withIdEl, withClassNameEl], { + // showHidden: true, + // depth: null + // })); + + matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // #id + matcher.advance(withClassNameEl).should.not.be.ok; // #id + matcher.advance(withIdEl).should.be.ok; // #id + + matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a + matcher.advance(withIdEl).should.not.be.ok; // .a + matcher.advance(withClassNameEl).should.be.ok; // .a + + matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a.b + matcher.advance(withIdEl).should.not.be.ok; // a.b + matcher.advance(withClassNameEl).should.be.ok; // a.b + + matcher.advance(withIdEl).should.not.be.ok; // *.a.b + matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // *.a.b + matcher.advance(withClassNameEl).should.be.ok; // *.a.b + + matcher.advance(withIdEl).should.not.be.ok; // .a.b + matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // .a.b + matcher.advance(withClassNameEl).should.be.ok; // .a.b + + matcher.advance(withIdEl).should.be.ok; // * + matcher.rewind(); + matcher.advance(MatchesNothingButStarEl).should.be.ok; // * + matcher.rewind(); + matcher.advance(withClassNameEl).should.be.ok; // * + + matcher.advance(withIdEl).should.not.be.ok; // a + matcher.advance(MatchesNothingButStarEl).should.not.be.ok; // a + matcher.advance(withClassNameEl).should.be.ok; // a + + matcher.matched.should.be.ok; + + }); + + }); + + var parseHTML = HTMLParser.parse; + + describe('parse', function() { + + it('should parse "

    Hello

    " and return root element', function() { + + var root = parseHTML('

    Hello

    '); + + var p = new HTMLElement('p', { id: 'id' }, 'id="id"'); + p.appendChild(new HTMLElement('a', { class: 'cls' }, 'class=\'cls\'')) + .appendChild(new TextNode('Hello')); + var ul = p.appendChild(new HTMLElement('ul', {}, '')); + ul.appendChild(new HTMLElement('li', {}, '')); + ul.appendChild(new HTMLElement('li', {}, '')); + p.appendChild(new HTMLElement('span', {}, '')); + + root.firstChild.should.eql(p); + + }); + + it('should parse "

    " and return root element', function() { + + var root = parseHTML('

    ', { + lowerCaseTagName: true + }); + + var div = new HTMLElement('div', {}, ''); + var a = div.appendChild(new HTMLElement('a', {}, '')); + var img = a.appendChild(new HTMLElement('img', {}, '')); + var p = div.appendChild(new HTMLElement('p', {}, '')); + + root.firstChild.should.eql(div); + + }); + + it('should parse "

    " and return root element', function() { + + var root = parseHTML('

    '); + + var div = new HTMLElement('div', {}, ''); + var a = div.appendChild(new HTMLElement('a', {}, '')); + var img = a.appendChild(new HTMLElement('img', {}, '')); + var p = div.appendChild(new HTMLElement('p', {}, '')); + + root.firstChild.should.eql(div); + + }); + + it('should not extract text in script and style by default', function() { + + var root = parseHTML(''); + + root.firstChild.childNodes.should.be.empty; + root.lastChild.childNodes.should.be.empty; + + }); + + it('should extract text in script and style when ask so', function() { + + var root = parseHTML('', { + script: true, + style: true + }); + + root.firstChild.childNodes.should.not.be.empty; + root.firstChild.childNodes.should.eql([new TextNode('1')]); + root.lastChild.childNodes.should.not.be.empty; + root.lastChild.childNodes.should.eql([new TextNode('2')]); + + }); + + it('should be able to parse "html/incomplete-script" file', function() { + + var root = parseHTML(fs.readFileSync(__dirname + '/html/incomplete-script').toString(), { + script: true + }); + + }); + + it('should parse "

    .." very fast', function() { + + for (var i = 0; i < 100; i++) + parseHTML('

    '); + + }); + + it('should parse "

    .." fast', function() { + + for (var i = 0; i < 100; i++) + parseHTML('

    ', { + lowerCaseTagName: true + }); + + }); + + }); + + describe('TextNode', function() { + + describe('isWhitespace', function() { + var node = new TextNode(''); + node.isWhitespace.should.be.ok; + node = new TextNode(' \t'); + node.isWhitespace.should.be.ok; + node = new TextNode(' \t  \t'); + node.isWhitespace.should.be.ok; + }); + + }); + + describe('HTMLElement', function() { + + describe('removeWhitespace', function() { + + it('should remove whitespaces while preserving nodes with content', function() { + + var root = parseHTML('

    \r \n \t

    123

    '); + + var p = new HTMLElement('p', {}, ''); + p.appendChild(new HTMLElement('h5', {}, '')) + .appendChild(new TextNode('123')); + + root.firstChild.removeWhitespace().should.eql(p); + + }); + + }); + + describe('rawAttributes', function() { + + it('should return escaped attributes of the element', function() { + + var root = parseHTML('

    '); + + root.firstChild.rawAttributes.should.eql({ + 'a': '12', + 'data-id': '!$$&', + 'yAz': '1' + }); + + }); + + }); + + describe('attributes', function() { + + it('should return attributes of the element', function() { + + var root = parseHTML('

    '); + + root.firstChild.attributes.should.eql({ + 'a': '12', + 'data-id': '!$$&', + 'yAz': '1' + }); + + }); + + }); + + describe('querySelectorAll', function() { + + it('should return correct elements in DOM tree', function() { + + var root = parseHTML('
    '); + + root.querySelectorAll('#id').should.eql([root.firstChild]); + root.querySelectorAll('span.a').should.eql([root.firstChild.firstChild.firstChild]); + root.querySelectorAll('span.b').should.eql([root.firstChild.firstChild.firstChild]); + root.querySelectorAll('span.a.b').should.eql([root.firstChild.firstChild.firstChild]); + root.querySelectorAll('#id .b').should.eql([root.firstChild.firstChild.firstChild]); + root.querySelectorAll('#id span').should.eql(root.firstChild.firstChild.childNodes); + + }); + + }); + + describe('structuredText', function() { + + it('should return correct structured text', function() { + + var root = parseHTML('o

    a

    b

    c
    '); + root.structuredText.should.eql('o\na\nb\nc'); + + }); + + }); + + }); + +}); diff --git a/test/html/incomplete-script b/test/html/incomplete-script new file mode 100644 index 0000000..26be52d --- /dev/null +++ b/test/html/incomplete-script @@ -0,0 +1,598 @@ + + + + + + + + + + + +Designer Backpacks for women | Leather & Textile Backpacks | SSENSE + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + + +
    +
    + +
    + + + +
    + +
    +
    +
    + +
    + + +