Merge pull request #13 from blex41/parse-comments

feat: add support for comment parsing
This commit is contained in:
taoqf 2020-01-14 12:14:52 +08:00 committed by GitHub
commit c145442179
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 90 additions and 3 deletions

View file

@ -79,7 +79,8 @@ Parse given data, and return root of the generated DOM.
lowerCaseTagName: false, // convert tag name to lower case (hurt performance heavily)
script: false, // retrieve content in <script> (hurt performance slightly)
style: false, // retrieve content in <style> (hurt performance slightly)
pre: false // retrieve content in <pre> (hurt performance slightly)
pre: false, // retrieve content in <pre> (hurt performance slightly)
comment: false // retrieve comments (hurt performance slightly)
}
```

View file

@ -2,7 +2,8 @@ import { decode } from 'he';
export enum NodeType {
ELEMENT_NODE = 1,
TEXT_NODE = 3
TEXT_NODE = 3,
COMMENT_NODE = 8
}
/**
@ -52,6 +53,31 @@ export class TextNode extends Node {
}
}
export class CommentNode extends Node {
constructor(value: string) {
super();
this.rawText = value;
}
/**
* Node Type declaration.
* @type {Number}
*/
nodeType = NodeType.COMMENT_NODE;
/**
* Get unescaped text value of current node and its children.
* @return {string} text content
*/
get text() {
return decode(this.rawText);
}
toString() {
return `<!--${this.rawText}-->`;
}
}
const kBlockElements = {
div: true,
p: true,
@ -193,7 +219,7 @@ export class HTMLElement extends Node {
currentBlock.push(text);
}
}
}
}
dfs(this);
return blocks
.map(function (block) {
@ -754,6 +780,7 @@ export function parse(data: string, options?: {
script?: boolean;
style?: boolean;
pre?: boolean;
comment?: boolean;
}) {
const root = new HTMLElement(null, {});
let currentParent = root;
@ -772,6 +799,11 @@ export function parse(data: string, options?: {
lastTextPos = kMarkupPattern.lastIndex;
if (match[0][1] == '!') {
// this is a comment
if (options.comment) {
// Only keep what is in between <!-- and -->
const text = data.substring(lastTextPos - 3 , lastTextPos - match[0].length + 4);
currentParent.appendChild(new CommentNode(text));
}
continue;
}
if (options.lowerCaseTagName)

View file

@ -9,6 +9,7 @@ describe('HTML Parser', function () {
var Matcher = HTMLParser.Matcher;
var HTMLElement = HTMLParser.HTMLElement;
var TextNode = HTMLParser.TextNode;
var CommentNode = HTMLParser.CommentNode;
describe('Matcher', function () {
it('should match corrent elements', function () {
@ -97,6 +98,34 @@ describe('HTML Parser', function () {
});
it('should parse "<div><a><!-- my comment --></a></div>" and return root element without comments', function () {
var root = parseHTML('<div><a><!-- my comment --></a></div>');
var div = new HTMLElement('div', {}, '');
var a = div.appendChild(new HTMLElement('a', {}, ''));
root.firstChild.should.eql(div);
});
it('should parse "<div><a><!-- my comment --></a></div>" and return root element with comments', function () {
var root = parseHTML('<div><a><!-- my comment --></a></div>', { comment: true });
var div = new HTMLElement('div', {}, '');
var a = div.appendChild(new HTMLElement('a', {}, ''));
var comment = a.appendChild(new CommentNode(' my comment '));
root.firstChild.should.eql(div);
});
it('should not parse HTML inside comments', function () {
var root = parseHTML('<div><!--<a></a>--></div>', { comment: true });
var div = new HTMLElement('div', {}, '');
var comment = div.appendChild(new CommentNode('<a></a>'));
root.firstChild.should.eql(div);
});
it('should parse picture element', function () {
var root = parseHTML('<picture><source srcset="/images/example-1.jpg 1200w, /images/example-2.jpg 1600w" sizes="100vw"><img src="/images/example.jpg" alt="Example"/></picture>');
@ -319,6 +348,11 @@ describe('HTML Parser', function () {
var root = parseHTML('<span>o<p>a</p><p>b</p>c</span>');
root.structuredText.should.eql('o\na\nb\nc');
});
it('should not return comments in structured text', function () {
var root = parseHTML('<span>o<p>a</p><!-- my comment --></span>', { comment: true });
root.structuredText.should.eql('o\na');
});
});
describe('#set_content', function () {
it('set content string', function () {
@ -350,6 +384,26 @@ describe('HTML Parser', function () {
const root = parseHTML(html);
root.toString().should.eql(html)
});
it('#toString() should not return comments by default', function () {
const html = '<p><!-- my comment --></p>';
const result = '<p></p>';
const root = parseHTML(html);
root.toString().should.eql(result);
});
it('#toString() should return comments when specified', function () {
const html = '<!----><p><!-- my comment --></p>';
const root = parseHTML(html, { comment: true });
root.toString().should.eql(html);
});
});
describe('Comment Element', function () {
it('comment nodeType should be 8', function () {
var root = parseHTML('<!-- my comment -->', { comment: true });
root.firstChild.nodeType.should.eql(8);
});
});
describe('Custom Element', function () {