mirror of
https://github.com/danbulant/node-html-parser
synced 2026-05-19 04:18:52 +00:00
feat: allow comment parsing
This commit is contained in:
parent
1083ffd59e
commit
1d03ef06ac
3 changed files with 90 additions and 3 deletions
|
|
@ -79,7 +79,8 @@ Parse given data, and return root of the generated DOM.
|
|||
lowerCaseTagName: false, // convert tag name to lower case (hurt performance heavily)
|
||||
script: false, // retrieve content in <script> (hurt performance slightly)
|
||||
style: false, // retrieve content in <style> (hurt performance slightly)
|
||||
pre: false // retrieve content in <pre> (hurt performance slightly)
|
||||
pre: false, // retrieve content in <pre> (hurt performance slightly)
|
||||
comment: false // retrieve comments (hurt performance slightly)
|
||||
}
|
||||
```
|
||||
|
||||
|
|
|
|||
36
src/index.ts
36
src/index.ts
|
|
@ -2,7 +2,8 @@ import { decode } from 'he';
|
|||
|
||||
export enum NodeType {
|
||||
ELEMENT_NODE = 1,
|
||||
TEXT_NODE = 3
|
||||
TEXT_NODE = 3,
|
||||
COMMENT_NODE = 8
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -52,6 +53,31 @@ export class TextNode extends Node {
|
|||
}
|
||||
}
|
||||
|
||||
export class CommentNode extends Node {
|
||||
constructor(value: string) {
|
||||
super();
|
||||
this.rawText = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Node Type declaration.
|
||||
* @type {Number}
|
||||
*/
|
||||
nodeType = NodeType.COMMENT_NODE;
|
||||
|
||||
/**
|
||||
* Get unescaped text value of current node and its children.
|
||||
* @return {string} text content
|
||||
*/
|
||||
get text() {
|
||||
return decode(this.rawText);
|
||||
}
|
||||
|
||||
toString() {
|
||||
return `<!--${this.rawText}-->`;
|
||||
}
|
||||
}
|
||||
|
||||
const kBlockElements = {
|
||||
div: true,
|
||||
p: true,
|
||||
|
|
@ -193,7 +219,7 @@ export class HTMLElement extends Node {
|
|||
currentBlock.push(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dfs(this);
|
||||
return blocks
|
||||
.map(function (block) {
|
||||
|
|
@ -754,6 +780,7 @@ export function parse(data: string, options?: {
|
|||
script?: boolean;
|
||||
style?: boolean;
|
||||
pre?: boolean;
|
||||
comment?: boolean;
|
||||
}) {
|
||||
const root = new HTMLElement(null, {});
|
||||
let currentParent = root;
|
||||
|
|
@ -772,6 +799,11 @@ export function parse(data: string, options?: {
|
|||
lastTextPos = kMarkupPattern.lastIndex;
|
||||
if (match[0][1] == '!') {
|
||||
// this is a comment
|
||||
if (options.comment) {
|
||||
// Only keep what is in between <!-- and -->
|
||||
const text = data.substring(lastTextPos - 3 , lastTextPos - match[0].length + 4);
|
||||
currentParent.appendChild(new CommentNode(text));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (options.lowerCaseTagName)
|
||||
|
|
|
|||
54
test/html.js
54
test/html.js
|
|
@ -9,6 +9,7 @@ describe('HTML Parser', function () {
|
|||
var Matcher = HTMLParser.Matcher;
|
||||
var HTMLElement = HTMLParser.HTMLElement;
|
||||
var TextNode = HTMLParser.TextNode;
|
||||
var CommentNode = HTMLParser.CommentNode;
|
||||
|
||||
describe('Matcher', function () {
|
||||
it('should match corrent elements', function () {
|
||||
|
|
@ -97,6 +98,34 @@ describe('HTML Parser', function () {
|
|||
|
||||
});
|
||||
|
||||
it('should parse "<div><a><!-- my comment --></a></div>" and return root element without comments', function () {
|
||||
var root = parseHTML('<div><a><!-- my comment --></a></div>');
|
||||
|
||||
var div = new HTMLElement('div', {}, '');
|
||||
var a = div.appendChild(new HTMLElement('a', {}, ''));
|
||||
|
||||
root.firstChild.should.eql(div);
|
||||
});
|
||||
|
||||
it('should parse "<div><a><!-- my comment --></a></div>" and return root element with comments', function () {
|
||||
var root = parseHTML('<div><a><!-- my comment --></a></div>', { comment: true });
|
||||
|
||||
var div = new HTMLElement('div', {}, '');
|
||||
var a = div.appendChild(new HTMLElement('a', {}, ''));
|
||||
var comment = a.appendChild(new CommentNode(' my comment '));
|
||||
|
||||
root.firstChild.should.eql(div);
|
||||
});
|
||||
|
||||
it('should not parse HTML inside comments', function () {
|
||||
var root = parseHTML('<div><!--<a></a>--></div>', { comment: true });
|
||||
|
||||
var div = new HTMLElement('div', {}, '');
|
||||
var comment = div.appendChild(new CommentNode('<a></a>'));
|
||||
|
||||
root.firstChild.should.eql(div);
|
||||
});
|
||||
|
||||
it('should parse picture element', function () {
|
||||
|
||||
var root = parseHTML('<picture><source srcset="/images/example-1.jpg 1200w, /images/example-2.jpg 1600w" sizes="100vw"><img src="/images/example.jpg" alt="Example"/></picture>');
|
||||
|
|
@ -319,6 +348,11 @@ describe('HTML Parser', function () {
|
|||
var root = parseHTML('<span>o<p>a</p><p>b</p>c</span>');
|
||||
root.structuredText.should.eql('o\na\nb\nc');
|
||||
});
|
||||
|
||||
it('should not return comments in structured text', function () {
|
||||
var root = parseHTML('<span>o<p>a</p><!-- my comment --></span>', { comment: true });
|
||||
root.structuredText.should.eql('o\na');
|
||||
});
|
||||
});
|
||||
describe('#set_content', function () {
|
||||
it('set content string', function () {
|
||||
|
|
@ -350,6 +384,26 @@ describe('HTML Parser', function () {
|
|||
const root = parseHTML(html);
|
||||
root.toString().should.eql(html)
|
||||
});
|
||||
|
||||
it('#toString() should not return comments by default', function () {
|
||||
const html = '<p><!-- my comment --></p>';
|
||||
const result = '<p></p>';
|
||||
const root = parseHTML(html);
|
||||
root.toString().should.eql(result);
|
||||
});
|
||||
|
||||
it('#toString() should return comments when specified', function () {
|
||||
const html = '<!----><p><!-- my comment --></p>';
|
||||
const root = parseHTML(html, { comment: true });
|
||||
root.toString().should.eql(html);
|
||||
});
|
||||
});
|
||||
|
||||
describe('Comment Element', function () {
|
||||
it('comment nodeType should be 8', function () {
|
||||
var root = parseHTML('<!-- my comment -->', { comment: true });
|
||||
root.firstChild.nodeType.should.eql(8);
|
||||
});
|
||||
});
|
||||
|
||||
describe('Custom Element', function () {
|
||||
|
|
|
|||
Loading…
Reference in a new issue