mirror of
https://github.com/danbulant/node-html-parser
synced 2026-05-24 12:35:10 +00:00
Merge pull request #13 from blex41/parse-comments
feat: add support for comment parsing
This commit is contained in:
commit
c145442179
3 changed files with 90 additions and 3 deletions
|
|
@ -79,7 +79,8 @@ Parse given data, and return root of the generated DOM.
|
||||||
lowerCaseTagName: false, // convert tag name to lower case (hurt performance heavily)
|
lowerCaseTagName: false, // convert tag name to lower case (hurt performance heavily)
|
||||||
script: false, // retrieve content in <script> (hurt performance slightly)
|
script: false, // retrieve content in <script> (hurt performance slightly)
|
||||||
style: false, // retrieve content in <style> (hurt performance slightly)
|
style: false, // retrieve content in <style> (hurt performance slightly)
|
||||||
pre: false // retrieve content in <pre> (hurt performance slightly)
|
pre: false, // retrieve content in <pre> (hurt performance slightly)
|
||||||
|
comment: false // retrieve comments (hurt performance slightly)
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
||||||
34
src/index.ts
34
src/index.ts
|
|
@ -2,7 +2,8 @@ import { decode } from 'he';
|
||||||
|
|
||||||
export enum NodeType {
|
export enum NodeType {
|
||||||
ELEMENT_NODE = 1,
|
ELEMENT_NODE = 1,
|
||||||
TEXT_NODE = 3
|
TEXT_NODE = 3,
|
||||||
|
COMMENT_NODE = 8
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -52,6 +53,31 @@ export class TextNode extends Node {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export class CommentNode extends Node {
|
||||||
|
constructor(value: string) {
|
||||||
|
super();
|
||||||
|
this.rawText = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Node Type declaration.
|
||||||
|
* @type {Number}
|
||||||
|
*/
|
||||||
|
nodeType = NodeType.COMMENT_NODE;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get unescaped text value of current node and its children.
|
||||||
|
* @return {string} text content
|
||||||
|
*/
|
||||||
|
get text() {
|
||||||
|
return decode(this.rawText);
|
||||||
|
}
|
||||||
|
|
||||||
|
toString() {
|
||||||
|
return `<!--${this.rawText}-->`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const kBlockElements = {
|
const kBlockElements = {
|
||||||
div: true,
|
div: true,
|
||||||
p: true,
|
p: true,
|
||||||
|
|
@ -754,6 +780,7 @@ export function parse(data: string, options?: {
|
||||||
script?: boolean;
|
script?: boolean;
|
||||||
style?: boolean;
|
style?: boolean;
|
||||||
pre?: boolean;
|
pre?: boolean;
|
||||||
|
comment?: boolean;
|
||||||
}) {
|
}) {
|
||||||
const root = new HTMLElement(null, {});
|
const root = new HTMLElement(null, {});
|
||||||
let currentParent = root;
|
let currentParent = root;
|
||||||
|
|
@ -772,6 +799,11 @@ export function parse(data: string, options?: {
|
||||||
lastTextPos = kMarkupPattern.lastIndex;
|
lastTextPos = kMarkupPattern.lastIndex;
|
||||||
if (match[0][1] == '!') {
|
if (match[0][1] == '!') {
|
||||||
// this is a comment
|
// this is a comment
|
||||||
|
if (options.comment) {
|
||||||
|
// Only keep what is in between <!-- and -->
|
||||||
|
const text = data.substring(lastTextPos - 3 , lastTextPos - match[0].length + 4);
|
||||||
|
currentParent.appendChild(new CommentNode(text));
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (options.lowerCaseTagName)
|
if (options.lowerCaseTagName)
|
||||||
|
|
|
||||||
54
test/html.js
54
test/html.js
|
|
@ -9,6 +9,7 @@ describe('HTML Parser', function () {
|
||||||
var Matcher = HTMLParser.Matcher;
|
var Matcher = HTMLParser.Matcher;
|
||||||
var HTMLElement = HTMLParser.HTMLElement;
|
var HTMLElement = HTMLParser.HTMLElement;
|
||||||
var TextNode = HTMLParser.TextNode;
|
var TextNode = HTMLParser.TextNode;
|
||||||
|
var CommentNode = HTMLParser.CommentNode;
|
||||||
|
|
||||||
describe('Matcher', function () {
|
describe('Matcher', function () {
|
||||||
it('should match corrent elements', function () {
|
it('should match corrent elements', function () {
|
||||||
|
|
@ -97,6 +98,34 @@ describe('HTML Parser', function () {
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('should parse "<div><a><!-- my comment --></a></div>" and return root element without comments', function () {
|
||||||
|
var root = parseHTML('<div><a><!-- my comment --></a></div>');
|
||||||
|
|
||||||
|
var div = new HTMLElement('div', {}, '');
|
||||||
|
var a = div.appendChild(new HTMLElement('a', {}, ''));
|
||||||
|
|
||||||
|
root.firstChild.should.eql(div);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should parse "<div><a><!-- my comment --></a></div>" and return root element with comments', function () {
|
||||||
|
var root = parseHTML('<div><a><!-- my comment --></a></div>', { comment: true });
|
||||||
|
|
||||||
|
var div = new HTMLElement('div', {}, '');
|
||||||
|
var a = div.appendChild(new HTMLElement('a', {}, ''));
|
||||||
|
var comment = a.appendChild(new CommentNode(' my comment '));
|
||||||
|
|
||||||
|
root.firstChild.should.eql(div);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should not parse HTML inside comments', function () {
|
||||||
|
var root = parseHTML('<div><!--<a></a>--></div>', { comment: true });
|
||||||
|
|
||||||
|
var div = new HTMLElement('div', {}, '');
|
||||||
|
var comment = div.appendChild(new CommentNode('<a></a>'));
|
||||||
|
|
||||||
|
root.firstChild.should.eql(div);
|
||||||
|
});
|
||||||
|
|
||||||
it('should parse picture element', function () {
|
it('should parse picture element', function () {
|
||||||
|
|
||||||
var root = parseHTML('<picture><source srcset="/images/example-1.jpg 1200w, /images/example-2.jpg 1600w" sizes="100vw"><img src="/images/example.jpg" alt="Example"/></picture>');
|
var root = parseHTML('<picture><source srcset="/images/example-1.jpg 1200w, /images/example-2.jpg 1600w" sizes="100vw"><img src="/images/example.jpg" alt="Example"/></picture>');
|
||||||
|
|
@ -319,6 +348,11 @@ describe('HTML Parser', function () {
|
||||||
var root = parseHTML('<span>o<p>a</p><p>b</p>c</span>');
|
var root = parseHTML('<span>o<p>a</p><p>b</p>c</span>');
|
||||||
root.structuredText.should.eql('o\na\nb\nc');
|
root.structuredText.should.eql('o\na\nb\nc');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('should not return comments in structured text', function () {
|
||||||
|
var root = parseHTML('<span>o<p>a</p><!-- my comment --></span>', { comment: true });
|
||||||
|
root.structuredText.should.eql('o\na');
|
||||||
|
});
|
||||||
});
|
});
|
||||||
describe('#set_content', function () {
|
describe('#set_content', function () {
|
||||||
it('set content string', function () {
|
it('set content string', function () {
|
||||||
|
|
@ -350,6 +384,26 @@ describe('HTML Parser', function () {
|
||||||
const root = parseHTML(html);
|
const root = parseHTML(html);
|
||||||
root.toString().should.eql(html)
|
root.toString().should.eql(html)
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('#toString() should not return comments by default', function () {
|
||||||
|
const html = '<p><!-- my comment --></p>';
|
||||||
|
const result = '<p></p>';
|
||||||
|
const root = parseHTML(html);
|
||||||
|
root.toString().should.eql(result);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('#toString() should return comments when specified', function () {
|
||||||
|
const html = '<!----><p><!-- my comment --></p>';
|
||||||
|
const root = parseHTML(html, { comment: true });
|
||||||
|
root.toString().should.eql(html);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Comment Element', function () {
|
||||||
|
it('comment nodeType should be 8', function () {
|
||||||
|
var root = parseHTML('<!-- my comment -->', { comment: true });
|
||||||
|
root.firstChild.nodeType.should.eql(8);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('Custom Element', function () {
|
describe('Custom Element', function () {
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue