import { decode } from 'he'; export enum NodeType { ELEMENT_NODE = 1, TEXT_NODE = 3 } /** * Node Class as base class for TextNode and HTMLElement. */ export abstract class Node { nodeType: NodeType; childNodes = [] as Node[]; text: string; rawText: string; abstract toString(): String; } /** * TextNode to contain a text element in DOM tree. * @param {string} value [description] */ export class TextNode extends Node { constructor(value: string) { super(); this.rawText = value; } /** * Node Type declaration. * @type {Number} */ nodeType = NodeType.TEXT_NODE; /** * Get unescaped text value of current node and its children. * @return {string} text content */ get text() { return decode(this.rawText); } /** * Detect if the node contains only white space. * @return {bool} */ get isWhitespace() { return /^(\s| )*$/.test(this.rawText); } toString() { return this.text; } } const kBlockElements = { div: true, p: true, // ul: true, // ol: true, li: true, // table: true, // tr: true, td: true, section: true, br: true }; export interface KeyAttributes { id?: string; class?: string; } export interface Attributes { [key: string]: string; } export interface RawAttributes { [key: string]: string; } function arr_back(arr: T[]) { return arr[arr.length - 1]; } /** * HTMLElement, which contains a set of children. * * Note: this is a minimalist implementation, no complete tree * structure provided (no parentNode, nextSibling, * previousSibling etc). * @class HTMLElement * @extends {Node} */ export class HTMLElement extends Node { private _attrs: Attributes; private _rawAttrs: RawAttributes; public id: string; public classNames = [] as string[]; /** * Node Type declaration. */ public nodeType = NodeType.ELEMENT_NODE; /** * Creates an instance of HTMLElement. * @param keyAttrs id and class attribute * @param [rawAttrs] attributes in string * * @memberof HTMLElement */ constructor(public tagName: string, keyAttrs: KeyAttributes, private rawAttrs = '', public parentNode = null as Node) { super(); this.rawAttrs = rawAttrs || ''; this.parentNode = parentNode || null; this.childNodes = []; if (keyAttrs.id) { this.id = keyAttrs.id; } if (keyAttrs.class) { this.classNames = keyAttrs.class.split(/\s+/); } } /** * Remove Child element from childNodes array * @param {HTMLElement} node node to remove */ public removeChild(node: Node) { this.childNodes = this.childNodes.filter((child) => { return (child !== node); }); } /** * Exchanges given child with new child * @param {HTMLElement} oldNode node to exchange * @param {HTMLElement} newNode new node */ public exchangeChild(oldNode: Node, newNode: Node) { let idx = -1; for (let i = 0; i < this.childNodes.length; i++) { if (this.childNodes[i] === oldNode) { idx = i; break; } } this.childNodes[idx] = newNode; } /** * Get escpaed (as-it) text value of current node and its children. * @return {string} text content */ get rawText() { let res = ''; for (let i = 0; i < this.childNodes.length; i++) res += this.childNodes[i].rawText; return res; } /** * Get unescaped text value of current node and its children. * @return {string} text content */ get text() { return decode(this.rawText); } /** * Get structured Text (with '\n' etc.) * @return {string} structured text */ get structuredText() { let currentBlock = [] as string[]; const blocks = [currentBlock]; function dfs(node: Node) { if (node.nodeType === NodeType.ELEMENT_NODE) { if (kBlockElements[(node as HTMLElement).tagName]) { if (currentBlock.length > 0) { blocks.push(currentBlock = []); } node.childNodes.forEach(dfs); if (currentBlock.length > 0) { blocks.push(currentBlock = []); } } else { node.childNodes.forEach(dfs); } } else if (node.nodeType === NodeType.TEXT_NODE) { if ((node as TextNode).isWhitespace) { // Whitespace node, postponed output (currentBlock as any).prependWhitespace = true; } else { let text = node.text; if ((currentBlock as any).prependWhitespace) { text = ' ' + text; (currentBlock as any).prependWhitespace = false; } currentBlock.push(text); } } } dfs(this); return blocks .map(function (block) { // Normalize each line's whitespace return block.join('').trim().replace(/\s{2,}/g, ' '); }) .join('\n').replace(/\s+$/, ''); // trimRight; } public toString() { const tag = this.tagName; if (tag) { const is_un_closed = /^meta$/i.test(tag); const is_self_closed = /^(img|br|hr|area|base|input|doctype|link)$/i.test(tag); const attrs = this.rawAttrs ? ' ' + this.rawAttrs : ''; if (is_un_closed) { return `<${tag}${attrs}>`; } else if (is_self_closed) { return `<${tag}${attrs} />`; } else { return `<${tag}${attrs}>${this.innerHTML}`; } } else { return this.innerHTML; } } get innerHTML() { return this.childNodes.map((child) => { return child.toString(); }).join(''); } public set_content(content: string | Node | Node[]) { if (content instanceof Node) { content = [content]; } else if (typeof content == 'string') { const r = parse(content); content = r.childNodes.length ? r.childNodes : [new TextNode(content)]; } this.childNodes = content as Node[]; } get outerHTML() { return this.toString(); } /** * Trim element from right (in block) after seeing pattern in a TextNode. * @param {RegExp} pattern pattern to find * @return {HTMLElement} reference to current node */ public trimRight(pattern: RegExp) { for (let i = 0; i < this.childNodes.length; i++) { const childNode = this.childNodes[i]; if (childNode.nodeType === NodeType.ELEMENT_NODE) { (childNode as HTMLElement).trimRight(pattern); } else { const index = childNode.rawText.search(pattern); if (index > -1) { childNode.rawText = childNode.rawText.substr(0, index); // trim all following nodes. this.childNodes.length = i + 1; } } } return this; } /** * Get DOM structure * @return {string} strucutre */ get structure() { const res = [] as string[]; let indention = 0; function write(str: string) { res.push(' '.repeat(indention) + str); } function dfs(node: HTMLElement) { const idStr = node.id ? ('#' + node.id) : ''; const classStr = node.classNames.length ? ('.' + node.classNames.join('.')) : ''; write(node.tagName + idStr + classStr); indention++; for (let i = 0; i < node.childNodes.length; i++) { const childNode = node.childNodes[i]; if (childNode.nodeType === NodeType.ELEMENT_NODE) { dfs(childNode as HTMLElement); } else if (childNode.nodeType === NodeType.TEXT_NODE) { if (!(childNode as TextNode).isWhitespace) write('#text'); } } indention--; } dfs(this); return res.join('\n'); } /** * Remove whitespaces in this sub tree. * @return {HTMLElement} pointer to this */ public removeWhitespace() { let o = 0; for (let i = 0; i < this.childNodes.length; i++) { const node = this.childNodes[i]; if (node.nodeType === NodeType.TEXT_NODE) { if ((node as TextNode).isWhitespace) continue; node.rawText = node.rawText.trim(); } else if (node.nodeType === NodeType.ELEMENT_NODE) { (node as HTMLElement).removeWhitespace(); } this.childNodes[o++] = node; } this.childNodes.length = o; return this; } /** * Query CSS selector to find matching nodes. * @param {string} selector Simplified CSS selector * @param {Matcher} selector A Matcher instance * @return {HTMLElement[]} matching elements */ public querySelectorAll(selector: string | Matcher) { let matcher: Matcher; if (selector instanceof Matcher) { matcher = selector; matcher.reset(); } else { matcher = new Matcher(selector); } const res = [] as HTMLElement[]; const stack = [] as { 0: Node; 1: 0 | 1; 2: boolean; }[]; for (let i = 0; i < this.childNodes.length; i++) { stack.push([this.childNodes[i], 0, false]); while (stack.length) { const state = arr_back(stack); const el = state[0]; if (state[1] === 0) { // Seen for first time. if (el.nodeType !== NodeType.ELEMENT_NODE) { stack.pop(); continue; } if (state[2] = matcher.advance(el)) { if (matcher.matched) { res.push(el as HTMLElement); // no need to go further. matcher.rewind(); stack.pop(); continue; } } } if (state[1] < el.childNodes.length) { stack.push([el.childNodes[state[1]++], 0, false]); } else { if (state[2]) matcher.rewind(); stack.pop(); } } } return res; } /** * Query CSS Selector to find matching node. * @param {string} selector Simplified CSS selector * @param {Matcher} selector A Matcher instance * @return {HTMLElement} matching node */ public querySelector(selector: string | Matcher) { let matcher: Matcher; if (selector instanceof Matcher) { matcher = selector; matcher.reset(); } else { matcher = new Matcher(selector); } const stack = [] as { 0: Node; 1: 0 | 1; 2: boolean; }[]; for (let i = 0; i < this.childNodes.length; i++) { stack.push([this.childNodes[i], 0, false]); while (stack.length) { const state = arr_back(stack); const el = state[0]; if (state[1] === 0) { // Seen for first time. if (el.nodeType !== NodeType.ELEMENT_NODE) { stack.pop(); continue; } if (state[2] = matcher.advance(el)) { if (matcher.matched) { return el as HTMLElement; } } } if (state[1] < el.childNodes.length) { stack.push([el.childNodes[state[1]++], 0, false]); } else { if (state[2]) matcher.rewind(); stack.pop(); } } } return null; } /** * Append a child node to childNodes * @param {Node} node node to append * @return {Node} node appended */ public appendChild(node: T) { // node.parentNode = this; this.childNodes.push(node); if (node instanceof HTMLElement) { node.parentNode = this; } return node; } /** * Get first child node * @return {Node} first child node */ get firstChild() { return this.childNodes[0]; } /** * Get last child node * @return {Node} last child node */ get lastChild() { return arr_back(this.childNodes); } /** * Get attributes * @return {Object} parsed and unescaped attributes */ get attributes() { if (this._attrs) return this._attrs; this._attrs = {}; const attrs = this.rawAttributes; for (const key in attrs) { this._attrs[key] = decode(attrs[key]); } return this._attrs; } /** * Get escaped (as-it) attributes * @return {Object} parsed attributes */ get rawAttributes() { if (this._rawAttrs) return this._rawAttrs; const attrs = {} as RawAttributes; if (this.rawAttrs) { const re = /\b([a-z][a-z0-9\-]*)(?:\s*=\s*(?:"([^"]+)"|'([^']+)'|(\S+)))?/ig; let match: RegExpExecArray; while (match = re.exec(this.rawAttrs)) { attrs[match[1]] = match[2] || match[3] || match[4] || ""; } } this._rawAttrs = attrs; return attrs; } } interface MatherFunction { func: any; tagName: string; classes: string | string[]; attr_key: any; value: any; } /** * Cache to store generated match functions * @type {Object} */ let pMatchFunctionCache = {} as { [name: string]: MatherFunction }; /** * Function cache */ const functionCache = { "f145": function (el: HTMLElement, tagName: string, classes: string[], attr_key: string, value: string) { "use strict"; tagName = tagName || ""; classes = classes || []; attr_key = attr_key || ""; value = value || ""; if (el.id != tagName.substr(1)) return false; for (let cls = classes, i = 0; i < cls.length; i++) if (el.classNames.indexOf(cls[i]) === -1) return false; return true; }, "f45": function (el: HTMLElement, tagName: string, classes: string[], attr_key: string, value: string) { "use strict"; tagName = tagName || ""; classes = classes || []; attr_key = attr_key || ""; value = value || ""; for (let cls = classes, i = 0; i < cls.length; i++) if (el.classNames.indexOf(cls[i]) === -1) return false; return true; }, "f15": function (el: HTMLElement, tagName: string, classes: string[], attr_key: string, value: string) { "use strict"; tagName = tagName || ""; classes = classes || []; attr_key = attr_key || ""; value = value || ""; if (el.id != tagName.substr(1)) return false; return true; }, "f1": function (el: HTMLElement, tagName: string, classes: string[], attr_key: string, value: string) { "use strict"; tagName = tagName || ""; classes = classes || []; attr_key = attr_key || ""; value = value || ""; if (el.id != tagName.substr(1)) return false; }, "f5": function (el: HTMLElement, tagName: string, classes: string[], attr_key: string, value: string) { "use strict"; el = el || {} as HTMLElement; tagName = tagName || ""; classes = classes || []; attr_key = attr_key || ""; value = value || ""; return true; }, "f245": function (el: HTMLElement, tagName: string, classes: string[], attr_key: string, value: string) { "use strict"; tagName = tagName || ""; classes = classes || []; attr_key = attr_key || ""; value = value || ""; let attrs = el.attributes; for (let key in attrs) { const val = attrs[key]; if (key == attr_key && val == value) { return true; } } return false; // for (let cls = classes, i = 0; i < cls.length; i++) {if (el.classNames.indexOf(cls[i]) === -1){ return false;}} // return true; }, "f25": function (el: HTMLElement, tagName: string, classes: string[], attr_key: string, value: string) { "use strict"; tagName = tagName || ""; classes = classes || []; attr_key = attr_key || ""; value = value || ""; let attrs = el.attributes; for (let key in attrs) { const val = attrs[key]; if (key == attr_key && val == value) { return true; } } return false; //return true; }, "f2": function (el: HTMLElement, tagName: string, classes: string[], attr_key: string, value: string) { "use strict"; tagName = tagName || ""; classes = classes || []; attr_key = attr_key || ""; value = value || ""; let attrs = el.attributes; for (let key in attrs) { const val = attrs[key]; if (key == attr_key && val == value) { return true; } } return false; }, "f345": function (el: HTMLElement, tagName: string, classes: string[], attr_key: string, value: string) { "use strict"; tagName = tagName || ""; classes = classes || []; attr_key = attr_key || ""; value = value || ""; if (el.tagName != tagName) return false; for (let cls = classes, i = 0; i < cls.length; i++) if (el.classNames.indexOf(cls[i]) === -1) return false; return true; }, "f35": function (el: HTMLElement, tagName: string, classes: string[], attr_key: string, value: string) { "use strict"; tagName = tagName || ""; classes = classes || []; attr_key = attr_key || ""; value = value || ""; if (el.tagName != tagName) return false; return true; }, "f3": function (el: HTMLElement, tagName: string, classes: string[], attr_key: string, value: string) { "use strict"; tagName = tagName || ""; classes = classes || []; attr_key = attr_key || ""; value = value || ""; if (el.tagName != tagName) return false; } } /** * Matcher class to make CSS match * * @class Matcher */ export class Matcher { private matchers: MatherFunction[]; private nextMatch = 0; /** * Creates an instance of Matcher. * @param {string} selector * * @memberof Matcher */ constructor(selector: string) { functionCache["f5"] = functionCache["f5"]; this.matchers = selector.split(' ').map((matcher) => { if (pMatchFunctionCache[matcher]) return pMatchFunctionCache[matcher]; const parts = matcher.split('.'); const tagName = parts[0]; const classes = parts.slice(1).sort(); let source = '"use strict";'; let function_name = 'f'; let attr_key = ""; let value = ""; if (tagName && tagName != '*') { let matcher: RegExpMatchArray; if (tagName[0] == '#') { source += 'if (el.id != ' + JSON.stringify(tagName.substr(1)) + ') return false;';//1 function_name += '1'; } else if (matcher = tagName.match(/^\[\s*(\S+)\s*(=|!=)\s*((((["'])([^\6]*)\6))|(\S*?))\]\s*/)) { attr_key = matcher[1]; let method = matcher[2]; if (method !== '=' && method !== '!=') { throw new Error('Selector not supported, Expect [key${op}value].op must be =,!='); } if (method === '=') { method = '=='; } value = matcher[7] || matcher[8]; source += `let attrs = el.attributes;for (let key in attrs){const val = attrs[key]; if (key == "${attr_key}" && val == "${value}"){return true;}} return false;`;//2 function_name += '2'; } else { source += 'if (el.tagName != ' + JSON.stringify(tagName) + ') return false;';//3 function_name += '3'; } } if (classes.length > 0) { source += 'for (let cls = ' + JSON.stringify(classes) + ', i = 0; i < cls.length; i++) if (el.classNames.indexOf(cls[i]) === -1) return false;';//4 function_name += '4'; } source += 'return true;';//5 function_name += '5'; let obj = { func: functionCache[function_name], tagName: tagName || "", classes: classes || "", attr_key: attr_key || "", value: value || "" } source = source || ""; return pMatchFunctionCache[matcher] = obj as MatherFunction; }); } /** * Trying to advance match pointer * @param {HTMLElement} el element to make the match * @return {bool} true when pointer advanced. */ advance(el: Node) { if (this.nextMatch < this.matchers.length && this.matchers[this.nextMatch].func(el, this.matchers[this.nextMatch].tagName, this.matchers[this.nextMatch].classes, this.matchers[this.nextMatch].attr_key, this.matchers[this.nextMatch].value)) { this.nextMatch++; return true; } return false; } /** * Rewind the match pointer */ rewind() { this.nextMatch--; } /** * Trying to determine if match made. * @return {bool} true when the match is made */ get matched() { return this.nextMatch == this.matchers.length; } /** * Rest match pointer. * @return {[type]} [description] */ reset() { this.nextMatch = 0; } /** * flush cache to free memory */ flushCache() { pMatchFunctionCache = {}; } } // https://html.spec.whatwg.org/multipage/custom-elements.html#valid-custom-element-name const kMarkupPattern = /)-->|<(\/?)([a-z][-.0-9_a-z]*)\s*([^>]*?)(\/?)>/ig; const kAttributePattern = /(^|\s)(id|class)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig; const kSelfClosingElements = { area: true, base: true, br: true, col: true, hr: true, img: true, input: true, link: true, meta: true }; const kElementsClosedByOpening = { li: { li: true }, p: { p: true, div: true }, b: { div: true }, td: { td: true, th: true }, th: { td: true, th: true }, h1: { h1: true }, h2: { h2: true }, h3: { h3: true }, h4: { h4: true }, h5: { h5: true }, h6: { h6: true } }; const kElementsClosedByClosing = { li: { ul: true, ol: true }, a: { div: true }, b: { div: true }, i: { div: true }, p: { div: true }, td: { tr: true, table: true }, th: { tr: true, table: true } }; const kBlockTextElements = { script: true, noscript: true, style: true, pre: true }; /** * Parses HTML and returns a root element * Parse a chuck of HTML source. * @param {string} data html * @return {HTMLElement} root element */ export function parse(data: string, options?: { lowerCaseTagName?: boolean; noFix?: boolean; script?: boolean; style?: boolean; pre?: boolean; }) { const root = new HTMLElement(null, {}); let currentParent = root; const stack = [root]; let lastTextPos = -1; options = options || {} as any; let match: RegExpExecArray; while (match = kMarkupPattern.exec(data)) { if (lastTextPos > -1) { if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) { // if has content const text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length); currentParent.appendChild(new TextNode(text)); } } lastTextPos = kMarkupPattern.lastIndex; if (match[0][1] == '!') { // this is a comment continue; } if (options.lowerCaseTagName) match[2] = match[2].toLowerCase(); if (!match[1]) { // not or ... let closeMarkup = ''; let index = data.indexOf(closeMarkup, kMarkupPattern.lastIndex); if (options[match[2]]) { let text: string; if (index == -1) { // there is no matching ending for the text element. text = data.substr(kMarkupPattern.lastIndex); } else { text = data.substring(kMarkupPattern.lastIndex, index); } if (text.length > 0) { currentParent.appendChild(new TextNode(text)); } } if (index == -1) { lastTextPos = kMarkupPattern.lastIndex = data.length + 1; } else { lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length; match[1] = 'true'; } } } if (match[1] || match[4] || kSelfClosingElements[match[2]]) { // or
etc. while (true) { if (currentParent.tagName == match[2]) { stack.pop(); currentParent = arr_back(stack); break; } else { // Trying to close current tag, and move on if (kElementsClosedByClosing[currentParent.tagName]) { if (kElementsClosedByClosing[currentParent.tagName][match[2]]) { stack.pop(); currentParent = arr_back(stack); continue; } } // Use aggressive strategy to handle unmatching markups. break; } } } } type Response = (HTMLElement | TextNode) & { valid: boolean; }; const valid = !!(stack.length === 1); if (!options.noFix) { const response = root as Response; response.valid = valid; while (stack.length > 1) { // Handle each error elements. const last = stack.pop(); const oneBefore = arr_back(stack); if (last.parentNode && (last.parentNode as HTMLElement).parentNode) { if (last.parentNode === oneBefore && last.tagName === oneBefore.tagName) { // Pair error case

handle : Fixes to

oneBefore.removeChild(last); last.childNodes.forEach((child) => { (oneBefore.parentNode as HTMLElement).appendChild(child); }); stack.pop(); } else { // Single error

handle: Just removes

oneBefore.removeChild(last); last.childNodes.forEach((child) => { oneBefore.appendChild(child); }); } } else { // If it's final element just skip. } } response.childNodes.forEach((node) => { if (node instanceof HTMLElement) { node.parentNode = null; } }); return response; } else { const response = new TextNode(data) as Response; response.valid = valid; return response; } }