require('apollojs'); var entities = require('entities'); /** * Node Class as base class for TextNode and HTMLElement. */ function Node() { } $declare(Node, { }); $defenum(Node, { ELEMENT_NODE: 1, TEXT_NODE: 3 }); /** * TextNode to contain a text element in DOM tree. * @param {string} value [description] */ function TextNode(value) { this.rawText = value; } $inherit(TextNode, Node, { /** * Node Type declaration. * @type {Number} */ nodeType: Node.TEXT_NODE, /** * Get unescaped text value of current node and its children. * @return {string} text content */ get text() { return entities.decodeHTML5(this.rawText); }, /** * Detect if the node contains only white space. * @return {bool} */ get isWhitespace() { return /^(\s| )*$/.test(this.rawText); } }); var kBlockElements = { div: true, p: true, // ul: true, // ol: true, li: true, // table: true, // tr: true, td: true, section: true, br: true }; /** * HTMLElement, which contains a set of children. * Note: this is a minimalist implementation, no complete tree * structure provided (no parentNode, nextSibling, * previousSibling etc). * @param {string} name tagName * @param {Object} keyAttrs id and class attribute * @param {Object} rawAttrs attributes in string */ function HTMLElement(name, keyAttrs, rawAttrs) { this.tagName = name; this.rawAttrs = rawAttrs || ''; // this.parentNode = null; this.childNodes = []; if (keyAttrs.id) this.id = keyAttrs.id; if (keyAttrs.class) this.classNames = keyAttrs.class.split(/\s+/); else this.classNames = []; } $inherit(HTMLElement, Node, { /** * Node Type declaration. * @type {Number} */ nodeType: Node.ELEMENT_NODE, /** * Get unescaped text value of current node and its children. * @return {string} text content */ get text() { return entities.decodeHTML5(this.rawText); }, /** * Get escpaed (as-it) text value of current node and its children. * @return {string} text content */ get rawText() { var res = ''; for (var i = 0; i < this.childNodes.length; i++) res += this.childNodes[i].rawText; return res; }, /** * Get structured Text (with '\n' etc.) * @return {string} structured text */ get structuredText() { var currentBlock = []; var blocks = [currentBlock]; function dfs(node) { if (node.nodeType === Node.ELEMENT_NODE) { if (kBlockElements[node.tagName]) { if (currentBlock.length > 0) blocks.push(currentBlock = []); node.childNodes.forEach(dfs); if (currentBlock.length > 0) blocks.push(currentBlock = []); } else { node.childNodes.forEach(dfs); } } else if (node.nodeType === Node.TEXT_NODE) { if (node.isWhitespace) { // Whitespace node, postponed output currentBlock.prependWhitespace = true; } else { var text = node.text; if (currentBlock.prependWhitespace) { text = ' ' + text; currentBlock.prependWhitespace = false; } currentBlock.push(text); } } } dfs(this); return blocks .map(function(block) { // Normalize each line's whitespace return block.join('').trim().replace(/\s{2,}/g, ' '); }) .join('\n').trimRight(); }, /** * Trim element from right (in block) after seeing pattern in a TextNode. * @param {RegExp} pattern pattern to find * @return {HTMLElement} reference to current node */ trimRight: function(pattern) { function dfs(node) { for (var i = 0; i < node.childNodes.length; i++) { var childNode = node.childNodes[i]; if (childNode.nodeType === Node.ELEMENT_NODE) { dfs(childNode); } else { var index = childNode.rawText.search(pattern); if (index > -1) { childNode.rawText = childNode.rawText.substr(0, index); // trim all following nodes. node.childNodes.length = i+1; } } } } dfs(this); return this; }, /** * Get DOM structure * @return {string} strucutre */ get structure() { var res = []; var indention = 0; function write(str) { res.push(' '.repeat(indention) + str); } function dfs(node) { var idStr = node.id ? ('#' + node.id) : ''; var classStr = node.classNames.length ? ('.' + node.classNames.join('.')) : ''; write(node.tagName + idStr + classStr); indention++; for (var i = 0; i < node.childNodes.length; i++) { var childNode = node.childNodes[i]; if (childNode.nodeType === Node.ELEMENT_NODE) { dfs(childNode); } else if (childNode.nodeType === Node.TEXT_NODE) { if (!childNode.isWhitespace) write('#text'); } } indention--; } dfs(this); return res.join('\n'); }, /** * Remove whitespaces in this sub tree. * @return {HTMLElement} pointer to this */ removeWhitespace: function() { var i = 0, o = 0; for (; i < this.childNodes.length; i++) { var node = this.childNodes[i]; if (node.nodeType === Node.TEXT_NODE) { if (node.isWhitespace) continue; node.rawText = node.rawText.trim(); } else if (node.nodeType === Node.ELEMENT_NODE) { node.removeWhitespace(); } this.childNodes[o++] = node; } this.childNodes.length = o; return this; }, /** * Query CSS selector to find matching nodes. * @param {string} selector Simplified CSS selector * @param {Matcher} selector A Matcher instance * @return {HTMLElement[]} matching elements */ querySelectorAll: function(selector) { var matcher; if (selector instanceof Matcher) { matcher = selector; matcher.reset(); } else { matcher = new Matcher(selector); } var res = []; var stack = []; for (var i = 0; i < this.childNodes.length; i++) { stack.push([this.childNodes[i], 0, false]); while (stack.length) { var state = stack.back; var el = state[0]; if (state[1] === 0) { // Seen for first time. if (el.nodeType !== Node.ELEMENT_NODE) { stack.pop(); continue; } if (state[2] = matcher.advance(el)) { if (matcher.matched) { res.push(el); // no need to go further. matcher.rewind(); stack.pop(); continue; } } } if (state[1] < el.childNodes.length) { stack.push([el.childNodes[state[1]++], 0, false]); } else { if (state[2]) matcher.rewind(); stack.pop(); } } } return res; }, /** * Query CSS Selector to find matching node. * @param {string} selector Simplified CSS selector * @param {Matcher} selector A Matcher instance * @return {HTMLElement} matching node */ querySelector: function(selector) { var matcher; if (selector instanceof Matcher) { matcher = selector; matcher.reset(); } else { matcher = new Matcher(selector); } var stack = []; for (var i = 0; i < this.childNodes.length; i++) { stack.push([this.childNodes[i], 0, false]); while (stack.length) { var state = stack.back; var el = state[0]; if (state[1] === 0) { // Seen for first time. if (el.nodeType !== Node.ELEMENT_NODE) { stack.pop(); continue; } if (state[2] = matcher.advance(el)) { if (matcher.matched) { return el; } } } if (state[1] < el.childNodes.length) { stack.push([el.childNodes[state[1]++], 0, false]); } else { if (state[2]) matcher.rewind(); stack.pop(); } } } return null; }, /** * Append a child node to childNodes * @param {Node} node node to append * @return {Node} node appended */ appendChild: function(node) { // node.parentNode = this; this.childNodes.push(node); return node; }, /** * Get first child node * @return {Node} first child node */ get firstChild() { return this.childNodes.front; }, /** * Get last child node * @return {Node} last child node */ get lastChild() { return this.childNodes.back; }, /** * Get attributes * @return {Object} parsed and unescaped attributes */ get attributes() { if (this._attrs) return this._attrs; this._attrs = {}; var attrs = this.rawAttributes; for (var key in attrs) { this._attrs[key] = entities.decodeHTML5(attrs[key]); } return this._attrs; }, /** * Get escaped (as-it) attributes * @return {Object} parsed attributes */ get rawAttributes() { if (this._rawAttrs) return this._rawAttrs; var attrs = {}; if (this.rawAttrs) { var re = /\b([a-z][a-z0-9\-]*)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig; for (var match; match = re.exec(this.rawAttrs); ) attrs[match[1]] = match[3] || match[4] || match[5]; } this._rawAttrs = attrs; return attrs; } }); $define(HTMLElement, { __wrap: function(el) { el.childNodes.forEach(function(node) { if (node.rawText) { $wrap(node, TextNode); } else { $wrap(node, HTMLElement); } }); } }); /** * Cache to store generated match functions * @type {Object} */ var gMatchFunctionCache = {}; /** * Matcher class to make CSS match * @param {string} selector Selector */ function Matcher(selector) { this.matchers = selector.split(' ').map(function(matcher) { if (gMatchFunctionCache[matcher]) return gMatchFunctionCache[matcher]; var parts = matcher.split('.'); var tagName = parts[0]; var classes = parts.slice(1).sort(); var source = ''; if (tagName && tagName != '*') { if (tagName[0] == '#') source += 'if (el.id != ' + JSON.stringify(tagName.substr(1)) + ') return false;'; else source += 'if (el.tagName != ' + JSON.stringify(tagName) + ') return false;'; } if (classes.length > 0) source += 'for (var cls = ' + JSON.stringify(classes) + ', i = 0; i < cls.length; i++) if (el.classNames.indexOf(cls[i]) === -1) return false;'; source += 'return true;'; return gMatchFunctionCache[matcher] = new Function('el', source); }); this.nextMatch = 0; } $declare(Matcher, { /** * Trying to advance match pointer * @param {HTMLElement} el element to make the match * @return {bool} true when pointer advanced. */ advance: function(el) { if (this.nextMatch < this.matchers.length && this.matchers[this.nextMatch](el)) { this.nextMatch++; return true; } return false; }, /** * Rewind the match pointer */ rewind: function() { this.nextMatch--; }, /** * Trying to determine if match made. * @return {bool} true when the match is made */ get matched() { return this.nextMatch == this.matchers.length; }, /** * Rest match pointer. * @return {[type]} [description] */ reset: function() { this.nextMatch = 0; } }); $define(Matcher, { /** * flush cache to free memory */ flushCache: function() { gMatchFunctionCache = {}; } }); var kMarkupPattern = /)-->|<(\/?)([a-z][a-z0-9]*)\s*([^>]*?)(\/?)>/ig; var kAttributePattern = /\b(id|class)\s*=\s*("([^"]+)"|'([^']+)'|(\S+))/ig; var kSelfClosingElements = { meta: true, img: true, link: true, input: true, area: true, br: true, hr: true }; var kElementsClosedByOpening = { li: {li: true}, p: {p: true, div: true}, td: {td: true, th: true}, th: {td: true, th: true} }; var kElementsClosedByClosing = { li: {ul: true, ol: true}, p: {div: true}, td: {tr: true, table: true}, th: {tr: true, table: true} }; var kBlockTextElements = { script: true, noscript: true, style: true, pre: true }; /** * Parses HTML and returns a root element */ module.exports = { Matcher: Matcher, Node: Node, HTMLElement: HTMLElement, TextNode: TextNode, /** * Parse a chuck of HTML source. * @param {string} data html * @return {HTMLElement} root element */ parse: function(data, options) { var root = new HTMLElement(null, {}); var currentParent = root; var stack = [root]; var lastTextPos = -1; options = options || {}; for (var match, text; match = kMarkupPattern.exec(data); ) { if (lastTextPos > -1) { if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) { // if has content text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length); currentParent.appendChild(new TextNode(text)); } } lastTextPos = kMarkupPattern.lastIndex; if (match[0][1] == '!') { // this is a comment continue; } if (options.lowerCaseTagName) match[2] = match[2].toLowerCase(); if (!match[1]) { // not or ... var closeMarkup = ''; var index = data.indexOf(closeMarkup, kMarkupPattern.lastIndex); if (options[match[2]]) { if (index == -1) { // there is no matching ending for the text element. text = data.substr(kMarkupPattern.lastIndex); } else { text = data.substring(kMarkupPattern.lastIndex, index); } if (text.length > 0) currentParent.appendChild(new TextNode(text)); } if (index == -1) { lastTextPos = kMarkupPattern.lastIndex = data.length + 1; } else { lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length; match[1] = true; } } } if (match[1] || match[4] || kSelfClosingElements[match[2]]) { // or
etc. while (true) { if (currentParent.tagName == match[2]) { stack.pop(); currentParent = stack.back; break; } else { // Trying to close current tag, and move on if (kElementsClosedByClosing[currentParent.tagName]) { if (kElementsClosedByClosing[currentParent.tagName][match[2]]) { stack.pop(); currentParent = stack.back; continue; } } // Use aggressive strategy to handle unmatching markups. break; } } } } return root; } };