var bufferFrom = require('buffer-from') var Transform = require('readable-stream').Transform; var inherits = require('inherits'); inherits(Tokenize, Transform); module.exports = Tokenize; var codes = { lt: '<'.charCodeAt(0), gt: '>'.charCodeAt(0), slash: '/'.charCodeAt(0), dquote: '"'.charCodeAt(0), squote: "'".charCodeAt(0), equal: '='.charCodeAt(0) }; var strings = { endScript: bufferFrom(''), cdata: bufferFrom('') }; var states = { 'TagNameState': 1, 'AttributeNameState': 2, 'BeforeAttributeValueState': 3, 'AttributeValueState':4 } function Tokenize () { if (!(this instanceof Tokenize)) return new Tokenize; Transform.call(this); this._readableState.objectMode = true; this.state = 'text'; this.tagState = null; this.quoteState = null; this.raw = null; this.buffers = []; this._last = []; } Tokenize.prototype._transform = function (buf, enc, next) { var i = 0; var offset = 0; if (this._prev) { buf = Buffer.concat([ this._prev, buf ]); i = this._prev.length - 1; offset = this._offset; this._prev = null; this._offset = 0; } for (; i < buf.length; i++) { var b = buf[i]; this._last.push(b); if (this._last.length > 9) this._last.shift(); // detect end of raw character mode (comment, script,..) if (this.raw) { var parts = this._testRaw(buf, offset, i); if (parts) { this.push([ 'text', parts[0] ]); if (this.raw === strings.endComment || this.raw === strings.endCdata) { this.state = 'text'; this.buffers = []; this.push([ 'close', parts[1] ]); } else { this.state = 'open'; this.buffers = [ parts[1] ]; } this.raw = null; offset = i + 1; } } // ask for more data if last byte is '<' else if (this.state === 'text' && b === codes.lt && i === buf.length - 1) { this._prev = buf; this._offset = offset; return next(); } // detect a tag opening else if (this.state === 'text' && b === codes.lt && !isWhiteSpace(buf[i+1])) { if (i > 0 && i - offset > 0) { this.buffers.push(buf.slice(offset, i)); } offset = i; this.state = 'open'; this.tagState = states.TagNameState; this._pushState('text'); } else if ( this.tagState === states.TagNameState && isWhiteSpace(b) ) { this.tagState = states.AttributeNameState } else if ( this.tagState === states.AttributeNameState && b === codes.equal ) { this.tagState = states.BeforeAttributeValueState } else if ( this.tagState === states.BeforeAttributeValueState && isWhiteSpace(b) ) {} else if ( this.tagState === states.BeforeAttributeValueState && b !== codes.gt ) { this.tagState = states.AttributeValueState; if (b === codes.dquote) { this.quoteState = 'double'; } else if (b === codes.squote) { this.quoteState = 'single'; } else { this.quoteState = null; } } else if ( this.tagState === states.AttributeValueState && !this.quoteState && isWhiteSpace(b) ) { this.tagState = states.AttributeNameState; } else if ( this.tagState === states.AttributeValueState && this.quoteState === 'double' && b === codes.dquote ) { this.quoteState = null; this.tagState = states.AttributeNameState; } else if ( this.tagState === states.AttributeValueState && this.quoteState === 'single' && b === codes.squote ) { this.quoteState = null; this.tagState = states.AttributeNameState; } else if (this.state === 'open' && b === codes.gt && !this.quoteState) { this.buffers.push(buf.slice(offset, i + 1)); offset = i + 1; this.state = 'text'; this.tagState = null; if (this._getChar(1) === codes.slash) { this._pushState('close'); } else { var tag = this._getTag(); if (tag === 'script') this.raw = strings.endScript; if (tag === 'style') this.raw = strings.endStyle; if (tag === 'title') this.raw = strings.endTitle; this._pushState('open'); } } else if (this.state === 'open' && compare(this._last, strings.comment)) { this.buffers.push(buf.slice(offset, i + 1)); offset = i + 1; this.state = 'text'; this.raw = strings.endComment; this._pushState('open'); } else if (this.state === 'open' && compare(this._last, strings.cdata)) { this.buffers.push(buf.slice(offset, i + 1)); offset = i + 1; this.state = 'text'; this.raw = strings.endCdata; this._pushState('open'); } } if (offset < buf.length) this.buffers.push(buf.slice(offset)); next(); }; Tokenize.prototype._flush = function (next) { if (this.state === 'text') this._pushState('text'); this.push(null); next(); }; Tokenize.prototype._pushState = function (ev) { if (this.buffers.length === 0) return; var buf = Buffer.concat(this.buffers); this.buffers = []; this.push([ ev, buf ]); }; Tokenize.prototype._getChar = function (i) { var offset = 0; for (var j = 0; j < this.buffers.length; j++) { var buf = this.buffers[j]; if (offset + buf.length > i) { return buf[i - offset]; } offset += buf; } }; Tokenize.prototype._getTag = function () { var offset = 0; var tag = ''; for (var j = 0; j < this.buffers.length; j++) { var buf = this.buffers[j]; for (var k = 0; k < buf.length; k++) { if (offset === 0 && k === 0) continue; var c = String.fromCharCode(buf[k]); if (/[^\w-!\[\]]/.test(c)) { return tag.toLowerCase(); } else tag += c; } offset += buf.length; } }; Tokenize.prototype._testRaw = function (buf, offset, index) { var raw = this.raw, last = this._last; if (!compare(last, raw)) return; this.buffers.push(buf.slice(offset, index + 1)); var buf = Buffer.concat(this.buffers); var k = buf.length - raw.length; return [ buf.slice(0, k), buf.slice(k) ]; }; function compare (a, b) { if (a.length < b.length) return false; for (var i=a.length-1, j=b.length-1; i >= 0 && j >= 0; i--, j--) { if (lower(a[i]) !== lower(b[j])) return false; } return true; } function lower (n) { if (n >= 65 && n <= 90) return n + 32; return n; } function isWhiteSpace(b) { return b === 0x20 || b === 0x09 || b === 0x0A || b === 0x0C || b === 0x0D; }