Path: blob/trunk/third_party/closure/goog/format/htmlprettyprinter.js
2868 views
// Copyright 2008 The Closure Library Authors. All Rights Reserved.1//2// Licensed under the Apache License, Version 2.0 (the "License");3// you may not use this file except in compliance with the License.4// You may obtain a copy of the License at5//6// http://www.apache.org/licenses/LICENSE-2.07//8// Unless required by applicable law or agreed to in writing, software9// distributed under the License is distributed on an "AS-IS" BASIS,10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.11// See the License for the specific language governing permissions and12// limitations under the License.1314/**15* @fileoverview Provides functions to parse and pretty-print HTML strings.16*17*/1819goog.provide('goog.format.HtmlPrettyPrinter');20goog.provide('goog.format.HtmlPrettyPrinter.Buffer');2122goog.require('goog.dom.TagName');23goog.require('goog.object');24goog.require('goog.string.StringBuffer');25262728/**29* This class formats HTML to be more human-readable.30* TODO(user): Add hierarchical indentation.31* @param {number=} opt_timeOutMillis Max # milliseconds to spend on #format. If32* this time is exceeded, return partially formatted. 0 or negative number33* indicates no timeout.34* @constructor35* @final36*/37goog.format.HtmlPrettyPrinter = function(opt_timeOutMillis) {38/**39* Max # milliseconds to spend on #format.40* @type {number}41* @private42*/43this.timeOutMillis_ =44opt_timeOutMillis && opt_timeOutMillis > 0 ? opt_timeOutMillis : 0;45};464748/**49* Singleton.50* @private {goog.format.HtmlPrettyPrinter?}51*/52goog.format.HtmlPrettyPrinter.instance_ = null;535455/**56* Singleton lazy initializer.57* @return {!goog.format.HtmlPrettyPrinter} Singleton.58* @private59*/60goog.format.HtmlPrettyPrinter.getInstance_ = function() {61if (!goog.format.HtmlPrettyPrinter.instance_) {62goog.format.HtmlPrettyPrinter.instance_ =63new goog.format.HtmlPrettyPrinter();64}65return goog.format.HtmlPrettyPrinter.instance_;66};676869/**70* Static utility function. See prototype #format.71* @param {string} html The HTML text to pretty print.72* @return {string} Formatted result.73*/74goog.format.HtmlPrettyPrinter.format = function(html) {75return goog.format.HtmlPrettyPrinter.getInstance_().format(html);76};777879/**80* List of patterns used to tokenize HTML for pretty printing. Cache81* subexpression for tag name.82* comment|meta-tag|tag|text|other-less-than-characters83* @private {!RegExp}84* @const85*/86goog.format.HtmlPrettyPrinter.TOKEN_REGEX_ =87/(?:<!--.*?-->|<!.*?>|<(\/?)(\w+)[^<>]*>|[^<]+|<)/g;888990/**91* Tags whose contents we don't want pretty printed.92* @private {!Object}93* @const94*/95goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_ = goog.object.createSet(96goog.dom.TagName.SCRIPT, goog.dom.TagName.STYLE, goog.dom.TagName.PRE,97'XMP');9899100/**101* 'Block' tags. We should add newlines before and after these tags during102* pretty printing. Tags drawn mostly from HTML4 definitions for block and other103* non-online tags, excepting the ones in104* #goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_.105* @private {!Object}106* @const107*/108goog.format.HtmlPrettyPrinter.BLOCK_TAGS_ = goog.object.createSet(109goog.dom.TagName.ADDRESS, goog.dom.TagName.APPLET, goog.dom.TagName.AREA,110goog.dom.TagName.BASE, goog.dom.TagName.BASEFONT,111goog.dom.TagName.BLOCKQUOTE, goog.dom.TagName.BODY,112goog.dom.TagName.CAPTION, goog.dom.TagName.CENTER, goog.dom.TagName.COL,113goog.dom.TagName.COLGROUP, goog.dom.TagName.DIR, goog.dom.TagName.DIV,114goog.dom.TagName.DL, goog.dom.TagName.FIELDSET, goog.dom.TagName.FORM,115goog.dom.TagName.FRAME, goog.dom.TagName.FRAMESET, goog.dom.TagName.H1,116goog.dom.TagName.H2, goog.dom.TagName.H3, goog.dom.TagName.H4,117goog.dom.TagName.H5, goog.dom.TagName.H6, goog.dom.TagName.HEAD,118goog.dom.TagName.HR, goog.dom.TagName.HTML, goog.dom.TagName.IFRAME,119goog.dom.TagName.ISINDEX, goog.dom.TagName.LEGEND, goog.dom.TagName.LINK,120goog.dom.TagName.MENU, goog.dom.TagName.META, goog.dom.TagName.NOFRAMES,121goog.dom.TagName.NOSCRIPT, goog.dom.TagName.OL, goog.dom.TagName.OPTGROUP,122goog.dom.TagName.OPTION, goog.dom.TagName.P, goog.dom.TagName.PARAM,123goog.dom.TagName.TABLE, goog.dom.TagName.TBODY, goog.dom.TagName.TD,124goog.dom.TagName.TFOOT, goog.dom.TagName.TH, goog.dom.TagName.THEAD,125goog.dom.TagName.TITLE, goog.dom.TagName.TR, goog.dom.TagName.UL);126127128/**129* Non-block tags that break flow. We insert a line break after, but not before130* these. Tags drawn from HTML4 definitions.131* @private {!Object}132* @const133*/134goog.format.HtmlPrettyPrinter.BREAKS_FLOW_TAGS_ = goog.object.createSet(135goog.dom.TagName.BR, goog.dom.TagName.DD, goog.dom.TagName.DT,136goog.dom.TagName.LI, goog.dom.TagName.NOFRAMES);137138139/**140* Empty tags. These are treated as both start and end tags.141* @private {!Object}142* @const143*/144goog.format.HtmlPrettyPrinter.EMPTY_TAGS_ = goog.object.createSet(145goog.dom.TagName.BR, goog.dom.TagName.HR, goog.dom.TagName.ISINDEX);146147148/**149* Breaks up HTML so it's easily readable by the user.150* @param {string} html The HTML text to pretty print.151* @return {string} Formatted result.152* @throws {Error} Regex error, data loss, or endless loop detected.153*/154goog.format.HtmlPrettyPrinter.prototype.format = function(html) {155// Trim leading whitespace, but preserve first indent; in other words, keep156// any spaces immediately before the first non-whitespace character (that's157// what $1 is), but remove all other leading whitespace. This adjustment158// historically had been made in Docs. The motivation is that some159// browsers prepend several line breaks in designMode.160html = html.replace(/^\s*?( *\S)/, '$1');161162// Trim trailing whitespace.163html = html.replace(/\s+$/, '');164165// Keep track of how much time we've used.166var timeOutMillis = this.timeOutMillis_;167var startMillis = timeOutMillis ? goog.now() : 0;168169// Handles concatenation of the result and required line breaks.170var buffer = new goog.format.HtmlPrettyPrinter.Buffer();171172// Declare these for efficiency since we access them in a loop.173var tokenRegex = goog.format.HtmlPrettyPrinter.TOKEN_REGEX_;174var nonPpTags = goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_;175var blockTags = goog.format.HtmlPrettyPrinter.BLOCK_TAGS_;176var breaksFlowTags = goog.format.HtmlPrettyPrinter.BREAKS_FLOW_TAGS_;177var emptyTags = goog.format.HtmlPrettyPrinter.EMPTY_TAGS_;178179// Used to verify we're making progress through our regex tokenization.180var lastIndex = 0;181182// Use this to track non-pretty-printed tags and children.183var nonPpTagStack = [];184185// Loop through each matched token.186var match;187while (match = tokenRegex.exec(html)) {188// Get token.189var token = match[0];190191// Is this token a tag? match.length == 3 for tags, 1 for all others.192if (match.length == 3) {193var tagName = match[2];194if (tagName) {195tagName = tagName.toUpperCase();196}197198// Non-pretty-printed tags?199if (nonPpTags.hasOwnProperty(tagName)) {200// End tag?201if (match[1] == '/') {202// Do we have a matching start tag?203var stackSize = nonPpTagStack.length;204var startTagName = stackSize ? nonPpTagStack[stackSize - 1] : null;205if (startTagName == tagName) {206// End of non-pretty-printed block. Line break after.207nonPpTagStack.pop();208buffer.pushToken(false, token, !nonPpTagStack.length);209} else {210// Malformed HTML. No line breaks.211buffer.pushToken(false, token, false);212}213} else {214// Start of non-pretty-printed block. Line break before.215buffer.pushToken(!nonPpTagStack.length, token, false);216nonPpTagStack.push(tagName);217}218} else if (nonPpTagStack.length) {219// Inside non-pretty-printed block, no new line breaks.220buffer.pushToken(false, token, false);221} else if (blockTags.hasOwnProperty(tagName)) {222// Put line break before start block and after end block tags.223var isEmpty = emptyTags.hasOwnProperty(tagName);224var isEndTag = match[1] == '/';225buffer.pushToken(isEmpty || !isEndTag, token, isEmpty || isEndTag);226} else if (breaksFlowTags.hasOwnProperty(tagName)) {227var isEmpty = emptyTags.hasOwnProperty(tagName);228var isEndTag = match[1] == '/';229// Put line break after end flow-breaking tags.230buffer.pushToken(false, token, isEndTag || isEmpty);231} else {232// All other tags, no line break.233buffer.pushToken(false, token, false);234}235} else {236// Non-tags, no line break.237buffer.pushToken(false, token, false);238}239240// Double check that we're making progress.241var newLastIndex = tokenRegex.lastIndex;242if (!token || newLastIndex <= lastIndex) {243throw Error('Regex failed to make progress through source html.');244}245lastIndex = newLastIndex;246247// Out of time?248if (timeOutMillis) {249if (goog.now() - startMillis > timeOutMillis) {250// Push unprocessed data as one big token and reset regex object.251buffer.pushToken(false, html.substring(tokenRegex.lastIndex), false);252tokenRegex.lastIndex = 0;253break;254}255}256}257258// Ensure we end in a line break.259buffer.lineBreak();260261// Construct result string.262var result = String(buffer);263264// Length should be original length plus # line breaks added.265var expectedLength = html.length + buffer.breakCount;266if (result.length != expectedLength) {267throw Error('Lost data pretty printing html.');268}269270return result;271};272273274275/**276* This class is a buffer to which we push our output. It tracks line breaks to277* make sure we don't add unnecessary ones.278* @constructor279* @final280*/281goog.format.HtmlPrettyPrinter.Buffer = function() {282/**283* Tokens to be output in #toString.284* @type {goog.string.StringBuffer}285* @private286*/287this.out_ = new goog.string.StringBuffer();288};289290291/**292* Tracks number of line breaks added.293* @type {number}294*/295goog.format.HtmlPrettyPrinter.Buffer.prototype.breakCount = 0;296297298/**299* Tracks if we are at the start of a new line.300* @type {boolean}301* @private302*/303goog.format.HtmlPrettyPrinter.Buffer.prototype.isBeginningOfNewLine_ = true;304305306/**307* Tracks if we need a new line before the next token.308* @type {boolean}309* @private310*/311goog.format.HtmlPrettyPrinter.Buffer.prototype.needsNewLine_ = false;312313314/**315* Adds token and necessary line breaks to output buffer.316* @param {boolean} breakBefore If true, add line break before token if317* necessary.318* @param {string} token Token to push.319* @param {boolean} breakAfter If true, add line break after token if320* necessary.321*/322goog.format.HtmlPrettyPrinter.Buffer.prototype.pushToken = function(323breakBefore, token, breakAfter) {324// If this token needs a preceding line break, and325// we haven't already added a line break, and326// this token does not start with a line break,327// then add line break.328// Due to FF3.0 bug with lists, we don't insert a /n329// right before </ul>. See bug 1520665.330if ((this.needsNewLine_ || breakBefore) && !/^\r?\n/.test(token) &&331!/\/ul/i.test(token)) {332this.lineBreak();333}334335// Token.336this.out_.append(token);337338// Remember if this string ended with a line break so we know we don't have to339// insert another one before the next token.340this.isBeginningOfNewLine_ = /\r?\n$/.test(token);341342// Remember if this token requires a line break after it. We don't insert it343// here because we might not have to if the next token starts with a line344// break.345this.needsNewLine_ = breakAfter && !this.isBeginningOfNewLine_;346};347348349/**350* Append line break if we need one.351*/352goog.format.HtmlPrettyPrinter.Buffer.prototype.lineBreak = function() {353if (!this.isBeginningOfNewLine_) {354this.out_.append('\n');355++this.breakCount;356}357};358359360/**361* @return {string} String representation of tokens.362* @override363*/364goog.format.HtmlPrettyPrinter.Buffer.prototype.toString = function() {365return this.out_.toString();366};367368369