Path: blob/trunk/third_party/closure/goog/labs/html/sanitizer.js
2868 views
// Copyright 2014 The Closure Library Authors. All Rights Reserved.1//2// Licensed under the Apache License, Version 2.0 (the "License");3// you may not use this file except in compliance with the License.4// You may obtain a copy of the License at5//6// http://www.apache.org/licenses/LICENSE-2.07//8// Unless required by applicable law or agreed to in writing, software9// distributed under the License is distributed on an "AS-IS" BASIS,10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.11// See the License for the specific language governing permissions and12// limitations under the License.131415/**16* @fileoverview17* An HTML sanitizer that takes untrusted HTML snippets and produces18* safe HTML by filtering/rewriting tags and attributes that contain19* high-privilege instructions.20*/212223goog.provide('goog.labs.html.Sanitizer');2425goog.require('goog.asserts');26goog.require('goog.html.SafeUrl');27goog.require('goog.labs.html.attributeRewriterPresubmitWorkaround');28goog.require('goog.labs.html.scrubber');29goog.require('goog.object');30goog.require('goog.string');31323334/**35* A sanitizer that converts untrusted, messy HTML into more regular HTML36* that cannot abuse high-authority constructs like the ability to execute37* arbitrary JavaScript.38* @constructor39*/40goog.labs.html.Sanitizer = function() {41/**42* Maps the lower-case names of allowed elements to attribute white-lists.43* An attribute white-list maps lower-case attribute names to functions44* from values to values or undefined to disallow.45*46* The special element name {@code "*"} contains a white-list of attributes47* allowed on any tag, which is useful for attributes like {@code title} and48* {@code id} which are widely available with element-agnostic meanings.49* It should not be used for attributes like {@code type} whose meaning50* differs based on the element on which it appears:51* e.g. {@code <input type=text>} vs {@code <style type=text/css>}.52*53* @type {!Object<string, !Object<string, goog.labs.html.AttributeRewriter>>}54* @private55*/56this.whitelist_ = goog.labs.html.Sanitizer.createBlankObject_();57this.whitelist_['*'] = goog.labs.html.Sanitizer.createBlankObject_();5859// To use the sanitizer, we build inputs for the scrubber.60// These inputs are invalidated by changes to the policy, so we (re)build them61// lazily.6263/**64* Maps element names to {@code true} so the scrubber does not have to do65* own property checks for every tag filtered.66*67* Built lazily and invalidated when the white-list is modified.68*69* @type {Object<string, boolean>}70* @private71*/72this.allowedElementSet_ = null;73};747576// TODO(user): Should the return type be goog.html.SafeHtml?77// If we receive a safe HTML string as input, should we simply rebalance78// tags?79/**80* Yields a string of safe HTML that contains all and only the safe81* text-nodes and elements in the input.82*83* <p>84* For the purposes of this function, "safe" is defined thus:85* <ul>86* <li>Contains only elements explicitly allowed via {@code this.allow*}.87* <li>Contains only attributes explicitly allowed via {@code this.allow*}88* and having had all relevant transformations applied.89* <li>Contains an end tag for all and only non-void open tags.90* <li>Tags nest per XHTML rules.91* <li>Tags do not nest beyond a finite but fairly large level.92* </ul>93*94* @param {!string} unsafeHtml A string of HTML which need not originate with95* a trusted source.96* @return {!string} A string of HTML that contains only tags and attributes97* explicitly allowed by this sanitizer, and with end tags for all and only98* non-void elements.99*/100goog.labs.html.Sanitizer.prototype.sanitize = function(unsafeHtml) {101var unsafeHtmlString = '' + unsafeHtml;102103/**104* @type {!Object<string, !Object<string, goog.labs.html.AttributeRewriter>>}105*/106var whitelist = this.whitelist_;107if (!this.allowedElementSet_) {108this.allowedElementSet_ = goog.object.createSet(109// This can lead to '*' in the allowed element set, but the scrubber110// will not parse "<*" as a tag beginning.111goog.object.getKeys(whitelist));112}113114return goog.labs.html.scrubber.scrub(115this.allowedElementSet_, whitelist, unsafeHtmlString);116};117118119/**120* Adds the element names to the white-list of elements that are allowed121* in the safe HTML output.122* <p>123* Allowing elements does not, by itself, allow any attributes on124* those elements.125*126* @param {...!string} var_args element names that should be allowed in the127* safe HTML output.128* @return {!goog.labs.html.Sanitizer} {@code this}.129*/130goog.labs.html.Sanitizer.prototype.allowElements = function(var_args) {131this.allowedElementSet_ = null; // Invalidate.132var whitelist = this.whitelist_;133for (var i = 0; i < arguments.length; ++i) {134var elementName = arguments[i].toLowerCase();135136goog.asserts.assert(137goog.labs.html.Sanitizer.isValidHtmlName_(elementName), elementName);138139if (!Object.prototype.hasOwnProperty.call(whitelist, elementName)) {140whitelist[elementName] = goog.labs.html.Sanitizer.createBlankObject_();141}142}143return this;144};145146147/**148* Allows in the sanitized output149* <tt><<i>element</i> <i>attr</i>="..."></tt>150* when <i>element</i> is in {@code elementNames} and151* <i>attrNames</i> is in {@code attrNames}.152*153* If specified, {@code opt_valueXform} is a function that takes the154* HTML-entity-decoded attribute value, and can choose to disallow the155* attribute by returning {@code null} or substitute a new value156* by returning a string with the new value.157*158* @param {!Array<string>|string} elementNames names (or name) on which the159* attributes are allowed.160*161* Element names should be allowed via {@code allowElements(...)} prior162* to white-listing attributes.163*164* The special element name {@code "*"} has the same meaning as in CSS165* selectors: it can be used to white-list attributes like {@code title}166* and {@code id} which are widely available with element-agnostic167* meanings.168*169* It should not be used for attributes like {@code type} whose meaning170* differs based on the element on which it appears:171* e.g. {@code <input type=text>} vs {@code <style type=text/css>}.172*173* @param {!Array<string>|string} attrNames names (or name) of the attribute174* that should be allowed.175*176* @param {goog.labs.html.AttributeRewriter=} opt_rewriteValue A function177* that receives the HTML-entity-decoded attribute value and can return178* {@code null} to disallow the attribute entirely or the value for the179* attribute as a string.180* <p>181* The default is the identity function ({@code function(x){return x}}),182* and the value rewriter is composed with an attribute specific handler:183* <table>184* <tr>185* <th>href, src</th>186* <td>Requires that the value be an absolute URL with a protocol in187* (http, https, mailto) or a protocol relative URL.188* </tr>189* </table>190*191* @return {!goog.labs.html.Sanitizer} {@code this}.192*/193goog.labs.html.Sanitizer.prototype.allowAttributes = function(194elementNames, attrNames, opt_rewriteValue) {195if (!goog.isArray(elementNames)) {196elementNames = [elementNames];197}198if (!goog.isArray(attrNames)) {199attrNames = [attrNames];200}201goog.asserts.assert(202!opt_rewriteValue || 'function' === typeof opt_rewriteValue,203'opt_rewriteValue should be a function');204205var whitelist = this.whitelist_;206for (var ei = 0; ei < elementNames.length; ++ei) {207var elementName = elementNames[ei].toLowerCase();208goog.asserts.assert(209goog.labs.html.Sanitizer.isValidHtmlName_(elementName) ||210'*' === elementName,211elementName);212// If the element has not been white-listed then panic.213// TODO(user): allow allow{Elements,Attributes} to be called in any214// order if someone needs it.215if (!Object.prototype.hasOwnProperty.call(whitelist, elementName)) {216throw new Error(elementName);217}218var attrWhitelist = whitelist[elementName];219for (var ai = 0, an = attrNames.length; ai < an; ++ai) {220var attrName = attrNames[ai].toLowerCase();221goog.asserts.assert(222goog.labs.html.Sanitizer.isValidHtmlName_(attrName), attrName);223224// If the value has already been allowed, then chain the rewriters225// so that both white-listers concerns are met.226// We do not use the default rewriter here since it should have227// been introduced by the call that created the initial white-list228// entry.229attrWhitelist[attrName] = goog.labs.html.Sanitizer.chain_(230opt_rewriteValue || goog.labs.html.Sanitizer.valueIdentity_,231Object.prototype.hasOwnProperty.call(attrWhitelist, attrName) ?232attrWhitelist[attrName] :233goog.labs.html.Sanitizer.defaultRewriterForAttr_(attrName));234}235}236return this;237};238239240/**241* A new object that is as blank as possible.242*243* Using {@code Object.create} to create an object with244* no prototype speeds up whitelist access since there's fewer prototypes245* to fall-back to for a common case where an element is not in the246* white-list, and reduces the chance of confusing a member of247* {@code Object.prototype} with a whitelist entry.248*249* @return {!Object<string, ?>} a reference to a newly allocated object that250* does not alias any reference that existed prior.251* @private252*/253goog.labs.html.Sanitizer.createBlankObject_ = function() {254return (Object.create || Object)(null);255};256257258/**259* HTML element and attribute names may be almost arbitrary strings, but the260* sanitizer is more restrictive as to what can be white-listed.261*262* Since HTML is case-insensitive, only lower-case identifiers composed of263* ASCII letters, digits, and select punctuation are allowed.264*265* @param {string} name266* @return {boolean} true iff name is a valid white-list key.267* @private268*/269goog.labs.html.Sanitizer.isValidHtmlName_ = function(name) {270return 'string' === typeof name && // Names must be strings.271// Names must be lower-case and ASCII identifier chars only.272/^[a-z][a-z0-9\-:]*$/.test(name);273};274275276/**277* @param {goog.labs.html.AttributeValue} x278* @return {goog.labs.html.AttributeValue}279* @private280*/281goog.labs.html.Sanitizer.valueIdentity_ = function(x) {282return x;283};284285286/**287* @param {goog.labs.html.AttributeValue} x288* @return {null}289* @private290*/291goog.labs.html.Sanitizer.disallow_ = function(x) {292return null;293};294295296/**297* Chains attribute rewriters.298*299* @param {goog.labs.html.AttributeRewriter} f300* @param {goog.labs.html.AttributeRewriter} g301* @return {goog.labs.html.AttributeRewriter}302* a function that return g(f(x)) or null if f(x) is null.303* @private304*/305goog.labs.html.Sanitizer.chain_ = function(f, g) {306// Sometimes white-listing code ends up allowing things multiple times.307if (f === goog.labs.html.Sanitizer.valueIdentity_) {308return g;309}310if (g === goog.labs.html.Sanitizer.valueIdentity_) {311return f;312}313// If someone tries to white-list a really problematic value, we reject314// it by returning disallow_. Disallow it quickly.315if (f === goog.labs.html.Sanitizer.disallow_) {316return f;317}318if (g === goog.labs.html.Sanitizer.disallow_) {319return g;320}321return (322/**323* @param {goog.labs.html.AttributeValue} x324* @return {goog.labs.html.AttributeValue}325*/326function(x) {327var y = f(x);328return y != null ? g(y) : null;329});330};331332333/**334* Given an attribute name, returns a value rewriter that enforces some335* minimal safety properties.336*337* <p>338* For url atributes, it checks that any protocol is on a safe set that339* doesn't allow script execution.340* <p>341* It also blanket disallows CSS and event handler attributes.342*343* @param {string} attrName lower-cased attribute name.344* @return {goog.labs.html.AttributeRewriter}345* @private346*/347goog.labs.html.Sanitizer.defaultRewriterForAttr_ = function(attrName) {348if ('href' === attrName || 'src' === attrName) {349return goog.labs.html.Sanitizer.checkUrl_;350} else if ('style' === attrName || 'on' === attrName.substr(0, 2)) {351// TODO(user): delegate to a CSS sanitizer if one is available.352return goog.labs.html.Sanitizer.disallow_;353}354return goog.labs.html.Sanitizer.valueIdentity_;355};356357358/**359* Applied automatically to URL attributes to check that they are safe as per360* {@link SafeUrl}.361*362* @param {goog.labs.html.AttributeValue} attrValue a decoded attribute value.363* @return {goog.html.SafeUrl | null} a URL that is equivalent to the364* input or {@code null} if the input is not a safe URL.365* @private366*/367goog.labs.html.Sanitizer.checkUrl_ = function(attrValue) {368if (attrValue == null) {369return null;370}371/** @type {!goog.html.SafeUrl} */372var safeUrl;373if (attrValue instanceof goog.html.SafeUrl) {374safeUrl = /** @type {!goog.html.SafeUrl} */ (attrValue);375} else {376if (typeof attrValue === 'string') {377// Whitespace at the ends of URL-valued attributes in HTML is ignored.378attrValue = goog.string.trim(/** @type {string} */ (attrValue));379}380safeUrl = goog.html.SafeUrl.sanitize(381/** @type {!goog.string.TypedString | string} */ (attrValue));382}383if (goog.html.SafeUrl.unwrap(safeUrl) == goog.html.SafeUrl.INNOCUOUS_STRING) {384return null;385} else {386return safeUrl;387}388};389390391goog.labs.html.attributeRewriterPresubmitWorkaround();392393394