Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
seleniumhq
GitHub Repository: seleniumhq/selenium
Path: blob/trunk/third_party/closure/goog/labs/html/sanitizer.js
2868 views
1
// Copyright 2014 The Closure Library Authors. All Rights Reserved.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
// http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS-IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
16
/**
17
* @fileoverview
18
* An HTML sanitizer that takes untrusted HTML snippets and produces
19
* safe HTML by filtering/rewriting tags and attributes that contain
20
* high-privilege instructions.
21
*/
22
23
24
goog.provide('goog.labs.html.Sanitizer');
25
26
goog.require('goog.asserts');
27
goog.require('goog.html.SafeUrl');
28
goog.require('goog.labs.html.attributeRewriterPresubmitWorkaround');
29
goog.require('goog.labs.html.scrubber');
30
goog.require('goog.object');
31
goog.require('goog.string');
32
33
34
35
/**
36
* A sanitizer that converts untrusted, messy HTML into more regular HTML
37
* that cannot abuse high-authority constructs like the ability to execute
38
* arbitrary JavaScript.
39
* @constructor
40
*/
41
goog.labs.html.Sanitizer = function() {
42
/**
43
* Maps the lower-case names of allowed elements to attribute white-lists.
44
* An attribute white-list maps lower-case attribute names to functions
45
* from values to values or undefined to disallow.
46
*
47
* The special element name {@code "*"} contains a white-list of attributes
48
* allowed on any tag, which is useful for attributes like {@code title} and
49
* {@code id} which are widely available with element-agnostic meanings.
50
* It should not be used for attributes like {@code type} whose meaning
51
* differs based on the element on which it appears:
52
* e.g. {@code <input type=text>} vs {@code <style type=text/css>}.
53
*
54
* @type {!Object<string, !Object<string, goog.labs.html.AttributeRewriter>>}
55
* @private
56
*/
57
this.whitelist_ = goog.labs.html.Sanitizer.createBlankObject_();
58
this.whitelist_['*'] = goog.labs.html.Sanitizer.createBlankObject_();
59
60
// To use the sanitizer, we build inputs for the scrubber.
61
// These inputs are invalidated by changes to the policy, so we (re)build them
62
// lazily.
63
64
/**
65
* Maps element names to {@code true} so the scrubber does not have to do
66
* own property checks for every tag filtered.
67
*
68
* Built lazily and invalidated when the white-list is modified.
69
*
70
* @type {Object<string, boolean>}
71
* @private
72
*/
73
this.allowedElementSet_ = null;
74
};
75
76
77
// TODO(user): Should the return type be goog.html.SafeHtml?
78
// If we receive a safe HTML string as input, should we simply rebalance
79
// tags?
80
/**
81
* Yields a string of safe HTML that contains all and only the safe
82
* text-nodes and elements in the input.
83
*
84
* <p>
85
* For the purposes of this function, "safe" is defined thus:
86
* <ul>
87
* <li>Contains only elements explicitly allowed via {@code this.allow*}.
88
* <li>Contains only attributes explicitly allowed via {@code this.allow*}
89
* and having had all relevant transformations applied.
90
* <li>Contains an end tag for all and only non-void open tags.
91
* <li>Tags nest per XHTML rules.
92
* <li>Tags do not nest beyond a finite but fairly large level.
93
* </ul>
94
*
95
* @param {!string} unsafeHtml A string of HTML which need not originate with
96
* a trusted source.
97
* @return {!string} A string of HTML that contains only tags and attributes
98
* explicitly allowed by this sanitizer, and with end tags for all and only
99
* non-void elements.
100
*/
101
goog.labs.html.Sanitizer.prototype.sanitize = function(unsafeHtml) {
102
var unsafeHtmlString = '' + unsafeHtml;
103
104
/**
105
* @type {!Object<string, !Object<string, goog.labs.html.AttributeRewriter>>}
106
*/
107
var whitelist = this.whitelist_;
108
if (!this.allowedElementSet_) {
109
this.allowedElementSet_ = goog.object.createSet(
110
// This can lead to '*' in the allowed element set, but the scrubber
111
// will not parse "<*" as a tag beginning.
112
goog.object.getKeys(whitelist));
113
}
114
115
return goog.labs.html.scrubber.scrub(
116
this.allowedElementSet_, whitelist, unsafeHtmlString);
117
};
118
119
120
/**
121
* Adds the element names to the white-list of elements that are allowed
122
* in the safe HTML output.
123
* <p>
124
* Allowing elements does not, by itself, allow any attributes on
125
* those elements.
126
*
127
* @param {...!string} var_args element names that should be allowed in the
128
* safe HTML output.
129
* @return {!goog.labs.html.Sanitizer} {@code this}.
130
*/
131
goog.labs.html.Sanitizer.prototype.allowElements = function(var_args) {
132
this.allowedElementSet_ = null; // Invalidate.
133
var whitelist = this.whitelist_;
134
for (var i = 0; i < arguments.length; ++i) {
135
var elementName = arguments[i].toLowerCase();
136
137
goog.asserts.assert(
138
goog.labs.html.Sanitizer.isValidHtmlName_(elementName), elementName);
139
140
if (!Object.prototype.hasOwnProperty.call(whitelist, elementName)) {
141
whitelist[elementName] = goog.labs.html.Sanitizer.createBlankObject_();
142
}
143
}
144
return this;
145
};
146
147
148
/**
149
* Allows in the sanitized output
150
* <tt>&lt;<i>element</i> <i>attr</i>="..."&gt;</tt>
151
* when <i>element</i> is in {@code elementNames} and
152
* <i>attrNames</i> is in {@code attrNames}.
153
*
154
* If specified, {@code opt_valueXform} is a function that takes the
155
* HTML-entity-decoded attribute value, and can choose to disallow the
156
* attribute by returning {@code null} or substitute a new value
157
* by returning a string with the new value.
158
*
159
* @param {!Array<string>|string} elementNames names (or name) on which the
160
* attributes are allowed.
161
*
162
* Element names should be allowed via {@code allowElements(...)} prior
163
* to white-listing attributes.
164
*
165
* The special element name {@code "*"} has the same meaning as in CSS
166
* selectors: it can be used to white-list attributes like {@code title}
167
* and {@code id} which are widely available with element-agnostic
168
* meanings.
169
*
170
* It should not be used for attributes like {@code type} whose meaning
171
* differs based on the element on which it appears:
172
* e.g. {@code <input type=text>} vs {@code <style type=text/css>}.
173
*
174
* @param {!Array<string>|string} attrNames names (or name) of the attribute
175
* that should be allowed.
176
*
177
* @param {goog.labs.html.AttributeRewriter=} opt_rewriteValue A function
178
* that receives the HTML-entity-decoded attribute value and can return
179
* {@code null} to disallow the attribute entirely or the value for the
180
* attribute as a string.
181
* <p>
182
* The default is the identity function ({@code function(x){return x}}),
183
* and the value rewriter is composed with an attribute specific handler:
184
* <table>
185
* <tr>
186
* <th>href, src</th>
187
* <td>Requires that the value be an absolute URL with a protocol in
188
* (http, https, mailto) or a protocol relative URL.
189
* </tr>
190
* </table>
191
*
192
* @return {!goog.labs.html.Sanitizer} {@code this}.
193
*/
194
goog.labs.html.Sanitizer.prototype.allowAttributes = function(
195
elementNames, attrNames, opt_rewriteValue) {
196
if (!goog.isArray(elementNames)) {
197
elementNames = [elementNames];
198
}
199
if (!goog.isArray(attrNames)) {
200
attrNames = [attrNames];
201
}
202
goog.asserts.assert(
203
!opt_rewriteValue || 'function' === typeof opt_rewriteValue,
204
'opt_rewriteValue should be a function');
205
206
var whitelist = this.whitelist_;
207
for (var ei = 0; ei < elementNames.length; ++ei) {
208
var elementName = elementNames[ei].toLowerCase();
209
goog.asserts.assert(
210
goog.labs.html.Sanitizer.isValidHtmlName_(elementName) ||
211
'*' === elementName,
212
elementName);
213
// If the element has not been white-listed then panic.
214
// TODO(user): allow allow{Elements,Attributes} to be called in any
215
// order if someone needs it.
216
if (!Object.prototype.hasOwnProperty.call(whitelist, elementName)) {
217
throw new Error(elementName);
218
}
219
var attrWhitelist = whitelist[elementName];
220
for (var ai = 0, an = attrNames.length; ai < an; ++ai) {
221
var attrName = attrNames[ai].toLowerCase();
222
goog.asserts.assert(
223
goog.labs.html.Sanitizer.isValidHtmlName_(attrName), attrName);
224
225
// If the value has already been allowed, then chain the rewriters
226
// so that both white-listers concerns are met.
227
// We do not use the default rewriter here since it should have
228
// been introduced by the call that created the initial white-list
229
// entry.
230
attrWhitelist[attrName] = goog.labs.html.Sanitizer.chain_(
231
opt_rewriteValue || goog.labs.html.Sanitizer.valueIdentity_,
232
Object.prototype.hasOwnProperty.call(attrWhitelist, attrName) ?
233
attrWhitelist[attrName] :
234
goog.labs.html.Sanitizer.defaultRewriterForAttr_(attrName));
235
}
236
}
237
return this;
238
};
239
240
241
/**
242
* A new object that is as blank as possible.
243
*
244
* Using {@code Object.create} to create an object with
245
* no prototype speeds up whitelist access since there's fewer prototypes
246
* to fall-back to for a common case where an element is not in the
247
* white-list, and reduces the chance of confusing a member of
248
* {@code Object.prototype} with a whitelist entry.
249
*
250
* @return {!Object<string, ?>} a reference to a newly allocated object that
251
* does not alias any reference that existed prior.
252
* @private
253
*/
254
goog.labs.html.Sanitizer.createBlankObject_ = function() {
255
return (Object.create || Object)(null);
256
};
257
258
259
/**
260
* HTML element and attribute names may be almost arbitrary strings, but the
261
* sanitizer is more restrictive as to what can be white-listed.
262
*
263
* Since HTML is case-insensitive, only lower-case identifiers composed of
264
* ASCII letters, digits, and select punctuation are allowed.
265
*
266
* @param {string} name
267
* @return {boolean} true iff name is a valid white-list key.
268
* @private
269
*/
270
goog.labs.html.Sanitizer.isValidHtmlName_ = function(name) {
271
return 'string' === typeof name && // Names must be strings.
272
// Names must be lower-case and ASCII identifier chars only.
273
/^[a-z][a-z0-9\-:]*$/.test(name);
274
};
275
276
277
/**
278
* @param {goog.labs.html.AttributeValue} x
279
* @return {goog.labs.html.AttributeValue}
280
* @private
281
*/
282
goog.labs.html.Sanitizer.valueIdentity_ = function(x) {
283
return x;
284
};
285
286
287
/**
288
* @param {goog.labs.html.AttributeValue} x
289
* @return {null}
290
* @private
291
*/
292
goog.labs.html.Sanitizer.disallow_ = function(x) {
293
return null;
294
};
295
296
297
/**
298
* Chains attribute rewriters.
299
*
300
* @param {goog.labs.html.AttributeRewriter} f
301
* @param {goog.labs.html.AttributeRewriter} g
302
* @return {goog.labs.html.AttributeRewriter}
303
* a function that return g(f(x)) or null if f(x) is null.
304
* @private
305
*/
306
goog.labs.html.Sanitizer.chain_ = function(f, g) {
307
// Sometimes white-listing code ends up allowing things multiple times.
308
if (f === goog.labs.html.Sanitizer.valueIdentity_) {
309
return g;
310
}
311
if (g === goog.labs.html.Sanitizer.valueIdentity_) {
312
return f;
313
}
314
// If someone tries to white-list a really problematic value, we reject
315
// it by returning disallow_. Disallow it quickly.
316
if (f === goog.labs.html.Sanitizer.disallow_) {
317
return f;
318
}
319
if (g === goog.labs.html.Sanitizer.disallow_) {
320
return g;
321
}
322
return (
323
/**
324
* @param {goog.labs.html.AttributeValue} x
325
* @return {goog.labs.html.AttributeValue}
326
*/
327
function(x) {
328
var y = f(x);
329
return y != null ? g(y) : null;
330
});
331
};
332
333
334
/**
335
* Given an attribute name, returns a value rewriter that enforces some
336
* minimal safety properties.
337
*
338
* <p>
339
* For url atributes, it checks that any protocol is on a safe set that
340
* doesn't allow script execution.
341
* <p>
342
* It also blanket disallows CSS and event handler attributes.
343
*
344
* @param {string} attrName lower-cased attribute name.
345
* @return {goog.labs.html.AttributeRewriter}
346
* @private
347
*/
348
goog.labs.html.Sanitizer.defaultRewriterForAttr_ = function(attrName) {
349
if ('href' === attrName || 'src' === attrName) {
350
return goog.labs.html.Sanitizer.checkUrl_;
351
} else if ('style' === attrName || 'on' === attrName.substr(0, 2)) {
352
// TODO(user): delegate to a CSS sanitizer if one is available.
353
return goog.labs.html.Sanitizer.disallow_;
354
}
355
return goog.labs.html.Sanitizer.valueIdentity_;
356
};
357
358
359
/**
360
* Applied automatically to URL attributes to check that they are safe as per
361
* {@link SafeUrl}.
362
*
363
* @param {goog.labs.html.AttributeValue} attrValue a decoded attribute value.
364
* @return {goog.html.SafeUrl | null} a URL that is equivalent to the
365
* input or {@code null} if the input is not a safe URL.
366
* @private
367
*/
368
goog.labs.html.Sanitizer.checkUrl_ = function(attrValue) {
369
if (attrValue == null) {
370
return null;
371
}
372
/** @type {!goog.html.SafeUrl} */
373
var safeUrl;
374
if (attrValue instanceof goog.html.SafeUrl) {
375
safeUrl = /** @type {!goog.html.SafeUrl} */ (attrValue);
376
} else {
377
if (typeof attrValue === 'string') {
378
// Whitespace at the ends of URL-valued attributes in HTML is ignored.
379
attrValue = goog.string.trim(/** @type {string} */ (attrValue));
380
}
381
safeUrl = goog.html.SafeUrl.sanitize(
382
/** @type {!goog.string.TypedString | string} */ (attrValue));
383
}
384
if (goog.html.SafeUrl.unwrap(safeUrl) == goog.html.SafeUrl.INNOCUOUS_STRING) {
385
return null;
386
} else {
387
return safeUrl;
388
}
389
};
390
391
392
goog.labs.html.attributeRewriterPresubmitWorkaround();
393
394