File : core/htmlSanitizer/HTMLSanitizer.js

1
/*
2
Copyright - 2017 2023 - wwwouaiebe - Contact: https://www.ouaie.be/
3
4
This  program is free software;
5
you can redistribute it and/or modify it under the terms of the
6
GNU General Public License as published by the Free Software Foundation;
7
either version 3 of the License, or any later version.
8
9
This program is distributed in the hope that it will be useful,
10
but WITHOUT ANY WARRANTY; without even the implied warranty of
11
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
GNU General Public License for more details.
13
14
You should have received a copy of the GNU General Public License
15
along with this program; if not, write to the Free Software
16
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
*/
18
19
/*
20
Changes:
21
    - v2.0.0:
22
        - created
23
    - v3.0.0:
24
        - Issue ♯175 : Private and static fields and methods are coming
25
    - v3.1.0:
26
        - Issue ♯2 : Set all properties as private and use accessors.
27
Doc reviewed 20210914
28
Tests ...
29
*/
30
31
import HTMLSanitizerData from './HTMLSanitizerData.js';
32
import HtmlStringValidationResult from './HtmlStringValidationResult.js';
33
import UrlValidationResult from './UrlValidationResult.js';
34
import { SVG_NS, ZERO, NOT_FOUND } from '../../main/Constants.js';
35
36
/* ------------------------------------------------------------------------------------------------------------------------- */
37
/**
38
This class contains methods to sanitize url and string, filtering html tags and attributes
39
present in the string.
40
41
See theHTMLSanitizer for the one and only one instance of this class
42
*/
43
/* ------------------------------------------------------------------------------------------------------------------------- */
44
45
class HTMLSanitizer {
46
47
    /**
48
    the results of the #stringify method
49
    @type {String}
50
    */
51
52
    #stringifiedHTML = '';
53
54
    /**
55
    the errors detected by the #stringify method
56
    @type {String}
57
    */
58
59
    #stringifyErrors = '';
60
61
    /**
62
    An instance of the HTMLSanitizerData
63
    @type {HTMLSanitizerData}
64
    */
65
66
    static #htmlSanitizerData = new HTMLSanitizerData ( );
67
68
    /**
69
    Replace < >' " and nbsp chars with htmlEntities
70
    @param {String} htmlString the string to transform
71
    @return {String} a string with htmlEntities
72
    */
73
74
    #addHtmlEntities ( htmlString ) {
75
        const newHtmlString = htmlString
76
            .replaceAll ( /\u003c/g, '<' )
77
            .replaceAll ( /\u003e/g, '>' )
78
            .replaceAll ( /\u0022/g, '"' )
79
            .replaceAll ( /\u0027/g, ''' )
80
            .replaceAll ( /\u0a00/g, ' ' );
81
82
        return newHtmlString;
83
    }
84
85
    /**
86
    Helper method for the #stringify method. Validate an url present in a htmlString
87
    @param {String} url The url to validate
88
    @param {String} attributeName The attribute name in witch the url was found
89
    */
90
91
    #stringifyUrl ( url, attributeName ) {
92
        const validUrl = this.sanitizeToUrl ( url, attributeName ).url;
93
        if ( '' === validUrl && '' !== url ) {
94
            this.#stringifyErrors +=
95
                '\nAn invalid url (' +     url + ') was removed from a ' + attributeName + ' attribute';
96
        }
97
        else {
98
            this.#stringifiedHTML += ' ' + attributeName + '="' + validUrl + '"';
99
        }
100
    }
101
102
    /**
103
    Helper method for the #stringify method.  Validate and stringify the attributes of a svg node
104
    @param {SVGElement} currentNode The svg node for witch the attributes are stringified.
105
    @param {String} nodeName the name of the currentNode
106
    */
107
108
    #stringifySvgAttributes ( currentNode, nodeName ) {
109
        HTMLSanitizer.#htmlSanitizerData.getValidAttributesNames ( nodeName ).forEach (
110
            validAttributeName => {
111
                if ( currentNode.hasAttributeNS ( null, validAttributeName ) ) {
112
                    this.#stringifiedHTML += ' ' + validAttributeName + '="' +
113
                        this.#addHtmlEntities ( currentNode.getAttributeNS ( null, validAttributeName ) ) +
114
                        '"';
115
                    currentNode.removeAttributeNS ( null, validAttributeName );
116
                }
117
            }
118
        );
119
    }
120
121
    /**
122
    Helper method for the #stringify method.  Validate and stringify the attributes of a HTML node
123
    @param {HTMLElement} currentNode The HTML node for witch the attributes are stringified.
124
    @param {String} nodeName the name of the currentNode
125
    */
126
127
    #stringifyHTMLAttributes ( currentNode, nodeName ) {
128
        if ( currentNode.hasAttribute ( 'target' ) ) {
129
            this.#stringifiedHTML += ' rel="noopener noreferrer"';
130
        }
131
        HTMLSanitizer.#htmlSanitizerData.getValidAttributesNames ( nodeName ).forEach (
132
            validAttributeName => {
133
                if ( currentNode.hasAttribute ( validAttributeName ) ) {
134
                    if ( 'href' === validAttributeName || 'src' === validAttributeName ) {
135
                        this.#stringifyUrl ( currentNode.getAttribute ( validAttributeName ), validAttributeName );
136
                    }
137
                    else {
138
                        this.#stringifiedHTML += ' ' + validAttributeName + '="' +
139
                        this.#addHtmlEntities ( currentNode.getAttribute ( validAttributeName ) ) +
140
                        '"';
141
                    }
142
                    currentNode.removeAttribute ( validAttributeName );
143
                }
144
            }
145
        );
146
    }
147
148
    /**
149
    Helper method for the #stringify method.  Add the removed attributes to the error string
150
    @param {HTMLElement} currentNode The HTML node for witch the attributes are stringified.
151
    */
152
153
    #addStringifyErrors ( currentNode ) {
154
        for ( let attCounter = ZERO; attCounter < currentNode.attributes.length; attCounter ++ ) {
155
            if ( 'rel' !== currentNode.attributes [ attCounter ].name ) {
156
                this.#stringifyErrors +=
157
                    '\nAn unsecure attribute ' +
158
                    currentNode.attributes [ attCounter ].name +
159
                    '="' +
160
                    currentNode.attributes [ attCounter ].value +
161
                    '" was removed.';
162
            }
163
        }
164
    }
165
166
    /**
167
    Transform a node and it's descendants into a string, removing all the invalid tags, invalid atrributes,
168
    invalid texts and invalid url's
169
    @param {HTMLElement} sourceNode The node to stringify
170
    */
171
172
    #stringify ( sourceNode ) {
173
        const childs = sourceNode.childNodes;
174
        for ( let nodeCounter = 0; nodeCounter < childs.length; nodeCounter ++ ) {
175
            const currentNode = sourceNode.childNodes [ nodeCounter ];
176
            const nodeName = HTMLSanitizer.#htmlSanitizerData.getValidNodeName ( currentNode.nodeName );
177
            if ( '' === nodeName ) {
178
                this.#stringifyErrors += '\nAn invalid tag ' + currentNode.nodeName + ' was removed';
179
            }
180
            else if ( '\u0023text' === nodeName ) {
181
                this.#stringifiedHTML += this.#addHtmlEntities ( currentNode.nodeValue );
182
            }
183
            else {
184
                this.#stringifiedHTML += '<' + nodeName;
185
                if ( 'svg' === nodeName || 'text' === nodeName || 'polyline' === nodeName ) {
186
                    this.#stringifySvgAttributes ( currentNode, nodeName );
187
                }
188
                else {
189
                    this.#stringifyHTMLAttributes ( currentNode, nodeName );
190
                }
191
                this.#stringifiedHTML += '>';
192
                this.#stringify ( currentNode );
193
                this.#stringifiedHTML += '</' + nodeName + '>';
194
                if ( currentNode.attributes ) {
195
                    this.#addStringifyErrors ( currentNode );
196
                }
197
            }
198
        }
199
    }
200
201
    /**
202
    Helper function for the #cloneNode method. Clone a svg node
203
    @param {SVGElement} currentNode The svg node to clone
204
    @param {String} nodeName The name of the currentNode
205
    */
206
207
    #cloneSvg ( currentNode, nodeName ) {
208
        const newChildNode = document.createElementNS ( SVG_NS, nodeName );
209
        HTMLSanitizer.#htmlSanitizerData.getValidAttributesNames ( nodeName ).forEach (
210
            validAttributeName => {
211
                if ( currentNode.hasAttributeNS ( null, validAttributeName ) ) {
212
                    newChildNode.setAttributeNS (
213
                        null,
214
                        validAttributeName,
215
                        currentNode.getAttributeNS ( null, validAttributeName )
216
                    );
217
                    currentNode.removeAttributeNS ( null, validAttributeName );
218
                }
219
            }
220
        );
221
        return newChildNode;
222
    }
223
224
    /**
225
    Helper function for the #cloneNode method. Clone a HTML node
226
    @param {HTMLElement} currentNode The html node to clone
227
    @param {String} nodeName The name of the currentNode
228
    */
229
230
    #cloneHTML ( currentNode, nodeName ) {
231
        const newChildNode = document.createElement ( nodeName );
232
        HTMLSanitizer.#htmlSanitizerData.getValidAttributesNames ( nodeName ).forEach (
233
            validAttributeName => {
234
                if ( currentNode.hasAttribute ( validAttributeName ) ) {
235
                    if ( 'href' === validAttributeName || 'src' === validAttributeName ) {
236
                        const attributeValue = this.sanitizeToUrl (
237
                            currentNode.getAttribute ( validAttributeName ),
238
                            validAttributeName
239
                        ).url;
240
                        if ( '' !== attributeValue ) {
241
                            newChildNode.setAttribute ( validAttributeName, attributeValue );
242
                        }
243
                    }
244
                    else {
245
                        newChildNode.setAttribute (
246
                            validAttributeName,
247
                            currentNode.getAttribute ( validAttributeName )
248
                        );
249
                    }
250
                }
251
            }
252
        );
253
        if ( currentNode.hasAttribute ( 'target' ) ) {
254
            newChildNode.setAttribute ( 'rel', 'noopener noreferrer' );
255
        }
256
        return newChildNode;
257
    }
258
259
    /**
260
    Deep clone the contains of an HTML node into another node. Only valid tags, valid attributes, valid url's
261
    and valid texts are cloned
262
263
    @param {HTMLElement} clonedNode The node to clone
264
    @param {HTMLElement} newNode The destination node
265
    */
266
267
    #cloneNode ( clonedNode, newNode ) {
268
        const childs = clonedNode.childNodes;
269
        for ( let nodeCounter = 0; nodeCounter < childs.length; nodeCounter ++ ) {
270
            const currentNode = clonedNode.childNodes [ nodeCounter ];
271
            const nodeName = HTMLSanitizer.#htmlSanitizerData.getValidNodeName ( currentNode.nodeName );
272
            if ( '\u0023text' === nodeName ) {
273
                newNode.appendChild ( document.createTextNode ( currentNode.nodeValue ) );
274
            }
275
            else if ( '' !== nodeName ) {
276
                const newChildNode =
277
                    'svg' === nodeName || 'text' === nodeName || 'polyline' === nodeName
278
                        ?
279
                        this.#cloneSvg ( currentNode, nodeName )
280
                        :
281
                        this.#cloneHTML ( currentNode, nodeName );
282
283
                newNode.appendChild ( newChildNode );
284
                this.#cloneNode ( currentNode, newChildNode );
285
            }
286
        }
287
    }
288
289
    /**
290
    The constructor
291
    */
292
293
    constructor ( ) {
294
        Object.freeze ( this );
295
    }
296
297
    /**
298
    This method transform a string containing html and svg tags into html and svg elements and copy these elements
299
    as child nodes of the targetNode. Only tags and attributes present in the HTMLSanitizerData.#validityMap variable
300
    are copied in the targetNode. Url in the href and src attributes must be valid url (see sanitizeToUrl method)
301
    @param {String} htmlString the string to transform
302
    @param {HTMLElement} targetNode the node in witch the created elements are placed
303
    */
304
305
    sanitizeToHtmlElement ( htmlString, targetNode ) {
306
307
        const parseResult = new DOMParser ( ).parseFromString ( '<div>' + htmlString + '</div>', 'text/html' );
308
309
        const docFragment = new DocumentFragment ( );
310
        if ( parseResult && '\u0023document' === parseResult.nodeName ) {
311
            this.#cloneNode ( parseResult.body.firstChild, docFragment );
312
            targetNode.appendChild ( docFragment );
313
        }
314
        else {
315
            targetNode.textContent = '';
316
        }
317
    }
318
319
    /**
320
    This method clone a DOM node, removing all invalid childs and attributes
321
    @param {HTMLElement} htmlElement The node to clone
322
    @return {HTMLElement} The cloned node
323
    */
324
325
    clone ( htmlElement ) {
326
        const clone = document.createElement ( htmlElement.tagName );
327
        this.#cloneNode ( htmlElement, clone );
328
329
        return clone;
330
    }
331
332
    /**
333
    This method transform a string containing html and svg tags. Tags and attributes not present in the
334
    HTMLSanitizerData.#validityMap variable are removed. Invalid Url in the href and src attributes are
335
    also removed (see sanitizeToUrl method)
336
    @param {String} htmlString the string to transform
337
    @return {HtmlStringValidationResult} a HtmlStringValidationResult with the result of the validation
338
    */
339
340
    sanitizeToHtmlString ( htmlString ) {
341
342
        // ! don't use XMLSerializer. Problems with ", &apos and   and xmlns
343
344
        this.#stringifiedHTML = '';
345
        this.#stringifyErrors = '';
346
347
        const parseResult =
348
            new DOMParser ( ).parseFromString ( '<div>' + htmlString.replace ( ' ', '\u0a00' ) + '</div>', 'text/html' );
349
        if ( parseResult && '\u0023document' === parseResult.nodeName ) {
350
            this.#stringify ( parseResult.body.firstChild );
351
            return new HtmlStringValidationResult ( this.#stringifiedHTML, this.#stringifyErrors );
352
        }
353
        return new HtmlStringValidationResult ( '', 'Parsing error' );
354
    }
355
356
    /**
357
    This method verify that a string contains a valid url.
358
359
    A valid url must not contains html tags or html entities or invalid characters
360
    and must start with a valid protocol.
361
362
    Valid protocols are http: and https:. For href attributes mailto:, sms: and tel: are also valid
363
    and for src attributes, data: is also valid.
364
365
    sms: and tel: url's  must start with a + and contains only digits, *, # or space
366
    @param {String} urlString The url to validate
367
    @param {String} attributeName The attribute name in witch the url will be placed. must be 'src' or
368
    null (in this case 'href' is used as default)
369
    @return {UrlValidationResult} a UrlValidationResult with the result of the validation
370
    */
371
372
    sanitizeToUrl ( urlString, attributeName ) {
373
374
        const tmpAttributeName = attributeName || 'href';
375
376
        // set the url inside a div and then parsing...
377
        const parseResult = new DOMParser ( ).parseFromString ( '<div>' + urlString + '</div>', 'text/html' );
378
        if ( ! parseResult || '\u0023document' !== parseResult.nodeName ) {
379
380
            // strange: no result or not a document. We return an empty string
381
            return new UrlValidationResult ( '', 'Parsing error' );
382
        }
383
384
        // Taking the first child node of the pasing and concatenate the childnodes of this node...
385
        const resultNode = parseResult.body.firstChild;
386
        let newUrlString = '';
387
        for ( let nodeCounter = 0; nodeCounter < resultNode.childNodes.length; nodeCounter ++ ) {
388
            if ( '\u0023text' === resultNode.childNodes [ nodeCounter ].nodeName ) {
389
390
                // ...  if only text nodes are found
391
                newUrlString += resultNode.childNodes [ nodeCounter ].nodeValue;
392
            }
393
            else {
394
395
                // otherwise returning an empty string
396
                return new UrlValidationResult ( '', 'Invalid characters found in the url' );
397
            }
398
        }
399
400
        // removing < > " ' characters in a copy url ...
401
        newUrlString = newUrlString
402
            .replaceAll ( /</g, '' )
403
            .replaceAll ( />/g, '' )
404
            .replaceAll ( /"/g, '' )
405
            .replaceAll ( /\u0027/g, '' )
406
            .replaceAll ( /</g, '' )
407
            .replaceAll ( />/g, '' )
408
            .replaceAll ( /"/g, '' )
409
            .replaceAll ( /'/g, '' )
410
            .replaceAll ( /%3C/g, '' )
411
            .replaceAll ( /%3c/g, '' )
412
            .replaceAll ( /%3E/g, '' )
413
            .replaceAll ( /%3e/g, '' )
414
            .replaceAll ( /%22/g, '' )
415
            .replaceAll ( /%27/g, '' );
416
417
        // and comparing the result with the url
418
        if ( newUrlString !== urlString ) {
419
420
            // < > " ' characters found i the url. Returning an empty string
421
            return new UrlValidationResult ( '', 'Invalid characters found in the url' );
422
        }
423
424
        // creating a list of valid protocols for the url
425
        const validProtocols = [ 'https:' ];
426
        if ( 'http:' === window.location.protocol || 'href' === tmpAttributeName ) {
427
            validProtocols.push ( 'http:' );
428
        }
429
        if ( 'href' === tmpAttributeName ) {
430
            validProtocols.push ( 'mailto:' );
431
            validProtocols.push ( 'sms:' );
432
            validProtocols.push ( 'tel:' );
433
434
            // the url contains only letters and numbers chars and start with a hash. It's a link to the document itself
435
            const urlHash = newUrlString.match ( /^\u0023\w*/ );
436
            if ( urlHash && newUrlString === urlHash [ ZERO ] ) {
437
                return new UrlValidationResult ( newUrlString, '' );
438
            }
439
        }
440
        if ( 'src' === tmpAttributeName ) {
441
            validProtocols.push ( 'data:' );
442
        }
443
444
        // We try to create a url object from the url string
445
        let url = null;
446
        try {
447
            url = new URL ( newUrlString );
448
        }
449
        catch ( err ) {
450
451
            // not possible to create an url. Returning an empty string
452
            return new UrlValidationResult ( '', 'Invalid url string' );
453
        }
454
        if ( NOT_FOUND === validProtocols.indexOf ( url.protocol ) ) {
455
456
            // the url protocol is not in the list of valid protocol. Returning an empty string
457
            return new UrlValidationResult ( '', 'Invalid protocol ' + url.protocol );
458
        }
459
        if ( NOT_FOUND !== [ 'sms:', 'tel:' ].indexOf ( url.protocol ) ) {
460
461
            // sms and tel url must start with a + and contains only numbers, hash or star
462
            if ( url.pathname.match ( /^\+[0-9,*,\u0023]*$/ ) ) {
463
                return new UrlValidationResult ( newUrlString, '' );
464
            }
465
466
            return new UrlValidationResult ( '', 'Invalid sms: or tel: url' );
467
468
        }
469
470
        // try the encodeURIComponent function on the href part of the url
471
        try {
472
            encodeURIComponent ( url.href );
473
        }
474
        catch ( err ) {
475
            return new UrlValidationResult ( '', 'Invalid character in url' );
476
        }
477
        return new UrlValidationResult ( newUrlString, '' );
478
    }
479
480
    /**
481
    Remove all html tags from a string and replace htmlEntities and < > ' " and nbsp chars with others similar unicode chars
482
    @param {String} stringToSanitize the string to transform
483
    @return {String} a string with html tags removed and htmlEntities and < >' " and nbsp chars replaced
484
    */
485
486
    sanitizeToJsString ( stringToSanitize ) {
487
488
        // Parsing the string inside a div...
489
        const parseResult = new DOMParser ( ).parseFromString ( '<div>' + stringToSanitize + '</div>', 'text/html' );
490
        if ( ! parseResult || '\u0023document' !== parseResult.nodeName ) {
491
492
            // Bad results from the parsing... Returning an empty string
493
            return '';
494
        }
495
        const resultNode = parseResult.body.firstChild;
496
        let sanitizedString = '';
497
        for ( let nodeCounter = 0; nodeCounter < resultNode.childNodes.length; nodeCounter ++ ) {
498
            if ( '\u0023text' === resultNode.childNodes [ nodeCounter ].nodeName ) {
499
                sanitizedString += resultNode.childNodes [ nodeCounter ].nodeValue;
500
            }
501
            else {
502
503
                // The parsing contains others nodes than text string... returning an empty string
504
                return '';
505
            }
506
        }
507
508
        // replacing <>'" with others similar chars
509
        sanitizedString = sanitizedString
510
            .replaceAll ( /</g, '\u227a' )
511
            .replaceAll ( />/g, '\u227b' )
512
            .replaceAll ( /"/g, '\u2033' )
513
            .replaceAll ( /\u0027/g, '\u2032' );
514
515
        return sanitizedString;
516
    }
517
518
    /**
519
    This method verify that a string describe a css color. A valid css color must start with a hash followed by 6 hex numbers
520
    @param {String} colorString the string to test
521
    @return {String} the verified color or null if the given color is invalid
522
    */
523
524
    sanitizeToColor ( colorString ) {
525
        const newColor = colorString.match ( /^\u0023[0-9,A-F,a-f]{6}$/ );
526
        if ( newColor ) {
527
            return newColor [ ZERO ];
528
        }
529
        return null;
530
    }
531
}
532
533
/* ------------------------------------------------------------------------------------------------------------------------- */
534
/**
535
The one and only one instance of HTMLSanitizer class
536
@type {HTMLSanitizer}
537
*/
538
/* ------------------------------------------------------------------------------------------------------------------------- */
539
540
const theHTMLSanitizer = new HTMLSanitizer ( );
541
542
export default theHTMLSanitizer;
543
544
/* --- End of file --------------------------------------------------------------------------------------------------------- */
545