From a5f42fd007bf3646261b431c52cde53657e21564 Mon Sep 17 00:00:00 2001 From: Dries Buytaert Date: Sun, 24 May 2009 07:17:14 +0000 Subject: [PATCH] - Patch #470632 by sun: move filter_xss*() into common.inc. --- includes/common.inc | 291 +++++++++++++++++++++++++++++++++++ modules/filter/filter.module | 282 --------------------------------- 2 files changed, 291 insertions(+), 282 deletions(-) diff --git a/includes/common.inc b/includes/common.inc index b2508709f03..7ef5a7deba0 100644 --- a/includes/common.inc +++ b/includes/common.inc @@ -1256,6 +1256,12 @@ function check_file($filename) { return is_uploaded_file($filename); } +/** + * @defgroup sanitization Sanitization functions + * @{ + * Functions to sanitize values. + */ + /** * Prepare a URL for use in an HTML attribute. Strips harmful protocols. */ @@ -1263,6 +1269,291 @@ function check_url($uri) { return filter_xss_bad_protocol($uri, FALSE); } +/** + * Very permissive XSS/HTML filter for admin-only use. + * + * Use only for fields where it is impractical to use the + * whole filter system, but where some (mainly inline) mark-up + * is desired (so check_plain() is not acceptable). + * + * Allows all tags that can be used inside an HTML body, save + * for scripts and styles. + */ +function filter_xss_admin($string) { + return filter_xss($string, array('a', 'abbr', 'acronym', 'address', 'b', 'bdo', 'big', 'blockquote', 'br', 'caption', 'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'div', 'dl', 'dt', 'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd', 'li', 'ol', 'p', 'pre', 'q', 'samp', 'small', 'span', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'tt', 'ul', 'var')); +} + +/** + * Filter XSS. + * + * Based on kses by Ulf Harnhammar, see + * http://sourceforge.net/projects/kses + * + * For examples of various XSS attacks, see: + * http://ha.ckers.org/xss.html + * + * This code does four things: + * - Removes characters and constructs that can trick browsers + * - Makes sure all HTML entities are well-formed + * - Makes sure all HTML tags and attributes are well-formed + * - Makes sure no HTML tags contain URLs with a disallowed protocol (e.g. javascript:) + * + * @param $string + * The string with raw HTML in it. It will be stripped of everything that can cause + * an XSS attack. + * @param $allowed_tags + * An array of allowed tags. + */ +function filter_xss($string, $allowed_tags = array('a', 'em', 'strong', 'cite', 'blockquote', 'code', 'ul', 'ol', 'li', 'dl', 'dt', 'dd')) { + // Only operate on valid UTF-8 strings. This is necessary to prevent cross + // site scripting issues on Internet Explorer 6. + if (!drupal_validate_utf8($string)) { + return ''; + } + // Store the text format + _filter_xss_split($allowed_tags, TRUE); + // Remove NULL characters (ignored by some browsers) + $string = str_replace(chr(0), '', $string); + // Remove Netscape 4 JS entities + $string = preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string); + + // Defuse all HTML entities + $string = str_replace('&', '&', $string); + // Change back only well-formed entities in our whitelist + // Named entities + $string = preg_replace('/&([A-Za-z][A-Za-z0-9]*;)/', '&\1', $string); + // Decimal numeric entities + $string = preg_replace('/&#([0-9]+;)/', '&#\1', $string); + // Hexadecimal numeric entities + $string = preg_replace('/&#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\1', $string); + + return preg_replace_callback('% + ( + <(?=[^a-zA-Z!/]) # a lone < + | # or + <[^>]*(>|$) # a string that starts with a <, up until the > or the end of the string + | # or + > # just a > + )%x', '_filter_xss_split', $string); +} + +/** + * Processes an HTML tag. + * + * @param $m + * An array with various meaning depending on the value of $store. + * If $store is TRUE then the array contains the allowed tags. + * If $store is FALSE then the array has one element, the HTML tag to process. + * @param $store + * Whether to store $m. + * @return + * If the element isn't allowed, an empty string. Otherwise, the cleaned up + * version of the HTML element. + */ +function _filter_xss_split($m, $store = FALSE) { + static $allowed_html; + + if ($store) { + $allowed_html = array_flip($m); + return; + } + + $string = $m[1]; + + if (substr($string, 0, 1) != '<') { + // We matched a lone ">" character + return '>'; + } + elseif (strlen($string) == 1) { + // We matched a lone "<" character + return '<'; + } + + if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9]+)([^>]*)>?$%', $string, $matches)) { + // Seriously malformed + return ''; + } + + $slash = trim($matches[1]); + $elem = &$matches[2]; + $attrlist = &$matches[3]; + + if (!isset($allowed_html[strtolower($elem)])) { + // Disallowed HTML element + return ''; + } + + if ($slash != '') { + return ""; + } + + // Is there a closing XHTML slash at the end of the attributes? + $attrlist = preg_replace('%(\s?)/\s*$%', '\1', $attrlist, -1, $count); + $xhtml_slash = $count ? ' /' : ''; + + // Clean up attributes + $attr2 = implode(' ', _filter_xss_attributes($attrlist)); + $attr2 = preg_replace('/[<>]/', '', $attr2); + $attr2 = strlen($attr2) ? ' ' . $attr2 : ''; + + return "<$elem$attr2$xhtml_slash>"; +} + +/** + * Processes a string of HTML attributes. + * + * @return + * Cleaned up version of the HTML attributes. + */ +function _filter_xss_attributes($attr) { + $attrarr = array(); + $mode = 0; + $attrname = ''; + + while (strlen($attr) != 0) { + // Was the last operation successful? + $working = 0; + + switch ($mode) { + case 0: + // Attribute name, href for instance + if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) { + $attrname = strtolower($match[1]); + $skip = ($attrname == 'style' || substr($attrname, 0, 2) == 'on'); + $working = $mode = 1; + $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr); + } + break; + + case 1: + // Equals sign or valueless ("selected") + if (preg_match('/^\s*=\s*/', $attr)) { + $working = 1; $mode = 2; + $attr = preg_replace('/^\s*=\s*/', '', $attr); + break; + } + + if (preg_match('/^\s+/', $attr)) { + $working = 1; $mode = 0; + if (!$skip) { + $attrarr[] = $attrname; + } + $attr = preg_replace('/^\s+/', '', $attr); + } + break; + + case 2: + // Attribute value, a URL after href= for instance + if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match)) { + $thisval = filter_xss_bad_protocol($match[1]); + + if (!$skip) { + $attrarr[] = "$attrname=\"$thisval\""; + } + $working = 1; + $mode = 0; + $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr); + break; + } + + if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match)) { + $thisval = filter_xss_bad_protocol($match[1]); + + if (!$skip) { + $attrarr[] = "$attrname='$thisval'"; + } + $working = 1; $mode = 0; + $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr); + break; + } + + if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match)) { + $thisval = filter_xss_bad_protocol($match[1]); + + if (!$skip) { + $attrarr[] = "$attrname=\"$thisval\""; + } + $working = 1; $mode = 0; + $attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr); + } + break; + } + + if ($working == 0) { + // not well formed, remove and try again + $attr = preg_replace('/ + ^ + ( + "[^"]*("|$) # - a string that starts with a double quote, up until the next double quote or the end of the string + | # or + \'[^\']*(\'|$)| # - a string that starts with a quote, up until the next quote or the end of the string + | # or + \S # - a non-whitespace character + )* # any number of the above three + \s* # any number of whitespaces + /x', '', $attr); + $mode = 0; + } + } + + // the attribute list ends with a valueless attribute like "selected" + if ($mode == 1) { + $attrarr[] = $attrname; + } + return $attrarr; +} + +/** + * Processes an HTML attribute value and ensures it does not contain an URL with a disallowed protocol (e.g. javascript:). + * + * @param $string + * The string with the attribute value. + * @param $decode + * Whether to decode entities in the $string. Set to FALSE if the $string + * is in plain text, TRUE otherwise. Defaults to TRUE. + * @return + * Cleaned up and HTML-escaped version of $string. + */ +function filter_xss_bad_protocol($string, $decode = TRUE) { + static $allowed_protocols; + + if (!isset($allowed_protocols)) { + $allowed_protocols = array_flip(variable_get('filter_allowed_protocols', array('ftp', 'http', 'https', 'irc', 'mailto', 'news', 'nntp', 'rtsp', 'sftp', 'ssh', 'telnet', 'webcal'))); + } + + // Get the plain text representation of the attribute value (i.e. its meaning). + if ($decode) { + $string = decode_entities($string); + } + + // Iteratively remove any invalid protocol found. + do { + $before = $string; + $colonpos = strpos($string, ':'); + if ($colonpos > 0) { + // We found a colon, possibly a protocol. Verify. + $protocol = substr($string, 0, $colonpos); + // If a colon is preceded by a slash, question mark or hash, it cannot + // possibly be part of the URL scheme. This must be a relative URL, + // which inherits the (safe) protocol of the base document. + if (preg_match('![/?#]!', $protocol)) { + break; + } + // Per RFC2616, section 3.2.3 (URI Comparison) scheme comparison must be case-insensitive + // Check if this is a disallowed protocol. + if (!isset($allowed_protocols[strtolower($protocol)])) { + $string = substr($string, $colonpos + 1); + } + } + } while ($before != $string); + + return check_plain($string); +} + +/** + * @} End of "defgroup sanitization". + */ + /** * @defgroup format Formatting * @{ diff --git a/modules/filter/filter.module b/modules/filter/filter.module index 2cef2ad5ee4..d3a452b7c6b 100644 --- a/modules/filter/filter.module +++ b/modules/filter/filter.module @@ -911,288 +911,6 @@ function _filter_autop($text) { return $output; } -/** - * Very permissive XSS/HTML filter for admin-only use. - * - * Use only for fields where it is impractical to use the - * whole filter system, but where some (mainly inline) mark-up - * is desired (so check_plain() is not acceptable). - * - * Allows all tags that can be used inside an HTML body, save - * for scripts and styles. - */ -function filter_xss_admin($string) { - return filter_xss($string, array('a', 'abbr', 'acronym', 'address', 'b', 'bdo', 'big', 'blockquote', 'br', 'caption', 'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'div', 'dl', 'dt', 'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd', 'li', 'ol', 'p', 'pre', 'q', 'samp', 'small', 'span', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'tt', 'ul', 'var')); -} - -/** - * Filters XSS. Based on kses by Ulf Harnhammar, see - * http://sourceforge.net/projects/kses - * - * For examples of various XSS attacks, see: - * http://ha.ckers.org/xss.html - * - * This code does four things: - * - Removes characters and constructs that can trick browsers - * - Makes sure all HTML entities are well-formed - * - Makes sure all HTML tags and attributes are well-formed - * - Makes sure no HTML tags contain URLs with a disallowed protocol (e.g. javascript:) - * - * @param $string - * The string with raw HTML in it. It will be stripped of everything that can cause - * an XSS attack. - * @param $allowed_tags - * An array of allowed tags. - */ -function filter_xss($string, $allowed_tags = array('a', 'em', 'strong', 'cite', 'blockquote', 'code', 'ul', 'ol', 'li', 'dl', 'dt', 'dd')) { - // Only operate on valid UTF-8 strings. This is necessary to prevent cross - // site scripting issues on Internet Explorer 6. - if (!drupal_validate_utf8($string)) { - return ''; - } - // Store the text format - _filter_xss_split($allowed_tags, TRUE); - // Remove NULL characters (ignored by some browsers) - $string = str_replace(chr(0), '', $string); - // Remove Netscape 4 JS entities - $string = preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string); - - // Defuse all HTML entities - $string = str_replace('&', '&', $string); - // Change back only well-formed entities in our whitelist - // Named entities - $string = preg_replace('/&([A-Za-z][A-Za-z0-9]*;)/', '&\1', $string); - // Decimal numeric entities - $string = preg_replace('/&#([0-9]+;)/', '&#\1', $string); - // Hexadecimal numeric entities - $string = preg_replace('/&#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\1', $string); - - return preg_replace_callback('% - ( - <(?=[^a-zA-Z!/]) # a lone < - | # or - <[^>]*(>|$) # a string that starts with a <, up until the > or the end of the string - | # or - > # just a > - )%x', '_filter_xss_split', $string); -} - -/** - * Processes an HTML tag. - * - * @param $m - * An array with various meaning depending on the value of $store. - * If $store is TRUE then the array contains the allowed tags. - * If $store is FALSE then the array has one element, the HTML tag to process. - * @param $store - * Whether to store $m. - * @return - * If the element isn't allowed, an empty string. Otherwise, the cleaned up - * version of the HTML element. - */ -function _filter_xss_split($m, $store = FALSE) { - static $allowed_html; - - if ($store) { - $allowed_html = array_flip($m); - return; - } - - $string = $m[1]; - - if (substr($string, 0, 1) != '<') { - // We matched a lone ">" character - return '>'; - } - elseif (strlen($string) == 1) { - // We matched a lone "<" character - return '<'; - } - - if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9]+)([^>]*)>?$%', $string, $matches)) { - // Seriously malformed - return ''; - } - - $slash = trim($matches[1]); - $elem = &$matches[2]; - $attrlist = &$matches[3]; - - if (!isset($allowed_html[strtolower($elem)])) { - // Disallowed HTML element - return ''; - } - - if ($slash != '') { - return ""; - } - - // Is there a closing XHTML slash at the end of the attributes? - $attrlist = preg_replace('%(\s?)/\s*$%', '\1', $attrlist, -1, $count); - $xhtml_slash = $count ? ' /' : ''; - - // Clean up attributes - $attr2 = implode(' ', _filter_xss_attributes($attrlist)); - $attr2 = preg_replace('/[<>]/', '', $attr2); - $attr2 = strlen($attr2) ? ' ' . $attr2 : ''; - - return "<$elem$attr2$xhtml_slash>"; -} - -/** - * Processes a string of HTML attributes. - * - * @return - * Cleaned up version of the HTML attributes. - */ -function _filter_xss_attributes($attr) { - $attrarr = array(); - $mode = 0; - $attrname = ''; - - while (strlen($attr) != 0) { - // Was the last operation successful? - $working = 0; - - switch ($mode) { - case 0: - // Attribute name, href for instance - if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) { - $attrname = strtolower($match[1]); - $skip = ($attrname == 'style' || substr($attrname, 0, 2) == 'on'); - $working = $mode = 1; - $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr); - } - - break; - - case 1: - // Equals sign or valueless ("selected") - if (preg_match('/^\s*=\s*/', $attr)) { - $working = 1; $mode = 2; - $attr = preg_replace('/^\s*=\s*/', '', $attr); - break; - } - - if (preg_match('/^\s+/', $attr)) { - $working = 1; $mode = 0; - if (!$skip) { - $attrarr[] = $attrname; - } - $attr = preg_replace('/^\s+/', '', $attr); - } - - break; - - case 2: - // Attribute value, a URL after href= for instance - if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match)) { - $thisval = filter_xss_bad_protocol($match[1]); - - if (!$skip) { - $attrarr[] = "$attrname=\"$thisval\""; - } - $working = 1; - $mode = 0; - $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr); - break; - } - - if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match)) { - $thisval = filter_xss_bad_protocol($match[1]); - - if (!$skip) { - $attrarr[] = "$attrname='$thisval'"; - } - $working = 1; $mode = 0; - $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr); - break; - } - - if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match)) { - $thisval = filter_xss_bad_protocol($match[1]); - - if (!$skip) { - $attrarr[] = "$attrname=\"$thisval\""; - } - $working = 1; $mode = 0; - $attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr); - } - - break; - } - - if ($working == 0) { - // not well formed, remove and try again - $attr = preg_replace('/ - ^ - ( - "[^"]*("|$) # - a string that starts with a double quote, up until the next double quote or the end of the string - | # or - \'[^\']*(\'|$)| # - a string that starts with a quote, up until the next quote or the end of the string - | # or - \S # - a non-whitespace character - )* # any number of the above three - \s* # any number of whitespaces - /x', '', $attr); - $mode = 0; - } - } - - // the attribute list ends with a valueless attribute like "selected" - if ($mode == 1) { - $attrarr[] = $attrname; - } - return $attrarr; -} - -/** - * Processes an HTML attribute value and ensures it does not contain an URL - * with a disallowed protocol (e.g. javascript:) - * - * @param $string - * The string with the attribute value. - * @param $decode - * Whether to decode entities in the $string. Set to FALSE if the $string - * is in plain text, TRUE otherwise. Defaults to TRUE. - * @return - * Cleaned up and HTML-escaped version of $string. - */ -function filter_xss_bad_protocol($string, $decode = TRUE) { - static $allowed_protocols; - if (!isset($allowed_protocols)) { - $allowed_protocols = array_flip(variable_get('filter_allowed_protocols', array('ftp', 'http', 'https', 'irc', 'mailto', 'news', 'nntp', 'rtsp', 'sftp', 'ssh', 'telnet', 'webcal'))); - } - - // Get the plain text representation of the attribute value (i.e. its meaning). - if ($decode) { - $string = decode_entities($string); - } - - // Iteratively remove any invalid protocol found. - - do { - $before = $string; - $colonpos = strpos($string, ':'); - if ($colonpos > 0) { - // We found a colon, possibly a protocol. Verify. - $protocol = substr($string, 0, $colonpos); - // If a colon is preceded by a slash, question mark or hash, it cannot - // possibly be part of the URL scheme. This must be a relative URL, - // which inherits the (safe) protocol of the base document. - if (preg_match('![/?#]!', $protocol)) { - break; - } - // Per RFC2616, section 3.2.3 (URI Comparison) scheme comparison must be case-insensitive - // Check if this is a disallowed protocol. - if (!isset($allowed_protocols[strtolower($protocol)])) { - $string = substr($string, $colonpos + 1); - } - } - } while ($before != $string); - return check_plain($string); -} - /** * @} End of "Standard filters". */