diff --git a/core/modules/filter/filter.module b/core/modules/filter/filter.module index cf0eaef299f..bb9b748d1dd 100644 --- a/core/modules/filter/filter.module +++ b/core/modules/filter/filter.module @@ -1086,34 +1086,46 @@ function _filter_url($text, $filter) { $protocols = config('system.filter')->get('protocols'); $protocols = implode(':(?://)?|', $protocols) . ':(?://)?'; + $valid_url_path_characters = "[\p{L}\p{M}\p{N}!\*\';:=\+,\.\$\/%#\[\]\-_~@&]"; + + // Allow URL paths to contain balanced parens + // 1. Used in Wikipedia URLs like /Primer_(film) + // 2. Used in IIS sessions like /S(dfd346)/ + $valid_url_balanced_parens = '\('. $valid_url_path_characters . '+\)'; + + // Valid end-of-path chracters (so /foo. does not gobble the period). + // 1. Allow =&# for empty URL parameters and other URL-join artifacts + $valid_url_ending_characters = '[\p{L}\p{M}\p{N}:_+~#=/]|(?:' . $valid_url_balanced_parens . ')'; + + $valid_url_query_chars = '[a-z0-9!?\*\'@\(\);:&=\+\$\/%#\[\]\-_\.,~|]'; + $valid_url_query_ending_chars = '[a-z0-9_&=#\/]'; + + //full path + //and allow @ in a url, but only in the middle. Catch things like http://example.com/@user/ + $valid_url_path = '(?:(?:'.$valid_url_path_characters . '*(?:'.$valid_url_balanced_parens .$valid_url_path_characters . '*)*'. $valid_url_ending_characters . ')|(?:@' . $valid_url_path_characters . '+\/))'; + // Prepare domain name pattern. // The ICANN seems to be on track towards accepting more diverse top level // domains, so this pattern has been "future-proofed" to allow for TLDs // of length 2-64. - $domain = '(?:[A-Za-z0-9._+-]+\.)?[A-Za-z]{2,64}\b'; + $domain = '(?:[\p{L}\p{M}\p{N}._+-]+\.)?[\p{L}\p{M}]{2,64}\b'; $ip = '(?:[0-9]{1,3}\.){3}[0-9]{1,3}'; - $auth = '[a-zA-Z0-9:%_+*~#?&=.,/;-]+@'; - $trail = '[a-zA-Z0-9:%_+*~#&\[\]=/;?!\.,-]*[a-zA-Z0-9:%_+*~#&\[\]=/;-]'; - - // Prepare pattern for optional trailing punctuation. - // Even these characters could have a valid meaning for the URL, such usage is - // rare compared to using a URL at the end of or within a sentence, so these - // trailing characters are optionally excluded. - $punctuation = '[\.,?!]*?'; + $auth = '[\p{L}\p{M}\p{N}:%_+*~#?&=.,/;-]+@'; + $trail = '('.$valid_url_path.'*)?(\\?'.$valid_url_query_chars .'*'.$valid_url_query_ending_chars.')?'; // Match absolute URLs. $url_pattern = "(?:$auth)?(?:$domain|$ip)/?(?:$trail)?"; - $pattern = "`((?:$protocols)(?:$url_pattern))($punctuation)`"; + $pattern = "`((?:$protocols)(?:$url_pattern))`u"; $tasks['_filter_url_parse_full_links'] = $pattern; // Match e-mail addresses. - $url_pattern = "[A-Za-z0-9._-]{1,254}@(?:$domain)"; - $pattern = "`($url_pattern)`"; + $url_pattern = "[\p{L}\p{M}\p{N}._-]{1,254}@(?:$domain)"; + $pattern = "`($url_pattern)`u"; $tasks['_filter_url_parse_email_links'] = $pattern; // Match www domains. $url_pattern = "www\.(?:$domain)/?(?:$trail)?"; - $pattern = "`($url_pattern)($punctuation)`"; + $pattern = "`($url_pattern)`u"; $tasks['_filter_url_parse_partial_links'] = $pattern; // Each type of URL needs to be processed separately. The text is joined and @@ -1188,7 +1200,7 @@ function _filter_url_parse_full_links($match) { $match[$i] = decode_entities($match[$i]); $caption = check_plain(_filter_url_trim($match[$i])); $match[$i] = check_plain($match[$i]); - return '' . $caption . '' . $match[$i + 1]; + return '' . $caption . ''; } /** @@ -1218,7 +1230,7 @@ function _filter_url_parse_partial_links($match) { $match[$i] = decode_entities($match[$i]); $caption = check_plain(_filter_url_trim($match[$i])); $match[$i] = check_plain($match[$i]); - return '' . $caption . '' . $match[$i + 1]; + return '' . $caption . ''; } /** diff --git a/core/modules/filter/lib/Drupal/filter/Tests/FilterUnitTest.php b/core/modules/filter/lib/Drupal/filter/Tests/FilterUnitTest.php index de9f194a9ea..bd6ae23d8c6 100644 --- a/core/modules/filter/lib/Drupal/filter/Tests/FilterUnitTest.php +++ b/core/modules/filter/lib/Drupal/filter/Tests/FilterUnitTest.php @@ -459,6 +459,7 @@ person@example.com or mailto:person2@example.com or ' . $long_email . ' but not http://trailingslash.com/ or www.trailingslash.com/ http://host.com/some/path?query=foo&bar[baz]=beer#fragment or www.host.com/some/path?query=foo&bar[baz]=beer#fragment http://twitter.com/#!/example/status/22376963142324226 +http://example.com/@user/ ftp://user:pass@ftp.example.com/~home/dir1 sftp://user@nonstandardport:222/dir ssh://192.168.0.100/srv/git/drupal.git @@ -468,10 +469,29 @@ ssh://192.168.0.100/srv/git/drupal.git 'http://host.com/some/path?query=foo&bar[baz]=beer#fragment' => TRUE, 'www.host.com/some/path?query=foo&bar[baz]=beer#fragment' => TRUE, 'http://twitter.com/#!/example/status/22376963142324226' => TRUE, + 'http://example.com/@user/' => TRUE, 'ftp://user:pass@ftp.example.com/~home/dir1' => TRUE, 'sftp://user@nonstandardport:222/dir' => TRUE, 'ssh://192.168.0.100/srv/git/drupal.git' => TRUE, ), + // International Unicode characters. + ' +http://пример.испытание/ +http://مثال.إختبار/ +http://例子.測試/ +http://12345.中国/ +http://例え.テスト/ +http://dréißig-bücher.de/ +http://méxico-mañana.es/ +' => array( + 'http://пример.испытание/' => TRUE, + 'http://مثال.إختبار/' => TRUE, + 'http://例子.測試/' => TRUE, + 'http://12345.中国/' => TRUE, + 'http://例え.テスト/' => TRUE, + 'http://dréißig-bücher.de/' => TRUE, + 'http://méxico-mañana.es/' => TRUE, + ), // Encoding. ' http://ampersand.com/?a=1&b=2 @@ -530,6 +550,10 @@ Query string with trailing exclamation www.query.com/index.php?a=! Partial URL with 3 trailing www.partial.periods... E-mail with 3 trailing exclamations@example.com!!! Absolute URL and query string with 2 different punctuation characters (http://www.example.com/q=abc). +Partial URL with brackets in the URL as well as surrounded brackets (www.foo.com/more_(than)_one_(parens)). +Absolute URL with square brackets in the URL as well as surrounded brackets [http://www.drupal.org/?class[]=1] +Absolute URL with quotes "http://www.drupal.org/sample" + ' => array( 'period www.partial.com.' => TRUE, 'comma person@example.com,' => TRUE, @@ -538,6 +562,9 @@ Absolute URL and query string with 2 different punctuation characters (http://ww 'trailing www.partial.periods...' => TRUE, 'trailing exclamations@example.com!!!' => TRUE, 'characters (http://www.example.com/q=abc).' => TRUE, + 'brackets (www.foo.com/more_(than)_one_(parens)).' => TRUE, + 'brackets [http://www.drupal.org/?class[]=1]' => TRUE, + 'quotes "http://www.drupal.org/sample"' => TRUE, ), ' (www.parenthesis.com/dir?a=1&b=2#a)