diff --git a/core/modules/filter/filter.module b/core/modules/filter/filter.module
index cf0eaef299f..bb9b748d1dd 100644
--- a/core/modules/filter/filter.module
+++ b/core/modules/filter/filter.module
@@ -1086,34 +1086,46 @@ function _filter_url($text, $filter) {
$protocols = config('system.filter')->get('protocols');
$protocols = implode(':(?://)?|', $protocols) . ':(?://)?';
+ $valid_url_path_characters = "[\p{L}\p{M}\p{N}!\*\';:=\+,\.\$\/%#\[\]\-_~@&]";
+
+ // Allow URL paths to contain balanced parens
+ // 1. Used in Wikipedia URLs like /Primer_(film)
+ // 2. Used in IIS sessions like /S(dfd346)/
+ $valid_url_balanced_parens = '\('. $valid_url_path_characters . '+\)';
+
+ // Valid end-of-path chracters (so /foo. does not gobble the period).
+ // 1. Allow = for empty URL parameters and other URL-join artifacts
+ $valid_url_ending_characters = '[\p{L}\p{M}\p{N}:_+~#=/]|(?:' . $valid_url_balanced_parens . ')';
+
+ $valid_url_query_chars = '[a-z0-9!?\*\'@\(\);:&=\+\$\/%#\[\]\-_\.,~|]';
+ $valid_url_query_ending_chars = '[a-z0-9_&=#\/]';
+
+ //full path
+ //and allow @ in a url, but only in the middle. Catch things like http://example.com/@user/
+ $valid_url_path = '(?:(?:'.$valid_url_path_characters . '*(?:'.$valid_url_balanced_parens .$valid_url_path_characters . '*)*'. $valid_url_ending_characters . ')|(?:@' . $valid_url_path_characters . '+\/))';
+
// Prepare domain name pattern.
// The ICANN seems to be on track towards accepting more diverse top level
// domains, so this pattern has been "future-proofed" to allow for TLDs
// of length 2-64.
- $domain = '(?:[A-Za-z0-9._+-]+\.)?[A-Za-z]{2,64}\b';
+ $domain = '(?:[\p{L}\p{M}\p{N}._+-]+\.)?[\p{L}\p{M}]{2,64}\b';
$ip = '(?:[0-9]{1,3}\.){3}[0-9]{1,3}';
- $auth = '[a-zA-Z0-9:%_+*~#?&=.,/;-]+@';
- $trail = '[a-zA-Z0-9:%_+*~#&\[\]=/;?!\.,-]*[a-zA-Z0-9:%_+*~#&\[\]=/;-]';
-
- // Prepare pattern for optional trailing punctuation.
- // Even these characters could have a valid meaning for the URL, such usage is
- // rare compared to using a URL at the end of or within a sentence, so these
- // trailing characters are optionally excluded.
- $punctuation = '[\.,?!]*?';
+ $auth = '[\p{L}\p{M}\p{N}:%_+*~#?&=.,/;-]+@';
+ $trail = '('.$valid_url_path.'*)?(\\?'.$valid_url_query_chars .'*'.$valid_url_query_ending_chars.')?';
// Match absolute URLs.
$url_pattern = "(?:$auth)?(?:$domain|$ip)/?(?:$trail)?";
- $pattern = "`((?:$protocols)(?:$url_pattern))($punctuation)`";
+ $pattern = "`((?:$protocols)(?:$url_pattern))`u";
$tasks['_filter_url_parse_full_links'] = $pattern;
// Match e-mail addresses.
- $url_pattern = "[A-Za-z0-9._-]{1,254}@(?:$domain)";
- $pattern = "`($url_pattern)`";
+ $url_pattern = "[\p{L}\p{M}\p{N}._-]{1,254}@(?:$domain)";
+ $pattern = "`($url_pattern)`u";
$tasks['_filter_url_parse_email_links'] = $pattern;
// Match www domains.
$url_pattern = "www\.(?:$domain)/?(?:$trail)?";
- $pattern = "`($url_pattern)($punctuation)`";
+ $pattern = "`($url_pattern)`u";
$tasks['_filter_url_parse_partial_links'] = $pattern;
// Each type of URL needs to be processed separately. The text is joined and
@@ -1188,7 +1200,7 @@ function _filter_url_parse_full_links($match) {
$match[$i] = decode_entities($match[$i]);
$caption = check_plain(_filter_url_trim($match[$i]));
$match[$i] = check_plain($match[$i]);
- return '' . $caption . '' . $match[$i + 1];
+ return '' . $caption . '';
}
/**
@@ -1218,7 +1230,7 @@ function _filter_url_parse_partial_links($match) {
$match[$i] = decode_entities($match[$i]);
$caption = check_plain(_filter_url_trim($match[$i]));
$match[$i] = check_plain($match[$i]);
- return '' . $caption . '' . $match[$i + 1];
+ return '' . $caption . '';
}
/**
diff --git a/core/modules/filter/lib/Drupal/filter/Tests/FilterUnitTest.php b/core/modules/filter/lib/Drupal/filter/Tests/FilterUnitTest.php
index de9f194a9ea..bd6ae23d8c6 100644
--- a/core/modules/filter/lib/Drupal/filter/Tests/FilterUnitTest.php
+++ b/core/modules/filter/lib/Drupal/filter/Tests/FilterUnitTest.php
@@ -459,6 +459,7 @@ person@example.com or mailto:person2@example.com or ' . $long_email . ' but not
http://trailingslash.com/ or www.trailingslash.com/
http://host.com/some/path?query=foo&bar[baz]=beer#fragment or www.host.com/some/path?query=foo&bar[baz]=beer#fragment
http://twitter.com/#!/example/status/22376963142324226
+http://example.com/@user/
ftp://user:pass@ftp.example.com/~home/dir1
sftp://user@nonstandardport:222/dir
ssh://192.168.0.100/srv/git/drupal.git
@@ -468,10 +469,29 @@ ssh://192.168.0.100/srv/git/drupal.git
'http://host.com/some/path?query=foo&bar[baz]=beer#fragment' => TRUE,
'www.host.com/some/path?query=foo&bar[baz]=beer#fragment' => TRUE,
'http://twitter.com/#!/example/status/22376963142324226' => TRUE,
+ 'http://example.com/@user/' => TRUE,
'ftp://user:pass@ftp.example.com/~home/dir1' => TRUE,
'sftp://user@nonstandardport:222/dir' => TRUE,
'ssh://192.168.0.100/srv/git/drupal.git' => TRUE,
),
+ // International Unicode characters.
+ '
+http://пример.испытание/
+http://مثال.إختبار/
+http://例子.測試/
+http://12345.中国/
+http://例え.テスト/
+http://dréißig-bücher.de/
+http://méxico-mañana.es/
+' => array(
+ 'http://пример.испытание/' => TRUE,
+ 'http://مثال.إختبار/' => TRUE,
+ 'http://例子.測試/' => TRUE,
+ 'http://12345.中国/' => TRUE,
+ 'http://例え.テスト/' => TRUE,
+ 'http://dréißig-bücher.de/' => TRUE,
+ 'http://méxico-mañana.es/' => TRUE,
+ ),
// Encoding.
'
http://ampersand.com/?a=1&b=2
@@ -530,6 +550,10 @@ Query string with trailing exclamation www.query.com/index.php?a=!
Partial URL with 3 trailing www.partial.periods...
E-mail with 3 trailing exclamations@example.com!!!
Absolute URL and query string with 2 different punctuation characters (http://www.example.com/q=abc).
+Partial URL with brackets in the URL as well as surrounded brackets (www.foo.com/more_(than)_one_(parens)).
+Absolute URL with square brackets in the URL as well as surrounded brackets [http://www.drupal.org/?class[]=1]
+Absolute URL with quotes "http://www.drupal.org/sample"
+
' => array(
'period www.partial.com.' => TRUE,
'comma person@example.com,' => TRUE,
@@ -538,6 +562,9 @@ Absolute URL and query string with 2 different punctuation characters (http://ww
'trailing www.partial.periods...' => TRUE,
'trailing exclamations@example.com!!!' => TRUE,
'characters (http://www.example.com/q=abc).' => TRUE,
+ 'brackets (www.foo.com/more_(than)_one_(parens)).' => TRUE,
+ 'brackets [http://www.drupal.org/?class[]=1]' => TRUE,
+ 'quotes "http://www.drupal.org/sample"' => TRUE,
),
'
(www.parenthesis.com/dir?a=1&b=2#a)