Issue #1657886 by Hanno: Filter 'Convert URLs into links; doesn't support multilingual web addresses.

8.0.x
Alex Pott 2013-06-14 15:23:25 +02:00
parent 010ffd4508
commit 41edb242f8
2 changed files with 54 additions and 15 deletions

View File

@ -1086,34 +1086,46 @@ function _filter_url($text, $filter) {
$protocols = config('system.filter')->get('protocols');
$protocols = implode(':(?://)?|', $protocols) . ':(?://)?';
$valid_url_path_characters = "[\p{L}\p{M}\p{N}!\*\';:=\+,\.\$\/%#\[\]\-_~@&]";
// Allow URL paths to contain balanced parens
// 1. Used in Wikipedia URLs like /Primer_(film)
// 2. Used in IIS sessions like /S(dfd346)/
$valid_url_balanced_parens = '\('. $valid_url_path_characters . '+\)';
// Valid end-of-path chracters (so /foo. does not gobble the period).
// 1. Allow =&# for empty URL parameters and other URL-join artifacts
$valid_url_ending_characters = '[\p{L}\p{M}\p{N}:_+~#=/]|(?:' . $valid_url_balanced_parens . ')';
$valid_url_query_chars = '[a-z0-9!?\*\'@\(\);:&=\+\$\/%#\[\]\-_\.,~|]';
$valid_url_query_ending_chars = '[a-z0-9_&=#\/]';
//full path
//and allow @ in a url, but only in the middle. Catch things like http://example.com/@user/
$valid_url_path = '(?:(?:'.$valid_url_path_characters . '*(?:'.$valid_url_balanced_parens .$valid_url_path_characters . '*)*'. $valid_url_ending_characters . ')|(?:@' . $valid_url_path_characters . '+\/))';
// Prepare domain name pattern.
// The ICANN seems to be on track towards accepting more diverse top level
// domains, so this pattern has been "future-proofed" to allow for TLDs
// of length 2-64.
$domain = '(?:[A-Za-z0-9._+-]+\.)?[A-Za-z]{2,64}\b';
$domain = '(?:[\p{L}\p{M}\p{N}._+-]+\.)?[\p{L}\p{M}]{2,64}\b';
$ip = '(?:[0-9]{1,3}\.){3}[0-9]{1,3}';
$auth = '[a-zA-Z0-9:%_+*~#?&=.,/;-]+@';
$trail = '[a-zA-Z0-9:%_+*~#&\[\]=/;?!\.,-]*[a-zA-Z0-9:%_+*~#&\[\]=/;-]';
// Prepare pattern for optional trailing punctuation.
// Even these characters could have a valid meaning for the URL, such usage is
// rare compared to using a URL at the end of or within a sentence, so these
// trailing characters are optionally excluded.
$punctuation = '[\.,?!]*?';
$auth = '[\p{L}\p{M}\p{N}:%_+*~#?&=.,/;-]+@';
$trail = '('.$valid_url_path.'*)?(\\?'.$valid_url_query_chars .'*'.$valid_url_query_ending_chars.')?';
// Match absolute URLs.
$url_pattern = "(?:$auth)?(?:$domain|$ip)/?(?:$trail)?";
$pattern = "`((?:$protocols)(?:$url_pattern))($punctuation)`";
$pattern = "`((?:$protocols)(?:$url_pattern))`u";
$tasks['_filter_url_parse_full_links'] = $pattern;
// Match e-mail addresses.
$url_pattern = "[A-Za-z0-9._-]{1,254}@(?:$domain)";
$pattern = "`($url_pattern)`";
$url_pattern = "[\p{L}\p{M}\p{N}._-]{1,254}@(?:$domain)";
$pattern = "`($url_pattern)`u";
$tasks['_filter_url_parse_email_links'] = $pattern;
// Match www domains.
$url_pattern = "www\.(?:$domain)/?(?:$trail)?";
$pattern = "`($url_pattern)($punctuation)`";
$pattern = "`($url_pattern)`u";
$tasks['_filter_url_parse_partial_links'] = $pattern;
// Each type of URL needs to be processed separately. The text is joined and
@ -1188,7 +1200,7 @@ function _filter_url_parse_full_links($match) {
$match[$i] = decode_entities($match[$i]);
$caption = check_plain(_filter_url_trim($match[$i]));
$match[$i] = check_plain($match[$i]);
return '<a href="' . $match[$i] . '">' . $caption . '</a>' . $match[$i + 1];
return '<a href="' . $match[$i] . '">' . $caption . '</a>';
}
/**
@ -1218,7 +1230,7 @@ function _filter_url_parse_partial_links($match) {
$match[$i] = decode_entities($match[$i]);
$caption = check_plain(_filter_url_trim($match[$i]));
$match[$i] = check_plain($match[$i]);
return '<a href="http://' . $match[$i] . '">' . $caption . '</a>' . $match[$i + 1];
return '<a href="http://' . $match[$i] . '">' . $caption . '</a>';
}
/**

View File

@ -459,6 +459,7 @@ person@example.com or mailto:person2@example.com or ' . $long_email . ' but not
http://trailingslash.com/ or www.trailingslash.com/
http://host.com/some/path?query=foo&bar[baz]=beer#fragment or www.host.com/some/path?query=foo&bar[baz]=beer#fragment
http://twitter.com/#!/example/status/22376963142324226
http://example.com/@user/
ftp://user:pass@ftp.example.com/~home/dir1
sftp://user@nonstandardport:222/dir
ssh://192.168.0.100/srv/git/drupal.git
@ -468,10 +469,29 @@ ssh://192.168.0.100/srv/git/drupal.git
'<a href="http://host.com/some/path?query=foo&amp;bar[baz]=beer#fragment">http://host.com/some/path?query=foo&amp;bar[baz]=beer#fragment</a>' => TRUE,
'<a href="http://www.host.com/some/path?query=foo&amp;bar[baz]=beer#fragment">www.host.com/some/path?query=foo&amp;bar[baz]=beer#fragment</a>' => TRUE,
'<a href="http://twitter.com/#!/example/status/22376963142324226">http://twitter.com/#!/example/status/22376963142324226</a>' => TRUE,
'<a href="http://example.com/@user/">http://example.com/@user/</a>' => TRUE,
'<a href="ftp://user:pass@ftp.example.com/~home/dir1">ftp://user:pass@ftp.example.com/~home/dir1</a>' => TRUE,
'<a href="sftp://user@nonstandardport:222/dir">sftp://user@nonstandardport:222/dir</a>' => TRUE,
'<a href="ssh://192.168.0.100/srv/git/drupal.git">ssh://192.168.0.100/srv/git/drupal.git</a>' => TRUE,
),
// International Unicode characters.
'
http://пример.испытание/
http://مثال.إختبار/
http://例子.測試/
http://12345.中国/
http://例え.テスト/
http://dréißig-bücher.de/
http://méxico-mañana.es/
' => array(
'<a href="http://пример.испытание/">http://пример.испытание/</a>' => TRUE,
'<a href="http://مثال.إختبار/">http://مثال.إختبار/</a>' => TRUE,
'<a href="http://例子.測試/">http://例子.測試/</a>' => TRUE,
'<a href="http://12345.中国/">http://12345.中国/</a>' => TRUE,
'<a href="http://例え.テスト/">http://例え.テスト/</a>' => TRUE,
'<a href="http://dréißig-bücher.de/">http://dréißig-bücher.de/</a>' => TRUE,
'<a href="http://méxico-mañana.es/">http://méxico-mañana.es/</a>' => TRUE,
),
// Encoding.
'
http://ampersand.com/?a=1&b=2
@ -530,6 +550,10 @@ Query string with trailing exclamation www.query.com/index.php?a=!
Partial URL with 3 trailing www.partial.periods...
E-mail with 3 trailing exclamations@example.com!!!
Absolute URL and query string with 2 different punctuation characters (http://www.example.com/q=abc).
Partial URL with brackets in the URL as well as surrounded brackets (www.foo.com/more_(than)_one_(parens)).
Absolute URL with square brackets in the URL as well as surrounded brackets [http://www.drupal.org/?class[]=1]
Absolute URL with quotes "http://www.drupal.org/sample"
' => array(
'period <a href="http://www.partial.com">www.partial.com</a>.' => TRUE,
'comma <a href="mailto:person@example.com">person@example.com</a>,' => TRUE,
@ -538,6 +562,9 @@ Absolute URL and query string with 2 different punctuation characters (http://ww
'trailing <a href="http://www.partial.periods">www.partial.periods</a>...' => TRUE,
'trailing <a href="mailto:exclamations@example.com">exclamations@example.com</a>!!!' => TRUE,
'characters (<a href="http://www.example.com/q=abc">http://www.example.com/q=abc</a>).' => TRUE,
'brackets (<a href="http://www.foo.com/more_(than)_one_(parens)">www.foo.com/more_(than)_one_(parens)</a>).' => TRUE,
'brackets [<a href="http://www.drupal.org/?class[]=1">http://www.drupal.org/?class[]=1</a>]' => TRUE,
'quotes "<a href="http://www.drupal.org/sample">http://www.drupal.org/sample</a>"' => TRUE,
),
'
(www.parenthesis.com/dir?a=1&b=2#a)