From a8c62e707af0971a62b7601f4ac6ea46f57b16c2 Mon Sep 17 00:00:00 2001 From: rxu Date: Tue, 25 Nov 2014 22:16:30 +0700 Subject: [ticket/12926] Support for IDN (IRI) Add international domain name support for URLs. PHPBB3-12926 --- phpBB/develop/regex_idn.php | 143 ++++++++++++++++++++++++++++ phpBB/includes/functions.php | 8 +- phpBB/includes/message_parser.php | 2 +- phpBB/phpbb/profilefields/type/type_url.php | 2 +- 4 files changed, 149 insertions(+), 6 deletions(-) create mode 100644 phpBB/develop/regex_idn.php (limited to 'phpBB') diff --git a/phpBB/develop/regex_idn.php b/phpBB/develop/regex_idn.php new file mode 100644 index 0000000000..4124829a11 --- /dev/null +++ b/phpBB/develop/regex_idn.php @@ -0,0 +1,143 @@ +
\n\nIPv6: " . $ipv6 . "

\n\n"; + +// URL regular expressions + +/* IDN2008 characters derivation +** http://unicode.org/faq/idn.html#33 - IDN FAQ: derivation of valid characters in terms of Unicode properties +** http://unicode.org/reports/tr46/ - Unicode Technical Standard #46. Unicode IDNA Compatibility Processing +** http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt - Unicode Character Database +*/ +/* +** Remove Control Characters and Whitespace (as in IDNA2003) +*/ +$no_cc = '\p{C}\p{Z}'; +/* +** Remove Symbols, Punctuation, non-decimal Numbers, and Enclosing Marks +*/ +$no_symbol = '\p{S}\p{P}\p{Nl}\p{No}\p{Me}'; +/* +** Remove characters used for archaic Hangul (Korean) - \p{HST=L} and \p{HST=V} +** as per http://unicode.org/Public/UNIDATA/HangulSyllableType.txt +*/ +$no_hangul = '\x{1100}-\x{115F}\x{A960}-\x{A97C}\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}'; +/* +** Remove three blocks of technical or archaic symbols. +*/ +$no_cdm = '\x{20D0}-\x{20FF}'; // \p{block=Combining_Diacritical_Marks_For_Symbols} +$no_musical = '\x{1D100}-\x{1D1FF}'; // \p{block=Musical_Symbols} +$no_ancient_greek_musical = '\x{1D200}-\x{1D24F}'; // \p{block=Ancient_Greek_Musical_Notation} +/* Remove certain exceptions: +** U+0640 ARABIC TATWEEL +** U+07FA NKO LAJANYALAN +** U+302E HANGUL SINGLE DOT TONE MARK +** U+302F HANGUL DOUBLE DOT TONE MARK +** U+3031 VERTICAL KANA REPEAT MARK +** U+3032 VERTICAL KANA REPEAT WITH VOICED SOUND MARK +** .. +** U+3035 VERTICAL KANA REPEAT MARK LOWER HALF +** U+303B VERTICAL IDEOGRAPHIC ITERATION MARK +*/ +$no_certain_exceptions = '\x{0640}\x{07FA}\x{302E}\x{302F}\x{3031}-\x{3035}\x{303B}'; +/* Add certain exceptions: +** U+00B7 MIDDLE DOT +** U+0375 GREEK LOWER NUMERAL SIGN +** U+05F3 HEBREW PUNCTUATION GERESH +** U+05F4 HEBREW PUNCTUATION GERSHAYIM +** U+30FB KATAKANA MIDDLE DOT +** U+002D HYPHEN-MINUS +** U+06FD ARABIC SIGN SINDHI AMPERSAND +** U+06FE ARABIC SIGN SINDHI POSTPOSITION MEN +** U+0F0B TIBETAN MARK INTERSYLLABIC TSHEG +** U+3007 IDEOGRAPHIC NUMBER ZERO +*/ +$add_certain_exceptions = '\x{00B7}\x{0375}\x{05F3}\x{05F4}\x{30FB}\x{002D}\x{06FD}\x{06FE}\x{0F0B}\x{3007}'; +/* Add special exceptions (Deviations): +** U+00DF LATIN SMALL LETTER SHARP S +** U+03C2 GREEK SMALL LETTER FINAL SIGMA +** U+200C ZERO WIDTH NON-JOINER +** U+200D ZERO WIDTH JOINER +*/ +$add_deviations = '\x{00DF}\x{03C2}\x{200C}\x{200D}'; + +// Concatenate remove/add regexes respectively +$remove_chars = "$no_cc$no_symbol$no_hangul$no_cdm$no_musical$no_ancient_greek_musical$no_certain_exceptions"; +$add_chars = "$add_certain_exceptions$add_deviations"; + +$pct_encoded = "%[\dA-F]{2}"; +$unreserved = "$add_chars\pL0-9\-._~"; +$sub_delims = '!$&\'()*+,;='; +$pchar = "(?:[^$remove_chars]*[$unreserved$sub_delims:@|]+|$pct_encoded)"; // rfc: no "|" + +$scheme = '[a-z][a-z\d+\-.]*'; +$reg_name = "(?:[^$remove_chars]*[$unreserved$sub_delims:@|]+|$pct_encoded)+"; // rfc: * instead of + and no "|" and no "@" and no ":" (included instead of userinfo) +//$userinfo = "(?:(?:[$unreserved$sub_delims:]+|$pct_encoded))*"; +$ipv4_simple = '[0-9.]+'; +$ipv6_simple = '\[[a-z0-9.]+:[a-z0-9.]+:[a-z0-9.:]+\]'; +$host = "(?:$reg_name|$ipv4_simple|$ipv6_simple)"; +$port = '\d*'; +//$authority = "(?:$userinfo@)?$host(?::$port)?"; +$authority = "$host(?::$port)?"; +$segment = "$pchar*"; +$path_abempty = "(?:/$segment)*"; +$hier_part = "/{2}$authority$path_abempty"; +$query = "(?:[^$remove_chars]*[$unreserved$sub_delims:@/?|]+|$pct_encoded)*"; // pchar | "/" | "?", rfc: no "|" +$fragment = $query; + +$url = "$scheme:$hier_part(?:\?$query)?(?:\#$fragment)?"; +echo 'URL: ' . $url . "

\n\n"; + +// no scheme, shortened authority, but host has to start with www. +$www_url = "www\.$reg_name(?::$port)?$path_abempty(?:\?$query)?(?:\#$fragment)?"; +echo 'www.URL: ' . $www_url . "

\n\n"; + +// no schema and no authority +$relative_url = "$segment$path_abempty(?:\?$query)?(?:\#$fragment)?"; +echo 'relative URL: ' . $relative_url . "

\n\n"; diff --git a/phpBB/includes/functions.php b/phpBB/includes/functions.php index 1a3560dbb1..d62e2111c3 100644 --- a/phpBB/includes/functions.php +++ b/phpBB/includes/functions.php @@ -3330,20 +3330,20 @@ function get_preg_expression($mode) case 'url_inline': $inline = ($mode == 'url') ? ')' : ''; $scheme = ($mode == 'url') ? '[a-z\d+\-.]' : '[a-z\d+]'; // avoid automatic parsing of "word" in "last word.http://..." - // generated with regex generation file in the develop folder - return "[a-z]$scheme*:/{2}(?:(?:[a-z0-9\-._~!$&'($inline*+,;=:@|]+|%[\dA-F]{2})+|[0-9.]+|\[[a-z0-9.]+:[a-z0-9.]+:[a-z0-9.:]+\])(?::\d*)?(?:/(?:[a-z0-9\-._~!$&'($inline*+,;=:@|]+|%[\dA-F]{2})*)*(?:\?(?:[a-z0-9\-._~!$&'($inline*+,;=:@/?|]+|%[\dA-F]{2})*)?(?:\#(?:[a-z0-9\-._~!$&'($inline*+,;=:@/?|]+|%[\dA-F]{2})*)?"; + // generated with regex_idn.php file in the develop folder + return "[a-z]$scheme*:/{2}(?:(?:[^\p{C}\p{Z}\p{S}\p{P}\p{Nl}\p{No}\p{Me}\x{1100}-\x{115F}\x{A960}-\x{A97C}\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}\x{20D0}-\x{20FF}\x{1D100}-\x{1D1FF}\x{1D200}-\x{1D24F}\x{0640}\x{07FA}\x{302E}\x{302F}\x{3031}-\x{3035}\x{303B}]*[\x{00B7}\x{0375}\x{05F3}\x{05F4}\x{30FB}\x{002D}\x{06FD}\x{06FE}\x{0F0B}\x{3007}\x{00DF}\x{03C2}\x{200C}\x{200D}\pL0-9\-._~!$&'($inline*+,;=:@|]+|%[\dA-F]{2})+|[0-9.]+|\[[a-z0-9.]+:[a-z0-9.]+:[a-z0-9.:]+\])(?::\d*)?(?:/(?:[^\p{C}\p{Z}\p{S}\p{P}\p{Nl}\p{No}\p{Me}\x{1100}-\x{115F}\x{A960}-\x{A97C}\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}\x{20D0}-\x{20FF}\x{1D100}-\x{1D1FF}\x{1D200}-\x{1D24F}\x{0640}\x{07FA}\x{302E}\x{302F}\x{3031}-\x{3035}\x{303B}]*[\x{00B7}\x{0375}\x{05F3}\x{05F4}\x{30FB}\x{002D}\x{06FD}\x{06FE}\x{0F0B}\x{3007}\x{00DF}\x{03C2}\x{200C}\x{200D}\pL0-9\-._~!$&'($inline*+,;=:@|]+|%[\dA-F]{2})*)*(?:\?(?:[^\p{C}\p{Z}\p{S}\p{P}\p{Nl}\p{No}\p{Me}\x{1100}-\x{115F}\x{A960}-\x{A97C}\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}\x{20D0}-\x{20FF}\x{1D100}-\x{1D1FF}\x{1D200}-\x{1D24F}\x{0640}\x{07FA}\x{302E}\x{302F}\x{3031}-\x{3035}\x{303B}]*[\x{00B7}\x{0375}\x{05F3}\x{05F4}\x{30FB}\x{002D}\x{06FD}\x{06FE}\x{0F0B}\x{3007}\x{00DF}\x{03C2}\x{200C}\x{200D}\pL0-9\-._~!$&'($inline*+,;=:@/?|]+|%[\dA-F]{2})*)?(?:\#(?:[^\p{C}\p{Z}\p{S}\p{P}\p{Nl}\p{No}\p{Me}\x{1100}-\x{115F}\x{A960}-\x{A97C}\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}\x{20D0}-\x{20FF}\x{1D100}-\x{1D1FF}\x{1D200}-\x{1D24F}\x{0640}\x{07FA}\x{302E}\x{302F}\x{3031}-\x{3035}\x{303B}]*[\x{00B7}\x{0375}\x{05F3}\x{05F4}\x{30FB}\x{002D}\x{06FD}\x{06FE}\x{0F0B}\x{3007}\x{00DF}\x{03C2}\x{200C}\x{200D}\pL0-9\-._~!$&'($inline*+,;=:@/?|]+|%[\dA-F]{2})*)?"; break; case 'www_url': case 'www_url_inline': $inline = ($mode == 'www_url') ? ')' : ''; - return "www\.(?:[a-z0-9\-._~!$&'($inline*+,;=:@|]+|%[\dA-F]{2})+(?::\d*)?(?:/(?:[a-z0-9\-._~!$&'($inline*+,;=:@|]+|%[\dA-F]{2})*)*(?:\?(?:[a-z0-9\-._~!$&'($inline*+,;=:@/?|]+|%[\dA-F]{2})*)?(?:\#(?:[a-z0-9\-._~!$&'($inline*+,;=:@/?|]+|%[\dA-F]{2})*)?"; + return "www\.(?:[^\p{C}\p{Z}\p{S}\p{P}\p{Nl}\p{No}\p{Me}\x{1100}-\x{115F}\x{A960}-\x{A97C}\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}\x{20D0}-\x{20FF}\x{1D100}-\x{1D1FF}\x{1D200}-\x{1D24F}\x{0640}\x{07FA}\x{302E}\x{302F}\x{3031}-\x{3035}\x{303B}]*[\x{00B7}\x{0375}\x{05F3}\x{05F4}\x{30FB}\x{002D}\x{06FD}\x{06FE}\x{0F0B}\x{3007}\x{00DF}\x{03C2}\x{200C}\x{200D}\pL0-9\-._~!$&'($inline*+,;=:@|]+|%[\dA-F]{2})+(?::\d*)?(?:/(?:[^\p{C}\p{Z}\p{S}\p{P}\p{Nl}\p{No}\p{Me}\x{1100}-\x{115F}\x{A960}-\x{A97C}\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}\x{20D0}-\x{20FF}\x{1D100}-\x{1D1FF}\x{1D200}-\x{1D24F}\x{0640}\x{07FA}\x{302E}\x{302F}\x{3031}-\x{3035}\x{303B}]*[\x{00B7}\x{0375}\x{05F3}\x{05F4}\x{30FB}\x{002D}\x{06FD}\x{06FE}\x{0F0B}\x{3007}\x{00DF}\x{03C2}\x{200C}\x{200D}\pL0-9\-._~!$&'($inline*+,;=:@|]+|%[\dA-F]{2})*)*(?:\?(?:[^\p{C}\p{Z}\p{S}\p{P}\p{Nl}\p{No}\p{Me}\x{1100}-\x{115F}\x{A960}-\x{A97C}\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}\x{20D0}-\x{20FF}\x{1D100}-\x{1D1FF}\x{1D200}-\x{1D24F}\x{0640}\x{07FA}\x{302E}\x{302F}\x{3031}-\x{3035}\x{303B}]*[\x{00B7}\x{0375}\x{05F3}\x{05F4}\x{30FB}\x{002D}\x{06FD}\x{06FE}\x{0F0B}\x{3007}\x{00DF}\x{03C2}\x{200C}\x{200D}\pL0-9\-._~!$&'($inline*+,;=:@/?|]+|%[\dA-F]{2})*)?(?:\#(?:[^\p{C}\p{Z}\p{S}\p{P}\p{Nl}\p{No}\p{Me}\x{1100}-\x{115F}\x{A960}-\x{A97C}\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}\x{20D0}-\x{20FF}\x{1D100}-\x{1D1FF}\x{1D200}-\x{1D24F}\x{0640}\x{07FA}\x{302E}\x{302F}\x{3031}-\x{3035}\x{303B}]*[\x{00B7}\x{0375}\x{05F3}\x{05F4}\x{30FB}\x{002D}\x{06FD}\x{06FE}\x{0F0B}\x{3007}\x{00DF}\x{03C2}\x{200C}\x{200D}\pL0-9\-._~!$&'($inline*+,;=:@/?|]+|%[\dA-F]{2})*)?"; break; case 'relative_url': case 'relative_url_inline': $inline = ($mode == 'relative_url') ? ')' : ''; - return "(?:[a-z0-9\-._~!$&'($inline*+,;=:@|]+|%[\dA-F]{2})*(?:/(?:[a-z0-9\-._~!$&'($inline*+,;=:@|]+|%[\dA-F]{2})*)*(?:\?(?:[a-z0-9\-._~!$&'($inline*+,;=:@/?|]+|%[\dA-F]{2})*)?(?:\#(?:[a-z0-9\-._~!$&'($inline*+,;=:@/?|]+|%[\dA-F]{2})*)?"; + return "(?:[^\p{C}\p{Z}\p{S}\p{P}\p{Nl}\p{No}\p{Me}\x{1100}-\x{115F}\x{A960}-\x{A97C}\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}\x{20D0}-\x{20FF}\x{1D100}-\x{1D1FF}\x{1D200}-\x{1D24F}\x{0640}\x{07FA}\x{302E}\x{302F}\x{3031}-\x{3035}\x{303B}]*[\x{00B7}\x{0375}\x{05F3}\x{05F4}\x{30FB}\x{002D}\x{06FD}\x{06FE}\x{0F0B}\x{3007}\x{00DF}\x{03C2}\x{200C}\x{200D}\pL0-9\-._~!$&'($inline*+,;=:@|]+|%[\dA-F]{2})*(?:/(?:[^\p{C}\p{Z}\p{S}\p{P}\p{Nl}\p{No}\p{Me}\x{1100}-\x{115F}\x{A960}-\x{A97C}\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}\x{20D0}-\x{20FF}\x{1D100}-\x{1D1FF}\x{1D200}-\x{1D24F}\x{0640}\x{07FA}\x{302E}\x{302F}\x{3031}-\x{3035}\x{303B}]*[\x{00B7}\x{0375}\x{05F3}\x{05F4}\x{30FB}\x{002D}\x{06FD}\x{06FE}\x{0F0B}\x{3007}\x{00DF}\x{03C2}\x{200C}\x{200D}\pL0-9\-._~!$&'($inline*+,;=:@|]+|%[\dA-F]{2})*)*(?:\?(?:[^\p{C}\p{Z}\p{S}\p{P}\p{Nl}\p{No}\p{Me}\x{1100}-\x{115F}\x{A960}-\x{A97C}\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}\x{20D0}-\x{20FF}\x{1D100}-\x{1D1FF}\x{1D200}-\x{1D24F}\x{0640}\x{07FA}\x{302E}\x{302F}\x{3031}-\x{3035}\x{303B}]*[\x{00B7}\x{0375}\x{05F3}\x{05F4}\x{30FB}\x{002D}\x{06FD}\x{06FE}\x{0F0B}\x{3007}\x{00DF}\x{03C2}\x{200C}\x{200D}\pL0-9\-._~!$&'($inline*+,;=:@/?|]+|%[\dA-F]{2})*)?(?:\#(?:[^\p{C}\p{Z}\p{S}\p{P}\p{Nl}\p{No}\p{Me}\x{1100}-\x{115F}\x{A960}-\x{A97C}\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}\x{20D0}-\x{20FF}\x{1D100}-\x{1D1FF}\x{1D200}-\x{1D24F}\x{0640}\x{07FA}\x{302E}\x{302F}\x{3031}-\x{3035}\x{303B}]*[\x{00B7}\x{0375}\x{05F3}\x{05F4}\x{30FB}\x{002D}\x{06FD}\x{06FE}\x{0F0B}\x{3007}\x{00DF}\x{03C2}\x{200C}\x{200D}\pL0-9\-._~!$&'($inline*+,;=:@/?|]+|%[\dA-F]{2})*)?"; break; case 'table_prefix': diff --git a/phpBB/includes/message_parser.php b/phpBB/includes/message_parser.php index 12ef94c07a..07fe969ce2 100644 --- a/phpBB/includes/message_parser.php +++ b/phpBB/includes/message_parser.php @@ -313,7 +313,7 @@ class bbcode_firstpass extends bbcode $in = str_replace(' ', '%20', $in); // Checking urls - if (!preg_match('#^' . get_preg_expression('url') . '$#i', $in) && !preg_match('#^' . get_preg_expression('www_url') . '$#iu', $in)) + if (!preg_match('#^' . get_preg_expression('url') . '$#iu', $in) && !preg_match('#^' . get_preg_expression('www_url') . '$#iu', $in)) { return '[img]' . $in . '[/img]'; } diff --git a/phpBB/phpbb/profilefields/type/type_url.php b/phpBB/phpbb/profilefields/type/type_url.php index bc8ac869d0..fe0bffd582 100644 --- a/phpBB/phpbb/profilefields/type/type_url.php +++ b/phpBB/phpbb/profilefields/type/type_url.php @@ -64,7 +64,7 @@ class type_url extends type_string return false; } - if (!preg_match('#^' . get_preg_expression('url') . '$#i', $field_value)) + if (!preg_match('#^' . get_preg_expression('url') . '$#iu', $field_value)) { return $this->user->lang('FIELD_INVALID_URL', $this->get_field_name($field_data['lang_name'])); } -- cgit v1.2.1