Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow unicode domain name and path #423

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions java/src/main/java/com/twitter/twittertext/Regex.java
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ protected Regex() {
private static final String URL_VALID_PRECEDING_CHARS =
"(?:[^a-z0-9@@$##" + INVALID_CHARACTERS + "]|[" + DIRECTIONAL_CHARACTERS + "]|^)";

private static final String URL_VALID_CHARS = "[a-z0-9" + LATIN_ACCENTS_CHARS + "]";
private static final String URL_UNICODE_CHARS ="\\p{M}\\p{L}";
private static final String URL_VALID_CHARS = "[a-z0-9" + URL_UNICODE_CHARS +LATIN_ACCENTS_CHARS + "]";
private static final String URL_VALID_SUBDOMAIN =
"(?>(?:" + URL_VALID_CHARS + "[" + URL_VALID_CHARS + "\\-_]*)?" + URL_VALID_CHARS + "\\.)";
private static final String URL_VALID_DOMAIN_NAME =
Expand All @@ -159,7 +160,7 @@ protected Regex() {
// Any non-space, non-punctuation characters.
// \p{Z} = any kind of whitespace or invisible separator.
private static final String URL_VALID_UNICODE_CHARS =
"[^" + PUNCTUATION_CHARS + "\\s\\p{Z}\\p{InGeneralPunctuation}]";
"[^" + PUNCTUATION_CHARS+ URL_UNICODE_CHARS + "\\s\\p{Z}\\p{InGeneralPunctuation}]";
private static final String URL_VALID_UNICODE_DOMAIN_NAME =
"(?:(?:" + URL_VALID_UNICODE_CHARS + "[" + URL_VALID_UNICODE_CHARS + "\\-]*)?" +
URL_VALID_UNICODE_CHARS + "\\.)";
Expand Down Expand Up @@ -188,7 +189,7 @@ protected Regex() {

private static final String URL_VALID_GENERAL_PATH_CHARS =
"[a-z0-9!\\*';:=\\+,.\\$/%#\\[\\]\\-\\u2013_~\\|&@" +
LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + "]";
LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + URL_UNICODE_CHARS + "]";

/**
* Allow URL paths to contain up to two nested levels of balanced parens
Expand Down Expand Up @@ -216,7 +217,7 @@ protected Regex() {
* 2. Allow =&# for empty URL parameters and other URL-join artifacts
*/
private static final String URL_VALID_PATH_ENDING_CHARS =
"[a-z0-9=_#/\\-\\+" + LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + "]|(?:" +
"[a-z0-9=_#/\\-\\+" + LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + URL_UNICODE_CHARS + "]|(?:" +
URL_BALANCED_PARENS + ")";

private static final String URL_VALID_PATH = "(?:" +
Expand Down
12 changes: 12 additions & 0 deletions java/src/test/java/com/twitter/twittertext/ExtractorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,18 @@ public void testUrlWithSpecialCCTLDWithoutProtocol() {
assertTrue("Should not extract URLs w/o protocol", extractor.extractURLs(text).isEmpty());
}


public void testUrlWithUnicode() {
final String text = "http://www.詹姆斯.com http://www.詹姆斯.com/詹姆斯";
assertList("Failed to extract URLs with unicode",
new String[]{"http://www.詹姆斯.com", "http://www.詹姆斯.com/詹姆斯"},
extractor.extractURLs(text));

final String text1 = "https://简体中文.winshipway.com/good/";
assertList("Failed to extract URLs with unicode",
new String[]{"https://简体中文.winshipway.com/good/"},
extractor.extractURLs(text1));
}
/**
* Helper method for asserting that the List of extracted Strings match the expected values.
*
Expand Down