diff --git a/ark-tweet-nlp/pom.xml b/ark-tweet-nlp/pom.xml
index 526bbfb..1c3597e 100644
--- a/ark-tweet-nlp/pom.xml
+++ b/ark-tweet-nlp/pom.xml
@@ -10,6 +10,7 @@
UTF-8
+
@@ -27,17 +28,30 @@
+
+
+ add-test-source
+ generate-test-sources
+
+ add-test-source
+
+
+
+
+
+
+
-
+
org.apache.maven.plugins
maven-shade-plugin
1.6
- package
+ none
shade
@@ -50,7 +64,7 @@
-
+
org.apache.maven.plugins
maven-compiler-plugin
@@ -108,11 +122,11 @@
**/*.jar
-
+
-
+
@@ -133,9 +147,9 @@
1.4
- commons-lang
- commons-lang
- 2.3
+ org.apache.commons
+ commons-lang3
+ 3.4
org.apache.commons
@@ -162,6 +176,13 @@
guava
10.0.1
+
+
+ org.pirkaengine
+ pirka-mobile
+ 0.3.0
+
+
junit
@@ -181,7 +202,23 @@
jackson-databind
2.0.0
-
+
+
+ edu.stanford.nlp
+ stanford-corenlp
+ 1.2.0
+
+
+ edu.berkeley.nlp
+ berkeleyparser
+ r32
+
+
+ net.sf.trove4j
+ trove4j
+ 3.0.3
+
+
diff --git a/src/cmu/arktweetnlp/EmojiExtractor.java b/src/cmu/arktweetnlp/EmojiExtractor.java
new file mode 100644
index 0000000..259b053
--- /dev/null
+++ b/src/cmu/arktweetnlp/EmojiExtractor.java
@@ -0,0 +1,13 @@
+package cmu.arktweetnlp;
+
+import org.apache.commons.lang3.tuple.Pair;
+import org.pirkaengine.mobile.Emoji;
+import java.util.List;
+
+
+/**
+ * Interface for objects that know how to extract emojis from text.
+ */
+public interface EmojiExtractor {
+ public Pair> extractEmojis(final String text);
+}
diff --git a/src/cmu/arktweetnlp/PatternContext.java b/src/cmu/arktweetnlp/PatternContext.java
new file mode 100644
index 0000000..9bb0e18
--- /dev/null
+++ b/src/cmu/arktweetnlp/PatternContext.java
@@ -0,0 +1,44 @@
+package cmu.arktweetnlp;
+
+import java.util.regex.Pattern;
+
+/**
+ * Interface for the collection of Patterns needed by the Twokenize module.
+ * Allows users to pass in a custom set of patterns or use the DefaultPatternContext bundled with the library.
+ */
+public interface PatternContext {
+ /**
+ * @return A pattern that can be used to detect contractions
+ */
+ public Pattern getContractionPattern();
+
+ /**
+ * @return A pattern that can be used to detect whitespace
+ */
+ public Pattern getWhitespacePattern();
+
+ /**
+ * @return A pattern that can be used to detect any desired
+ * "protected" tokens -- tokens that should not be split any further.
+ */
+ public Pattern getProtectedTokenPattern();
+
+ /**
+ * @return A pattern that can be used to detect left edge punctuation
+ */
+ public Pattern getLeftEdgePunctuationPattern();
+
+ /**
+ * @return A pattern that can be used to detect right edge punctuation
+ */
+ public Pattern getRightEdgePunctuationPattern();
+
+
+ public String splitEdgePunctuation(String input);
+
+ /**
+ * @return Trims multiple consecutive white spaces into a single
+ * space. E.g. "foo bar " => "foo bar"
+ */
+ public String squeezeWhitespace(String input);
+}
diff --git a/src/cmu/arktweetnlp/TokenCategorizer.java b/src/cmu/arktweetnlp/TokenCategorizer.java
new file mode 100644
index 0000000..ebc45ec
--- /dev/null
+++ b/src/cmu/arktweetnlp/TokenCategorizer.java
@@ -0,0 +1,15 @@
+package cmu.arktweetnlp;
+
+
+import org.pirkaengine.mobile.Emoji;
+import java.util.*;
+
+/**
+ * Interface for objects that know how to group tokens output from Twokenize
+ * into categories of a given type. E.g. mapping certain token types to an enum.
+ * @param
+ */
+public interface TokenCategorizer {
+ public Map> categorize(final String text, final List> splitTokens, final List> protectedTokens, final List emojis);
+}
+
diff --git a/src/cmu/arktweetnlp/Twokenize.java b/src/cmu/arktweetnlp/Twokenize.java
index 4397a40..285db24 100644
--- a/src/cmu/arktweetnlp/Twokenize.java
+++ b/src/cmu/arktweetnlp/Twokenize.java
@@ -4,17 +4,23 @@
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
+import java.util.Map;
import java.util.regex.*;
import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;
-import org.apache.commons.lang.StringEscapeUtils;
+import cmu.arktweetnlp.impl.DefaultPatternContext;
+import cmu.arktweetnlp.impl.NoOpEmojiExtractor;
+import org.apache.commons.lang3.StringEscapeUtils;
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
+import org.pirkaengine.mobile.Emoji;
/**
* Twokenize -- a tokenizer designed for Twitter text in English and some other European languages.
* This is the Java version. If you want the old Python version, see: http://github.com/brendano/tweetmotif
- *
+ *
* This tokenizer code has gone through a long history:
*
* (1) Brendan O'Connor wrote original version in Python, http://github.com/brendano/tweetmotif
@@ -24,208 +30,102 @@
* (2a) Kevin Gimpel and Daniel Mills modified it for POS tagging for the CMU ARK Twitter POS Tagger
* (2b) Jason Baldridge and David Snyder ported it to Scala
* (3) Brendan bugfixed the Scala port and merged with POS-specific changes
- * for the CMU ARK Twitter POS Tagger
+ * for the CMU ARK Twitter POS Tagger
* (4) Tobi Owoputi ported it back to Java and added many improvements (2012-06)
- *
+ *
* Current home is http://github.com/brendano/ark-tweet-nlp and http://www.ark.cs.cmu.edu/TweetNLP
*
* There have been at least 2 other Java ports, but they are not in the lineage for the code here.
*/
public class Twokenize {
- static Pattern Contractions = Pattern.compile("(?i)(\\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$");
- static Pattern Whitespace = Pattern.compile("[\\s\\p{Zs}]+");
-
- static String punctChars = "['\"“”‘’.?!…,:;]";
- //static String punctSeq = punctChars+"+"; //'anthem'. => ' anthem '.
- static String punctSeq = "['\"“”‘’]+|[.?!,…]+|[:;]+"; //'anthem'. => ' anthem ' .
- static String entity = "&(?:amp|lt|gt|quot);";
- // URLs
-
- // BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong.
- // If you actually empirically test it the results are bad.
- // Please see https://github.com/brendano/ark-tweet-nlp/pull/9
-
- static String urlStart1 = "(?:https?://|\\bwww\\.)";
- static String commonTLDs = "(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)";
- static String ccTLDs = "(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" +
- "bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" +
- "er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" +
- "hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" +
- "lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" +
- "nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" +
- "sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" +
- "va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)"; //TODO: remove obscure country domains?
- static String urlStart2 = "\\b(?:[A-Za-z\\d-])+(?:\\.[A-Za-z0-9]+){0,3}\\." + "(?:"+commonTLDs+"|"+ccTLDs+")"+"(?:\\."+ccTLDs+")?(?=\\W|$)";
- static String urlBody = "(?:[^\\.\\s<>][^\\s<>]*?)?";
- static String urlExtraCrapBeforeEnd = "(?:"+punctChars+"|"+entity+")+?";
- static String urlEnd = "(?:\\.\\.+|[<>]|\\s|$)";
- public static String url = "(?:"+urlStart1+"|"+urlStart2+")"+urlBody+"(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")";
-
-
- // Numeric
- static String timeLike = "\\d+(?::\\d+){1,2}";
- //static String numNum = "\\d+\\.\\d+";
- static String numberWithCommas = "(?:(?> splitTokens;
+ private final List> preservedTokens;
+ private final List emojis;
+
+ public TwokenizedTweet(String originalText, List> splitGoodTokens, List> badTokens, List emojis) {
+ this.originalText = originalText;
+ this.splitTokens = splitGoodTokens;
+ this.preservedTokens = badTokens;
+ this.emojis = emojis;
+ }
+
+ /**
+ * @return The original Tweet text
+ */
+ public String getOriginalText() {
+ return originalText;
+ }
+
+ /**
+ * @return The tokens that were split into single tokens
+ */
+ public List> getSplitTokens() {
+ return splitTokens;
+ }
+
+ /**
+ * @return The preserved tokens that were not split up. E.g. what's, http://test.com, etc.
+ */
+ public List> getPreservedTokens() {
+ return preservedTokens;
+ }
+
+ /**
+ * @return The emojis detected in the tweet
+ */
+ public List getEmojis() {
+ return emojis;
}
- sb.append(")");
- return sb.toString();
- }
-
- // Emoticons
- static String normalEyes = "(?iu)[:=]"; // 8 and x are eyes but cause problems
- static String wink = "[;]";
- static String noseArea = "(?:|-|[^a-zA-Z0-9 ])"; // doesn't get :'-(
- static String happyMouths = "[D\\)\\]\\}]+";
- static String sadMouths = "[\\(\\[\\{]+";
- static String tongue = "[pPd3]+";
- static String otherMouths = "(?:[oO]+|[/\\\\]+|[vV]+|[Ss]+|[|]+)"; // remove forward slash if http://'s aren't cleaned
-
- // mouth repetition examples:
- // @aliciakeys Put it in a love song :-))
- // @hellocalyclops =))=))=)) Oh well
-
- static String bfLeft = "(♥|0|o|°|v|\\$|t|x|;|\\u0CA0|@|ʘ|•|・|◕|\\^|¬|\\*)";
- static String bfCenter = "(?:[\\.]|[_-]+)";
- static String bfRight = "\\2";
- static String s3 = "(?:--['\"])";
- static String s4 = "(?:<|<|>|>)[\\._-]+(?:<|<|>|>)";
- static String s5 = "(?:[.][_]+[.])";
- static String basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5;
-
- static String eeLeft = "[\\\\\ƪԄ\\((<>;ヽ\\-=~\\*]+";
- static String eeRight= "[\\-=\\);'\\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+";
- static String eeSymbol = "[^A-Za-z0-9\\s\\(\\)\\*:=-]";
- static String eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight;
-
-
- public static String emoticon = OR(
- // Standard version :) :( :] :D :P
- "(?:>|>)?" + OR(normalEyes, wink) + OR(noseArea,"[Oo]") +
- OR(tongue+"(?=\\W|$|RT|rt|Rt)", otherMouths+"(?=\\W|$|RT|rt|Rt)", sadMouths, happyMouths),
-
- // reversed version (: D: use positive lookbehind to remove "(word):"
- // because eyes on the right side is more ambiguous with the standard usage of : ;
- "(?<=(?: |^))" + OR(sadMouths,happyMouths,otherMouths) + noseArea + OR(normalEyes, wink) + "(?:<|<)?",
-
- //inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
- eastEmote.replaceFirst("2", "1"), basicface
- // iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
- // TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
- );
-
- static String Hearts = "(?:<+/?3+)+"; //the other hearts are in decorations
-
- static String Arrows = "(?:<*[-―—=]*>+|<+[-―—=]*>*)|\\p{InArrows}+";
-
- // BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
- // "hello (#hashtag)" ==> "hello (#hashtag )" WRONG
- // "hello (#hashtag)" ==> "hello ( #hashtag )" RIGHT
- // "hello (@person)" ==> "hello (@person )" WRONG
- // "hello (@person)" ==> "hello ( @person )" RIGHT
- // ... Some sort of weird interaction with edgepunct I guess, because edgepunct
- // has poor content-symbol detection.
-
- // This also gets #1 #40 which probably aren't hashtags .. but good as tokens.
- // If you want good hashtag identification, use a different regex.
- static String Hashtag = "#[a-zA-Z0-9_]+"; //optional: lookbehind for \b
- //optional: lookbehind for \b, max length 15
- static String AtMention = "[@@][a-zA-Z0-9_]+";
-
- // I was worried this would conflict with at-mentions
- // but seems ok in sample of 5800: 7 changes all email fixes
- // http://www.regular-expressions.info/email.html
- static String Bound = "(?:\\W|^|$)";
- public static String Email = "(?<=" +Bound+ ")[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,4}(?=" +Bound+")";
-
- // We will be tokenizing using these regexps as delimiters
- // Additionally, these things are "protected", meaning they shouldn't be further split themselves.
- static Pattern Protected = Pattern.compile(
- OR(
- Hearts,
- url,
- Email,
- timeLike,
- //numNum,
- numberWithCommas,
- numComb,
- emoticon,
- Arrows,
- entity,
- punctSeq,
- arbitraryAbbrev,
- separators,
- decorations,
- embeddedApostrophe,
- Hashtag,
- AtMention
- ));
-
- // Edge punctuation
- // Want: 'foo' => ' foo '
- // While also: don't => don't
- // the first is considered "edge punctuation".
- // the second is word-internal punctuation -- don't want to mess with it.
- // BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.
- // I remember it causing lots of trouble in the past as well. Would be good to revisit or eliminate.
-
- // Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
- static String edgePunctChars = "'\"“”‘’«»{}\\(\\)\\[\\]\\*&"; //add \\p{So}? (symbols)
- static String edgePunct = "[" + edgePunctChars + "]";
- static String notEdgePunct = "[a-zA-Z0-9]"; // content characters
- static String offEdge = "(^|$|:|;|\\s|\\.|,)"; // colon here gets "(hello):" ==> "( hello ):"
- static Pattern EdgePunctLeft = Pattern.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")");
- static Pattern EdgePunctRight = Pattern.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge);
-
- public static String splitEdgePunct (String input) {
- Matcher m1 = EdgePunctLeft.matcher(input);
- input = m1.replaceAll("$1$2 $3");
- m1 = EdgePunctRight.matcher(input);
- input = m1.replaceAll("$1 $2$3");
- return input;
- }
-
- private static class Pair {
- public T1 first;
- public T2 second;
- public Pair(T1 x, T2 y) { first=x; second=y; }
}
+
// The main work of tokenizing a tweet.
- private static List simpleTokenize (String text) {
+ private static List simpleTokenize (final String text, final PatternContext patterns, final EmojiExtractor emojiExtractor) {
+ final TwokenizedTweet twokenizedTweet = tokenizeTweet(text, patterns, emojiExtractor);
+
+ // Reinterpolate the 'good' and 'bad' Lists, ensuring that
+ // additonal tokens from last good item get included
+ List zippedStr= new ArrayList();
+ int i;
+ for(i=0; i < twokenizedTweet.getPreservedTokens().size(); i++) {
+ zippedStr = addAllnonempty(zippedStr, twokenizedTweet.getSplitTokens().get(i));
+ zippedStr = addAllnonempty(zippedStr,twokenizedTweet.getPreservedTokens().get(i));
+ }
+ zippedStr = addAllnonempty(zippedStr,twokenizedTweet.getSplitTokens().get(i));
+
+ // BTO: our POS tagger wants "ur" and "you're" to both be one token.
+ // Uncomment to get "you 're"
+ /*ArrayList splitStr = new ArrayList(zippedStr.size());
+ for(String tok:zippedStr)
+ splitStr.addAll(splitToken(tok));
+ zippedStr=splitStr;*/
+ return zippedStr;
+ }
+
+ protected static TwokenizedTweet tokenizeTweet(final String text, final PatternContext patterns, final EmojiExtractor emojiExtractor) {
// Do the no-brainers first
- String splitPunctText = splitEdgePunct(text);
+ String splitPunctText = patterns.splitEdgePunctuation(text);
int textLength = splitPunctText.length();
-
+
// BTO: the logic here got quite convoluted via the Scala porting detour
// It would be good to switch back to a nice simple procedural style like in the Python version
// ... Scala is such a pain. Never again.
// Find the matches for subsequences that should be protected,
// e.g. URLs, 1.0, U.N.K.L.E., 12:53
- Matcher matches = Protected.matcher(splitPunctText);
- //Storing as List[List[String]] to make zip easier later on
+ Matcher matches = patterns.getProtectedTokenPattern().matcher(splitPunctText);
+ //Storing as List[List[String]] to make zip easier later on
List> bads = new ArrayList>(); //linked list?
List> badSpans = new ArrayList>();
while(matches.find()){
@@ -234,52 +134,44 @@ private static List simpleTokenize (String text) {
List bad = new ArrayList(1);
bad.add(splitPunctText.substring(matches.start(),matches.end()));
bads.add(bad);
- badSpans.add(new Pair(matches.start(),matches.end()));
+ badSpans.add(new ImmutablePair(matches.start(),matches.end()));
}
}
// Create a list of indices to create the "goods", which can be
- // split. We are taking "bad" spans like
- // List((2,5), (8,10))
- // to create
+ // split. We are taking "bad" spans like
+ // List((2,5), (8,10))
+ // to create
/// List(0, 2, 5, 8, 10, 12)
// where, e.g., "12" here would be the textLength
// has an even length and no indices are the same
List indices = new ArrayList(2+2*badSpans.size());
indices.add(0);
for(Pair p:badSpans){
- indices.add(p.first);
- indices.add(p.second);
+ indices.add(p.getLeft());
+ indices.add(p.getRight());
}
indices.add(textLength);
// Group the indices and map them to their respective portion of the string
List> splitGoods = new ArrayList>(indices.size()/2);
+ final List emojis = new ArrayList();
for (int i=0; i> goodStrAndEmojis = emojiExtractor.extractEmojis(goodstr);
+ goodstr = goodStrAndEmojis.getLeft();
+
+ emojis.addAll(goodStrAndEmojis.getRight());
+
List splitstr = Arrays.asList(goodstr.trim().split(" "));
splitGoods.add(splitstr);
}
- // Reinterpolate the 'good' and 'bad' Lists, ensuring that
- // additonal tokens from last good item get included
- List zippedStr= new ArrayList();
- int i;
- for(i=0; i < bads.size(); i++) {
- zippedStr = addAllnonempty(zippedStr,splitGoods.get(i));
- zippedStr = addAllnonempty(zippedStr,bads.get(i));
- }
- zippedStr = addAllnonempty(zippedStr,splitGoods.get(i));
-
- // BTO: our POS tagger wants "ur" and "you're" to both be one token.
- // Uncomment to get "you 're"
- /*ArrayList splitStr = new ArrayList(zippedStr.size());
- for(String tok:zippedStr)
- splitStr.addAll(splitToken(tok));
- zippedStr=splitStr;*/
-
- return zippedStr;
- }
+
+ return new TwokenizedTweet(text, splitGoods, bads, emojis);
+ }
private static List addAllnonempty(List master, List smaller){
for (String s : smaller){
@@ -289,42 +181,38 @@ private static List addAllnonempty(List master, List sma
}
return master;
}
- /** "foo bar " => "foo bar" */
- public static String squeezeWhitespace (String input){
- return Whitespace.matcher(input).replaceAll(" ").trim();
- }
-
- // Final pass tokenization based on special patterns
- private static List splitToken (String token) {
- Matcher m = Contractions.matcher(token);
- if (m.find()){
- String[] contract = {m.group(1), m.group(2)};
- return Arrays.asList(contract);
- }
- String[] contract = {token};
- return Arrays.asList(contract);
+ /** Assume 'text' has no HTML escaping. **/
+ public static List tokenize(String text) {
+ return simpleTokenize(DEFAULT_PATTERN_CONTEXT.squeezeWhitespace(text), DEFAULT_PATTERN_CONTEXT, DEFAULT_EMOJI_EXTRACTOR);
}
/** Assume 'text' has no HTML escaping. **/
- public static List tokenize(String text){
- return simpleTokenize(squeezeWhitespace(text));
+ public static List tokenize(final String text, final PatternContext patternContext, final EmojiExtractor emojiExtractor) {
+ return simpleTokenize(patternContext.squeezeWhitespace(text), patternContext, emojiExtractor);
}
+ /**
+ * Tokenizes the given text and applies the given categorization function to categorize the tokens into groups
+ */
+ public static Map> tokenizeIntoCategories(final String text, final TokenCategorizer categorizer) {
+ return tokenizeIntoCategories(text, categorizer, DEFAULT_PATTERN_CONTEXT, DEFAULT_EMOJI_EXTRACTOR);
+ }
/**
- * Twitter text comes HTML-escaped, so unescape it.
- * We also first unescape &'s, in case the text has been buggily double-escaped.
+ * Same as tokenizeIntoCategories but uses a custom PatternContext and EmojiExtractor
*/
- public static String normalizeTextForTagger(String text) {
- text = text.replaceAll("&", "&");
- text = StringEscapeUtils.unescapeHtml(text);
- return text;
+ public static Map> tokenizeIntoCategories(final String text, final TokenCategorizer categorizer, final PatternContext patterns, final EmojiExtractor emojiExtractor) {
+ final String cleaned = patterns.squeezeWhitespace(text);
+ final TwokenizedTweet twokenizedTweet = tokenizeTweet(cleaned, patterns, emojiExtractor);
+ final Map> tokenCategories = categorizer.categorize(twokenizedTweet.getOriginalText(), twokenizedTweet.getSplitTokens(), twokenizedTweet.getPreservedTokens(), twokenizedTweet.getEmojis());
+ return tokenCategories;
}
+
/**
* This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger.
- *
+ *
* This function normalizes the input text BEFORE calling the tokenizer.
* So the tokens you get back may not exactly correspond to
* substrings of the original text.
@@ -334,6 +222,16 @@ public static List tokenizeRawTweetText(String text) {
return tokens;
}
+ /**
+ * Twitter text comes HTML-escaped, so unescape it.
+ * We also first unescape &'s, in case the text has been buggily double-escaped.
+ */
+ public static String normalizeTextForTagger(String text) {
+ text = text.replaceAll("&", "&");
+ text = StringEscapeUtils.unescapeHtml4(text);
+ return text;
+ }
+
/** Tokenizes tweet texts on standard input, tokenizations on standard output. Input and output UTF-8. */
public static void main(String[] args) throws IOException {
BufferedReader input = new BufferedReader(new InputStreamReader(System.in,"UTF-8"));
@@ -350,5 +248,5 @@ public static void main(String[] args) throws IOException {
output.print("\n");
}
}
-
+
}
diff --git a/src/cmu/arktweetnlp/impl/DefaultPatternContext.java b/src/cmu/arktweetnlp/impl/DefaultPatternContext.java
new file mode 100644
index 0000000..362ca4b
--- /dev/null
+++ b/src/cmu/arktweetnlp/impl/DefaultPatternContext.java
@@ -0,0 +1,226 @@
+package cmu.arktweetnlp.impl;
+
+import cmu.arktweetnlp.PatternContext;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class DefaultPatternContext implements PatternContext {
+
+ public static Pattern Contractions = Pattern.compile("(?i)(\\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$");
+ public static Pattern Whitespace = Pattern.compile("[\\s\\p{Zs}]+");
+
+ public static String punctChars = "['\"“”‘’.?!…,:;]";
+ //public static String punctSeq = punctChars+"+"; //'anthem'. => ' anthem '.
+ public static String punctSeq = "['\"“”‘’]+|[.?!,…]+|[:;]+"; //'anthem'. => ' anthem ' .
+ public static String entity = "&(?:amp|lt|gt|quot);";
+ // URLs
+
+ // BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong.
+ // If you actually empirically test it the results are bad.
+ // Please see https://github.com/brendano/ark-tweet-nlp/pull/9
+
+ public static String urlStart1 = "(?:https?://|\\bwww\\.)";
+ public static String commonTLDs = "(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)";
+ public static String ccTLDs = "(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" +
+ "bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" +
+ "er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" +
+ "hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" +
+ "lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" +
+ "nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" +
+ "sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" +
+ "va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)"; //TODO: remove obscure country domains?
+ public static String urlStart2 = "\\b(?:[A-Za-z\\d-])+(?:\\.[A-Za-z0-9]+){0,3}\\." + "(?:"+commonTLDs+"|"+ccTLDs+")"+"(?:\\."+ccTLDs+")?(?=\\W|$)";
+ public static String urlBody = "(?:[^\\.\\s<>][^\\s<>]*?)?";
+ public static String urlExtraCrapBeforeEnd = "(?:"+punctChars+"|"+entity+")+?";
+ public static String urlEnd = "(?:\\.\\.+|[<>]|\\s|$)";
+ public static String url = "(?:"+urlStart1+"|"+urlStart2+")"+urlBody+"(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")";
+
+
+ // Numeric
+ public static String timeLike = "\\d+(?::\\d+){1,2}";
+ //public static String numNum = "\\d+\\.\\d+";
+ public static String numberWithCommas = "(?:(?|>)[\\._-]+(?:<|<|>|>)";
+ public static String s5 = "(?:[.][_]+[.])";
+ public static String basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5;
+
+ public static String eeLeft = "[\\\\\ƪԄ\\((<>;ヽ\\-=~\\*]+";
+ public static String eeRight= "[\\-=\\);'\\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+";
+ public static String eeSymbol = "[^A-Za-z0-9\\s\\(\\)\\*:=-]";
+ public static String eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight;
+
+
+ public static String emoticon = OR(
+ // Standard version :) :( :] :D :P
+ "(?:>|>)?" + OR(normalEyes, wink) + OR(noseArea,"[Oo]") +
+ OR(tongue+"(?=\\W|$|RT|rt|Rt)", otherMouths+"(?=\\W|$|RT|rt|Rt)", sadMouths, happyMouths),
+
+ // reversed version (: D: use positive lookbehind to remove "(word):"
+ // because eyes on the right side is more ambiguous with the standard usage of : ;
+ "(?<=(?: |^))" + OR(sadMouths,happyMouths,otherMouths) + noseArea + OR(normalEyes, wink) + "(?:<|<)?",
+
+ //inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
+ eastEmote.replaceFirst("2", "1"), basicface
+ // iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
+ // TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
+ );
+
+ public static String Hearts = "(?:<+/?3+)+"; //the other hearts are in decorations
+
+ public static String Arrows = "(?:<*[-―—=]*>+|<+[-―—=]*>*)|\\p{InArrows}+";
+
+ // BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
+ // "hello (#hashtag)" ==> "hello (#hashtag )" WRONG
+ // "hello (#hashtag)" ==> "hello ( #hashtag )" RIGHT
+ // "hello (@person)" ==> "hello (@person )" WRONG
+ // "hello (@person)" ==> "hello ( @person )" RIGHT
+ // ... Some sort of weird interaction with edgepunct I guess, because edgepunct
+ // has poor content-symbol detection.
+
+ // This also gets #1 #40 which probably aren't hashtags .. but good as tokens.
+ // If you want good hashtag identification, use a different regex.
+ public static String Hashtag = "#[a-zA-Z0-9_]+"; //optional: lookbehind for \b
+ //optional: lookbehind for \b, max length 15
+ public static String AtMention = "[@@][a-zA-Z0-9_]+";
+
+ // I was worried this would conflict with at-mentions
+ // but seems ok in sample of 5800: 7 changes all email fixes
+ // http://www.regular-expressions.info/email.html
+ public static String Bound = "(?:\\W|^|$)";
+ public static String Email = "(?<=" +Bound+ ")[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,4}(?=" +Bound+")";
+
+ // We will be tokenizing using these regexps as delimiters
+ // Additionally, these things are "protected", meaning they shouldn't be further split themselves.
+ public static Pattern Protected = Pattern.compile(
+ OR(
+ Hearts,
+ url,
+ Email,
+ timeLike,
+ //numNum,
+ numberWithCommas,
+ numComb,
+ emoticon,
+ Arrows,
+ entity,
+ punctSeq,
+ arbitraryAbbrev,
+ separators,
+ decorations,
+ embeddedApostrophe,
+ Hashtag,
+ AtMention
+ ));
+
+ // Edge punctuation
+ // Want: 'foo' => ' foo '
+ // While also: don't => don't
+ // the first is considered "edge punctuation".
+ // the second is word-internal punctuation -- don't want to mess with it.
+ // BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.
+ // I remember it causing lots of trouble in the past as well. Would be good to revisit or eliminate.
+
+ // Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
+ public static String edgePunctChars = "'\"“”‘’«»{}\\(\\)\\[\\]\\*&"; //add \\p{So}? (symbols)
+ public static String edgePunct = "[" + edgePunctChars + "]";
+ public static String notEdgePunct = "[a-zA-Z0-9]"; // content characters
+ public static String offEdge = "(^|$|:|;|\\s|\\.|,)"; // colon here gets "(hello):" ==> "( hello ):"
+ public static Pattern EdgePunctLeft = Pattern.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")");
+ public static Pattern EdgePunctRight = Pattern.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge);
+
+ @Override
+ public Pattern getContractionPattern() {
+ return Contractions;
+ }
+
+ @Override
+ public Pattern getWhitespacePattern() {
+ return Whitespace;
+ }
+
+ @Override
+ public Pattern getProtectedTokenPattern() {
+ return Protected;
+ }
+
+ @Override
+ public Pattern getLeftEdgePunctuationPattern() {
+ return EdgePunctLeft;
+ }
+
+ @Override
+ public Pattern getRightEdgePunctuationPattern() {
+ return EdgePunctRight;
+ }
+
+
+ public String splitEdgePunctuation(String input) {
+ Matcher m1 = getLeftEdgePunctuationPattern().matcher(input);
+ input = m1.replaceAll("$1$2 $3");
+ m1 = getRightEdgePunctuationPattern().matcher(input);
+ input = m1.replaceAll("$1 $2$3");
+ return input;
+ }
+
+ /** "foo bar " => "foo bar" */
+ public String squeezeWhitespace (String input){
+ return getWhitespacePattern().matcher(input).replaceAll(" ").trim();
+ }
+
+ // Final pass tokenization based on special patterns
+ public List splitToken (String token) {
+
+ Matcher m = getContractionPattern().matcher(token);
+ if (m.find()){
+ String[] contract = {m.group(1), m.group(2)};
+ return Arrays.asList(contract);
+ }
+ String[] contract = {token};
+ return Arrays.asList(contract);
+ }
+}
diff --git a/src/cmu/arktweetnlp/impl/EmojiExtractorImpl.java b/src/cmu/arktweetnlp/impl/EmojiExtractorImpl.java
new file mode 100644
index 0000000..1602f2f
--- /dev/null
+++ b/src/cmu/arktweetnlp/impl/EmojiExtractorImpl.java
@@ -0,0 +1,15 @@
+package cmu.arktweetnlp.impl;
+
+import cmu.arktweetnlp.EmojiExtractor;
+import cmu.arktweetnlp.util.EmojiUtil;
+import org.apache.commons.lang3.tuple.Pair;
+import org.pirkaengine.mobile.Emoji;
+
+import java.util.List;
+
+public class EmojiExtractorImpl implements EmojiExtractor {
+ @Override
+ public Pair> extractEmojis(final String text) {
+ return EmojiUtil.filterEmoji(text);
+ }
+}
diff --git a/src/cmu/arktweetnlp/impl/NoOpEmojiExtractor.java b/src/cmu/arktweetnlp/impl/NoOpEmojiExtractor.java
new file mode 100644
index 0000000..a266fed
--- /dev/null
+++ b/src/cmu/arktweetnlp/impl/NoOpEmojiExtractor.java
@@ -0,0 +1,16 @@
+package cmu.arktweetnlp.impl;
+
+import cmu.arktweetnlp.EmojiExtractor;
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
+import org.pirkaengine.mobile.Emoji;
+
+import java.util.Collections;
+import java.util.List;
+
+public class NoOpEmojiExtractor implements EmojiExtractor {
+ @Override
+ public Pair> extractEmojis(String text) {
+ return new ImmutablePair>(text, Collections.emptyList());
+ }
+}
diff --git a/src/cmu/arktweetnlp/impl/features/FeatureUtil.java b/src/cmu/arktweetnlp/impl/features/FeatureUtil.java
index ed40bd2..184d815 100644
--- a/src/cmu/arktweetnlp/impl/features/FeatureUtil.java
+++ b/src/cmu/arktweetnlp/impl/features/FeatureUtil.java
@@ -8,6 +8,7 @@
import cmu.arktweetnlp.Twokenize;
+import cmu.arktweetnlp.impl.DefaultPatternContext;
import com.twitter.Regex;
@@ -16,7 +17,7 @@
**/
public class FeatureUtil {
- public static Pattern URL = Pattern.compile(Twokenize.OR(Twokenize.url, Twokenize.Email));
+ public static Pattern URL = Pattern.compile(DefaultPatternContext.OR(DefaultPatternContext.url, DefaultPatternContext.Email));
public static Pattern justbase = Pattern.compile("(?!www\\.|ww\\.|w\\.|@)[a-zA-Z0-9]+\\.[A-Za-z0-9\\.]+");
// Pattern URL = Pattern.compile(Twokenize.url);
diff --git a/src/cmu/arktweetnlp/impl/features/MiscFeatures.java b/src/cmu/arktweetnlp/impl/features/MiscFeatures.java
index 96b99ce..42aac5b 100644
--- a/src/cmu/arktweetnlp/impl/features/MiscFeatures.java
+++ b/src/cmu/arktweetnlp/impl/features/MiscFeatures.java
@@ -5,6 +5,7 @@
import java.util.regex.Pattern;
import cmu.arktweetnlp.Twokenize;
+import cmu.arktweetnlp.impl.DefaultPatternContext;
import cmu.arktweetnlp.impl.features.FeatureExtractor.FeatureExtractorInterface;
import cmu.arktweetnlp.impl.features.FeatureExtractor.PositionFeaturePairs;
@@ -121,7 +122,7 @@ public static class SimpleOrthFeatures implements FeatureExtractorInterface {
/** TODO change to punctuation class, or better from Twokenize **/
//Pattern allPunct = Pattern.compile("^[^a-zA-Z0-9]*$");
Pattern allPunct = Pattern.compile("^\\W*$");
- Pattern emoticon = Pattern.compile(Twokenize.emoticon);
+ Pattern emoticon = Pattern.compile(DefaultPatternContext.emoticon);
public void addFeatures(List tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = tokens.get(t);
@@ -154,8 +155,8 @@ public void addFeatures(List tokens, PositionFeaturePairs pairs) {
}
}
public static class URLFeatures implements FeatureExtractorInterface {
- Pattern validURL = Pattern.compile(Twokenize.url);
- Pattern validEmail = Pattern.compile(Twokenize.Email);
+ Pattern validURL = Pattern.compile(DefaultPatternContext.url);
+ Pattern validEmail = Pattern.compile(DefaultPatternContext.Email);
public void addFeatures(List tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = tokens.get(t);
diff --git a/src/cmu/arktweetnlp/impl/features/WordListFeatures.java b/src/cmu/arktweetnlp/impl/features/WordListFeatures.java
index e4146d2..cdb6d04 100644
--- a/src/cmu/arktweetnlp/impl/features/WordListFeatures.java
+++ b/src/cmu/arktweetnlp/impl/features/WordListFeatures.java
@@ -8,6 +8,7 @@
import java.util.regex.Pattern;
import cmu.arktweetnlp.Twokenize;
+import cmu.arktweetnlp.impl.DefaultPatternContext;
import cmu.arktweetnlp.impl.features.FeatureExtractor.FeatureExtractorInterface;
import cmu.arktweetnlp.impl.features.FeatureExtractor.PositionFeaturePairs;
import cmu.arktweetnlp.util.BasicFileIO;
@@ -15,7 +16,7 @@
public class WordListFeatures {
public static class POSTagDict implements FeatureExtractorInterface {
- Pattern URL = Pattern.compile(Twokenize.url);
+ Pattern URL = Pattern.compile(DefaultPatternContext.url);
Pattern letter = Pattern.compile("[A-Za-z]{3,}");
public void addFeatures(List tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
diff --git a/src/cmu/arktweetnlp/util/EmojiUtil.java b/src/cmu/arktweetnlp/util/EmojiUtil.java
new file mode 100644
index 0000000..07cf31c
--- /dev/null
+++ b/src/cmu/arktweetnlp/util/EmojiUtil.java
@@ -0,0 +1,34 @@
+package cmu.arktweetnlp.util;
+
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
+import org.pirkaengine.mobile.Emoji;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class EmojiUtil {
+
+ public static Pair> filterEmoji(String text) {
+ StringBuffer term = new StringBuffer("");
+ List emojis = new ArrayList();
+
+ if (text != null && !text.isEmpty()) {
+ Emoji em;
+ for (int i = 0; i < text.length(); ) {
+ final int codePoint = text.codePointAt(i);
+ em = codePoint < 0x1FFFF ? Emoji.charOf(codePoint) : null;
+ if (null == em) {
+ final String chars = new String(Character.toChars(codePoint));
+ term.append(chars);
+ } else {
+ emojis.add(em);
+ }
+ i += Character.charCount(codePoint);
+ }
+ }
+
+ Pair> out = new ImmutablePair>(term.toString(), emojis);
+ return out;
+ }
+}
diff --git a/test/cmu/arktweetnlp/TwokenizeTest.java b/test/cmu/arktweetnlp/TwokenizeTest.java
new file mode 100644
index 0000000..9006887
--- /dev/null
+++ b/test/cmu/arktweetnlp/TwokenizeTest.java
@@ -0,0 +1,105 @@
+package cmu.arktweetnlp;
+
+import org.junit.Test;
+
+import java.util.*;
+
+import static org.junit.Assert.*;
+
+public class TwokenizeTest {
+
+ public static final String INPUT_1 = "What's the greatest invention of all time? — Tumblr. http://t.co/IPZPnKqVk2";
+ public static final List EXPECTED_TOKENS_1 = Arrays.asList("What's", "the", "greatest", "invention", "of", "all", "time", "?", "—", "Tumblr", ".", "http://t.co/IPZPnKqVk2");
+
+ public static final String INPUT_2 = "Looking for Apple Mac repairers near Naas....Anyone able to help? #kildare https://t.co/1lhLT6EtWs";
+ public static final List EXPECTED_TOKENS_2 = Arrays.asList("Looking", "for", "Apple", "Mac", "repairers", "near", "Naas", "....", "Anyone", "able", "to", "help", "?", "#kildare", "https://t.co/1lhLT6EtWs");
+
+ public static final String INPUT_3 = "RT @EKM94: The best thing I've seen on Twitter all day. http://t.co/lhYh13jUD0";
+ public static final List EXPECTED_TOKENS_3 = Arrays.asList("RT", "@EKM94", ":", "The", "best", "thing", "I've", "seen", "on", "Twitter", "all", "day", ".", "http://t.co/lhYh13jUD0");
+
+ public static final String INPUT_4 = "Butterball Turkey Bacon Only $.54 At Walgreens! via Couponing For 4 - Starting the week of 6/28, ... http://t.co/0AdaJsqwIR";
+ public static final List EXPECTED_TOKENS_4 = Arrays.asList("Butterball", "Turkey", "Bacon", "Only", "$", ".", "54", "At", "Walgreens", "!", "via", "Couponing", "For", "4", "-", "Starting", "the", "week", "of", "6/28", ",", "...", "http://t.co/0AdaJsqwIR");
+
+ public static final String INPUT_5 = "RT @beingactress: ♥Taking joy in living is a woman’s best cosmetic♥ @actressharshika http://t.co/AF8Bl69Uyu";
+ public static final List EXPECTED_TOKENS_5 = Arrays.asList("RT", "@beingactress", ":", "♥", "Taking", "joy", "in", "living", "is", "a", "woman’s", "best", "cosmetic", "♥", "@actressharshika", "http://t.co/AF8Bl69Uyu");
+
+ public static final String INPUT_6 = "@larysaG Well thanks! Making me feel better already lol I'm Nancy btw :) Nice to meet u! I'll try remembering that when I'm terrified there.";
+ public static final List EXPECTED_TOKENS_6 = Arrays.asList("@larysaG", "Well", "thanks", "!", "Making", "me", "feel", "better", "already", "lol", "I'm", "Nancy", "btw", ":)", "Nice", "to", "meet", "u", "!", "I'll", "try", "remembering", "that", "when", "I'm", "terrified", "there", ".");
+
+ public static final String INPUT_7 = "*✲゚*。✧٩(・ิᴗ・ิ๑)۶ luke hemmings from 5sos you make me happy i love you so much , follow me please?@luke5sos*✲゚*。✧٩(・ิᴗ・ิ๑)۶ 77";
+ public static final List EXPECTED_TOKENS_7 = Arrays.asList("*✲゚*", "。✧٩", "(・ิᴗ・ิ๑)", "۶", "luke", "hemmings", "from", "5sos", "you", "make", "me", "happy", "i", "love", "you", "so", "much", ",", "follow", "me", "please", "?", "@luke5sos", "*✲゚*", "。✧٩", "(・ิᴗ・ิ๑)", "۶", "77");
+
+
+ @Test
+ public void itShouldTokenizeTweets() throws Exception {
+ final List tokens1 = Twokenize.tokenize(INPUT_1);
+ assertFalse(tokens1.isEmpty());
+ assertEquals(EXPECTED_TOKENS_1.size(), tokens1.size());
+ assertEquals(EXPECTED_TOKENS_1, tokens1);
+
+
+
+ final List tokens2 = Twokenize.tokenize(INPUT_2);
+ assertFalse(tokens2.isEmpty());
+ assertEquals(EXPECTED_TOKENS_2.size(), tokens2.size());
+ assertEquals(EXPECTED_TOKENS_2, tokens2);
+
+
+ final List tokens3 = Twokenize.tokenize(INPUT_3);
+ assertFalse(tokens3.isEmpty());
+ assertEquals(EXPECTED_TOKENS_3.size(), tokens3.size());
+ assertEquals(EXPECTED_TOKENS_3, tokens3);
+
+
+ final List tokens4 = Twokenize.tokenize(INPUT_4);
+ assertFalse(tokens4.isEmpty());
+ assertEquals(EXPECTED_TOKENS_4.size(), tokens4.size());
+ assertEquals(EXPECTED_TOKENS_4, tokens4);
+
+
+ final List tokens5 = Twokenize.tokenize(INPUT_5);
+ assertFalse(tokens5.isEmpty());
+ assertEquals(EXPECTED_TOKENS_5.size(), tokens5.size());
+ assertEquals(EXPECTED_TOKENS_5, tokens5);
+
+
+ final List tokens6 = Twokenize.tokenize(INPUT_6);
+ assertFalse(tokens6.isEmpty());
+ assertEquals(EXPECTED_TOKENS_6.size(), tokens6.size());
+ assertEquals(EXPECTED_TOKENS_6, tokens6);
+
+
+ final List tokens7 = Twokenize.tokenize(INPUT_7);
+ assertFalse(tokens7.isEmpty());
+ assertEquals(EXPECTED_TOKENS_7.size(), tokens7.size());
+ assertEquals(EXPECTED_TOKENS_7, tokens7);
+ }
+
+ @Test
+ public void itShouldProduceASingleTokenFromAOneWordTweet() {
+ final String[] oneTokenInputs = {
+ "test",
+ "Test",
+ "?",
+ "!",
+ ".",
+ "http://test.com",
+ ":)"
+ };
+
+ for (final String input : oneTokenInputs) {
+ assertEquals(Arrays.asList(input), Twokenize.tokenize(input));
+ }
+ }
+
+ @Test
+ public void itShouldProduceAnEmptyListOfTokensForTheEmptyString() {
+ final List tokens = Twokenize.tokenize("");
+ assertTrue(tokens.isEmpty());
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void itShouldThrowNullPointerExceptionWithNullInput() {
+ Twokenize.tokenize(null);
+ }
+}
diff --git a/test/cmu/arktweetnlp/impl/DefaultPatternContextTest.java b/test/cmu/arktweetnlp/impl/DefaultPatternContextTest.java
new file mode 100644
index 0000000..b9ffbd0
--- /dev/null
+++ b/test/cmu/arktweetnlp/impl/DefaultPatternContextTest.java
@@ -0,0 +1,38 @@
+package cmu.arktweetnlp.impl;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class DefaultPatternContextTest {
+
+ final DefaultPatternContext context = new DefaultPatternContext();
+
+ @Test
+ public void itShouldSqeeuzeWhitespace() {
+ assertEquals("", context.squeezeWhitespace(""));
+ assertEquals("Hello world.", context.squeezeWhitespace("Hello world."));
+ assertEquals("Hello world.", context.squeezeWhitespace("Hello world."));
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void itShouldThrowNullPointerWhenSqueezingNull() {
+ context.squeezeWhitespace(null);
+ }
+
+ @Test
+ public void itShouldSplitEdgePunctuation() {
+ assertEquals("", context.splitEdgePunctuation(""));
+
+ final String split = context.splitEdgePunctuation("*hello*");
+ assertEquals("* hello *", split);
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void itShouldThrowNullPointerWhenSplittingEdgePunctOnNull() {
+ context.squeezeWhitespace(null);
+ }
+
+
+
+}
diff --git a/test/cmu/arktweetnlp/util/EmojiUtilTest.java b/test/cmu/arktweetnlp/util/EmojiUtilTest.java
new file mode 100644
index 0000000..04cd845
--- /dev/null
+++ b/test/cmu/arktweetnlp/util/EmojiUtilTest.java
@@ -0,0 +1,59 @@
+package cmu.arktweetnlp.util;
+
+import org.junit.Test;
+import org.pirkaengine.mobile.Emoji;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import org.apache.commons.lang3.tuple.Pair;
+import static org.junit.Assert.*;
+
+public class EmojiUtilTest {
+
+ @Test
+ public void itShouldProduceEmptyOutputForEmptyInput() {
+ final Pair> emojis = EmojiUtil.filterEmoji("");
+ assertTrue(emojis.getLeft().isEmpty());
+ assertTrue(emojis.getRight().isEmpty());
+ }
+
+ @Test
+ public void itShouldProduceEmptyOutputForNullInput() {
+ final Pair> emojis = EmojiUtil.filterEmoji(null);
+ assertTrue(emojis.getLeft().isEmpty());
+ assertTrue(emojis.getRight().isEmpty());
+ }
+
+ @Test
+ public void itShouldExtractEmojisFromText() {
+ //"RT @Annam1181orM: @LotAgar @Dom70Bcn @paquifer1969 @V_alf_V @MnicaRebullCome PPS\uD83D\uDC7EE=\uD83D\uDC01Tots!Son\uD83D\uDC01\uD83D\uDC00=FRANKISTESfeixistesQ\uD83D\uDC00varen\uD83D\uDC01MATAR×ODI aCAT➡Ca…";
+
+
+ Pair> emojis = null;
+
+ emojis = EmojiUtil.filterEmoji("Hello \uD83D\uDC7E");
+ assertEmojiEquals("Hello ", Arrays.asList(Emoji.ALIEN_MONSTER), emojis);
+
+ emojis = EmojiUtil.filterEmoji("He\uD83D\uDC7Ello");
+ assertEmojiEquals("Hello", Arrays.asList(Emoji.ALIEN_MONSTER), emojis);
+
+
+ emojis = EmojiUtil.filterEmoji("This has 2 \uD83D\uDC7E \uD83D\uDC7D emojis!");
+ assertEmojiEquals("This has 2 emojis!", Arrays.asList(Emoji.ALIEN_MONSTER, Emoji.EXTRATERRESTRIAL_ALIEN), emojis);
+ }
+
+ @Test
+ public void itShouldLeaveUnknownEmojisUntouched() {
+ final String text = "The following unicode characters are not a known emoji:\uD83D\uDC01. But this one is: \u27A1!";
+ final Pair> emojis = EmojiUtil.filterEmoji(text);
+ assertEmojiEquals("The following unicode characters are not a known emoji:\uD83D\uDC01. But this one is: !", Arrays.asList(Emoji.BLACK_RIGHTWARDS_ARROW), emojis);
+ }
+
+ private void assertEmojiEquals(final String expectedText, final Collection expectedEmoji, final Pair> actual) {
+ assertTrue(actual != null);
+ assertEquals(expectedText, actual.getLeft());
+ assertEquals(expectedEmoji.size(), actual.getRight().size());
+ assertEquals(expectedEmoji, actual.getRight());
+ }
+}