diff --git a/ark-tweet-nlp/pom.xml b/ark-tweet-nlp/pom.xml index 526bbfb..1c3597e 100644 --- a/ark-tweet-nlp/pom.xml +++ b/ark-tweet-nlp/pom.xml @@ -10,6 +10,7 @@ UTF-8 + @@ -27,17 +28,30 @@ ${basedir}/../src + + + add-test-source + generate-test-sources + + add-test-source + + + + ${basedir}/../test + + + - + org.apache.maven.plugins maven-shade-plugin 1.6 - package + none shade @@ -50,7 +64,7 @@ - + org.apache.maven.plugins maven-compiler-plugin @@ -108,11 +122,11 @@ **/*.jar - + - + @@ -133,9 +147,9 @@ 1.4 - commons-lang - commons-lang - 2.3 + org.apache.commons + commons-lang3 + 3.4 org.apache.commons @@ -162,6 +176,13 @@ guava 10.0.1 + + + org.pirkaengine + pirka-mobile + 0.3.0 + + junit @@ -181,7 +202,23 @@ jackson-databind 2.0.0 - + + + edu.stanford.nlp + stanford-corenlp + 1.2.0 + + + edu.berkeley.nlp + berkeleyparser + r32 + + + net.sf.trove4j + trove4j + 3.0.3 + + diff --git a/src/cmu/arktweetnlp/EmojiExtractor.java b/src/cmu/arktweetnlp/EmojiExtractor.java new file mode 100644 index 0000000..259b053 --- /dev/null +++ b/src/cmu/arktweetnlp/EmojiExtractor.java @@ -0,0 +1,13 @@ +package cmu.arktweetnlp; + +import org.apache.commons.lang3.tuple.Pair; +import org.pirkaengine.mobile.Emoji; +import java.util.List; + + +/** + * Interface for objects that know how to extract emojis from text. + */ +public interface EmojiExtractor { + public Pair> extractEmojis(final String text); +} diff --git a/src/cmu/arktweetnlp/PatternContext.java b/src/cmu/arktweetnlp/PatternContext.java new file mode 100644 index 0000000..9bb0e18 --- /dev/null +++ b/src/cmu/arktweetnlp/PatternContext.java @@ -0,0 +1,44 @@ +package cmu.arktweetnlp; + +import java.util.regex.Pattern; + +/** + * Interface for the collection of Patterns needed by the Twokenize module. + * Allows users to pass in a custom set of patterns or use the DefaultPatternContext bundled with the library. + */ +public interface PatternContext { + /** + * @return A pattern that can be used to detect contractions + */ + public Pattern getContractionPattern(); + + /** + * @return A pattern that can be used to detect whitespace + */ + public Pattern getWhitespacePattern(); + + /** + * @return A pattern that can be used to detect any desired + * "protected" tokens -- tokens that should not be split any further. + */ + public Pattern getProtectedTokenPattern(); + + /** + * @return A pattern that can be used to detect left edge punctuation + */ + public Pattern getLeftEdgePunctuationPattern(); + + /** + * @return A pattern that can be used to detect right edge punctuation + */ + public Pattern getRightEdgePunctuationPattern(); + + + public String splitEdgePunctuation(String input); + + /** + * @return Trims multiple consecutive white spaces into a single + * space. E.g. "foo bar " => "foo bar" + */ + public String squeezeWhitespace(String input); +} diff --git a/src/cmu/arktweetnlp/TokenCategorizer.java b/src/cmu/arktweetnlp/TokenCategorizer.java new file mode 100644 index 0000000..ebc45ec --- /dev/null +++ b/src/cmu/arktweetnlp/TokenCategorizer.java @@ -0,0 +1,15 @@ +package cmu.arktweetnlp; + + +import org.pirkaengine.mobile.Emoji; +import java.util.*; + +/** + * Interface for objects that know how to group tokens output from Twokenize + * into categories of a given type. E.g. mapping certain token types to an enum. + * @param + */ +public interface TokenCategorizer { + public Map> categorize(final String text, final List> splitTokens, final List> protectedTokens, final List emojis); +} + diff --git a/src/cmu/arktweetnlp/Twokenize.java b/src/cmu/arktweetnlp/Twokenize.java index 4397a40..285db24 100644 --- a/src/cmu/arktweetnlp/Twokenize.java +++ b/src/cmu/arktweetnlp/Twokenize.java @@ -4,17 +4,23 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintStream; +import java.util.Map; import java.util.regex.*; import java.util.Arrays; import java.util.List; import java.util.ArrayList; -import org.apache.commons.lang.StringEscapeUtils; +import cmu.arktweetnlp.impl.DefaultPatternContext; +import cmu.arktweetnlp.impl.NoOpEmojiExtractor; +import org.apache.commons.lang3.StringEscapeUtils; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.pirkaengine.mobile.Emoji; /** * Twokenize -- a tokenizer designed for Twitter text in English and some other European languages. * This is the Java version. If you want the old Python version, see: http://github.com/brendano/tweetmotif - * + * * This tokenizer code has gone through a long history: * * (1) Brendan O'Connor wrote original version in Python, http://github.com/brendano/tweetmotif @@ -24,208 +30,102 @@ * (2a) Kevin Gimpel and Daniel Mills modified it for POS tagging for the CMU ARK Twitter POS Tagger * (2b) Jason Baldridge and David Snyder ported it to Scala * (3) Brendan bugfixed the Scala port and merged with POS-specific changes - * for the CMU ARK Twitter POS Tagger + * for the CMU ARK Twitter POS Tagger * (4) Tobi Owoputi ported it back to Java and added many improvements (2012-06) - * + * * Current home is http://github.com/brendano/ark-tweet-nlp and http://www.ark.cs.cmu.edu/TweetNLP * * There have been at least 2 other Java ports, but they are not in the lineage for the code here. */ public class Twokenize { - static Pattern Contractions = Pattern.compile("(?i)(\\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$"); - static Pattern Whitespace = Pattern.compile("[\\s\\p{Zs}]+"); - - static String punctChars = "['\"“”‘’.?!…,:;]"; - //static String punctSeq = punctChars+"+"; //'anthem'. => ' anthem '. - static String punctSeq = "['\"“”‘’]+|[.?!,…]+|[:;]+"; //'anthem'. => ' anthem ' . - static String entity = "&(?:amp|lt|gt|quot);"; - // URLs - - // BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong. - // If you actually empirically test it the results are bad. - // Please see https://github.com/brendano/ark-tweet-nlp/pull/9 - - static String urlStart1 = "(?:https?://|\\bwww\\.)"; - static String commonTLDs = "(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)"; - static String ccTLDs = "(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + - "bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + - "er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + - "hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + - "lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + - "nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + - "sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + - "va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)"; //TODO: remove obscure country domains? - static String urlStart2 = "\\b(?:[A-Za-z\\d-])+(?:\\.[A-Za-z0-9]+){0,3}\\." + "(?:"+commonTLDs+"|"+ccTLDs+")"+"(?:\\."+ccTLDs+")?(?=\\W|$)"; - static String urlBody = "(?:[^\\.\\s<>][^\\s<>]*?)?"; - static String urlExtraCrapBeforeEnd = "(?:"+punctChars+"|"+entity+")+?"; - static String urlEnd = "(?:\\.\\.+|[<>]|\\s|$)"; - public static String url = "(?:"+urlStart1+"|"+urlStart2+")"+urlBody+"(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"; - - - // Numeric - static String timeLike = "\\d+(?::\\d+){1,2}"; - //static String numNum = "\\d+\\.\\d+"; - static String numberWithCommas = "(?:(?> splitTokens; + private final List> preservedTokens; + private final List emojis; + + public TwokenizedTweet(String originalText, List> splitGoodTokens, List> badTokens, List emojis) { + this.originalText = originalText; + this.splitTokens = splitGoodTokens; + this.preservedTokens = badTokens; + this.emojis = emojis; + } + + /** + * @return The original Tweet text + */ + public String getOriginalText() { + return originalText; + } + + /** + * @return The tokens that were split into single tokens + */ + public List> getSplitTokens() { + return splitTokens; + } + + /** + * @return The preserved tokens that were not split up. E.g. what's, http://test.com, etc. + */ + public List> getPreservedTokens() { + return preservedTokens; + } + + /** + * @return The emojis detected in the tweet + */ + public List getEmojis() { + return emojis; } - sb.append(")"); - return sb.toString(); - } - - // Emoticons - static String normalEyes = "(?iu)[:=]"; // 8 and x are eyes but cause problems - static String wink = "[;]"; - static String noseArea = "(?:|-|[^a-zA-Z0-9 ])"; // doesn't get :'-( - static String happyMouths = "[D\\)\\]\\}]+"; - static String sadMouths = "[\\(\\[\\{]+"; - static String tongue = "[pPd3]+"; - static String otherMouths = "(?:[oO]+|[/\\\\]+|[vV]+|[Ss]+|[|]+)"; // remove forward slash if http://'s aren't cleaned - - // mouth repetition examples: - // @aliciakeys Put it in a love song :-)) - // @hellocalyclops =))=))=)) Oh well - - static String bfLeft = "(♥|0|o|°|v|\\$|t|x|;|\\u0CA0|@|ʘ|•|・|◕|\\^|¬|\\*)"; - static String bfCenter = "(?:[\\.]|[_-]+)"; - static String bfRight = "\\2"; - static String s3 = "(?:--['\"])"; - static String s4 = "(?:<|<|>|>)[\\._-]+(?:<|<|>|>)"; - static String s5 = "(?:[.][_]+[.])"; - static String basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5; - - static String eeLeft = "[\\\\\ƪԄ\\((<>;ヽ\\-=~\\*]+"; - static String eeRight= "[\\-=\\);'\\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+"; - static String eeSymbol = "[^A-Za-z0-9\\s\\(\\)\\*:=-]"; - static String eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight; - - - public static String emoticon = OR( - // Standard version :) :( :] :D :P - "(?:>|>)?" + OR(normalEyes, wink) + OR(noseArea,"[Oo]") + - OR(tongue+"(?=\\W|$|RT|rt|Rt)", otherMouths+"(?=\\W|$|RT|rt|Rt)", sadMouths, happyMouths), - - // reversed version (: D: use positive lookbehind to remove "(word):" - // because eyes on the right side is more ambiguous with the standard usage of : ; - "(?<=(?: |^))" + OR(sadMouths,happyMouths,otherMouths) + noseArea + OR(normalEyes, wink) + "(?:<|<)?", - - //inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style - eastEmote.replaceFirst("2", "1"), basicface - // iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb] - // TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this - ); - - static String Hearts = "(?:<+/?3+)+"; //the other hearts are in decorations - - static String Arrows = "(?:<*[-―—=]*>+|<+[-―—=]*>*)|\\p{InArrows}+"; - - // BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes - // "hello (#hashtag)" ==> "hello (#hashtag )" WRONG - // "hello (#hashtag)" ==> "hello ( #hashtag )" RIGHT - // "hello (@person)" ==> "hello (@person )" WRONG - // "hello (@person)" ==> "hello ( @person )" RIGHT - // ... Some sort of weird interaction with edgepunct I guess, because edgepunct - // has poor content-symbol detection. - - // This also gets #1 #40 which probably aren't hashtags .. but good as tokens. - // If you want good hashtag identification, use a different regex. - static String Hashtag = "#[a-zA-Z0-9_]+"; //optional: lookbehind for \b - //optional: lookbehind for \b, max length 15 - static String AtMention = "[@@][a-zA-Z0-9_]+"; - - // I was worried this would conflict with at-mentions - // but seems ok in sample of 5800: 7 changes all email fixes - // http://www.regular-expressions.info/email.html - static String Bound = "(?:\\W|^|$)"; - public static String Email = "(?<=" +Bound+ ")[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,4}(?=" +Bound+")"; - - // We will be tokenizing using these regexps as delimiters - // Additionally, these things are "protected", meaning they shouldn't be further split themselves. - static Pattern Protected = Pattern.compile( - OR( - Hearts, - url, - Email, - timeLike, - //numNum, - numberWithCommas, - numComb, - emoticon, - Arrows, - entity, - punctSeq, - arbitraryAbbrev, - separators, - decorations, - embeddedApostrophe, - Hashtag, - AtMention - )); - - // Edge punctuation - // Want: 'foo' => ' foo ' - // While also: don't => don't - // the first is considered "edge punctuation". - // the second is word-internal punctuation -- don't want to mess with it. - // BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days. - // I remember it causing lots of trouble in the past as well. Would be good to revisit or eliminate. - - // Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes) - static String edgePunctChars = "'\"“”‘’«»{}\\(\\)\\[\\]\\*&"; //add \\p{So}? (symbols) - static String edgePunct = "[" + edgePunctChars + "]"; - static String notEdgePunct = "[a-zA-Z0-9]"; // content characters - static String offEdge = "(^|$|:|;|\\s|\\.|,)"; // colon here gets "(hello):" ==> "( hello ):" - static Pattern EdgePunctLeft = Pattern.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")"); - static Pattern EdgePunctRight = Pattern.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge); - - public static String splitEdgePunct (String input) { - Matcher m1 = EdgePunctLeft.matcher(input); - input = m1.replaceAll("$1$2 $3"); - m1 = EdgePunctRight.matcher(input); - input = m1.replaceAll("$1 $2$3"); - return input; - } - - private static class Pair { - public T1 first; - public T2 second; - public Pair(T1 x, T2 y) { first=x; second=y; } } + // The main work of tokenizing a tweet. - private static List simpleTokenize (String text) { + private static List simpleTokenize (final String text, final PatternContext patterns, final EmojiExtractor emojiExtractor) { + final TwokenizedTweet twokenizedTweet = tokenizeTweet(text, patterns, emojiExtractor); + + // Reinterpolate the 'good' and 'bad' Lists, ensuring that + // additonal tokens from last good item get included + List zippedStr= new ArrayList(); + int i; + for(i=0; i < twokenizedTweet.getPreservedTokens().size(); i++) { + zippedStr = addAllnonempty(zippedStr, twokenizedTweet.getSplitTokens().get(i)); + zippedStr = addAllnonempty(zippedStr,twokenizedTweet.getPreservedTokens().get(i)); + } + zippedStr = addAllnonempty(zippedStr,twokenizedTweet.getSplitTokens().get(i)); + + // BTO: our POS tagger wants "ur" and "you're" to both be one token. + // Uncomment to get "you 're" + /*ArrayList splitStr = new ArrayList(zippedStr.size()); + for(String tok:zippedStr) + splitStr.addAll(splitToken(tok)); + zippedStr=splitStr;*/ + return zippedStr; + } + + protected static TwokenizedTweet tokenizeTweet(final String text, final PatternContext patterns, final EmojiExtractor emojiExtractor) { // Do the no-brainers first - String splitPunctText = splitEdgePunct(text); + String splitPunctText = patterns.splitEdgePunctuation(text); int textLength = splitPunctText.length(); - + // BTO: the logic here got quite convoluted via the Scala porting detour // It would be good to switch back to a nice simple procedural style like in the Python version // ... Scala is such a pain. Never again. // Find the matches for subsequences that should be protected, // e.g. URLs, 1.0, U.N.K.L.E., 12:53 - Matcher matches = Protected.matcher(splitPunctText); - //Storing as List[List[String]] to make zip easier later on + Matcher matches = patterns.getProtectedTokenPattern().matcher(splitPunctText); + //Storing as List[List[String]] to make zip easier later on List> bads = new ArrayList>(); //linked list? List> badSpans = new ArrayList>(); while(matches.find()){ @@ -234,52 +134,44 @@ private static List simpleTokenize (String text) { List bad = new ArrayList(1); bad.add(splitPunctText.substring(matches.start(),matches.end())); bads.add(bad); - badSpans.add(new Pair(matches.start(),matches.end())); + badSpans.add(new ImmutablePair(matches.start(),matches.end())); } } // Create a list of indices to create the "goods", which can be - // split. We are taking "bad" spans like - // List((2,5), (8,10)) - // to create + // split. We are taking "bad" spans like + // List((2,5), (8,10)) + // to create /// List(0, 2, 5, 8, 10, 12) // where, e.g., "12" here would be the textLength // has an even length and no indices are the same List indices = new ArrayList(2+2*badSpans.size()); indices.add(0); for(Pair p:badSpans){ - indices.add(p.first); - indices.add(p.second); + indices.add(p.getLeft()); + indices.add(p.getRight()); } indices.add(textLength); // Group the indices and map them to their respective portion of the string List> splitGoods = new ArrayList>(indices.size()/2); + final List emojis = new ArrayList(); for (int i=0; i> goodStrAndEmojis = emojiExtractor.extractEmojis(goodstr); + goodstr = goodStrAndEmojis.getLeft(); + + emojis.addAll(goodStrAndEmojis.getRight()); + List splitstr = Arrays.asList(goodstr.trim().split(" ")); splitGoods.add(splitstr); } - // Reinterpolate the 'good' and 'bad' Lists, ensuring that - // additonal tokens from last good item get included - List zippedStr= new ArrayList(); - int i; - for(i=0; i < bads.size(); i++) { - zippedStr = addAllnonempty(zippedStr,splitGoods.get(i)); - zippedStr = addAllnonempty(zippedStr,bads.get(i)); - } - zippedStr = addAllnonempty(zippedStr,splitGoods.get(i)); - - // BTO: our POS tagger wants "ur" and "you're" to both be one token. - // Uncomment to get "you 're" - /*ArrayList splitStr = new ArrayList(zippedStr.size()); - for(String tok:zippedStr) - splitStr.addAll(splitToken(tok)); - zippedStr=splitStr;*/ - - return zippedStr; - } + + return new TwokenizedTweet(text, splitGoods, bads, emojis); + } private static List addAllnonempty(List master, List smaller){ for (String s : smaller){ @@ -289,42 +181,38 @@ private static List addAllnonempty(List master, List sma } return master; } - /** "foo bar " => "foo bar" */ - public static String squeezeWhitespace (String input){ - return Whitespace.matcher(input).replaceAll(" ").trim(); - } - - // Final pass tokenization based on special patterns - private static List splitToken (String token) { - Matcher m = Contractions.matcher(token); - if (m.find()){ - String[] contract = {m.group(1), m.group(2)}; - return Arrays.asList(contract); - } - String[] contract = {token}; - return Arrays.asList(contract); + /** Assume 'text' has no HTML escaping. **/ + public static List tokenize(String text) { + return simpleTokenize(DEFAULT_PATTERN_CONTEXT.squeezeWhitespace(text), DEFAULT_PATTERN_CONTEXT, DEFAULT_EMOJI_EXTRACTOR); } /** Assume 'text' has no HTML escaping. **/ - public static List tokenize(String text){ - return simpleTokenize(squeezeWhitespace(text)); + public static List tokenize(final String text, final PatternContext patternContext, final EmojiExtractor emojiExtractor) { + return simpleTokenize(patternContext.squeezeWhitespace(text), patternContext, emojiExtractor); } + /** + * Tokenizes the given text and applies the given categorization function to categorize the tokens into groups + */ + public static Map> tokenizeIntoCategories(final String text, final TokenCategorizer categorizer) { + return tokenizeIntoCategories(text, categorizer, DEFAULT_PATTERN_CONTEXT, DEFAULT_EMOJI_EXTRACTOR); + } /** - * Twitter text comes HTML-escaped, so unescape it. - * We also first unescape &'s, in case the text has been buggily double-escaped. + * Same as tokenizeIntoCategories but uses a custom PatternContext and EmojiExtractor */ - public static String normalizeTextForTagger(String text) { - text = text.replaceAll("&", "&"); - text = StringEscapeUtils.unescapeHtml(text); - return text; + public static Map> tokenizeIntoCategories(final String text, final TokenCategorizer categorizer, final PatternContext patterns, final EmojiExtractor emojiExtractor) { + final String cleaned = patterns.squeezeWhitespace(text); + final TwokenizedTweet twokenizedTweet = tokenizeTweet(cleaned, patterns, emojiExtractor); + final Map> tokenCategories = categorizer.categorize(twokenizedTweet.getOriginalText(), twokenizedTweet.getSplitTokens(), twokenizedTweet.getPreservedTokens(), twokenizedTweet.getEmojis()); + return tokenCategories; } + /** * This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger. - * + * * This function normalizes the input text BEFORE calling the tokenizer. * So the tokens you get back may not exactly correspond to * substrings of the original text. @@ -334,6 +222,16 @@ public static List tokenizeRawTweetText(String text) { return tokens; } + /** + * Twitter text comes HTML-escaped, so unescape it. + * We also first unescape &'s, in case the text has been buggily double-escaped. + */ + public static String normalizeTextForTagger(String text) { + text = text.replaceAll("&", "&"); + text = StringEscapeUtils.unescapeHtml4(text); + return text; + } + /** Tokenizes tweet texts on standard input, tokenizations on standard output. Input and output UTF-8. */ public static void main(String[] args) throws IOException { BufferedReader input = new BufferedReader(new InputStreamReader(System.in,"UTF-8")); @@ -350,5 +248,5 @@ public static void main(String[] args) throws IOException { output.print("\n"); } } - + } diff --git a/src/cmu/arktweetnlp/impl/DefaultPatternContext.java b/src/cmu/arktweetnlp/impl/DefaultPatternContext.java new file mode 100644 index 0000000..362ca4b --- /dev/null +++ b/src/cmu/arktweetnlp/impl/DefaultPatternContext.java @@ -0,0 +1,226 @@ +package cmu.arktweetnlp.impl; + +import cmu.arktweetnlp.PatternContext; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class DefaultPatternContext implements PatternContext { + + public static Pattern Contractions = Pattern.compile("(?i)(\\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$"); + public static Pattern Whitespace = Pattern.compile("[\\s\\p{Zs}]+"); + + public static String punctChars = "['\"“”‘’.?!…,:;]"; + //public static String punctSeq = punctChars+"+"; //'anthem'. => ' anthem '. + public static String punctSeq = "['\"“”‘’]+|[.?!,…]+|[:;]+"; //'anthem'. => ' anthem ' . + public static String entity = "&(?:amp|lt|gt|quot);"; + // URLs + + // BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong. + // If you actually empirically test it the results are bad. + // Please see https://github.com/brendano/ark-tweet-nlp/pull/9 + + public static String urlStart1 = "(?:https?://|\\bwww\\.)"; + public static String commonTLDs = "(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)"; + public static String ccTLDs = "(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + + "bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + + "er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + + "hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + + "lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + + "nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + + "sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + + "va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)"; //TODO: remove obscure country domains? + public static String urlStart2 = "\\b(?:[A-Za-z\\d-])+(?:\\.[A-Za-z0-9]+){0,3}\\." + "(?:"+commonTLDs+"|"+ccTLDs+")"+"(?:\\."+ccTLDs+")?(?=\\W|$)"; + public static String urlBody = "(?:[^\\.\\s<>][^\\s<>]*?)?"; + public static String urlExtraCrapBeforeEnd = "(?:"+punctChars+"|"+entity+")+?"; + public static String urlEnd = "(?:\\.\\.+|[<>]|\\s|$)"; + public static String url = "(?:"+urlStart1+"|"+urlStart2+")"+urlBody+"(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"; + + + // Numeric + public static String timeLike = "\\d+(?::\\d+){1,2}"; + //public static String numNum = "\\d+\\.\\d+"; + public static String numberWithCommas = "(?:(?|>)[\\._-]+(?:<|<|>|>)"; + public static String s5 = "(?:[.][_]+[.])"; + public static String basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5; + + public static String eeLeft = "[\\\\\ƪԄ\\((<>;ヽ\\-=~\\*]+"; + public static String eeRight= "[\\-=\\);'\\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+"; + public static String eeSymbol = "[^A-Za-z0-9\\s\\(\\)\\*:=-]"; + public static String eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight; + + + public static String emoticon = OR( + // Standard version :) :( :] :D :P + "(?:>|>)?" + OR(normalEyes, wink) + OR(noseArea,"[Oo]") + + OR(tongue+"(?=\\W|$|RT|rt|Rt)", otherMouths+"(?=\\W|$|RT|rt|Rt)", sadMouths, happyMouths), + + // reversed version (: D: use positive lookbehind to remove "(word):" + // because eyes on the right side is more ambiguous with the standard usage of : ; + "(?<=(?: |^))" + OR(sadMouths,happyMouths,otherMouths) + noseArea + OR(normalEyes, wink) + "(?:<|<)?", + + //inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style + eastEmote.replaceFirst("2", "1"), basicface + // iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb] + // TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this + ); + + public static String Hearts = "(?:<+/?3+)+"; //the other hearts are in decorations + + public static String Arrows = "(?:<*[-―—=]*>+|<+[-―—=]*>*)|\\p{InArrows}+"; + + // BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes + // "hello (#hashtag)" ==> "hello (#hashtag )" WRONG + // "hello (#hashtag)" ==> "hello ( #hashtag )" RIGHT + // "hello (@person)" ==> "hello (@person )" WRONG + // "hello (@person)" ==> "hello ( @person )" RIGHT + // ... Some sort of weird interaction with edgepunct I guess, because edgepunct + // has poor content-symbol detection. + + // This also gets #1 #40 which probably aren't hashtags .. but good as tokens. + // If you want good hashtag identification, use a different regex. + public static String Hashtag = "#[a-zA-Z0-9_]+"; //optional: lookbehind for \b + //optional: lookbehind for \b, max length 15 + public static String AtMention = "[@@][a-zA-Z0-9_]+"; + + // I was worried this would conflict with at-mentions + // but seems ok in sample of 5800: 7 changes all email fixes + // http://www.regular-expressions.info/email.html + public static String Bound = "(?:\\W|^|$)"; + public static String Email = "(?<=" +Bound+ ")[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,4}(?=" +Bound+")"; + + // We will be tokenizing using these regexps as delimiters + // Additionally, these things are "protected", meaning they shouldn't be further split themselves. + public static Pattern Protected = Pattern.compile( + OR( + Hearts, + url, + Email, + timeLike, + //numNum, + numberWithCommas, + numComb, + emoticon, + Arrows, + entity, + punctSeq, + arbitraryAbbrev, + separators, + decorations, + embeddedApostrophe, + Hashtag, + AtMention + )); + + // Edge punctuation + // Want: 'foo' => ' foo ' + // While also: don't => don't + // the first is considered "edge punctuation". + // the second is word-internal punctuation -- don't want to mess with it. + // BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days. + // I remember it causing lots of trouble in the past as well. Would be good to revisit or eliminate. + + // Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes) + public static String edgePunctChars = "'\"“”‘’«»{}\\(\\)\\[\\]\\*&"; //add \\p{So}? (symbols) + public static String edgePunct = "[" + edgePunctChars + "]"; + public static String notEdgePunct = "[a-zA-Z0-9]"; // content characters + public static String offEdge = "(^|$|:|;|\\s|\\.|,)"; // colon here gets "(hello):" ==> "( hello ):" + public static Pattern EdgePunctLeft = Pattern.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")"); + public static Pattern EdgePunctRight = Pattern.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge); + + @Override + public Pattern getContractionPattern() { + return Contractions; + } + + @Override + public Pattern getWhitespacePattern() { + return Whitespace; + } + + @Override + public Pattern getProtectedTokenPattern() { + return Protected; + } + + @Override + public Pattern getLeftEdgePunctuationPattern() { + return EdgePunctLeft; + } + + @Override + public Pattern getRightEdgePunctuationPattern() { + return EdgePunctRight; + } + + + public String splitEdgePunctuation(String input) { + Matcher m1 = getLeftEdgePunctuationPattern().matcher(input); + input = m1.replaceAll("$1$2 $3"); + m1 = getRightEdgePunctuationPattern().matcher(input); + input = m1.replaceAll("$1 $2$3"); + return input; + } + + /** "foo bar " => "foo bar" */ + public String squeezeWhitespace (String input){ + return getWhitespacePattern().matcher(input).replaceAll(" ").trim(); + } + + // Final pass tokenization based on special patterns + public List splitToken (String token) { + + Matcher m = getContractionPattern().matcher(token); + if (m.find()){ + String[] contract = {m.group(1), m.group(2)}; + return Arrays.asList(contract); + } + String[] contract = {token}; + return Arrays.asList(contract); + } +} diff --git a/src/cmu/arktweetnlp/impl/EmojiExtractorImpl.java b/src/cmu/arktweetnlp/impl/EmojiExtractorImpl.java new file mode 100644 index 0000000..1602f2f --- /dev/null +++ b/src/cmu/arktweetnlp/impl/EmojiExtractorImpl.java @@ -0,0 +1,15 @@ +package cmu.arktweetnlp.impl; + +import cmu.arktweetnlp.EmojiExtractor; +import cmu.arktweetnlp.util.EmojiUtil; +import org.apache.commons.lang3.tuple.Pair; +import org.pirkaengine.mobile.Emoji; + +import java.util.List; + +public class EmojiExtractorImpl implements EmojiExtractor { + @Override + public Pair> extractEmojis(final String text) { + return EmojiUtil.filterEmoji(text); + } +} diff --git a/src/cmu/arktweetnlp/impl/NoOpEmojiExtractor.java b/src/cmu/arktweetnlp/impl/NoOpEmojiExtractor.java new file mode 100644 index 0000000..a266fed --- /dev/null +++ b/src/cmu/arktweetnlp/impl/NoOpEmojiExtractor.java @@ -0,0 +1,16 @@ +package cmu.arktweetnlp.impl; + +import cmu.arktweetnlp.EmojiExtractor; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.pirkaengine.mobile.Emoji; + +import java.util.Collections; +import java.util.List; + +public class NoOpEmojiExtractor implements EmojiExtractor { + @Override + public Pair> extractEmojis(String text) { + return new ImmutablePair>(text, Collections.emptyList()); + } +} diff --git a/src/cmu/arktweetnlp/impl/features/FeatureUtil.java b/src/cmu/arktweetnlp/impl/features/FeatureUtil.java index ed40bd2..184d815 100644 --- a/src/cmu/arktweetnlp/impl/features/FeatureUtil.java +++ b/src/cmu/arktweetnlp/impl/features/FeatureUtil.java @@ -8,6 +8,7 @@ import cmu.arktweetnlp.Twokenize; +import cmu.arktweetnlp.impl.DefaultPatternContext; import com.twitter.Regex; @@ -16,7 +17,7 @@ **/ public class FeatureUtil { - public static Pattern URL = Pattern.compile(Twokenize.OR(Twokenize.url, Twokenize.Email)); + public static Pattern URL = Pattern.compile(DefaultPatternContext.OR(DefaultPatternContext.url, DefaultPatternContext.Email)); public static Pattern justbase = Pattern.compile("(?!www\\.|ww\\.|w\\.|@)[a-zA-Z0-9]+\\.[A-Za-z0-9\\.]+"); // Pattern URL = Pattern.compile(Twokenize.url); diff --git a/src/cmu/arktweetnlp/impl/features/MiscFeatures.java b/src/cmu/arktweetnlp/impl/features/MiscFeatures.java index 96b99ce..42aac5b 100644 --- a/src/cmu/arktweetnlp/impl/features/MiscFeatures.java +++ b/src/cmu/arktweetnlp/impl/features/MiscFeatures.java @@ -5,6 +5,7 @@ import java.util.regex.Pattern; import cmu.arktweetnlp.Twokenize; +import cmu.arktweetnlp.impl.DefaultPatternContext; import cmu.arktweetnlp.impl.features.FeatureExtractor.FeatureExtractorInterface; import cmu.arktweetnlp.impl.features.FeatureExtractor.PositionFeaturePairs; @@ -121,7 +122,7 @@ public static class SimpleOrthFeatures implements FeatureExtractorInterface { /** TODO change to punctuation class, or better from Twokenize **/ //Pattern allPunct = Pattern.compile("^[^a-zA-Z0-9]*$"); Pattern allPunct = Pattern.compile("^\\W*$"); - Pattern emoticon = Pattern.compile(Twokenize.emoticon); + Pattern emoticon = Pattern.compile(DefaultPatternContext.emoticon); public void addFeatures(List tokens, PositionFeaturePairs pairs) { for (int t=0; t < tokens.size(); t++) { String tok = tokens.get(t); @@ -154,8 +155,8 @@ public void addFeatures(List tokens, PositionFeaturePairs pairs) { } } public static class URLFeatures implements FeatureExtractorInterface { - Pattern validURL = Pattern.compile(Twokenize.url); - Pattern validEmail = Pattern.compile(Twokenize.Email); + Pattern validURL = Pattern.compile(DefaultPatternContext.url); + Pattern validEmail = Pattern.compile(DefaultPatternContext.Email); public void addFeatures(List tokens, PositionFeaturePairs pairs) { for (int t=0; t < tokens.size(); t++) { String tok = tokens.get(t); diff --git a/src/cmu/arktweetnlp/impl/features/WordListFeatures.java b/src/cmu/arktweetnlp/impl/features/WordListFeatures.java index e4146d2..cdb6d04 100644 --- a/src/cmu/arktweetnlp/impl/features/WordListFeatures.java +++ b/src/cmu/arktweetnlp/impl/features/WordListFeatures.java @@ -8,6 +8,7 @@ import java.util.regex.Pattern; import cmu.arktweetnlp.Twokenize; +import cmu.arktweetnlp.impl.DefaultPatternContext; import cmu.arktweetnlp.impl.features.FeatureExtractor.FeatureExtractorInterface; import cmu.arktweetnlp.impl.features.FeatureExtractor.PositionFeaturePairs; import cmu.arktweetnlp.util.BasicFileIO; @@ -15,7 +16,7 @@ public class WordListFeatures { public static class POSTagDict implements FeatureExtractorInterface { - Pattern URL = Pattern.compile(Twokenize.url); + Pattern URL = Pattern.compile(DefaultPatternContext.url); Pattern letter = Pattern.compile("[A-Za-z]{3,}"); public void addFeatures(List tokens, PositionFeaturePairs pairs) { for (int t=0; t < tokens.size(); t++) { diff --git a/src/cmu/arktweetnlp/util/EmojiUtil.java b/src/cmu/arktweetnlp/util/EmojiUtil.java new file mode 100644 index 0000000..07cf31c --- /dev/null +++ b/src/cmu/arktweetnlp/util/EmojiUtil.java @@ -0,0 +1,34 @@ +package cmu.arktweetnlp.util; + +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.pirkaengine.mobile.Emoji; + +import java.util.ArrayList; +import java.util.List; + +public class EmojiUtil { + + public static Pair> filterEmoji(String text) { + StringBuffer term = new StringBuffer(""); + List emojis = new ArrayList(); + + if (text != null && !text.isEmpty()) { + Emoji em; + for (int i = 0; i < text.length(); ) { + final int codePoint = text.codePointAt(i); + em = codePoint < 0x1FFFF ? Emoji.charOf(codePoint) : null; + if (null == em) { + final String chars = new String(Character.toChars(codePoint)); + term.append(chars); + } else { + emojis.add(em); + } + i += Character.charCount(codePoint); + } + } + + Pair> out = new ImmutablePair>(term.toString(), emojis); + return out; + } +} diff --git a/test/cmu/arktweetnlp/TwokenizeTest.java b/test/cmu/arktweetnlp/TwokenizeTest.java new file mode 100644 index 0000000..9006887 --- /dev/null +++ b/test/cmu/arktweetnlp/TwokenizeTest.java @@ -0,0 +1,105 @@ +package cmu.arktweetnlp; + +import org.junit.Test; + +import java.util.*; + +import static org.junit.Assert.*; + +public class TwokenizeTest { + + public static final String INPUT_1 = "What's the greatest invention of all time? — Tumblr. http://t.co/IPZPnKqVk2"; + public static final List EXPECTED_TOKENS_1 = Arrays.asList("What's", "the", "greatest", "invention", "of", "all", "time", "?", "—", "Tumblr", ".", "http://t.co/IPZPnKqVk2"); + + public static final String INPUT_2 = "Looking for Apple Mac repairers near Naas....Anyone able to help? #kildare https://t.co/1lhLT6EtWs"; + public static final List EXPECTED_TOKENS_2 = Arrays.asList("Looking", "for", "Apple", "Mac", "repairers", "near", "Naas", "....", "Anyone", "able", "to", "help", "?", "#kildare", "https://t.co/1lhLT6EtWs"); + + public static final String INPUT_3 = "RT @EKM94: The best thing I've seen on Twitter all day. http://t.co/lhYh13jUD0"; + public static final List EXPECTED_TOKENS_3 = Arrays.asList("RT", "@EKM94", ":", "The", "best", "thing", "I've", "seen", "on", "Twitter", "all", "day", ".", "http://t.co/lhYh13jUD0"); + + public static final String INPUT_4 = "Butterball Turkey Bacon Only $.54 At Walgreens! via Couponing For 4 - Starting the week of 6/28, ... http://t.co/0AdaJsqwIR"; + public static final List EXPECTED_TOKENS_4 = Arrays.asList("Butterball", "Turkey", "Bacon", "Only", "$", ".", "54", "At", "Walgreens", "!", "via", "Couponing", "For", "4", "-", "Starting", "the", "week", "of", "6/28", ",", "...", "http://t.co/0AdaJsqwIR"); + + public static final String INPUT_5 = "RT @beingactress: ♥Taking joy in living is a woman’s best cosmetic♥ @actressharshika http://t.co/AF8Bl69Uyu"; + public static final List EXPECTED_TOKENS_5 = Arrays.asList("RT", "@beingactress", ":", "♥", "Taking", "joy", "in", "living", "is", "a", "woman’s", "best", "cosmetic", "♥", "@actressharshika", "http://t.co/AF8Bl69Uyu"); + + public static final String INPUT_6 = "@larysaG Well thanks! Making me feel better already lol I'm Nancy btw :) Nice to meet u! I'll try remembering that when I'm terrified there."; + public static final List EXPECTED_TOKENS_6 = Arrays.asList("@larysaG", "Well", "thanks", "!", "Making", "me", "feel", "better", "already", "lol", "I'm", "Nancy", "btw", ":)", "Nice", "to", "meet", "u", "!", "I'll", "try", "remembering", "that", "when", "I'm", "terrified", "there", "."); + + public static final String INPUT_7 = "*✲゚*。✧٩(・ิᴗ・ิ๑)۶ luke hemmings from 5sos you make me happy i love you so much , follow me please?@luke5sos*✲゚*。✧٩(・ิᴗ・ิ๑)۶ 77"; + public static final List EXPECTED_TOKENS_7 = Arrays.asList("*✲゚*", "。✧٩", "(・ิᴗ・ิ๑)", "۶", "luke", "hemmings", "from", "5sos", "you", "make", "me", "happy", "i", "love", "you", "so", "much", ",", "follow", "me", "please", "?", "@luke5sos", "*✲゚*", "。✧٩", "(・ิᴗ・ิ๑)", "۶", "77"); + + + @Test + public void itShouldTokenizeTweets() throws Exception { + final List tokens1 = Twokenize.tokenize(INPUT_1); + assertFalse(tokens1.isEmpty()); + assertEquals(EXPECTED_TOKENS_1.size(), tokens1.size()); + assertEquals(EXPECTED_TOKENS_1, tokens1); + + + + final List tokens2 = Twokenize.tokenize(INPUT_2); + assertFalse(tokens2.isEmpty()); + assertEquals(EXPECTED_TOKENS_2.size(), tokens2.size()); + assertEquals(EXPECTED_TOKENS_2, tokens2); + + + final List tokens3 = Twokenize.tokenize(INPUT_3); + assertFalse(tokens3.isEmpty()); + assertEquals(EXPECTED_TOKENS_3.size(), tokens3.size()); + assertEquals(EXPECTED_TOKENS_3, tokens3); + + + final List tokens4 = Twokenize.tokenize(INPUT_4); + assertFalse(tokens4.isEmpty()); + assertEquals(EXPECTED_TOKENS_4.size(), tokens4.size()); + assertEquals(EXPECTED_TOKENS_4, tokens4); + + + final List tokens5 = Twokenize.tokenize(INPUT_5); + assertFalse(tokens5.isEmpty()); + assertEquals(EXPECTED_TOKENS_5.size(), tokens5.size()); + assertEquals(EXPECTED_TOKENS_5, tokens5); + + + final List tokens6 = Twokenize.tokenize(INPUT_6); + assertFalse(tokens6.isEmpty()); + assertEquals(EXPECTED_TOKENS_6.size(), tokens6.size()); + assertEquals(EXPECTED_TOKENS_6, tokens6); + + + final List tokens7 = Twokenize.tokenize(INPUT_7); + assertFalse(tokens7.isEmpty()); + assertEquals(EXPECTED_TOKENS_7.size(), tokens7.size()); + assertEquals(EXPECTED_TOKENS_7, tokens7); + } + + @Test + public void itShouldProduceASingleTokenFromAOneWordTweet() { + final String[] oneTokenInputs = { + "test", + "Test", + "?", + "!", + ".", + "http://test.com", + ":)" + }; + + for (final String input : oneTokenInputs) { + assertEquals(Arrays.asList(input), Twokenize.tokenize(input)); + } + } + + @Test + public void itShouldProduceAnEmptyListOfTokensForTheEmptyString() { + final List tokens = Twokenize.tokenize(""); + assertTrue(tokens.isEmpty()); + } + + @Test(expected = NullPointerException.class) + public void itShouldThrowNullPointerExceptionWithNullInput() { + Twokenize.tokenize(null); + } +} diff --git a/test/cmu/arktweetnlp/impl/DefaultPatternContextTest.java b/test/cmu/arktweetnlp/impl/DefaultPatternContextTest.java new file mode 100644 index 0000000..b9ffbd0 --- /dev/null +++ b/test/cmu/arktweetnlp/impl/DefaultPatternContextTest.java @@ -0,0 +1,38 @@ +package cmu.arktweetnlp.impl; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class DefaultPatternContextTest { + + final DefaultPatternContext context = new DefaultPatternContext(); + + @Test + public void itShouldSqeeuzeWhitespace() { + assertEquals("", context.squeezeWhitespace("")); + assertEquals("Hello world.", context.squeezeWhitespace("Hello world.")); + assertEquals("Hello world.", context.squeezeWhitespace("Hello world.")); + } + + @Test(expected = NullPointerException.class) + public void itShouldThrowNullPointerWhenSqueezingNull() { + context.squeezeWhitespace(null); + } + + @Test + public void itShouldSplitEdgePunctuation() { + assertEquals("", context.splitEdgePunctuation("")); + + final String split = context.splitEdgePunctuation("*hello*"); + assertEquals("* hello *", split); + } + + @Test(expected = NullPointerException.class) + public void itShouldThrowNullPointerWhenSplittingEdgePunctOnNull() { + context.squeezeWhitespace(null); + } + + + +} diff --git a/test/cmu/arktweetnlp/util/EmojiUtilTest.java b/test/cmu/arktweetnlp/util/EmojiUtilTest.java new file mode 100644 index 0000000..04cd845 --- /dev/null +++ b/test/cmu/arktweetnlp/util/EmojiUtilTest.java @@ -0,0 +1,59 @@ +package cmu.arktweetnlp.util; + +import org.junit.Test; +import org.pirkaengine.mobile.Emoji; + +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import org.apache.commons.lang3.tuple.Pair; +import static org.junit.Assert.*; + +public class EmojiUtilTest { + + @Test + public void itShouldProduceEmptyOutputForEmptyInput() { + final Pair> emojis = EmojiUtil.filterEmoji(""); + assertTrue(emojis.getLeft().isEmpty()); + assertTrue(emojis.getRight().isEmpty()); + } + + @Test + public void itShouldProduceEmptyOutputForNullInput() { + final Pair> emojis = EmojiUtil.filterEmoji(null); + assertTrue(emojis.getLeft().isEmpty()); + assertTrue(emojis.getRight().isEmpty()); + } + + @Test + public void itShouldExtractEmojisFromText() { + //"RT @Annam1181orM: @LotAgar @Dom70Bcn @paquifer1969 @V_alf_V @MnicaRebullCome PPS\uD83D\uDC7EE=\uD83D\uDC01Tots!Son\uD83D\uDC01\uD83D\uDC00=FRANKISTESfeixistesQ\uD83D\uDC00varen\uD83D\uDC01MATAR×ODI aCAT➡Ca…"; + + + Pair> emojis = null; + + emojis = EmojiUtil.filterEmoji("Hello \uD83D\uDC7E"); + assertEmojiEquals("Hello ", Arrays.asList(Emoji.ALIEN_MONSTER), emojis); + + emojis = EmojiUtil.filterEmoji("He\uD83D\uDC7Ello"); + assertEmojiEquals("Hello", Arrays.asList(Emoji.ALIEN_MONSTER), emojis); + + + emojis = EmojiUtil.filterEmoji("This has 2 \uD83D\uDC7E \uD83D\uDC7D emojis!"); + assertEmojiEquals("This has 2 emojis!", Arrays.asList(Emoji.ALIEN_MONSTER, Emoji.EXTRATERRESTRIAL_ALIEN), emojis); + } + + @Test + public void itShouldLeaveUnknownEmojisUntouched() { + final String text = "The following unicode characters are not a known emoji:\uD83D\uDC01. But this one is: \u27A1!"; + final Pair> emojis = EmojiUtil.filterEmoji(text); + assertEmojiEquals("The following unicode characters are not a known emoji:\uD83D\uDC01. But this one is: !", Arrays.asList(Emoji.BLACK_RIGHTWARDS_ARROW), emojis); + } + + private void assertEmojiEquals(final String expectedText, final Collection expectedEmoji, final Pair> actual) { + assertTrue(actual != null); + assertEquals(expectedText, actual.getLeft()); + assertEquals(expectedEmoji.size(), actual.getRight().size()); + assertEquals(expectedEmoji, actual.getRight()); + } +}