diff --git a/ark-tweet-nlp/pom.xml b/ark-tweet-nlp/pom.xml
index 526bbfb..1c3597e 100644
--- a/ark-tweet-nlp/pom.xml
+++ b/ark-tweet-nlp/pom.xml
@@ -10,6 +10,7 @@
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     </properties>
+
     <build>
 		<plugins>
              <plugin>
@@ -27,17 +28,30 @@
                       <sources>
                           <source>${basedir}/../src</source>
                       </sources>
+
                     </configuration>
                   </execution>
+                  <execution>
+                        <id>add-test-source</id>
+                        <phase>generate-test-sources</phase>
+                        <goals>
+                            <goal>add-test-source</goal>
+                        </goals>
+                        <configuration>
+                            <sources>
+                                <source>${basedir}/../test</source>
+                            </sources>
+                        </configuration>
+                    </execution>
                 </executions>
-            </plugin>		
+            </plugin>
 			<plugin>
 				<groupId>org.apache.maven.plugins</groupId>
 				<artifactId>maven-shade-plugin</artifactId>
 				<version>1.6</version>
 				<executions>
 					<execution>
-						<phase>package</phase>
+						<phase>none</phase>
 						<goals>
 							<goal>shade</goal>
 						</goals>
@@ -50,7 +64,7 @@
 						</configuration>
 					</execution>
 				</executions>
-			</plugin>	
+			</plugin>
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-compiler-plugin</artifactId>
@@ -108,11 +122,11 @@
 										<include>**/*.jar</include>
 									</includes>
 								</JarResource>
-							</resources>						
+							</resources>
 						</configuration>
 					</execution>
 				</executions>
-			</plugin>			
+			</plugin>
         </plugins>
     </build>
     <repositories>
@@ -133,9 +147,9 @@
             <version>1.4</version>
         </dependency>
 		<dependency>
-			<groupId>commons-lang</groupId>
-			<artifactId>commons-lang</artifactId>
-			<version>2.3</version>
+			<groupId>org.apache.commons</groupId>
+			<artifactId>commons-lang3</artifactId>
+			<version>3.4</version>
 		</dependency>
         <dependency>
             <groupId>org.apache.commons</groupId>
@@ -162,6 +176,13 @@
             <artifactId>guava</artifactId>
             <version>10.0.1</version>
         </dependency>
+
+        <dependency>
+            <groupId>org.pirkaengine</groupId>
+            <artifactId>pirka-mobile</artifactId>
+            <version>0.3.0</version>
+        </dependency>
+
         <!-- START testing dependecies -->
         <dependency>
             <groupId>junit</groupId>
@@ -181,7 +202,23 @@
 		  <artifactId>jackson-databind</artifactId>
 		  <version>2.0.0</version>
 		</dependency>
-		
+
+        <dependency>
+            <groupId>edu.stanford.nlp</groupId>
+            <artifactId>stanford-corenlp</artifactId>
+            <version>1.2.0</version>
+        </dependency>
+        <dependency>
+            <groupId>edu.berkeley.nlp</groupId>
+            <artifactId>berkeleyparser</artifactId>
+            <version>r32</version>
+        </dependency>
+        <dependency>
+            <groupId>net.sf.trove4j</groupId>
+            <artifactId>trove4j</artifactId>
+            <version>3.0.3</version>
+        </dependency>
+
         <!-- END testing dependecies -->
     </dependencies>
 </project>
diff --git a/src/cmu/arktweetnlp/EmojiExtractor.java b/src/cmu/arktweetnlp/EmojiExtractor.java
new file mode 100644
index 0000000..259b053
--- /dev/null
+++ b/src/cmu/arktweetnlp/EmojiExtractor.java
@@ -0,0 +1,13 @@
+package cmu.arktweetnlp;
+
+import org.apache.commons.lang3.tuple.Pair;
+import org.pirkaengine.mobile.Emoji;
+import java.util.List;
+
+
+/**
+ * Interface for objects that know how to extract emojis from text.
+ */
+public interface EmojiExtractor {
+    public Pair<String, List<Emoji>> extractEmojis(final String text);
+}
diff --git a/src/cmu/arktweetnlp/PatternContext.java b/src/cmu/arktweetnlp/PatternContext.java
new file mode 100644
index 0000000..9bb0e18
--- /dev/null
+++ b/src/cmu/arktweetnlp/PatternContext.java
@@ -0,0 +1,44 @@
+package cmu.arktweetnlp;
+
+import java.util.regex.Pattern;
+
+/**
+ * Interface for the collection of Patterns needed by the Twokenize module.
+ * Allows users to pass in a custom set of patterns or use the DefaultPatternContext bundled with the library.
+ */
+public interface PatternContext {
+    /**
+     * @return A pattern that can be used to detect contractions
+     */
+    public Pattern getContractionPattern();
+
+    /**
+     * @return A pattern that can be used to detect whitespace
+     */
+    public Pattern getWhitespacePattern();
+
+    /**
+     * @return A pattern that can be used to detect any desired
+     * "protected" tokens -- tokens that should not be split any further.
+     */
+    public Pattern getProtectedTokenPattern();
+
+    /**
+     * @return A pattern that can be used to detect left edge punctuation
+     */
+    public Pattern getLeftEdgePunctuationPattern();
+
+    /**
+     * @return A pattern that can be used to detect right edge punctuation
+     */
+    public Pattern getRightEdgePunctuationPattern();
+
+
+    public String splitEdgePunctuation(String input);
+
+    /**
+     * @return Trims multiple consecutive white spaces into a single
+     * space. E.g. "foo   bar " => "foo bar"
+     */
+    public String squeezeWhitespace(String input);
+}
diff --git a/src/cmu/arktweetnlp/TokenCategorizer.java b/src/cmu/arktweetnlp/TokenCategorizer.java
new file mode 100644
index 0000000..ebc45ec
--- /dev/null
+++ b/src/cmu/arktweetnlp/TokenCategorizer.java
@@ -0,0 +1,15 @@
+package cmu.arktweetnlp;
+
+
+import org.pirkaengine.mobile.Emoji;
+import java.util.*;
+
+/**
+ * Interface for objects that know how to group tokens output from Twokenize
+ * into categories of a given type. E.g. mapping certain token types to an enum.
+ * @param <T>
+ */
+public interface TokenCategorizer<T> {
+    public Map<T, List<String>> categorize(final String text, final List<List<String>> splitTokens, final List<List<String>> protectedTokens, final List<Emoji> emojis);
+}
+
diff --git a/src/cmu/arktweetnlp/Twokenize.java b/src/cmu/arktweetnlp/Twokenize.java
index 4397a40..285db24 100644
--- a/src/cmu/arktweetnlp/Twokenize.java
+++ b/src/cmu/arktweetnlp/Twokenize.java
@@ -4,17 +4,23 @@
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.PrintStream;
+import java.util.Map;
 import java.util.regex.*;
 import java.util.Arrays;
 import java.util.List;
 import java.util.ArrayList;
 
-import org.apache.commons.lang.StringEscapeUtils;
+import cmu.arktweetnlp.impl.DefaultPatternContext;
+import cmu.arktweetnlp.impl.NoOpEmojiExtractor;
+import org.apache.commons.lang3.StringEscapeUtils;
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
+import org.pirkaengine.mobile.Emoji;
 
 /**
  * Twokenize -- a tokenizer designed for Twitter text in English and some other European languages.
  * This is the Java version. If you want the old Python version, see: http://github.com/brendano/tweetmotif
- * 
+ *
  * This tokenizer code has gone through a long history:
  *
  * (1) Brendan O'Connor wrote original version in Python, http://github.com/brendano/tweetmotif
@@ -24,208 +30,102 @@
  * (2a) Kevin Gimpel and Daniel Mills modified it for POS tagging for the CMU ARK Twitter POS Tagger
  * (2b) Jason Baldridge and David Snyder ported it to Scala
  * (3) Brendan bugfixed the Scala port and merged with POS-specific changes
- *     for the CMU ARK Twitter POS Tagger  
+ *     for the CMU ARK Twitter POS Tagger
  * (4) Tobi Owoputi ported it back to Java and added many improvements (2012-06)
- * 
+ *
  * Current home is http://github.com/brendano/ark-tweet-nlp and http://www.ark.cs.cmu.edu/TweetNLP
  *
  * There have been at least 2 other Java ports, but they are not in the lineage for the code here.
  */
 public class Twokenize {
-    static Pattern Contractions = Pattern.compile("(?i)(\\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$");
-    static Pattern Whitespace = Pattern.compile("[\\s\\p{Zs}]+");
-
-    static String punctChars = "['\"“”‘’.?!…,:;]"; 
-    //static String punctSeq   = punctChars+"+";	//'anthem'. => ' anthem '.
-    static String punctSeq   = "['\"“”‘’]+|[.?!,…]+|[:;]+";	//'anthem'. => ' anthem ' .
-    static String entity     = "&(?:amp|lt|gt|quot);";
-    //  URLs
-
-    // BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong.
-    // If you actually empirically test it the results are bad.
-    // Please see https://github.com/brendano/ark-tweet-nlp/pull/9
-
-    static String urlStart1  = "(?:https?://|\\bwww\\.)";
-    static String commonTLDs = "(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)";
-    static String ccTLDs	 = "(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" +
-    "bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" +
-    "er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" +
-    "hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" +
-    "lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" +
-    "nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" +
-    "sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" +
-    "va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)";	//TODO: remove obscure country domains?
-    static String urlStart2  = "\\b(?:[A-Za-z\\d-])+(?:\\.[A-Za-z0-9]+){0,3}\\." + "(?:"+commonTLDs+"|"+ccTLDs+")"+"(?:\\."+ccTLDs+")?(?=\\W|$)";
-    static String urlBody    = "(?:[^\\.\\s<>][^\\s<>]*?)?";
-    static String urlExtraCrapBeforeEnd = "(?:"+punctChars+"|"+entity+")+?";
-    static String urlEnd     = "(?:\\.\\.+|[<>]|\\s|$)";
-    public static String url        = "(?:"+urlStart1+"|"+urlStart2+")"+urlBody+"(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")";
-
-
-    // Numeric
-    static String timeLike   = "\\d+(?::\\d+){1,2}";
-    //static String numNum     = "\\d+\\.\\d+";
-    static String numberWithCommas = "(?:(?<!\\d)\\d{1,3},)+?\\d{3}" + "(?=(?:[^,\\d]|$))";
-    static String numComb	 = "\\p{Sc}?\\d+(?:\\.\\d+)+%?";
-
-    // Abbreviations
-    static String boundaryNotDot = "(?:$|\\s|[“\\u0022?!,:;]|" + entity + ")";
-    static String aa1  = "(?:[A-Za-z]\\.){2,}(?=" + boundaryNotDot + ")";
-    static String aa2  = "[^A-Za-z](?:[A-Za-z]\\.){1,}[A-Za-z](?=" + boundaryNotDot + ")";
-    static String standardAbbreviations = "\\b(?:[Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\\.";
-    static String arbitraryAbbrev = "(?:" + aa1 +"|"+ aa2 + "|" + standardAbbreviations + ")";
-    static String separators  = "(?:--+|―|—|~|–|=)";
-    static String decorations = "(?:[♫♪]+|[★☆]+|[♥❤♡]+|[\\u2639-\\u263b]+|[\\ue001-\\uebbb]+)";
-    static String thingsThatSplitWords = "[^\\s\\.,?\"]";
-    static String embeddedApostrophe = thingsThatSplitWords+"+['’′]" + thingsThatSplitWords + "*";
-    
-    public static String OR(String... parts) {
-        String prefix="(?:";
-        StringBuilder sb = new StringBuilder();
-        for (String s:parts){
-            sb.append(prefix);
-            prefix="|";
-            sb.append(s);
+
+    private static final PatternContext DEFAULT_PATTERN_CONTEXT = new DefaultPatternContext();
+    private static final EmojiExtractor DEFAULT_EMOJI_EXTRACTOR = new NoOpEmojiExtractor();
+
+    /**
+     * Represents the results of tokenizing a tweet.
+     */
+    protected static class TwokenizedTweet {
+        private final String originalText;
+        private final List<List<String>> splitTokens;
+        private final List<List<String>> preservedTokens;
+        private final List<Emoji> emojis;
+
+        public TwokenizedTweet(String originalText, List<List<String>> splitGoodTokens, List<List<String>> badTokens, List<Emoji> emojis) {
+            this.originalText = originalText;
+            this.splitTokens = splitGoodTokens;
+            this.preservedTokens = badTokens;
+            this.emojis = emojis;
+        }
+
+        /**
+         * @return The original Tweet text
+         */
+        public String getOriginalText() {
+            return originalText;
+        }
+
+        /**
+         * @return The tokens that were split into single tokens
+         */
+        public List<List<String>> getSplitTokens() {
+            return splitTokens;
+        }
+
+        /**
+         * @return The preserved tokens that were not split up. E.g. what's, http://test.com, etc.
+         */
+        public List<List<String>> getPreservedTokens() {
+            return preservedTokens;
+        }
+
+        /**
+         * @return The emojis detected in the tweet
+         */
+        public List<Emoji> getEmojis() {
+            return emojis;
         }
-        sb.append(")");
-        return sb.toString();
-    }
-    
-    //  Emoticons
-    static String normalEyes = "(?iu)[:=]"; // 8 and x are eyes but cause problems
-    static String wink = "[;]";
-    static String noseArea = "(?:|-|[^a-zA-Z0-9 ])"; // doesn't get :'-(
-    static String happyMouths = "[D\\)\\]\\}]+";
-    static String sadMouths = "[\\(\\[\\{]+";
-    static String tongue = "[pPd3]+";
-    static String otherMouths = "(?:[oO]+|[/\\\\]+|[vV]+|[Ss]+|[|]+)"; // remove forward slash if http://'s aren't cleaned
-
-    // mouth repetition examples:
-    // @aliciakeys Put it in a love song :-))
-    // @hellocalyclops =))=))=)) Oh well
-
-    static String bfLeft = "(♥|0|o|°|v|\\$|t|x|;|\\u0CA0|@|ʘ|•|・|◕|\\^|¬|\\*)";
-    static String bfCenter = "(?:[\\.]|[_-]+)";
-    static String bfRight = "\\2";
-    static String s3 = "(?:--['\"])";
-    static String s4 = "(?:<|&lt;|>|&gt;)[\\._-]+(?:<|&lt;|>|&gt;)";
-    static String s5 = "(?:[.][_]+[.])";
-    static String basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5;
-
-    static String eeLeft = "[＼\\\\ƪԄ\\(（<>;ヽ\\-=~\\*]+";
-    static String eeRight= "[\\-=\\);'\\u0022<>ʃ）/／ノﾉ丿╯σっµ~\\*]+";
-    static String eeSymbol = "[^A-Za-z0-9\\s\\(\\)\\*:=-]";
-    static String eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight;
-
-    
-    public static String emoticon = OR(
-            // Standard version  :) :( :] :D :P
-    		"(?:>|&gt;)?" + OR(normalEyes, wink) + OR(noseArea,"[Oo]") + 
-            	OR(tongue+"(?=\\W|$|RT|rt|Rt)", otherMouths+"(?=\\W|$|RT|rt|Rt)", sadMouths, happyMouths),
-
-            // reversed version (: D:  use positive lookbehind to remove "(word):"
-            // because eyes on the right side is more ambiguous with the standard usage of : ;
-            "(?<=(?: |^))" + OR(sadMouths,happyMouths,otherMouths) + noseArea + OR(normalEyes, wink) + "(?:<|&lt;)?",
-
-            //inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
-            eastEmote.replaceFirst("2", "1"), basicface
-            // iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]  
-            // TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
-    );
-
-    static String Hearts = "(?:<+/?3+)+"; //the other hearts are in decorations
-
-    static String Arrows = "(?:<*[-―—=]*>+|<+[-―—=]*>*)|\\p{InArrows}+";
-
-    // BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
-    // "hello (#hashtag)" ==> "hello (#hashtag )"  WRONG
-    // "hello (#hashtag)" ==> "hello ( #hashtag )"  RIGHT
-    // "hello (@person)" ==> "hello (@person )"  WRONG
-    // "hello (@person)" ==> "hello ( @person )"  RIGHT
-    // ... Some sort of weird interaction with edgepunct I guess, because edgepunct 
-    // has poor content-symbol detection.
-
-    // This also gets #1 #40 which probably aren't hashtags .. but good as tokens.
-    // If you want good hashtag identification, use a different regex.
-    static String Hashtag = "#[a-zA-Z0-9_]+";  //optional: lookbehind for \b
-    //optional: lookbehind for \b, max length 15
-    static String AtMention = "[@＠][a-zA-Z0-9_]+"; 
-
-    // I was worried this would conflict with at-mentions
-    // but seems ok in sample of 5800: 7 changes all email fixes
-    // http://www.regular-expressions.info/email.html
-    static String Bound = "(?:\\W|^|$)";
-    public static String Email = "(?<=" +Bound+ ")[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,4}(?=" +Bound+")";
-
-    // We will be tokenizing using these regexps as delimiters
-    // Additionally, these things are "protected", meaning they shouldn't be further split themselves.
-    static Pattern Protected  = Pattern.compile(
-            OR(
-                    Hearts,
-                    url,
-                    Email,
-                    timeLike,
-                    //numNum,
-                    numberWithCommas,
-                    numComb,
-                    emoticon,
-                    Arrows,
-                    entity,
-                    punctSeq,
-                    arbitraryAbbrev,
-                    separators,
-                    decorations,
-                    embeddedApostrophe,
-                    Hashtag,  
-                    AtMention
-            ));
-
-    // Edge punctuation
-    // Want: 'foo' => ' foo '
-    // While also:   don't => don't
-    // the first is considered "edge punctuation".
-    // the second is word-internal punctuation -- don't want to mess with it.
-    // BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.  
-    // I remember it causing lots of trouble in the past as well.  Would be good to revisit or eliminate.
-
-    // Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
-    static String edgePunctChars    = "'\"“”‘’«»{}\\(\\)\\[\\]\\*&"; //add \\p{So}? (symbols)
-    static String edgePunct    = "[" + edgePunctChars + "]";
-    static String notEdgePunct = "[a-zA-Z0-9]"; // content characters
-    static String offEdge = "(^|$|:|;|\\s|\\.|,)";  // colon here gets "(hello):" ==> "( hello ):"
-    static Pattern EdgePunctLeft  = Pattern.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")");
-    static Pattern EdgePunctRight = Pattern.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge);
-
-    public static String splitEdgePunct (String input) {
-        Matcher m1 = EdgePunctLeft.matcher(input);
-        input = m1.replaceAll("$1$2 $3");
-        m1 = EdgePunctRight.matcher(input);
-        input = m1.replaceAll("$1 $2$3");
-        return input;
-    }
-    
-    private static class Pair<T1, T2> {
-        public T1 first;
-        public T2 second;
-        public Pair(T1 x, T2 y) { first=x; second=y; }
     }
 
+
     // The main work of tokenizing a tweet.
-    private static List<String> simpleTokenize (String text) {
+    private static List<String> simpleTokenize (final String text, final PatternContext patterns, final EmojiExtractor emojiExtractor) {
+        final TwokenizedTweet twokenizedTweet = tokenizeTweet(text, patterns, emojiExtractor);
+
+        //  Reinterpolate the 'good' and 'bad' Lists, ensuring that
+        //  additonal tokens from last good item get included
+        List<String> zippedStr= new ArrayList<String>();
+        int i;
+        for(i=0; i < twokenizedTweet.getPreservedTokens().size(); i++) {
+            zippedStr = addAllnonempty(zippedStr, twokenizedTweet.getSplitTokens().get(i));
+            zippedStr = addAllnonempty(zippedStr,twokenizedTweet.getPreservedTokens().get(i));
+        }
+        zippedStr = addAllnonempty(zippedStr,twokenizedTweet.getSplitTokens().get(i));
+
+        // BTO: our POS tagger wants "ur" and "you're" to both be one token.
+        // Uncomment to get "you 're"
+        /*ArrayList<String> splitStr = new ArrayList<String>(zippedStr.size());
+        for(String tok:zippedStr)
+        	splitStr.addAll(splitToken(tok));
+        zippedStr=splitStr;*/
 
+        return zippedStr;
+    }
+
+    protected static TwokenizedTweet tokenizeTweet(final String text, final PatternContext patterns, final EmojiExtractor emojiExtractor) {
         // Do the no-brainers first
-        String splitPunctText = splitEdgePunct(text);
+        String splitPunctText = patterns.splitEdgePunctuation(text);
 
         int textLength = splitPunctText.length();
-        
+
         // BTO: the logic here got quite convoluted via the Scala porting detour
         // It would be good to switch back to a nice simple procedural style like in the Python version
         // ... Scala is such a pain.  Never again.
 
         // Find the matches for subsequences that should be protected,
         // e.g. URLs, 1.0, U.N.K.L.E., 12:53
-        Matcher matches = Protected.matcher(splitPunctText);
-        //Storing as List[List[String]] to make zip easier later on 
+        Matcher matches = patterns.getProtectedTokenPattern().matcher(splitPunctText);
+        //Storing as List[List[String]] to make zip easier later on
         List<List<String>> bads = new ArrayList<List<String>>();	//linked list?
         List<Pair<Integer,Integer>> badSpans = new ArrayList<Pair<Integer,Integer>>();
         while(matches.find()){
@@ -234,52 +134,44 @@ private static List<String> simpleTokenize (String text) {
                 List<String> bad = new ArrayList<String>(1);
                 bad.add(splitPunctText.substring(matches.start(),matches.end()));
                 bads.add(bad);
-                badSpans.add(new Pair<Integer, Integer>(matches.start(),matches.end()));
+                badSpans.add(new ImmutablePair<Integer, Integer>(matches.start(),matches.end()));
             }
         }
 
         // Create a list of indices to create the "goods", which can be
-        // split. We are taking "bad" spans like 
-        //     List((2,5), (8,10)) 
-        // to create 
+        // split. We are taking "bad" spans like
+        //     List((2,5), (8,10))
+        // to create
         ///    List(0, 2, 5, 8, 10, 12)
         // where, e.g., "12" here would be the textLength
         // has an even length and no indices are the same
         List<Integer> indices = new ArrayList<Integer>(2+2*badSpans.size());
         indices.add(0);
         for(Pair<Integer,Integer> p:badSpans){
-            indices.add(p.first);
-            indices.add(p.second);
+            indices.add(p.getLeft());
+            indices.add(p.getRight());
         }
         indices.add(textLength);
 
         // Group the indices and map them to their respective portion of the string
         List<List<String>> splitGoods = new ArrayList<List<String>>(indices.size()/2);
+        final List<Emoji> emojis = new ArrayList<Emoji>();
         for (int i=0; i<indices.size(); i+=2) {
             String goodstr = splitPunctText.substring(indices.get(i),indices.get(i+1));
+
+
+            final Pair<String, List<Emoji>> goodStrAndEmojis = emojiExtractor.extractEmojis(goodstr);
+            goodstr = goodStrAndEmojis.getLeft();
+
+            emojis.addAll(goodStrAndEmojis.getRight());
+
             List<String> splitstr = Arrays.asList(goodstr.trim().split(" "));
             splitGoods.add(splitstr);
         }
 
-        //  Reinterpolate the 'good' and 'bad' Lists, ensuring that
-        //  additonal tokens from last good item get included
-        List<String> zippedStr= new ArrayList<String>();
-        int i;
-        for(i=0; i < bads.size(); i++) {
-            zippedStr = addAllnonempty(zippedStr,splitGoods.get(i));
-            zippedStr = addAllnonempty(zippedStr,bads.get(i));
-        }
-        zippedStr = addAllnonempty(zippedStr,splitGoods.get(i));
-        
-        // BTO: our POS tagger wants "ur" and "you're" to both be one token.
-        // Uncomment to get "you 're"
-        /*ArrayList<String> splitStr = new ArrayList<String>(zippedStr.size());
-        for(String tok:zippedStr)
-        	splitStr.addAll(splitToken(tok));
-        zippedStr=splitStr;*/
-        
-        return zippedStr;
-    }  
+
+        return new TwokenizedTweet(text, splitGoods, bads, emojis);
+    }
 
     private static List<String> addAllnonempty(List<String> master, List<String> smaller){
         for (String s : smaller){
@@ -289,42 +181,38 @@ private static List<String> addAllnonempty(List<String> master, List<String> sma
         }
         return master;
     }
-    /** "foo   bar " => "foo bar" */
-    public static String squeezeWhitespace (String input){
-        return Whitespace.matcher(input).replaceAll(" ").trim();
-    }
-
-    // Final pass tokenization based on special patterns
-    private static List<String> splitToken (String token) {
 
-        Matcher m = Contractions.matcher(token);
-        if (m.find()){
-        	String[] contract = {m.group(1), m.group(2)};
-        	return Arrays.asList(contract);
-        }
-        String[] contract = {token};
-        return Arrays.asList(contract);
+    /** Assume 'text' has no HTML escaping. **/
+    public static List<String> tokenize(String text) {
+        return simpleTokenize(DEFAULT_PATTERN_CONTEXT.squeezeWhitespace(text), DEFAULT_PATTERN_CONTEXT, DEFAULT_EMOJI_EXTRACTOR);
     }
 
     /** Assume 'text' has no HTML escaping. **/
-    public static List<String> tokenize(String text){
-        return simpleTokenize(squeezeWhitespace(text));
+    public static List<String> tokenize(final String text, final PatternContext patternContext, final EmojiExtractor emojiExtractor) {
+        return simpleTokenize(patternContext.squeezeWhitespace(text), patternContext, emojiExtractor);
     }
 
+    /**
+     * Tokenizes the given text and applies the given categorization function to categorize the tokens into groups
+     */
+    public static<T> Map<T, List<String>> tokenizeIntoCategories(final String text, final TokenCategorizer<T> categorizer) {
+        return tokenizeIntoCategories(text, categorizer, DEFAULT_PATTERN_CONTEXT, DEFAULT_EMOJI_EXTRACTOR);
+    }
 
     /**
-     * Twitter text comes HTML-escaped, so unescape it.
-     * We also first unescape &amp;'s, in case the text has been buggily double-escaped.
+     * Same as tokenizeIntoCategories but uses a custom PatternContext and EmojiExtractor
      */
-    public static String normalizeTextForTagger(String text) {
-    	text = text.replaceAll("&amp;", "&");
-    	text = StringEscapeUtils.unescapeHtml(text);
-    	return text;
+    public static<T> Map<T, List<String>> tokenizeIntoCategories(final String text, final TokenCategorizer<T> categorizer, final PatternContext patterns, final EmojiExtractor emojiExtractor) {
+        final String cleaned = patterns.squeezeWhitespace(text);
+        final TwokenizedTweet twokenizedTweet = tokenizeTweet(cleaned, patterns, emojiExtractor);
+        final Map<T, List<String>> tokenCategories = categorizer.categorize(twokenizedTweet.getOriginalText(), twokenizedTweet.getSplitTokens(), twokenizedTweet.getPreservedTokens(), twokenizedTweet.getEmojis());
+        return tokenCategories;
     }
 
+
     /**
      * This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger.
-     * 
+     *
      * This function normalizes the input text BEFORE calling the tokenizer.
      * So the tokens you get back may not exactly correspond to
      * substrings of the original text.
@@ -334,6 +222,16 @@ public static List<String> tokenizeRawTweetText(String text) {
         return tokens;
     }
 
+    /**
+     * Twitter text comes HTML-escaped, so unescape it.
+     * We also first unescape &amp;'s, in case the text has been buggily double-escaped.
+     */
+    public static String normalizeTextForTagger(String text) {
+        text = text.replaceAll("&amp;", "&");
+        text = StringEscapeUtils.unescapeHtml4(text);
+        return text;
+    }
+
     /** Tokenizes tweet texts on standard input, tokenizations on standard output.  Input and output UTF-8. */
     public static void main(String[] args) throws IOException {
         BufferedReader input = new BufferedReader(new InputStreamReader(System.in,"UTF-8"));
@@ -350,5 +248,5 @@ public static void main(String[] args) throws IOException {
     		output.print("\n");
     	}
     }
-    
+
 }
diff --git a/src/cmu/arktweetnlp/impl/DefaultPatternContext.java b/src/cmu/arktweetnlp/impl/DefaultPatternContext.java
new file mode 100644
index 0000000..362ca4b
--- /dev/null
+++ b/src/cmu/arktweetnlp/impl/DefaultPatternContext.java
@@ -0,0 +1,226 @@
+package cmu.arktweetnlp.impl;
+
+import cmu.arktweetnlp.PatternContext;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class DefaultPatternContext implements PatternContext {
+
+    public static Pattern Contractions = Pattern.compile("(?i)(\\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$");
+    public static Pattern Whitespace = Pattern.compile("[\\s\\p{Zs}]+");
+
+    public static String punctChars = "['\"“”‘’.?!…,:;]";
+    //public static String punctSeq   = punctChars+"+";	//'anthem'. => ' anthem '.
+    public static String punctSeq   = "['\"“”‘’]+|[.?!,…]+|[:;]+";	//'anthem'. => ' anthem ' .
+    public static String entity     = "&(?:amp|lt|gt|quot);";
+    //  URLs
+
+    // BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong.
+    // If you actually empirically test it the results are bad.
+    // Please see https://github.com/brendano/ark-tweet-nlp/pull/9
+
+    public static String urlStart1  = "(?:https?://|\\bwww\\.)";
+    public static String commonTLDs = "(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)";
+    public static String ccTLDs	 = "(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" +
+            "bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" +
+            "er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" +
+            "hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" +
+            "lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" +
+            "nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" +
+            "sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" +
+            "va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)";	//TODO: remove obscure country domains?
+    public static String urlStart2  = "\\b(?:[A-Za-z\\d-])+(?:\\.[A-Za-z0-9]+){0,3}\\." + "(?:"+commonTLDs+"|"+ccTLDs+")"+"(?:\\."+ccTLDs+")?(?=\\W|$)";
+    public static String urlBody    = "(?:[^\\.\\s<>][^\\s<>]*?)?";
+    public static String urlExtraCrapBeforeEnd = "(?:"+punctChars+"|"+entity+")+?";
+    public static String urlEnd     = "(?:\\.\\.+|[<>]|\\s|$)";
+    public static String url        = "(?:"+urlStart1+"|"+urlStart2+")"+urlBody+"(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")";
+
+
+    // Numeric
+    public static String timeLike   = "\\d+(?::\\d+){1,2}";
+    //public static String numNum     = "\\d+\\.\\d+";
+    public static String numberWithCommas = "(?:(?<!\\d)\\d{1,3},)+?\\d{3}" + "(?=(?:[^,\\d]|$))";
+    public static String numComb	 = "\\p{Sc}?\\d+(?:\\.\\d+)+%?";
+
+    // Abbreviations
+    public static String boundaryNotDot = "(?:$|\\s|[“\\u0022?!,:;]|" + entity + ")";
+    public static String aa1  = "(?:[A-Za-z]\\.){2,}(?=" + boundaryNotDot + ")";
+    public static String aa2  = "[^A-Za-z](?:[A-Za-z]\\.){1,}[A-Za-z](?=" + boundaryNotDot + ")";
+    public static String standardAbbreviations = "\\b(?:[Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\\.";
+    public static String arbitraryAbbrev = "(?:" + aa1 +"|"+ aa2 + "|" + standardAbbreviations + ")";
+    public static String separators  = "(?:--+|―|—|~|–|=)";
+    public static String decorations = "(?:[♫♪]+|[★☆]+|[♥❤♡]+|[\\u2639-\\u263b]+|[\\ue001-\\uebbb]+)";
+    public static String thingsThatSplitWords = "[^\\s\\.,?\"]";
+    public static String embeddedApostrophe = thingsThatSplitWords+"+['’′]" + thingsThatSplitWords + "*";
+
+    public static String OR(String... parts) {
+        String prefix="(?:";
+        StringBuilder sb = new StringBuilder();
+        for (String s:parts){
+            sb.append(prefix);
+            prefix="|";
+            sb.append(s);
+        }
+        sb.append(")");
+        return sb.toString();
+    }
+
+    //  Emoticons
+    public static String normalEyes = "(?iu)[:=]"; // 8 and x are eyes but cause problems
+    public static String wink = "[;]";
+    public static String noseArea = "(?:|-|[^a-zA-Z0-9 ])"; // doesn't get :'-(
+    public static String happyMouths = "[D\\)\\]\\}]+";
+    public static String sadMouths = "[\\(\\[\\{]+";
+    public static String tongue = "[pPd3]+";
+    public static String otherMouths = "(?:[oO]+|[/\\\\]+|[vV]+|[Ss]+|[|]+)"; // remove forward slash if http://'s aren't cleaned
+
+    // mouth repetition examples:
+    // @aliciakeys Put it in a love song :-))
+    // @hellocalyclops =))=))=)) Oh well
+
+    public static String bfLeft = "(♥|0|o|°|v|\\$|t|x|;|\\u0CA0|@|ʘ|•|・|◕|\\^|¬|\\*)";
+    public static String bfCenter = "(?:[\\.]|[_-]+)";
+    public static String bfRight = "\\2";
+    public static String s3 = "(?:--['\"])";
+    public static String s4 = "(?:<|&lt;|>|&gt;)[\\._-]+(?:<|&lt;|>|&gt;)";
+    public static String s5 = "(?:[.][_]+[.])";
+    public static String basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5;
+
+    public static String eeLeft = "[＼\\\\ƪԄ\\(（<>;ヽ\\-=~\\*]+";
+    public static String eeRight= "[\\-=\\);'\\u0022<>ʃ）/／ノﾉ丿╯σっµ~\\*]+";
+    public static String eeSymbol = "[^A-Za-z0-9\\s\\(\\)\\*:=-]";
+    public static String eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight;
+
+
+    public static String emoticon = OR(
+            // Standard version  :) :( :] :D :P
+            "(?:>|&gt;)?" + OR(normalEyes, wink) + OR(noseArea,"[Oo]") +
+                    OR(tongue+"(?=\\W|$|RT|rt|Rt)", otherMouths+"(?=\\W|$|RT|rt|Rt)", sadMouths, happyMouths),
+
+            // reversed version (: D:  use positive lookbehind to remove "(word):"
+            // because eyes on the right side is more ambiguous with the standard usage of : ;
+            "(?<=(?: |^))" + OR(sadMouths,happyMouths,otherMouths) + noseArea + OR(normalEyes, wink) + "(?:<|&lt;)?",
+
+            //inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
+            eastEmote.replaceFirst("2", "1"), basicface
+            // iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
+            // TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
+    );
+
+    public static String Hearts = "(?:<+/?3+)+"; //the other hearts are in decorations
+
+    public static String Arrows = "(?:<*[-―—=]*>+|<+[-―—=]*>*)|\\p{InArrows}+";
+
+    // BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
+    // "hello (#hashtag)" ==> "hello (#hashtag )"  WRONG
+    // "hello (#hashtag)" ==> "hello ( #hashtag )"  RIGHT
+    // "hello (@person)" ==> "hello (@person )"  WRONG
+    // "hello (@person)" ==> "hello ( @person )"  RIGHT
+    // ... Some sort of weird interaction with edgepunct I guess, because edgepunct
+    // has poor content-symbol detection.
+
+    // This also gets #1 #40 which probably aren't hashtags .. but good as tokens.
+    // If you want good hashtag identification, use a different regex.
+    public static String Hashtag = "#[a-zA-Z0-9_]+";  //optional: lookbehind for \b
+    //optional: lookbehind for \b, max length 15
+    public static String AtMention = "[@＠][a-zA-Z0-9_]+";
+
+    // I was worried this would conflict with at-mentions
+    // but seems ok in sample of 5800: 7 changes all email fixes
+    // http://www.regular-expressions.info/email.html
+    public static String Bound = "(?:\\W|^|$)";
+    public static String Email = "(?<=" +Bound+ ")[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,4}(?=" +Bound+")";
+
+    // We will be tokenizing using these regexps as delimiters
+    // Additionally, these things are "protected", meaning they shouldn't be further split themselves.
+    public static Pattern Protected  = Pattern.compile(
+            OR(
+                    Hearts,
+                    url,
+                    Email,
+                    timeLike,
+                    //numNum,
+                    numberWithCommas,
+                    numComb,
+                    emoticon,
+                    Arrows,
+                    entity,
+                    punctSeq,
+                    arbitraryAbbrev,
+                    separators,
+                    decorations,
+                    embeddedApostrophe,
+                    Hashtag,
+                    AtMention
+            ));
+
+    // Edge punctuation
+    // Want: 'foo' => ' foo '
+    // While also:   don't => don't
+    // the first is considered "edge punctuation".
+    // the second is word-internal punctuation -- don't want to mess with it.
+    // BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.
+    // I remember it causing lots of trouble in the past as well.  Would be good to revisit or eliminate.
+
+    // Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
+    public static String edgePunctChars    = "'\"“”‘’«»{}\\(\\)\\[\\]\\*&"; //add \\p{So}? (symbols)
+    public static String edgePunct    = "[" + edgePunctChars + "]";
+    public static String notEdgePunct = "[a-zA-Z0-9]"; // content characters
+    public static String offEdge = "(^|$|:|;|\\s|\\.|,)";  // colon here gets "(hello):" ==> "( hello ):"
+    public static Pattern EdgePunctLeft  = Pattern.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")");
+    public static Pattern EdgePunctRight = Pattern.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge);
+
+    @Override
+    public Pattern getContractionPattern() {
+        return Contractions;
+    }
+
+    @Override
+    public Pattern getWhitespacePattern() {
+        return Whitespace;
+    }
+
+    @Override
+    public Pattern getProtectedTokenPattern() {
+        return Protected;
+    }
+
+    @Override
+    public Pattern getLeftEdgePunctuationPattern() {
+        return EdgePunctLeft;
+    }
+
+    @Override
+    public Pattern getRightEdgePunctuationPattern() {
+        return EdgePunctRight;
+    }
+
+
+    public String splitEdgePunctuation(String input) {
+        Matcher m1 = getLeftEdgePunctuationPattern().matcher(input);
+        input = m1.replaceAll("$1$2 $3");
+        m1 = getRightEdgePunctuationPattern().matcher(input);
+        input = m1.replaceAll("$1 $2$3");
+        return input;
+    }
+
+    /** "foo   bar " => "foo bar" */
+    public String squeezeWhitespace (String input){
+        return getWhitespacePattern().matcher(input).replaceAll(" ").trim();
+    }
+
+    // Final pass tokenization based on special patterns
+    public List<String> splitToken (String token) {
+
+        Matcher m = getContractionPattern().matcher(token);
+        if (m.find()){
+            String[] contract = {m.group(1), m.group(2)};
+            return Arrays.asList(contract);
+        }
+        String[] contract = {token};
+        return Arrays.asList(contract);
+    }
+}
diff --git a/src/cmu/arktweetnlp/impl/EmojiExtractorImpl.java b/src/cmu/arktweetnlp/impl/EmojiExtractorImpl.java
new file mode 100644
index 0000000..1602f2f
--- /dev/null
+++ b/src/cmu/arktweetnlp/impl/EmojiExtractorImpl.java
@@ -0,0 +1,15 @@
+package cmu.arktweetnlp.impl;
+
+import cmu.arktweetnlp.EmojiExtractor;
+import cmu.arktweetnlp.util.EmojiUtil;
+import org.apache.commons.lang3.tuple.Pair;
+import org.pirkaengine.mobile.Emoji;
+
+import java.util.List;
+
+public class EmojiExtractorImpl implements EmojiExtractor {
+    @Override
+    public Pair<String, List<Emoji>> extractEmojis(final String text) {
+        return EmojiUtil.filterEmoji(text);
+    }
+}
diff --git a/src/cmu/arktweetnlp/impl/NoOpEmojiExtractor.java b/src/cmu/arktweetnlp/impl/NoOpEmojiExtractor.java
new file mode 100644
index 0000000..a266fed
--- /dev/null
+++ b/src/cmu/arktweetnlp/impl/NoOpEmojiExtractor.java
@@ -0,0 +1,16 @@
+package cmu.arktweetnlp.impl;
+
+import cmu.arktweetnlp.EmojiExtractor;
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
+import org.pirkaengine.mobile.Emoji;
+
+import java.util.Collections;
+import java.util.List;
+
+public class NoOpEmojiExtractor implements EmojiExtractor {
+    @Override
+    public Pair<String, List<Emoji>> extractEmojis(String text) {
+        return new ImmutablePair<String, List<Emoji>>(text, Collections.<Emoji>emptyList());
+    }
+}
diff --git a/src/cmu/arktweetnlp/impl/features/FeatureUtil.java b/src/cmu/arktweetnlp/impl/features/FeatureUtil.java
index ed40bd2..184d815 100644
--- a/src/cmu/arktweetnlp/impl/features/FeatureUtil.java
+++ b/src/cmu/arktweetnlp/impl/features/FeatureUtil.java
@@ -8,6 +8,7 @@
 
 import cmu.arktweetnlp.Twokenize;
 
+import cmu.arktweetnlp.impl.DefaultPatternContext;
 import com.twitter.Regex;
 
 
@@ -16,7 +17,7 @@
  **/
 public class FeatureUtil {
 	
-	public static Pattern URL = Pattern.compile(Twokenize.OR(Twokenize.url, Twokenize.Email));
+	public static Pattern URL = Pattern.compile(DefaultPatternContext.OR(DefaultPatternContext.url, DefaultPatternContext.Email));
 	public static Pattern justbase = Pattern.compile("(?!www\\.|ww\\.|w\\.|@)[a-zA-Z0-9]+\\.[A-Za-z0-9\\.]+"); 
 
 //	Pattern URL = Pattern.compile(Twokenize.url);
diff --git a/src/cmu/arktweetnlp/impl/features/MiscFeatures.java b/src/cmu/arktweetnlp/impl/features/MiscFeatures.java
index 96b99ce..42aac5b 100644
--- a/src/cmu/arktweetnlp/impl/features/MiscFeatures.java
+++ b/src/cmu/arktweetnlp/impl/features/MiscFeatures.java
@@ -5,6 +5,7 @@
 import java.util.regex.Pattern;
 
 import cmu.arktweetnlp.Twokenize;
+import cmu.arktweetnlp.impl.DefaultPatternContext;
 import cmu.arktweetnlp.impl.features.FeatureExtractor.FeatureExtractorInterface;
 import cmu.arktweetnlp.impl.features.FeatureExtractor.PositionFeaturePairs;
 
@@ -121,7 +122,7 @@ public static class SimpleOrthFeatures implements FeatureExtractorInterface {
 		/** TODO change to punctuation class, or better from Twokenize **/
 		//Pattern allPunct = Pattern.compile("^[^a-zA-Z0-9]*$");
 		Pattern allPunct = Pattern.compile("^\\W*$");
-		Pattern emoticon = Pattern.compile(Twokenize.emoticon);
+		Pattern emoticon = Pattern.compile(DefaultPatternContext.emoticon);
 		public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) {
 			for (int t=0; t < tokens.size(); t++) {
 				String tok = tokens.get(t);
@@ -154,8 +155,8 @@ public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) {
 		}    
 	}
 	public static class URLFeatures implements FeatureExtractorInterface {	
-		Pattern validURL = Pattern.compile(Twokenize.url);
-		Pattern validEmail = Pattern.compile(Twokenize.Email);
+		Pattern validURL = Pattern.compile(DefaultPatternContext.url);
+		Pattern validEmail = Pattern.compile(DefaultPatternContext.Email);
 		public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) {
 			for (int t=0; t < tokens.size(); t++) {
 				String tok = tokens.get(t);
diff --git a/src/cmu/arktweetnlp/impl/features/WordListFeatures.java b/src/cmu/arktweetnlp/impl/features/WordListFeatures.java
index e4146d2..cdb6d04 100644
--- a/src/cmu/arktweetnlp/impl/features/WordListFeatures.java
+++ b/src/cmu/arktweetnlp/impl/features/WordListFeatures.java
@@ -8,6 +8,7 @@
 import java.util.regex.Pattern;
 
 import cmu.arktweetnlp.Twokenize;
+import cmu.arktweetnlp.impl.DefaultPatternContext;
 import cmu.arktweetnlp.impl.features.FeatureExtractor.FeatureExtractorInterface;
 import cmu.arktweetnlp.impl.features.FeatureExtractor.PositionFeaturePairs;
 import cmu.arktweetnlp.util.BasicFileIO;
@@ -15,7 +16,7 @@
 public class WordListFeatures {
 
 	public static class POSTagDict implements FeatureExtractorInterface {
-		Pattern URL = Pattern.compile(Twokenize.url);
+		Pattern URL = Pattern.compile(DefaultPatternContext.url);
 		Pattern letter = Pattern.compile("[A-Za-z]{3,}");
 		public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) {
 			for (int t=0; t < tokens.size(); t++) {
diff --git a/src/cmu/arktweetnlp/util/EmojiUtil.java b/src/cmu/arktweetnlp/util/EmojiUtil.java
new file mode 100644
index 0000000..07cf31c
--- /dev/null
+++ b/src/cmu/arktweetnlp/util/EmojiUtil.java
@@ -0,0 +1,34 @@
+package cmu.arktweetnlp.util;
+
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
+import org.pirkaengine.mobile.Emoji;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class EmojiUtil {
+
+    public static Pair<String, List<Emoji>> filterEmoji(String text) {
+        StringBuffer term   = new StringBuffer("");
+        List<Emoji>  emojis = new ArrayList<Emoji>();
+
+        if (text != null && !text.isEmpty()) {
+            Emoji em;
+            for (int i = 0; i < text.length(); ) {
+                final int codePoint = text.codePointAt(i);
+                em = codePoint < 0x1FFFF ? Emoji.charOf(codePoint) : null;
+                if (null == em) {
+                    final String chars = new String(Character.toChars(codePoint));
+                    term.append(chars);
+                } else {
+                    emojis.add(em);
+                }
+                i += Character.charCount(codePoint);
+            }
+        }
+
+        Pair<String, List<Emoji>> out = new ImmutablePair<String, List<Emoji>>(term.toString(), emojis);
+        return out;
+    }
+}
diff --git a/test/cmu/arktweetnlp/TwokenizeTest.java b/test/cmu/arktweetnlp/TwokenizeTest.java
new file mode 100644
index 0000000..9006887
--- /dev/null
+++ b/test/cmu/arktweetnlp/TwokenizeTest.java
@@ -0,0 +1,105 @@
+package cmu.arktweetnlp;
+
+import org.junit.Test;
+
+import java.util.*;
+
+import static org.junit.Assert.*;
+
+public class TwokenizeTest {
+
+    public static final String INPUT_1 = "What's the greatest invention of all time? — Tumblr. http://t.co/IPZPnKqVk2";
+    public static final List<String> EXPECTED_TOKENS_1 = Arrays.asList("What's", "the", "greatest", "invention", "of", "all", "time", "?", "—", "Tumblr", ".", "http://t.co/IPZPnKqVk2");
+
+    public static final String INPUT_2 = "Looking for Apple Mac repairers near Naas....Anyone able to help? #kildare  https://t.co/1lhLT6EtWs";
+    public static final List<String> EXPECTED_TOKENS_2 = Arrays.asList("Looking", "for", "Apple", "Mac", "repairers", "near", "Naas", "....", "Anyone", "able", "to", "help", "?", "#kildare", "https://t.co/1lhLT6EtWs");
+
+    public static final String INPUT_3 = "RT @EKM94: The best thing I've seen on Twitter all day. http://t.co/lhYh13jUD0";
+    public static final List<String> EXPECTED_TOKENS_3 = Arrays.asList("RT", "@EKM94", ":", "The", "best", "thing", "I've", "seen", "on", "Twitter", "all", "day", ".", "http://t.co/lhYh13jUD0");
+
+    public static final String INPUT_4 = "Butterball Turkey Bacon Only $.54 At Walgreens! via Couponing For 4 - Starting the week of 6/28, ... http://t.co/0AdaJsqwIR";
+    public static final List<String> EXPECTED_TOKENS_4 = Arrays.asList("Butterball", "Turkey", "Bacon", "Only", "$", ".", "54", "At", "Walgreens", "!", "via", "Couponing", "For", "4", "-", "Starting", "the", "week", "of", "6/28", ",", "...", "http://t.co/0AdaJsqwIR");
+
+    public static final String INPUT_5 = "RT @beingactress: ♥Taking joy in living is a woman’s best cosmetic♥  @actressharshika http://t.co/AF8Bl69Uyu";
+    public static final List<String> EXPECTED_TOKENS_5 = Arrays.asList("RT", "@beingactress", ":", "♥", "Taking", "joy", "in", "living", "is", "a", "woman’s", "best", "cosmetic", "♥", "@actressharshika", "http://t.co/AF8Bl69Uyu");
+
+    public static final String INPUT_6 = "@larysaG Well thanks! Making me feel better already lol I'm Nancy btw :) Nice to meet u! I'll try remembering that when I'm terrified there.";
+    public static final List<String> EXPECTED_TOKENS_6 = Arrays.asList("@larysaG", "Well", "thanks", "!", "Making", "me", "feel", "better", "already", "lol", "I'm", "Nancy", "btw", ":)", "Nice", "to", "meet", "u", "!", "I'll", "try", "remembering", "that", "when", "I'm", "terrified", "there", ".");
+
+    public static final String INPUT_7 = "*✲ﾟ*｡✧٩(･ิᴗ･ิ๑)۶ luke hemmings from 5sos you make me happy i love you so much , follow me please?@luke5sos*✲ﾟ*｡✧٩(･ิᴗ･ิ๑)۶ 77";
+    public static final List<String> EXPECTED_TOKENS_7 = Arrays.asList("*✲ﾟ*", "｡✧٩", "(･ิᴗ･ิ๑)", "۶", "luke", "hemmings", "from", "5sos", "you", "make", "me", "happy", "i", "love", "you", "so", "much", ",", "follow", "me", "please", "?", "@luke5sos", "*✲ﾟ*", "｡✧٩", "(･ิᴗ･ิ๑)", "۶", "77");
+
+
+    @Test
+    public void itShouldTokenizeTweets() throws Exception {
+        final List<String> tokens1 = Twokenize.tokenize(INPUT_1);
+        assertFalse(tokens1.isEmpty());
+        assertEquals(EXPECTED_TOKENS_1.size(), tokens1.size());
+        assertEquals(EXPECTED_TOKENS_1, tokens1);
+
+
+
+        final List<String> tokens2 = Twokenize.tokenize(INPUT_2);
+        assertFalse(tokens2.isEmpty());
+        assertEquals(EXPECTED_TOKENS_2.size(), tokens2.size());
+        assertEquals(EXPECTED_TOKENS_2, tokens2);
+
+
+        final List<String> tokens3 = Twokenize.tokenize(INPUT_3);
+        assertFalse(tokens3.isEmpty());
+        assertEquals(EXPECTED_TOKENS_3.size(), tokens3.size());
+        assertEquals(EXPECTED_TOKENS_3, tokens3);
+
+
+        final List<String> tokens4 = Twokenize.tokenize(INPUT_4);
+        assertFalse(tokens4.isEmpty());
+        assertEquals(EXPECTED_TOKENS_4.size(), tokens4.size());
+        assertEquals(EXPECTED_TOKENS_4, tokens4);
+
+
+        final List<String> tokens5 = Twokenize.tokenize(INPUT_5);
+        assertFalse(tokens5.isEmpty());
+        assertEquals(EXPECTED_TOKENS_5.size(), tokens5.size());
+        assertEquals(EXPECTED_TOKENS_5, tokens5);
+
+
+        final List<String> tokens6 = Twokenize.tokenize(INPUT_6);
+        assertFalse(tokens6.isEmpty());
+        assertEquals(EXPECTED_TOKENS_6.size(), tokens6.size());
+        assertEquals(EXPECTED_TOKENS_6, tokens6);
+
+
+        final List<String> tokens7 = Twokenize.tokenize(INPUT_7);
+        assertFalse(tokens7.isEmpty());
+        assertEquals(EXPECTED_TOKENS_7.size(), tokens7.size());
+        assertEquals(EXPECTED_TOKENS_7, tokens7);
+    }
+
+    @Test
+    public void itShouldProduceASingleTokenFromAOneWordTweet() {
+        final String[] oneTokenInputs = {
+                "test",
+                "Test",
+                "?",
+                "!",
+                ".",
+                "http://test.com",
+                ":)"
+        };
+
+        for (final String input : oneTokenInputs) {
+            assertEquals(Arrays.asList(input), Twokenize.tokenize(input));
+        }
+    }
+
+    @Test
+    public void itShouldProduceAnEmptyListOfTokensForTheEmptyString() {
+        final List<String> tokens = Twokenize.tokenize("");
+        assertTrue(tokens.isEmpty());
+    }
+
+    @Test(expected = NullPointerException.class)
+    public void itShouldThrowNullPointerExceptionWithNullInput() {
+        Twokenize.tokenize(null);
+    }
+}
diff --git a/test/cmu/arktweetnlp/impl/DefaultPatternContextTest.java b/test/cmu/arktweetnlp/impl/DefaultPatternContextTest.java
new file mode 100644
index 0000000..b9ffbd0
--- /dev/null
+++ b/test/cmu/arktweetnlp/impl/DefaultPatternContextTest.java
@@ -0,0 +1,38 @@
+package cmu.arktweetnlp.impl;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class DefaultPatternContextTest {
+
+    final DefaultPatternContext context = new DefaultPatternContext();
+
+    @Test
+    public void itShouldSqeeuzeWhitespace() {
+        assertEquals("", context.squeezeWhitespace(""));
+        assertEquals("Hello world.", context.squeezeWhitespace("Hello world."));
+        assertEquals("Hello world.", context.squeezeWhitespace("Hello      world."));
+    }
+
+    @Test(expected = NullPointerException.class)
+    public void itShouldThrowNullPointerWhenSqueezingNull() {
+        context.squeezeWhitespace(null);
+    }
+
+    @Test
+    public void itShouldSplitEdgePunctuation() {
+        assertEquals("", context.splitEdgePunctuation(""));
+
+        final String split = context.splitEdgePunctuation("*hello*");
+        assertEquals("* hello *", split);
+    }
+
+    @Test(expected = NullPointerException.class)
+    public void itShouldThrowNullPointerWhenSplittingEdgePunctOnNull() {
+        context.squeezeWhitespace(null);
+    }
+
+
+
+}
diff --git a/test/cmu/arktweetnlp/util/EmojiUtilTest.java b/test/cmu/arktweetnlp/util/EmojiUtilTest.java
new file mode 100644
index 0000000..04cd845
--- /dev/null
+++ b/test/cmu/arktweetnlp/util/EmojiUtilTest.java
@@ -0,0 +1,59 @@
+package cmu.arktweetnlp.util;
+
+import org.junit.Test;
+import org.pirkaengine.mobile.Emoji;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import org.apache.commons.lang3.tuple.Pair;
+import static org.junit.Assert.*;
+
+public class EmojiUtilTest {
+
+    @Test
+    public void itShouldProduceEmptyOutputForEmptyInput() {
+        final Pair<String, List<Emoji>> emojis = EmojiUtil.filterEmoji("");
+        assertTrue(emojis.getLeft().isEmpty());
+        assertTrue(emojis.getRight().isEmpty());
+    }
+
+    @Test
+    public void itShouldProduceEmptyOutputForNullInput() {
+        final Pair<String, List<Emoji>> emojis = EmojiUtil.filterEmoji(null);
+        assertTrue(emojis.getLeft().isEmpty());
+        assertTrue(emojis.getRight().isEmpty());
+    }
+
+    @Test
+    public void itShouldExtractEmojisFromText() {
+        //"RT @Annam1181orM: @LotAgar @Dom70Bcn @paquifer1969 @V_alf_V @MnicaRebullCome PPS\uD83D\uDC7EE=\uD83D\uDC01Tots!Son\uD83D\uDC01\uD83D\uDC00=FRANKISTESfeixistesQ\uD83D\uDC00varen\uD83D\uDC01MATAR×ODI aCAT➡Ca…";
+
+
+        Pair<String, List<Emoji>> emojis = null;
+
+        emojis = EmojiUtil.filterEmoji("Hello \uD83D\uDC7E");
+        assertEmojiEquals("Hello ", Arrays.asList(Emoji.ALIEN_MONSTER), emojis);
+
+        emojis = EmojiUtil.filterEmoji("He\uD83D\uDC7Ello");
+        assertEmojiEquals("Hello", Arrays.asList(Emoji.ALIEN_MONSTER), emojis);
+
+
+        emojis = EmojiUtil.filterEmoji("This has 2 \uD83D\uDC7E \uD83D\uDC7D emojis!");
+        assertEmojiEquals("This has 2   emojis!", Arrays.asList(Emoji.ALIEN_MONSTER, Emoji.EXTRATERRESTRIAL_ALIEN), emojis);
+    }
+
+    @Test
+    public void itShouldLeaveUnknownEmojisUntouched() {
+        final String text = "The following unicode characters are not a known emoji:\uD83D\uDC01. But this one is: \u27A1!";
+        final Pair<String, List<Emoji>> emojis = EmojiUtil.filterEmoji(text);
+        assertEmojiEquals("The following unicode characters are not a known emoji:\uD83D\uDC01. But this one is: !", Arrays.asList(Emoji.BLACK_RIGHTWARDS_ARROW), emojis);
+    }
+
+    private void assertEmojiEquals(final String expectedText, final Collection<Emoji> expectedEmoji, final Pair<String, List<Emoji>> actual) {
+        assertTrue(actual != null);
+        assertEquals(expectedText, actual.getLeft());
+        assertEquals(expectedEmoji.size(), actual.getRight().size());
+        assertEquals(expectedEmoji, actual.getRight());
+    }
+}