brendano · spomerville · Oct 21, 2015 · Nov 10, 2015 · Nov 10, 2015
diff --git a/ark-tweet-nlp/pom.xml b/ark-tweet-nlp/pom.xml
@@ -10,6 +10,7 @@
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     </properties>
+
     <build>
 		<plugins>
              <plugin>
@@ -27,17 +28,30 @@
                       <sources>
                           <source>${basedir}/../src</source>
                       </sources>
+
                     </configuration>
                   </execution>
+                  <execution>
+                        <id>add-test-source</id>
+                        <phase>generate-test-sources</phase>
+                        <goals>
+                            <goal>add-test-source</goal>
+                        </goals>
+                        <configuration>
+                            <sources>
+                                <source>${basedir}/../test</source>
+                            </sources>
+                        </configuration>
+                    </execution>
                 </executions>
-            </plugin>		
+            </plugin>
 			<plugin>
 				<groupId>org.apache.maven.plugins</groupId>
 				<artifactId>maven-shade-plugin</artifactId>
 				<version>1.6</version>
 				<executions>
 					<execution>
-						<phase>package</phase>
+						<phase>none</phase>
 						<goals>
 							<goal>shade</goal>
 						</goals>
@@ -50,7 +64,7 @@
 						</configuration>
 					</execution>
 				</executions>
-			</plugin>	
+			</plugin>
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-compiler-plugin</artifactId>
@@ -108,11 +122,11 @@
 										<include>**/*.jar</include>
 									</includes>
 								</JarResource>
-							</resources>						
+							</resources>
 						</configuration>
 					</execution>
 				</executions>
-			</plugin>			
+			</plugin>
         </plugins>
     </build>
     <repositories>
@@ -133,9 +147,9 @@
             <version>1.4</version>
         </dependency>
 		<dependency>
-			<groupId>commons-lang</groupId>
-			<artifactId>commons-lang</artifactId>
-			<version>2.3</version>
+			<groupId>org.apache.commons</groupId>
+			<artifactId>commons-lang3</artifactId>
+			<version>3.4</version>
 		</dependency>
         <dependency>
             <groupId>org.apache.commons</groupId>
@@ -162,6 +176,13 @@
             <artifactId>guava</artifactId>
             <version>10.0.1</version>
         </dependency>
+
+        <dependency>
+            <groupId>org.pirkaengine</groupId>
+            <artifactId>pirka-mobile</artifactId>
+            <version>0.3.0</version>
+        </dependency>
+
         <!-- START testing dependecies -->
         <dependency>
             <groupId>junit</groupId>
@@ -181,7 +202,23 @@
 		  <artifactId>jackson-databind</artifactId>
 		  <version>2.0.0</version>
 		</dependency>
-
+
+        <dependency>
+            <groupId>edu.stanford.nlp</groupId>
+            <artifactId>stanford-corenlp</artifactId>
+            <version>1.2.0</version>
+        </dependency>
+        <dependency>
+            <groupId>edu.berkeley.nlp</groupId>
+            <artifactId>berkeleyparser</artifactId>
+            <version>r32</version>
+        </dependency>
+        <dependency>
+            <groupId>net.sf.trove4j</groupId>
+            <artifactId>trove4j</artifactId>
+            <version>3.0.3</version>
+        </dependency>
+
         <!-- END testing dependecies -->
     </dependencies>
 </project>
diff --git a/src/cmu/arktweetnlp/EmojiExtractor.java b/src/cmu/arktweetnlp/EmojiExtractor.java
@@ -0,0 +1,13 @@
+package cmu.arktweetnlp;
+
+import org.apache.commons.lang3.tuple.Pair;
+import org.pirkaengine.mobile.Emoji;
+import java.util.List;
+
+
+/**
+ * Interface for objects that know how to extract emojis from text.
+ */
+public interface EmojiExtractor {
+    public Pair<String, List<Emoji>> extractEmojis(final String text);
+}
diff --git a/src/cmu/arktweetnlp/PatternContext.java b/src/cmu/arktweetnlp/PatternContext.java
@@ -0,0 +1,44 @@
+package cmu.arktweetnlp;
+
+import java.util.regex.Pattern;
+
+/**
+ * Interface for the collection of Patterns needed by the Twokenize module.
+ * Allows users to pass in a custom set of patterns or use the DefaultPatternContext bundled with the library.
+ */
+public interface PatternContext {
+    /**
+     * @return A pattern that can be used to detect contractions
+     */
+    public Pattern getContractionPattern();
+
+    /**
+     * @return A pattern that can be used to detect whitespace
+     */
+    public Pattern getWhitespacePattern();
+
+    /**
+     * @return A pattern that can be used to detect any desired
+     * "protected" tokens -- tokens that should not be split any further.
+     */
+    public Pattern getProtectedTokenPattern();
+
+    /**
+     * @return A pattern that can be used to detect left edge punctuation
+     */
+    public Pattern getLeftEdgePunctuationPattern();
+
+    /**
+     * @return A pattern that can be used to detect right edge punctuation
+     */
+    public Pattern getRightEdgePunctuationPattern();
+
+
+    public String splitEdgePunctuation(String input);
+
+    /**
+     * @return Trims multiple consecutive white spaces into a single
+     * space. E.g. "foo   bar " => "foo bar"
+     */
+    public String squeezeWhitespace(String input);
+}
diff --git a/src/cmu/arktweetnlp/TokenCategorizer.java b/src/cmu/arktweetnlp/TokenCategorizer.java
@@ -0,0 +1,15 @@
+package cmu.arktweetnlp;
+
+
+import org.pirkaengine.mobile.Emoji;
+import java.util.*;
+
+/**
+ * Interface for objects that know how to group tokens output from Twokenize
+ * into categories of a given type. E.g. mapping certain token types to an enum.
+ * @param <T>
+ */
+public interface TokenCategorizer<T> {
+    public Map<T, List<String>> categorize(final String text, final List<List<String>> splitTokens, final List<List<String>> protectedTokens, final List<Emoji> emojis);
+}
+