diff --git a/pom.xml b/pom.xml
index 69b994a..cb5211d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
org.mediameter
CLIFF
war
- 1.2.0
+ 1.3.0
CLIFF
http://cliff.mediameter.org/
diff --git a/src/main/java/org/mediameter/cliff/EntityParser.java b/src/main/java/org/mediameter/cliff/EntityParser.java
index 37407c8..550f4d3 100644
--- a/src/main/java/org/mediameter/cliff/EntityParser.java
+++ b/src/main/java/org/mediameter/cliff/EntityParser.java
@@ -1,6 +1,7 @@
package org.mediameter.cliff;
import java.util.List;
+import java.util.Map;
import org.mediameter.cliff.extractor.ExtractedEntities;
import org.mediameter.cliff.extractor.StanfordNamedEntityExtractor;
@@ -54,7 +55,21 @@ public ExtractedEntities extractAndResolve(String inputText, boolean manuallyRep
logger.debug("extractAndResolve: "+extract+" / "+resolve);
return entities;
}
-
+
+ @SuppressWarnings("rawtypes")
+ public ExtractedEntities extractAndResolveFromSentences(Map[] sentences, boolean manuallyReplaceDemonyms) throws Exception {
+ logger.trace("input: {}", sentences);
+ long startTime = System.nanoTime();
+ ExtractedEntities extractedEntities = extractor.extractEntitiesFromSentences(sentences,manuallyReplaceDemonyms);
+ long extract = System.nanoTime() - startTime;
+ logger.trace("extracted: {}", extractedEntities.getLocations());
+ startTime = System.nanoTime();
+ ExtractedEntities entities = resolve(extractedEntities);
+ long resolve = System.nanoTime() - startTime;
+ logger.debug("extractAndResolve: "+extract+" / "+resolve);
+ return entities;
+ }
+
public ExtractedEntities resolve(ExtractedEntities entities) throws Exception{
// resolve the extracted location names against a
diff --git a/src/main/java/org/mediameter/cliff/ParseManager.java b/src/main/java/org/mediameter/cliff/ParseManager.java
index 5e7169b..227a4de 100644
--- a/src/main/java/org/mediameter/cliff/ParseManager.java
+++ b/src/main/java/org/mediameter/cliff/ParseManager.java
@@ -5,6 +5,7 @@
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import org.mediameter.cliff.extractor.CliffConfig;
import org.mediameter.cliff.extractor.ExtractedEntities;
@@ -28,6 +29,7 @@
import com.bericotech.clavin.gazetteer.GeoName;
import com.bericotech.clavin.resolver.LocationResolver;
import com.bericotech.clavin.resolver.ResolvedLocation;
+import com.google.gson.Gson;
/**
* Singleton-style wrapper around a GeoParser. Call GeoParser.locate(someText) to use this class.
@@ -39,7 +41,7 @@ public class ParseManager {
* Minor: change in json result format
* Revision: minor change or bug fix
*/
- static final String PARSER_VERSION = "1.2.0";
+ static final String PARSER_VERSION = "1.3.0";
private static final Logger logger = LoggerFactory.getLogger(ParseManager.class);
@@ -96,6 +98,27 @@ public static HashMap parseFromText(String text,boolean manuallyReplaceDemonyms)
return results;
}
+ @SuppressWarnings({ "unchecked", "rawtypes" })
+ public static HashMap parseFromSentences(String jsonText, boolean manuallyReplaceDemonyms) {
+ long startTime = System.currentTimeMillis();
+ HashMap results = null;
+ if(jsonText.trim().length()==0){
+ return getErrorText("No text");
+ }
+ try {
+ Gson gson = new Gson();
+ Map[] sentences = gson.fromJson(jsonText, Map[].class);
+ ExtractedEntities entities = extractAndResolveFromSentences(sentences,manuallyReplaceDemonyms);
+ results = parseFromEntities(entities);
+ } catch (Exception e) {
+ results = getErrorText(e.toString());
+ }
+ long endTime = System.currentTimeMillis();
+ long elapsedMillis = endTime - startTime;
+ results.put("milliseconds", elapsedMillis);
+ return results;
+ }
+
@SuppressWarnings({ "unchecked", "rawtypes" })
public static HashMap parseFromNlpJson(String nlpJsonString){
long startTime = System.currentTimeMillis();
@@ -104,7 +127,7 @@ public static HashMap parseFromNlpJson(String nlpJsonString){
return getErrorText("No text");
}
try {
- ExtractedEntities entities = MuckUtils.entitiesFromJsonString(nlpJsonString);
+ ExtractedEntities entities = MuckUtils.entitiesFromNlpJsonString(nlpJsonString);
entities = getParserInstance().resolve(entities);;
results = parseFromEntities(entities);
} catch (Exception e) {
@@ -259,6 +282,11 @@ public static ExtractedEntities extractAndResolve(String text,boolean manuallyRe
return getParserInstance().extractAndResolve(text,manuallyReplaceDemonyms);
}
+ @SuppressWarnings("rawtypes")
+ public static ExtractedEntities extractAndResolveFromSentences(Map[] sentences, boolean manuallyReplaceDemonyms) throws Exception{
+ return getParserInstance().extractAndResolveFromSentences(sentences, manuallyReplaceDemonyms);
+ }
+
/**
* We want all error messages sent to the client to have the same format
* @param msg
diff --git a/src/main/java/org/mediameter/cliff/extractor/StanfordNamedEntityExtractor.java b/src/main/java/org/mediameter/cliff/extractor/StanfordNamedEntityExtractor.java
index 330df33..f72d453 100644
--- a/src/main/java/org/mediameter/cliff/extractor/StanfordNamedEntityExtractor.java
+++ b/src/main/java/org/mediameter/cliff/extractor/StanfordNamedEntityExtractor.java
@@ -3,6 +3,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
+import java.util.Map;
import java.util.Properties;
import org.mediameter.cliff.places.substitutions.Blacklist;
@@ -156,6 +157,79 @@ public ExtractedEntities extractEntities(String textToParse,boolean manuallyRepl
return entities;
}
+
+ /**
+ * Get extracted locations from a plain-text body.
+ *
+ * @param text Text content to perform extraction on.
+ * @param manuallyReplaceDemonyms Can slow down performance quite a bit
+ * @return All the entities mentioned
+ */
+ @SuppressWarnings("rawtypes")
+ public ExtractedEntities extractEntitiesFromSentences(Map[] sentences,boolean manuallyReplaceDemonyms) {
+ ExtractedEntities entities = new ExtractedEntities();
+
+ if (sentences.length==0){
+ logger.warn("input to extractEntities was null or zero!");
+ return entities;
+ }
+
+ if(manuallyReplaceDemonyms){ // this is a noticeable performance hit
+ logger.debug("Replacing all demonyms by hand");
+ }
+
+ for(Map s:sentences){
+ String storySentencesId = s.get("story_sentences_id").toString();
+ String text = s.get("sentence").toString();
+ if(manuallyReplaceDemonyms){ // this is a noticeable performance hit
+ text = demonyms.replaceAll(text);
+ }
+ // extract entities as
+ List> extractedEntities =
+ namedEntityRecognizer.classifyToCharacterOffsets(text);
+ if (extractedEntities != null) {
+ for (Triple extractedEntity : extractedEntities) {
+ String entityName = text.substring(extractedEntity.second(), extractedEntity.third());
+ int position = extractedEntity.second();
+ switch(extractedEntity.first){
+ case "PERSON":
+ if(personToPlaceSubstitutions.contains(entityName)){
+ entities.addLocation( getLocationOccurrence(personToPlaceSubstitutions.getSubstitution(entityName), position) );
+ logger.debug("Changed person "+entityName+" to a place");
+ } else {
+ PersonOccurrence person = new PersonOccurrence(entityName, position);
+ entities.addPerson( person );
+ }
+ break;
+ case "LOCATION":
+ if(!locationBlacklist.contains(entityName)){
+ LocationOccurrence loc = getLocationOccurrence(entityName, position);
+ // save the sentence id here
+ entities.addLocation( new SentenceLocationOccurrence(loc.text, storySentencesId) );
+ } else {
+ logger.debug("Ignored blacklisted location "+entityName);
+ }
+ break;
+ case "ORGANIZATION":
+ OrganizationOccurrence organization = new OrganizationOccurrence(entityName, position);
+ entities.addOrganization( organization );
+ break;
+ case "MISC": // if you're using the slower 4class model
+ if (demonyms.contains(entityName)) {
+ logger.debug("Found and adding a MISC demonym "+entityName);
+ entities.addLocation( getLocationOccurrence(entityName, position) );
+ }
+ break;
+ default:
+ logger.error("Unknown NER type :"+ extractedEntity.first);
+ }
+ }
+ }
+ }
+
+ return entities;
+ }
+
private LocationOccurrence getLocationOccurrence(String entityName, int position){
String fixedName = entityName;
if (demonyms.contains(entityName)) {
diff --git a/src/main/java/org/mediameter/cliff/servlet/ParseSentencesServlet.java b/src/main/java/org/mediameter/cliff/servlet/ParseSentencesServlet.java
new file mode 100644
index 0000000..84fc3c3
--- /dev/null
+++ b/src/main/java/org/mediameter/cliff/servlet/ParseSentencesServlet.java
@@ -0,0 +1,67 @@
+package org.mediameter.cliff.servlet;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import org.mediameter.cliff.ParseManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.gson.Gson;
+
+/**
+ * Wraps the CLAVIN geoparser behind some ports so we can integrate it into other workflows.
+ *
+ * @author rahulb
+ */
+public class ParseSentencesServlet extends HttpServlet{
+
+ private static final Logger logger = LoggerFactory.getLogger(ParseSentencesServlet.class);
+
+ private static Gson gson = new Gson();
+
+ public ParseSentencesServlet() {
+ }
+
+ @Override
+ protected void doPost(HttpServletRequest request, HttpServletResponse response) throws IOException{
+ doGet(request,response);
+ }
+
+ @Override
+ @SuppressWarnings("rawtypes")
+ protected void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException{
+
+ logger.info("Sentences Parse Request from "+request.getRemoteAddr());
+ request.setCharacterEncoding(StandardCharsets.UTF_8.name());
+ response.setContentType("application/json;charset=UTF=8");
+ response.setCharacterEncoding(StandardCharsets.UTF_8.name());
+
+ HashMap results = null;
+ String text = request.getParameter("q");
+ String replaceAllDemonymsStr = request.getParameter("replaceAllDemonyms");
+ boolean manuallyReplaceDemonyms = (replaceAllDemonymsStr==null) ? false : Boolean.parseBoolean(replaceAllDemonymsStr);
+ logger.debug("q="+text);
+ logger.debug("replaceAllDemonyms="+manuallyReplaceDemonyms);
+
+ if(text==null){
+ response.sendError(HttpServletResponse.SC_BAD_REQUEST);
+ } else {
+ try {
+ results = ParseManager.parseFromSentences(text,manuallyReplaceDemonyms);
+ } catch(Exception e){ // try to give the user something useful
+ logger.error(e.toString());
+ results = ParseManager.getErrorText(e.toString());
+ }
+ String jsonResults = gson.toJson(results);
+ logger.info(jsonResults);
+ response.getWriter().write(jsonResults);
+ }
+ }
+
+}
diff --git a/src/main/java/org/mediameter/cliff/util/MuckUtils.java b/src/main/java/org/mediameter/cliff/util/MuckUtils.java
index ee1f8e3..00fdb09 100644
--- a/src/main/java/org/mediameter/cliff/util/MuckUtils.java
+++ b/src/main/java/org/mediameter/cliff/util/MuckUtils.java
@@ -14,9 +14,9 @@
@SuppressWarnings({ "rawtypes", "unchecked" })
public class MuckUtils {
- public static ExtractedEntities entitiesFromJsonString(String nlpJsonString){
+ public static ExtractedEntities entitiesFromNlpJsonString(String nlpJsonString){
Map sentences = sentencesFromJsonString(nlpJsonString);
- return entitiesFromSentenceMap(sentences);
+ return entitiesFromNlpSentenceMap(sentences);
}
public static Map sentencesFromJsonString(String nlpJsonString) {
@@ -26,9 +26,9 @@ public static Map sentencesFromJsonString(String nlpJsonString) {
}
/**
- * I've overloaded "position" in each of the occurrences to be sentenceIndex
+ *
*/
- private static ExtractedEntities entitiesFromSentenceMap(Map mcSentences){
+ private static ExtractedEntities entitiesFromNlpSentenceMap(Map mcSentences){
ExtractedEntities entities = new ExtractedEntities();
Iterator it = mcSentences.entrySet().iterator();
while (it.hasNext()) {
diff --git a/src/main/webapp/WEB-INF/web.xml b/src/main/webapp/WEB-INF/web.xml
index 24e23ae..859ab58 100755
--- a/src/main/webapp/WEB-INF/web.xml
+++ b/src/main/webapp/WEB-INF/web.xml
@@ -23,6 +23,15 @@
/parse/json
+
+ ParseSentencesServlet
+ org.mediameter.cliff.servlet.ParseSentencesServlet
+
+
+ ParseSentencesServlet
+ /parse/sentences
+
+
GeonamesLookupServlet
org.mediameter.cliff.servlet.GeonamesLookupServlet
diff --git a/src/test/java/org/mediameter/cliff/test/EntityParserTest.java b/src/test/java/org/mediameter/cliff/test/EntityParserTest.java
new file mode 100644
index 0000000..079fa34
--- /dev/null
+++ b/src/test/java/org/mediameter/cliff/test/EntityParserTest.java
@@ -0,0 +1,40 @@
+package org.mediameter.cliff.test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.FileUtils;
+import org.junit.Test;
+import org.mediameter.cliff.ParseManager;
+import org.mediameter.cliff.extractor.ExtractedEntities;
+import org.mediameter.cliff.extractor.SentenceLocationOccurrence;
+import org.mediameter.cliff.test.util.TestPlaces;
+
+import com.bericotech.clavin.resolver.ResolvedLocation;
+import com.google.gson.Gson;
+
+public class EntityParserTest {
+
+ @Test
+ @SuppressWarnings("rawtypes")
+ public void extractAndResolveFromSentences() throws Exception {
+ String fileName = "story-sentences-278413513.json";
+ File file = new File("src/test/resources/sample-sentence-docs/"+fileName);
+ String jsonText = FileUtils.readFileToString(file);
+ Gson gson = new Gson();
+ Map[] sentences = gson.fromJson(jsonText, Map[].class);
+ ExtractedEntities entities = ParseManager.extractAndResolveFromSentences(sentences, false);
+ List locations = entities.getResolvedLocations();
+ assertEquals(locations.size(),1);
+ ResolvedLocation loc = locations.get(0);
+ assertEquals(loc.geoname.geonameID,TestPlaces.RIKERS_ISLAND);
+ assertTrue(loc.location instanceof SentenceLocationOccurrence);
+ SentenceLocationOccurrence sentenceLoc = (SentenceLocationOccurrence) loc.location;
+ assertEquals(sentenceLoc.storySentenceId,"3279940188");
+ }
+
+}
diff --git a/src/test/java/org/mediameter/cliff/test/MuckUtilsTest.java b/src/test/java/org/mediameter/cliff/test/MuckUtilsTest.java
index 720139c..06a4544 100644
--- a/src/test/java/org/mediameter/cliff/test/MuckUtilsTest.java
+++ b/src/test/java/org/mediameter/cliff/test/MuckUtilsTest.java
@@ -23,7 +23,7 @@ public void testStory1() throws IOException {
File file = new File("src/test/resources/sample-muck-json/"+fileName);
String json = FileUtils.readFileToString(file);
- ExtractedEntities entities = MuckUtils.entitiesFromJsonString(json);
+ ExtractedEntities entities = MuckUtils.entitiesFromNlpJsonString(json);
assertEquals("Wrong number of location occurrences", 19, entities.getLocations().size());
assertEquals("Wrong number of people occurrences", 15, entities.getPeople().size());
assertEquals("Wrong number of organization occurrences", 4, entities.getOrganizations().size());
diff --git a/src/test/java/org/mediameter/cliff/test/util/TestPlaces.java b/src/test/java/org/mediameter/cliff/test/util/TestPlaces.java
index 386410b..1dcf313 100644
--- a/src/test/java/org/mediameter/cliff/test/util/TestPlaces.java
+++ b/src/test/java/org/mediameter/cliff/test/util/TestPlaces.java
@@ -27,5 +27,6 @@ public class TestPlaces {
public static final int CITY_BANGOR = 4957280;
public static final int COUNTRY_RUSSIA = 2017370;
public static final int STATE_OKLAHOMA = 4544379;
+ public static final int RIKERS_ISLAND = 5133874;
}
diff --git a/src/test/java/org/mediameter/cliff/test/util/TestUtils.java b/src/test/java/org/mediameter/cliff/test/util/TestUtils.java
index b73716b..024c946 100644
--- a/src/test/java/org/mediameter/cliff/test/util/TestUtils.java
+++ b/src/test/java/org/mediameter/cliff/test/util/TestUtils.java
@@ -14,7 +14,6 @@
import org.slf4j.Logger;
import com.bericotech.clavin.gazetteer.CountryCode;
-import com.bericotech.clavin.gazetteer.GeoName;
import com.bericotech.clavin.resolver.ResolvedLocation;
import com.bericotech.clavin.util.TextUtils;
import com.google.gson.Gson;
diff --git a/src/test/resources/sample-sentence-docs/story-sentences-278413513.json b/src/test/resources/sample-sentence-docs/story-sentences-278413513.json
new file mode 100644
index 0000000..58d21b5
--- /dev/null
+++ b/src/test/resources/sample-sentence-docs/story-sentences-278413513.json
@@ -0,0 +1 @@
+[{"sentence":"City officials had no comment on Tuesday.","language":"en","sentence_number":0,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940186"},{"sentence":"On Monday, Mayor Bill de Blasio said at a news conference that the city would be unveiling reforms over time, particularly for the youngest inmates and for those with mental health issues.","language":"en","sentence_number":1,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940187"},{"sentence":"“The status quo at Rikers is unacceptable,” he said.","language":"en","sentence_number":2,"tags":["8878953"],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:14.25158-05","story_sentences_id":"3279940188"},{"sentence":"“As we outlined in the report that we made public some 56 days ago,” Mr. Bharara said, “there’s a problem of accountability and of culture there that rivals all the problems that I’ve been talking about in other areas as well.","language":"en","sentence_number":3,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940189"},{"sentence":"Mr. Bharara noted that when his office first issued its report, he felt there was “a lot of reason for optimism.”","language":"en","sentence_number":4,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940190"},{"sentence":"He said on Tuesday that he had talked with the new correction commissioner, Joseph Ponte, and the corporation counsel’s office, and that Mr. de Blasio had also said “very positive things.”","language":"en","sentence_number":5,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940191"},{"sentence":"Mr. Bharara twice referred to the continuing talks with the city.","language":"en","sentence_number":6,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940192"}]
\ No newline at end of file