From 0a27e6e6aa32c32b419cef81243fc53d6d0b540a Mon Sep 17 00:00:00 2001 From: Rahul Bhargava Date: Fri, 27 Mar 2015 13:36:26 -0400 Subject: [PATCH] changes to support parsing from mediacloud sentences array --- pom.xml | 2 +- .../org/mediameter/cliff/EntityParser.java | 17 ++++- .../org/mediameter/cliff/ParseManager.java | 32 +++++++- .../StanfordNamedEntityExtractor.java | 74 +++++++++++++++++++ .../cliff/servlet/ParseSentencesServlet.java | 67 +++++++++++++++++ .../org/mediameter/cliff/util/MuckUtils.java | 8 +- src/main/webapp/WEB-INF/web.xml | 9 +++ .../cliff/test/EntityParserTest.java | 40 ++++++++++ .../mediameter/cliff/test/MuckUtilsTest.java | 2 +- .../cliff/test/util/TestPlaces.java | 1 + .../mediameter/cliff/test/util/TestUtils.java | 1 - .../story-sentences-278413513.json | 1 + 12 files changed, 244 insertions(+), 10 deletions(-) create mode 100644 src/main/java/org/mediameter/cliff/servlet/ParseSentencesServlet.java create mode 100644 src/test/java/org/mediameter/cliff/test/EntityParserTest.java create mode 100644 src/test/resources/sample-sentence-docs/story-sentences-278413513.json diff --git a/pom.xml b/pom.xml index 69b994a..cb5211d 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ org.mediameter CLIFF war - 1.2.0 + 1.3.0 CLIFF http://cliff.mediameter.org/ diff --git a/src/main/java/org/mediameter/cliff/EntityParser.java b/src/main/java/org/mediameter/cliff/EntityParser.java index 37407c8..550f4d3 100644 --- a/src/main/java/org/mediameter/cliff/EntityParser.java +++ b/src/main/java/org/mediameter/cliff/EntityParser.java @@ -1,6 +1,7 @@ package org.mediameter.cliff; import java.util.List; +import java.util.Map; import org.mediameter.cliff.extractor.ExtractedEntities; import org.mediameter.cliff.extractor.StanfordNamedEntityExtractor; @@ -54,7 +55,21 @@ public ExtractedEntities extractAndResolve(String inputText, boolean manuallyRep logger.debug("extractAndResolve: "+extract+" / "+resolve); return entities; } - + + @SuppressWarnings("rawtypes") + public ExtractedEntities extractAndResolveFromSentences(Map[] sentences, boolean manuallyReplaceDemonyms) throws Exception { + logger.trace("input: {}", sentences); + long startTime = System.nanoTime(); + ExtractedEntities extractedEntities = extractor.extractEntitiesFromSentences(sentences,manuallyReplaceDemonyms); + long extract = System.nanoTime() - startTime; + logger.trace("extracted: {}", extractedEntities.getLocations()); + startTime = System.nanoTime(); + ExtractedEntities entities = resolve(extractedEntities); + long resolve = System.nanoTime() - startTime; + logger.debug("extractAndResolve: "+extract+" / "+resolve); + return entities; + } + public ExtractedEntities resolve(ExtractedEntities entities) throws Exception{ // resolve the extracted location names against a diff --git a/src/main/java/org/mediameter/cliff/ParseManager.java b/src/main/java/org/mediameter/cliff/ParseManager.java index 5e7169b..227a4de 100644 --- a/src/main/java/org/mediameter/cliff/ParseManager.java +++ b/src/main/java/org/mediameter/cliff/ParseManager.java @@ -5,6 +5,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; +import java.util.Map; import org.mediameter.cliff.extractor.CliffConfig; import org.mediameter.cliff.extractor.ExtractedEntities; @@ -28,6 +29,7 @@ import com.bericotech.clavin.gazetteer.GeoName; import com.bericotech.clavin.resolver.LocationResolver; import com.bericotech.clavin.resolver.ResolvedLocation; +import com.google.gson.Gson; /** * Singleton-style wrapper around a GeoParser. Call GeoParser.locate(someText) to use this class. @@ -39,7 +41,7 @@ public class ParseManager { * Minor: change in json result format * Revision: minor change or bug fix */ - static final String PARSER_VERSION = "1.2.0"; + static final String PARSER_VERSION = "1.3.0"; private static final Logger logger = LoggerFactory.getLogger(ParseManager.class); @@ -96,6 +98,27 @@ public static HashMap parseFromText(String text,boolean manuallyReplaceDemonyms) return results; } + @SuppressWarnings({ "unchecked", "rawtypes" }) + public static HashMap parseFromSentences(String jsonText, boolean manuallyReplaceDemonyms) { + long startTime = System.currentTimeMillis(); + HashMap results = null; + if(jsonText.trim().length()==0){ + return getErrorText("No text"); + } + try { + Gson gson = new Gson(); + Map[] sentences = gson.fromJson(jsonText, Map[].class); + ExtractedEntities entities = extractAndResolveFromSentences(sentences,manuallyReplaceDemonyms); + results = parseFromEntities(entities); + } catch (Exception e) { + results = getErrorText(e.toString()); + } + long endTime = System.currentTimeMillis(); + long elapsedMillis = endTime - startTime; + results.put("milliseconds", elapsedMillis); + return results; + } + @SuppressWarnings({ "unchecked", "rawtypes" }) public static HashMap parseFromNlpJson(String nlpJsonString){ long startTime = System.currentTimeMillis(); @@ -104,7 +127,7 @@ public static HashMap parseFromNlpJson(String nlpJsonString){ return getErrorText("No text"); } try { - ExtractedEntities entities = MuckUtils.entitiesFromJsonString(nlpJsonString); + ExtractedEntities entities = MuckUtils.entitiesFromNlpJsonString(nlpJsonString); entities = getParserInstance().resolve(entities);; results = parseFromEntities(entities); } catch (Exception e) { @@ -259,6 +282,11 @@ public static ExtractedEntities extractAndResolve(String text,boolean manuallyRe return getParserInstance().extractAndResolve(text,manuallyReplaceDemonyms); } + @SuppressWarnings("rawtypes") + public static ExtractedEntities extractAndResolveFromSentences(Map[] sentences, boolean manuallyReplaceDemonyms) throws Exception{ + return getParserInstance().extractAndResolveFromSentences(sentences, manuallyReplaceDemonyms); + } + /** * We want all error messages sent to the client to have the same format * @param msg diff --git a/src/main/java/org/mediameter/cliff/extractor/StanfordNamedEntityExtractor.java b/src/main/java/org/mediameter/cliff/extractor/StanfordNamedEntityExtractor.java index 330df33..f72d453 100644 --- a/src/main/java/org/mediameter/cliff/extractor/StanfordNamedEntityExtractor.java +++ b/src/main/java/org/mediameter/cliff/extractor/StanfordNamedEntityExtractor.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.io.InputStream; import java.util.List; +import java.util.Map; import java.util.Properties; import org.mediameter.cliff.places.substitutions.Blacklist; @@ -156,6 +157,79 @@ public ExtractedEntities extractEntities(String textToParse,boolean manuallyRepl return entities; } + + /** + * Get extracted locations from a plain-text body. + * + * @param text Text content to perform extraction on. + * @param manuallyReplaceDemonyms Can slow down performance quite a bit + * @return All the entities mentioned + */ + @SuppressWarnings("rawtypes") + public ExtractedEntities extractEntitiesFromSentences(Map[] sentences,boolean manuallyReplaceDemonyms) { + ExtractedEntities entities = new ExtractedEntities(); + + if (sentences.length==0){ + logger.warn("input to extractEntities was null or zero!"); + return entities; + } + + if(manuallyReplaceDemonyms){ // this is a noticeable performance hit + logger.debug("Replacing all demonyms by hand"); + } + + for(Map s:sentences){ + String storySentencesId = s.get("story_sentences_id").toString(); + String text = s.get("sentence").toString(); + if(manuallyReplaceDemonyms){ // this is a noticeable performance hit + text = demonyms.replaceAll(text); + } + // extract entities as + List> extractedEntities = + namedEntityRecognizer.classifyToCharacterOffsets(text); + if (extractedEntities != null) { + for (Triple extractedEntity : extractedEntities) { + String entityName = text.substring(extractedEntity.second(), extractedEntity.third()); + int position = extractedEntity.second(); + switch(extractedEntity.first){ + case "PERSON": + if(personToPlaceSubstitutions.contains(entityName)){ + entities.addLocation( getLocationOccurrence(personToPlaceSubstitutions.getSubstitution(entityName), position) ); + logger.debug("Changed person "+entityName+" to a place"); + } else { + PersonOccurrence person = new PersonOccurrence(entityName, position); + entities.addPerson( person ); + } + break; + case "LOCATION": + if(!locationBlacklist.contains(entityName)){ + LocationOccurrence loc = getLocationOccurrence(entityName, position); + // save the sentence id here + entities.addLocation( new SentenceLocationOccurrence(loc.text, storySentencesId) ); + } else { + logger.debug("Ignored blacklisted location "+entityName); + } + break; + case "ORGANIZATION": + OrganizationOccurrence organization = new OrganizationOccurrence(entityName, position); + entities.addOrganization( organization ); + break; + case "MISC": // if you're using the slower 4class model + if (demonyms.contains(entityName)) { + logger.debug("Found and adding a MISC demonym "+entityName); + entities.addLocation( getLocationOccurrence(entityName, position) ); + } + break; + default: + logger.error("Unknown NER type :"+ extractedEntity.first); + } + } + } + } + + return entities; + } + private LocationOccurrence getLocationOccurrence(String entityName, int position){ String fixedName = entityName; if (demonyms.contains(entityName)) { diff --git a/src/main/java/org/mediameter/cliff/servlet/ParseSentencesServlet.java b/src/main/java/org/mediameter/cliff/servlet/ParseSentencesServlet.java new file mode 100644 index 0000000..84fc3c3 --- /dev/null +++ b/src/main/java/org/mediameter/cliff/servlet/ParseSentencesServlet.java @@ -0,0 +1,67 @@ +package org.mediameter.cliff.servlet; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; + +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.mediameter.cliff.ParseManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; + +/** + * Wraps the CLAVIN geoparser behind some ports so we can integrate it into other workflows. + * + * @author rahulb + */ +public class ParseSentencesServlet extends HttpServlet{ + + private static final Logger logger = LoggerFactory.getLogger(ParseSentencesServlet.class); + + private static Gson gson = new Gson(); + + public ParseSentencesServlet() { + } + + @Override + protected void doPost(HttpServletRequest request, HttpServletResponse response) throws IOException{ + doGet(request,response); + } + + @Override + @SuppressWarnings("rawtypes") + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException{ + + logger.info("Sentences Parse Request from "+request.getRemoteAddr()); + request.setCharacterEncoding(StandardCharsets.UTF_8.name()); + response.setContentType("application/json;charset=UTF=8"); + response.setCharacterEncoding(StandardCharsets.UTF_8.name()); + + HashMap results = null; + String text = request.getParameter("q"); + String replaceAllDemonymsStr = request.getParameter("replaceAllDemonyms"); + boolean manuallyReplaceDemonyms = (replaceAllDemonymsStr==null) ? false : Boolean.parseBoolean(replaceAllDemonymsStr); + logger.debug("q="+text); + logger.debug("replaceAllDemonyms="+manuallyReplaceDemonyms); + + if(text==null){ + response.sendError(HttpServletResponse.SC_BAD_REQUEST); + } else { + try { + results = ParseManager.parseFromSentences(text,manuallyReplaceDemonyms); + } catch(Exception e){ // try to give the user something useful + logger.error(e.toString()); + results = ParseManager.getErrorText(e.toString()); + } + String jsonResults = gson.toJson(results); + logger.info(jsonResults); + response.getWriter().write(jsonResults); + } + } + +} diff --git a/src/main/java/org/mediameter/cliff/util/MuckUtils.java b/src/main/java/org/mediameter/cliff/util/MuckUtils.java index ee1f8e3..00fdb09 100644 --- a/src/main/java/org/mediameter/cliff/util/MuckUtils.java +++ b/src/main/java/org/mediameter/cliff/util/MuckUtils.java @@ -14,9 +14,9 @@ @SuppressWarnings({ "rawtypes", "unchecked" }) public class MuckUtils { - public static ExtractedEntities entitiesFromJsonString(String nlpJsonString){ + public static ExtractedEntities entitiesFromNlpJsonString(String nlpJsonString){ Map sentences = sentencesFromJsonString(nlpJsonString); - return entitiesFromSentenceMap(sentences); + return entitiesFromNlpSentenceMap(sentences); } public static Map sentencesFromJsonString(String nlpJsonString) { @@ -26,9 +26,9 @@ public static Map sentencesFromJsonString(String nlpJsonString) { } /** - * I've overloaded "position" in each of the occurrences to be sentenceIndex + * */ - private static ExtractedEntities entitiesFromSentenceMap(Map mcSentences){ + private static ExtractedEntities entitiesFromNlpSentenceMap(Map mcSentences){ ExtractedEntities entities = new ExtractedEntities(); Iterator it = mcSentences.entrySet().iterator(); while (it.hasNext()) { diff --git a/src/main/webapp/WEB-INF/web.xml b/src/main/webapp/WEB-INF/web.xml index 24e23ae..859ab58 100755 --- a/src/main/webapp/WEB-INF/web.xml +++ b/src/main/webapp/WEB-INF/web.xml @@ -23,6 +23,15 @@ /parse/json + + ParseSentencesServlet + org.mediameter.cliff.servlet.ParseSentencesServlet + + + ParseSentencesServlet + /parse/sentences + + GeonamesLookupServlet org.mediameter.cliff.servlet.GeonamesLookupServlet diff --git a/src/test/java/org/mediameter/cliff/test/EntityParserTest.java b/src/test/java/org/mediameter/cliff/test/EntityParserTest.java new file mode 100644 index 0000000..079fa34 --- /dev/null +++ b/src/test/java/org/mediameter/cliff/test/EntityParserTest.java @@ -0,0 +1,40 @@ +package org.mediameter.cliff.test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.util.List; +import java.util.Map; + +import org.apache.commons.io.FileUtils; +import org.junit.Test; +import org.mediameter.cliff.ParseManager; +import org.mediameter.cliff.extractor.ExtractedEntities; +import org.mediameter.cliff.extractor.SentenceLocationOccurrence; +import org.mediameter.cliff.test.util.TestPlaces; + +import com.bericotech.clavin.resolver.ResolvedLocation; +import com.google.gson.Gson; + +public class EntityParserTest { + + @Test + @SuppressWarnings("rawtypes") + public void extractAndResolveFromSentences() throws Exception { + String fileName = "story-sentences-278413513.json"; + File file = new File("src/test/resources/sample-sentence-docs/"+fileName); + String jsonText = FileUtils.readFileToString(file); + Gson gson = new Gson(); + Map[] sentences = gson.fromJson(jsonText, Map[].class); + ExtractedEntities entities = ParseManager.extractAndResolveFromSentences(sentences, false); + List locations = entities.getResolvedLocations(); + assertEquals(locations.size(),1); + ResolvedLocation loc = locations.get(0); + assertEquals(loc.geoname.geonameID,TestPlaces.RIKERS_ISLAND); + assertTrue(loc.location instanceof SentenceLocationOccurrence); + SentenceLocationOccurrence sentenceLoc = (SentenceLocationOccurrence) loc.location; + assertEquals(sentenceLoc.storySentenceId,"3279940188"); + } + +} diff --git a/src/test/java/org/mediameter/cliff/test/MuckUtilsTest.java b/src/test/java/org/mediameter/cliff/test/MuckUtilsTest.java index 720139c..06a4544 100644 --- a/src/test/java/org/mediameter/cliff/test/MuckUtilsTest.java +++ b/src/test/java/org/mediameter/cliff/test/MuckUtilsTest.java @@ -23,7 +23,7 @@ public void testStory1() throws IOException { File file = new File("src/test/resources/sample-muck-json/"+fileName); String json = FileUtils.readFileToString(file); - ExtractedEntities entities = MuckUtils.entitiesFromJsonString(json); + ExtractedEntities entities = MuckUtils.entitiesFromNlpJsonString(json); assertEquals("Wrong number of location occurrences", 19, entities.getLocations().size()); assertEquals("Wrong number of people occurrences", 15, entities.getPeople().size()); assertEquals("Wrong number of organization occurrences", 4, entities.getOrganizations().size()); diff --git a/src/test/java/org/mediameter/cliff/test/util/TestPlaces.java b/src/test/java/org/mediameter/cliff/test/util/TestPlaces.java index 386410b..1dcf313 100644 --- a/src/test/java/org/mediameter/cliff/test/util/TestPlaces.java +++ b/src/test/java/org/mediameter/cliff/test/util/TestPlaces.java @@ -27,5 +27,6 @@ public class TestPlaces { public static final int CITY_BANGOR = 4957280; public static final int COUNTRY_RUSSIA = 2017370; public static final int STATE_OKLAHOMA = 4544379; + public static final int RIKERS_ISLAND = 5133874; } diff --git a/src/test/java/org/mediameter/cliff/test/util/TestUtils.java b/src/test/java/org/mediameter/cliff/test/util/TestUtils.java index b73716b..024c946 100644 --- a/src/test/java/org/mediameter/cliff/test/util/TestUtils.java +++ b/src/test/java/org/mediameter/cliff/test/util/TestUtils.java @@ -14,7 +14,6 @@ import org.slf4j.Logger; import com.bericotech.clavin.gazetteer.CountryCode; -import com.bericotech.clavin.gazetteer.GeoName; import com.bericotech.clavin.resolver.ResolvedLocation; import com.bericotech.clavin.util.TextUtils; import com.google.gson.Gson; diff --git a/src/test/resources/sample-sentence-docs/story-sentences-278413513.json b/src/test/resources/sample-sentence-docs/story-sentences-278413513.json new file mode 100644 index 0000000..58d21b5 --- /dev/null +++ b/src/test/resources/sample-sentence-docs/story-sentences-278413513.json @@ -0,0 +1 @@ +[{"sentence":"City officials had no comment on Tuesday.","language":"en","sentence_number":0,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940186"},{"sentence":"On Monday, Mayor Bill de Blasio said at a news conference that the city would be unveiling reforms over time, particularly for the youngest inmates and for those with mental health issues.","language":"en","sentence_number":1,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940187"},{"sentence":"“The status quo at Rikers is unacceptable,” he said.","language":"en","sentence_number":2,"tags":["8878953"],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:14.25158-05","story_sentences_id":"3279940188"},{"sentence":"“As we outlined in the report that we made public some 56 days ago,” Mr. Bharara said, “there’s a problem of accountability and of culture there that rivals all the problems that I’ve been talking about in other areas as well.","language":"en","sentence_number":3,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940189"},{"sentence":"Mr. Bharara noted that when his office first issued its report, he felt there was “a lot of reason for optimism.”","language":"en","sentence_number":4,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940190"},{"sentence":"He said on Tuesday that he had talked with the new correction commissioner, Joseph Ponte, and the corporation counsel’s office, and that Mr. de Blasio had also said “very positive things.”","language":"en","sentence_number":5,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940191"},{"sentence":"Mr. Bharara twice referred to the continuing talks with the city.","language":"en","sentence_number":6,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940192"}] \ No newline at end of file