Skip to content
This repository has been archived by the owner on May 30, 2023. It is now read-only.

Commit

Permalink
changes to support parsing from mediacloud sentences array
Browse files Browse the repository at this point in the history
  • Loading branch information
rahulbot committed Mar 27, 2015
1 parent 40e0df1 commit 0a27e6e
Show file tree
Hide file tree
Showing 12 changed files with 244 additions and 10 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<groupId>org.mediameter</groupId>
<artifactId>CLIFF</artifactId>
<packaging>war</packaging>
<version>1.2.0</version>
<version>1.3.0</version>

<name>CLIFF</name>
<url>http://cliff.mediameter.org/</url>
Expand Down
17 changes: 16 additions & 1 deletion src/main/java/org/mediameter/cliff/EntityParser.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.mediameter.cliff;

import java.util.List;
import java.util.Map;

import org.mediameter.cliff.extractor.ExtractedEntities;
import org.mediameter.cliff.extractor.StanfordNamedEntityExtractor;
Expand Down Expand Up @@ -54,7 +55,21 @@ public ExtractedEntities extractAndResolve(String inputText, boolean manuallyRep
logger.debug("extractAndResolve: "+extract+" / "+resolve);
return entities;
}


@SuppressWarnings("rawtypes")
public ExtractedEntities extractAndResolveFromSentences(Map[] sentences, boolean manuallyReplaceDemonyms) throws Exception {
logger.trace("input: {}", sentences);
long startTime = System.nanoTime();
ExtractedEntities extractedEntities = extractor.extractEntitiesFromSentences(sentences,manuallyReplaceDemonyms);
long extract = System.nanoTime() - startTime;
logger.trace("extracted: {}", extractedEntities.getLocations());
startTime = System.nanoTime();
ExtractedEntities entities = resolve(extractedEntities);
long resolve = System.nanoTime() - startTime;
logger.debug("extractAndResolve: "+extract+" / "+resolve);
return entities;
}

public ExtractedEntities resolve(ExtractedEntities entities) throws Exception{

// resolve the extracted location names against a
Expand Down
32 changes: 30 additions & 2 deletions src/main/java/org/mediameter/cliff/ParseManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.mediameter.cliff.extractor.CliffConfig;
import org.mediameter.cliff.extractor.ExtractedEntities;
Expand All @@ -28,6 +29,7 @@
import com.bericotech.clavin.gazetteer.GeoName;
import com.bericotech.clavin.resolver.LocationResolver;
import com.bericotech.clavin.resolver.ResolvedLocation;
import com.google.gson.Gson;

/**
* Singleton-style wrapper around a GeoParser. Call GeoParser.locate(someText) to use this class.
Expand All @@ -39,7 +41,7 @@ public class ParseManager {
* Minor: change in json result format
* Revision: minor change or bug fix
*/
static final String PARSER_VERSION = "1.2.0";
static final String PARSER_VERSION = "1.3.0";

private static final Logger logger = LoggerFactory.getLogger(ParseManager.class);

Expand Down Expand Up @@ -96,6 +98,27 @@ public static HashMap parseFromText(String text,boolean manuallyReplaceDemonyms)
return results;
}

@SuppressWarnings({ "unchecked", "rawtypes" })
public static HashMap parseFromSentences(String jsonText, boolean manuallyReplaceDemonyms) {
long startTime = System.currentTimeMillis();
HashMap results = null;
if(jsonText.trim().length()==0){
return getErrorText("No text");
}
try {
Gson gson = new Gson();
Map[] sentences = gson.fromJson(jsonText, Map[].class);
ExtractedEntities entities = extractAndResolveFromSentences(sentences,manuallyReplaceDemonyms);
results = parseFromEntities(entities);
} catch (Exception e) {
results = getErrorText(e.toString());
}
long endTime = System.currentTimeMillis();
long elapsedMillis = endTime - startTime;
results.put("milliseconds", elapsedMillis);
return results;
}

@SuppressWarnings({ "unchecked", "rawtypes" })
public static HashMap parseFromNlpJson(String nlpJsonString){
long startTime = System.currentTimeMillis();
Expand All @@ -104,7 +127,7 @@ public static HashMap parseFromNlpJson(String nlpJsonString){
return getErrorText("No text");
}
try {
ExtractedEntities entities = MuckUtils.entitiesFromJsonString(nlpJsonString);
ExtractedEntities entities = MuckUtils.entitiesFromNlpJsonString(nlpJsonString);
entities = getParserInstance().resolve(entities);;
results = parseFromEntities(entities);
} catch (Exception e) {
Expand Down Expand Up @@ -259,6 +282,11 @@ public static ExtractedEntities extractAndResolve(String text,boolean manuallyRe
return getParserInstance().extractAndResolve(text,manuallyReplaceDemonyms);
}

@SuppressWarnings("rawtypes")
public static ExtractedEntities extractAndResolveFromSentences(Map[] sentences, boolean manuallyReplaceDemonyms) throws Exception{
return getParserInstance().extractAndResolveFromSentences(sentences, manuallyReplaceDemonyms);
}

/**
* We want all error messages sent to the client to have the same format
* @param msg
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.mediameter.cliff.places.substitutions.Blacklist;
Expand Down Expand Up @@ -156,6 +157,79 @@ public ExtractedEntities extractEntities(String textToParse,boolean manuallyRepl
return entities;
}


/**
* Get extracted locations from a plain-text body.
*
* @param text Text content to perform extraction on.
* @param manuallyReplaceDemonyms Can slow down performance quite a bit
* @return All the entities mentioned
*/
@SuppressWarnings("rawtypes")
public ExtractedEntities extractEntitiesFromSentences(Map[] sentences,boolean manuallyReplaceDemonyms) {
ExtractedEntities entities = new ExtractedEntities();

if (sentences.length==0){
logger.warn("input to extractEntities was null or zero!");
return entities;
}

if(manuallyReplaceDemonyms){ // this is a noticeable performance hit
logger.debug("Replacing all demonyms by hand");
}

for(Map s:sentences){
String storySentencesId = s.get("story_sentences_id").toString();
String text = s.get("sentence").toString();
if(manuallyReplaceDemonyms){ // this is a noticeable performance hit
text = demonyms.replaceAll(text);
}
// extract entities as <Entity Type, Start Index, Stop Index>
List<Triple<String, Integer, Integer>> extractedEntities =
namedEntityRecognizer.classifyToCharacterOffsets(text);
if (extractedEntities != null) {
for (Triple<String, Integer, Integer> extractedEntity : extractedEntities) {
String entityName = text.substring(extractedEntity.second(), extractedEntity.third());
int position = extractedEntity.second();
switch(extractedEntity.first){
case "PERSON":
if(personToPlaceSubstitutions.contains(entityName)){
entities.addLocation( getLocationOccurrence(personToPlaceSubstitutions.getSubstitution(entityName), position) );
logger.debug("Changed person "+entityName+" to a place");
} else {
PersonOccurrence person = new PersonOccurrence(entityName, position);
entities.addPerson( person );
}
break;
case "LOCATION":
if(!locationBlacklist.contains(entityName)){
LocationOccurrence loc = getLocationOccurrence(entityName, position);
// save the sentence id here
entities.addLocation( new SentenceLocationOccurrence(loc.text, storySentencesId) );
} else {
logger.debug("Ignored blacklisted location "+entityName);
}
break;
case "ORGANIZATION":
OrganizationOccurrence organization = new OrganizationOccurrence(entityName, position);
entities.addOrganization( organization );
break;
case "MISC": // if you're using the slower 4class model
if (demonyms.contains(entityName)) {
logger.debug("Found and adding a MISC demonym "+entityName);
entities.addLocation( getLocationOccurrence(entityName, position) );
}
break;
default:
logger.error("Unknown NER type :"+ extractedEntity.first);
}
}
}
}

return entities;
}

private LocationOccurrence getLocationOccurrence(String entityName, int position){
String fixedName = entityName;
if (demonyms.contains(entityName)) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package org.mediameter.cliff.servlet;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;

import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.mediameter.cliff.ParseManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.gson.Gson;

/**
* Wraps the CLAVIN geoparser behind some ports so we can integrate it into other workflows.
*
* @author rahulb
*/
public class ParseSentencesServlet extends HttpServlet{

private static final Logger logger = LoggerFactory.getLogger(ParseSentencesServlet.class);

private static Gson gson = new Gson();

public ParseSentencesServlet() {
}

@Override
protected void doPost(HttpServletRequest request, HttpServletResponse response) throws IOException{
doGet(request,response);
}

@Override
@SuppressWarnings("rawtypes")
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException{

logger.info("Sentences Parse Request from "+request.getRemoteAddr());
request.setCharacterEncoding(StandardCharsets.UTF_8.name());
response.setContentType("application/json;charset=UTF=8");
response.setCharacterEncoding(StandardCharsets.UTF_8.name());

HashMap results = null;
String text = request.getParameter("q");
String replaceAllDemonymsStr = request.getParameter("replaceAllDemonyms");
boolean manuallyReplaceDemonyms = (replaceAllDemonymsStr==null) ? false : Boolean.parseBoolean(replaceAllDemonymsStr);
logger.debug("q="+text);
logger.debug("replaceAllDemonyms="+manuallyReplaceDemonyms);

if(text==null){
response.sendError(HttpServletResponse.SC_BAD_REQUEST);
} else {
try {
results = ParseManager.parseFromSentences(text,manuallyReplaceDemonyms);
} catch(Exception e){ // try to give the user something useful
logger.error(e.toString());
results = ParseManager.getErrorText(e.toString());
}
String jsonResults = gson.toJson(results);
logger.info(jsonResults);
response.getWriter().write(jsonResults);
}
}

}
8 changes: 4 additions & 4 deletions src/main/java/org/mediameter/cliff/util/MuckUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
@SuppressWarnings({ "rawtypes", "unchecked" })
public class MuckUtils {

public static ExtractedEntities entitiesFromJsonString(String nlpJsonString){
public static ExtractedEntities entitiesFromNlpJsonString(String nlpJsonString){
Map sentences = sentencesFromJsonString(nlpJsonString);
return entitiesFromSentenceMap(sentences);
return entitiesFromNlpSentenceMap(sentences);
}

public static Map sentencesFromJsonString(String nlpJsonString) {
Expand All @@ -26,9 +26,9 @@ public static Map sentencesFromJsonString(String nlpJsonString) {
}

/**
* I've overloaded "position" in each of the occurrences to be sentenceIndex
*
*/
private static ExtractedEntities entitiesFromSentenceMap(Map mcSentences){
private static ExtractedEntities entitiesFromNlpSentenceMap(Map mcSentences){
ExtractedEntities entities = new ExtractedEntities();
Iterator it = mcSentences.entrySet().iterator();
while (it.hasNext()) {
Expand Down
9 changes: 9 additions & 0 deletions src/main/webapp/WEB-INF/web.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@
<url-pattern>/parse/json</url-pattern>
</servlet-mapping>

<servlet>
<servlet-name>ParseSentencesServlet</servlet-name>
<servlet-class>org.mediameter.cliff.servlet.ParseSentencesServlet</servlet-class>
</servlet>
<servlet-mapping>
<servlet-name>ParseSentencesServlet</servlet-name>
<url-pattern>/parse/sentences</url-pattern>
</servlet-mapping>

<servlet>
<servlet-name>GeonamesLookupServlet</servlet-name>
<servlet-class>org.mediameter.cliff.servlet.GeonamesLookupServlet</servlet-class>
Expand Down
40 changes: 40 additions & 0 deletions src/test/java/org/mediameter/cliff/test/EntityParserTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package org.mediameter.cliff.test;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.File;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.junit.Test;
import org.mediameter.cliff.ParseManager;
import org.mediameter.cliff.extractor.ExtractedEntities;
import org.mediameter.cliff.extractor.SentenceLocationOccurrence;
import org.mediameter.cliff.test.util.TestPlaces;

import com.bericotech.clavin.resolver.ResolvedLocation;
import com.google.gson.Gson;

public class EntityParserTest {

@Test
@SuppressWarnings("rawtypes")
public void extractAndResolveFromSentences() throws Exception {
String fileName = "story-sentences-278413513.json";
File file = new File("src/test/resources/sample-sentence-docs/"+fileName);
String jsonText = FileUtils.readFileToString(file);
Gson gson = new Gson();
Map[] sentences = gson.fromJson(jsonText, Map[].class);
ExtractedEntities entities = ParseManager.extractAndResolveFromSentences(sentences, false);
List<ResolvedLocation> locations = entities.getResolvedLocations();
assertEquals(locations.size(),1);
ResolvedLocation loc = locations.get(0);
assertEquals(loc.geoname.geonameID,TestPlaces.RIKERS_ISLAND);
assertTrue(loc.location instanceof SentenceLocationOccurrence);
SentenceLocationOccurrence sentenceLoc = (SentenceLocationOccurrence) loc.location;
assertEquals(sentenceLoc.storySentenceId,"3279940188");
}

}
2 changes: 1 addition & 1 deletion src/test/java/org/mediameter/cliff/test/MuckUtilsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ public void testStory1() throws IOException {
File file = new File("src/test/resources/sample-muck-json/"+fileName);
String json = FileUtils.readFileToString(file);

ExtractedEntities entities = MuckUtils.entitiesFromJsonString(json);
ExtractedEntities entities = MuckUtils.entitiesFromNlpJsonString(json);
assertEquals("Wrong number of location occurrences", 19, entities.getLocations().size());
assertEquals("Wrong number of people occurrences", 15, entities.getPeople().size());
assertEquals("Wrong number of organization occurrences", 4, entities.getOrganizations().size());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,6 @@ public class TestPlaces {
public static final int CITY_BANGOR = 4957280;
public static final int COUNTRY_RUSSIA = 2017370;
public static final int STATE_OKLAHOMA = 4544379;
public static final int RIKERS_ISLAND = 5133874;

}
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import org.slf4j.Logger;

import com.bericotech.clavin.gazetteer.CountryCode;
import com.bericotech.clavin.gazetteer.GeoName;
import com.bericotech.clavin.resolver.ResolvedLocation;
import com.bericotech.clavin.util.TextUtils;
import com.google.gson.Gson;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"sentence":"City officials had no comment on Tuesday.","language":"en","sentence_number":0,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940186"},{"sentence":"On Monday, Mayor Bill de Blasio said at a news conference that the city would be unveiling reforms over time, particularly for the youngest inmates and for those with mental health issues.","language":"en","sentence_number":1,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940187"},{"sentence":"“The status quo at Rikers is unacceptable,” he said.","language":"en","sentence_number":2,"tags":["8878953"],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:14.25158-05","story_sentences_id":"3279940188"},{"sentence":"“As we outlined in the report that we made public some 56 days ago,” Mr. Bharara said, “there’s a problem of accountability and of culture there that rivals all the problems that I’ve been talking about in other areas as well.","language":"en","sentence_number":3,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940189"},{"sentence":"Mr. Bharara noted that when his office first issued its report, he felt there was “a lot of reason for optimism.”","language":"en","sentence_number":4,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940190"},{"sentence":"He said on Tuesday that he had talked with the new correction commissioner, Joseph Ponte, and the corporation counsel’s office, and that Mr. de Blasio had also said “very positive things.”","language":"en","sentence_number":5,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940191"},{"sentence":"Mr. Bharara twice referred to the continuing talks with the city.","language":"en","sentence_number":6,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940192"}]

0 comments on commit 0a27e6e

Please sign in to comment.