This repository has been archived by the owner on May 30, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
changes to support parsing from mediacloud sentences array
- Loading branch information
Showing
12 changed files
with
244 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
67 changes: 67 additions & 0 deletions
67
src/main/java/org/mediameter/cliff/servlet/ParseSentencesServlet.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
package org.mediameter.cliff.servlet; | ||
|
||
import java.io.IOException; | ||
import java.nio.charset.StandardCharsets; | ||
import java.util.HashMap; | ||
|
||
import javax.servlet.http.HttpServlet; | ||
import javax.servlet.http.HttpServletRequest; | ||
import javax.servlet.http.HttpServletResponse; | ||
|
||
import org.mediameter.cliff.ParseManager; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import com.google.gson.Gson; | ||
|
||
/** | ||
* Wraps the CLAVIN geoparser behind some ports so we can integrate it into other workflows. | ||
* | ||
* @author rahulb | ||
*/ | ||
public class ParseSentencesServlet extends HttpServlet{ | ||
|
||
private static final Logger logger = LoggerFactory.getLogger(ParseSentencesServlet.class); | ||
|
||
private static Gson gson = new Gson(); | ||
|
||
public ParseSentencesServlet() { | ||
} | ||
|
||
@Override | ||
protected void doPost(HttpServletRequest request, HttpServletResponse response) throws IOException{ | ||
doGet(request,response); | ||
} | ||
|
||
@Override | ||
@SuppressWarnings("rawtypes") | ||
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException{ | ||
|
||
logger.info("Sentences Parse Request from "+request.getRemoteAddr()); | ||
request.setCharacterEncoding(StandardCharsets.UTF_8.name()); | ||
response.setContentType("application/json;charset=UTF=8"); | ||
response.setCharacterEncoding(StandardCharsets.UTF_8.name()); | ||
|
||
HashMap results = null; | ||
String text = request.getParameter("q"); | ||
String replaceAllDemonymsStr = request.getParameter("replaceAllDemonyms"); | ||
boolean manuallyReplaceDemonyms = (replaceAllDemonymsStr==null) ? false : Boolean.parseBoolean(replaceAllDemonymsStr); | ||
logger.debug("q="+text); | ||
logger.debug("replaceAllDemonyms="+manuallyReplaceDemonyms); | ||
|
||
if(text==null){ | ||
response.sendError(HttpServletResponse.SC_BAD_REQUEST); | ||
} else { | ||
try { | ||
results = ParseManager.parseFromSentences(text,manuallyReplaceDemonyms); | ||
} catch(Exception e){ // try to give the user something useful | ||
logger.error(e.toString()); | ||
results = ParseManager.getErrorText(e.toString()); | ||
} | ||
String jsonResults = gson.toJson(results); | ||
logger.info(jsonResults); | ||
response.getWriter().write(jsonResults); | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
40 changes: 40 additions & 0 deletions
40
src/test/java/org/mediameter/cliff/test/EntityParserTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
package org.mediameter.cliff.test; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
import static org.junit.Assert.assertTrue; | ||
|
||
import java.io.File; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
import org.apache.commons.io.FileUtils; | ||
import org.junit.Test; | ||
import org.mediameter.cliff.ParseManager; | ||
import org.mediameter.cliff.extractor.ExtractedEntities; | ||
import org.mediameter.cliff.extractor.SentenceLocationOccurrence; | ||
import org.mediameter.cliff.test.util.TestPlaces; | ||
|
||
import com.bericotech.clavin.resolver.ResolvedLocation; | ||
import com.google.gson.Gson; | ||
|
||
public class EntityParserTest { | ||
|
||
@Test | ||
@SuppressWarnings("rawtypes") | ||
public void extractAndResolveFromSentences() throws Exception { | ||
String fileName = "story-sentences-278413513.json"; | ||
File file = new File("src/test/resources/sample-sentence-docs/"+fileName); | ||
String jsonText = FileUtils.readFileToString(file); | ||
Gson gson = new Gson(); | ||
Map[] sentences = gson.fromJson(jsonText, Map[].class); | ||
ExtractedEntities entities = ParseManager.extractAndResolveFromSentences(sentences, false); | ||
List<ResolvedLocation> locations = entities.getResolvedLocations(); | ||
assertEquals(locations.size(),1); | ||
ResolvedLocation loc = locations.get(0); | ||
assertEquals(loc.geoname.geonameID,TestPlaces.RIKERS_ISLAND); | ||
assertTrue(loc.location instanceof SentenceLocationOccurrence); | ||
SentenceLocationOccurrence sentenceLoc = (SentenceLocationOccurrence) loc.location; | ||
assertEquals(sentenceLoc.storySentenceId,"3279940188"); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1 change: 1 addition & 0 deletions
1
src/test/resources/sample-sentence-docs/story-sentences-278413513.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
[{"sentence":"City officials had no comment on Tuesday.","language":"en","sentence_number":0,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940186"},{"sentence":"On Monday, Mayor Bill de Blasio said at a news conference that the city would be unveiling reforms over time, particularly for the youngest inmates and for those with mental health issues.","language":"en","sentence_number":1,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940187"},{"sentence":"“The status quo at Rikers is unacceptable,” he said.","language":"en","sentence_number":2,"tags":["8878953"],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:14.25158-05","story_sentences_id":"3279940188"},{"sentence":"“As we outlined in the report that we made public some 56 days ago,” Mr. Bharara said, “there’s a problem of accountability and of culture there that rivals all the problems that I’ve been talking about in other areas as well.","language":"en","sentence_number":3,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940189"},{"sentence":"Mr. Bharara noted that when his office first issued its report, he felt there was “a lot of reason for optimism.”","language":"en","sentence_number":4,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940190"},{"sentence":"He said on Tuesday that he had talked with the new correction commissioner, Joseph Ponte, and the corporation counsel’s office, and that Mr. de Blasio had also said “very positive things.”","language":"en","sentence_number":5,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940191"},{"sentence":"Mr. Bharara twice referred to the continuing talks with the city.","language":"en","sentence_number":6,"tags":[],"media_id":1,"publish_date":"2014-10-01 04:00:00","stories_id":"278413513","db_row_last_updated":"2014-12-22 19:08:13.754986-05","story_sentences_id":"3279940192"}] |