dbpedia-spotlight · abhishekg2389 · May 11, 2015 · May 11, 2015 · May 11, 2015 · May 11, 2015
diff --git a/examples/macros/nerd_commons.pig b/examples/macros/nerd_commons.pig
@@ -7,7 +7,7 @@ DEFINE read(WIKIPEDIA_DUMP, LANG, MIN_SURFACE_FORM_LENGTH) RETURNS ids, articles
     -- Parse the wikipedia dump and extract text and links data
     parsed = LOAD '$WIKIPEDIA_DUMP'
       USING pignlproc.storage.ParsingWikipediaLoader('$LANG')
-      AS (title, id, pageUrl, text, redirect, links, headers, paragraphs);
+      AS (title, id, pageUrl, text, redirect, links, headers, paragraphs, boldforms);
 
     -- Normalize pageUrls to DBpedia URIs
     parsed = FOREACH parsed GENERATE
@@ -18,7 +18,8 @@ DEFINE read(WIKIPEDIA_DUMP, LANG, MIN_SURFACE_FORM_LENGTH) RETURNS ids, articles
       dbpediaEncode(redirect) AS redirect,
       links,
       headers,
-      paragraphs;
+      paragraphs,
+      boldforms;
 
     -- Separate redirects from non-redirects
     SPLIT parsed INTO
@@ -36,7 +37,8 @@ DEFINE read(WIKIPEDIA_DUMP, LANG, MIN_SURFACE_FORM_LENGTH) RETURNS ids, articles
       pageUrl,
       text,
       links,
-      paragraphs;
+      paragraphs,
+      boldforms;
 
     -- Build transitive closure of redirects
     redirects = redirectTransClo(parsedRedirects);
@@ -46,12 +48,12 @@ DEFINE read(WIKIPEDIA_DUMP, LANG, MIN_SURFACE_FORM_LENGTH) RETURNS ids, articles
       redirectTarget AS uri;
 
     -- Get Links
-    pageLinksNonEmptySf = getLinks(articles, $LANG, $MIN_SURFACE_FORM_LENGTH);
+    pageLinksAndBoldsNonEmptySf = getLinksAndBolds(articles, $LANG, $MIN_SURFACE_FORM_LENGTH);
 
     -- Resolve redirects
     pageLinksRedirectsJoin = JOIN
       redirects BY redirectSource RIGHT,
-      pageLinksNonEmptySf BY uri;
+      pageLinksAndBoldsNonEmptySf BY uri;
     resolvedLinks = FOREACH pageLinksRedirectsJoin GENERATE
       surfaceForm,
       FLATTEN(resolve(uri, redirectTarget)) AS uri,
@@ -63,7 +65,7 @@ DEFINE read(WIKIPEDIA_DUMP, LANG, MIN_SURFACE_FORM_LENGTH) RETURNS ids, articles
       distinctLinks;
 };
 
-DEFINE getLinks(articles, LANG, MIN_SURFACE_FORM_LENGTH) RETURNS pageLinksNonEmptySf {
+DEFINE getLinksAndBolds(articles, LANG, MIN_SURFACE_FORM_LENGTH) RETURNS pageLinksAndBoldsNonEmptySf {
     -- get link pairs
 
     DEFINE dbpediaEncode pignlproc.evaluation.DBpediaUriEncode('$LANG');
@@ -81,8 +83,28 @@ DEFINE getLinks(articles, LANG, MIN_SURFACE_FORM_LENGTH) RETURNS pageLinksNonEmp
       pageUrl AS pageUrl;
 
     -- Filter out surfaceForms that have zero or one character
-    $pageLinksNonEmptySf = FILTER pageLinks
+    pageLinksNonEmptySf = FILTER pageLinks
       BY SIZE(surfaceForm) >= $MIN_SURFACE_FORM_LENGTH;
+
+    -- get bold pairs
+
+    -- Extract sentence contexts of the initial bold surface forms respecting the paragraph boundaries
+    sentences_ = FOREACH $articles GENERATE
+      pageUrl,
+      FLATTEN(pignlproc.evaluation.SentencesWithLink(text, boldforms, paragraphs))
+      AS (sentenceIdx, sentence, targetUri, startPos, endPos);
+
+    -- Project to three important relations
+    pageInitBoldforms = FOREACH sentences_ GENERATE
+      TRIM(SUBSTRING(sentence, startPos, endPos)) AS surfaceForm,
+      dbpediaEncode(targetUri) AS uri,
+      pageUrl AS pageUrl;
+
+    -- Filter out surfaceForms that have zero or one character
+    pageInitBoldformsNonEmptySf = FILTER pageInitBoldforms
+      BY SIZE(surfaceForm) >= $MIN_SURFACE_FORM_LENGTH;
+
+    $pageLinksAndBoldsNonEmptySf = UNION pageLinksNonEmptySf, pageInitBoldformsNonEmptySf;
 };
 
 DEFINE redirectTransClo(parsedRedirects) RETURNS redirects {

diff --git a/src/main/java/pignlproc/markup/AnnotatingMarkupParser.java b/src/main/java/pignlproc/markup/AnnotatingMarkupParser.java
@@ -62,6 +62,8 @@ public class AnnotatingMarkupParser implements ITextConverter {
 
     protected final List<Annotation> paragraphs = new ArrayList<Annotation>();
 
+    protected final List<Annotation> boldforms = new ArrayList<Annotation>();
+
     protected String languageCode;
 
     protected final WikiModel model;
@@ -385,6 +387,13 @@ public void nodesToText(List<? extends Object> nodes, Appendable buffer,
                                     tagName));
                             countingBuffer.append("\n\n");
                         }
+                        if ("b".equals(tagName)) {
+                            if (tagBegin < 500) {
+                                boldforms.add(new Annotation(tagBegin,
+                                        countingBuffer.currentPosition, "boldform",
+                                        tagName));
+                            }
+                        }
                     }
                 }
             } finally {
@@ -413,6 +422,10 @@ public List<Annotation> getHeaderAnnotations() {
     public List<Annotation> getParagraphAnnotations() {
         return paragraphs;
     }
+
+    public List<Annotation> getBoldformAnnotations() {
+        return boldforms;
+    }
 
     public List<String> getParagraphs() {
         List<String> texts = new ArrayList<String>();
@@ -429,6 +442,14 @@ public List<String> getHeaders() {
         }
         return texts;
     }
+
+    public List<String> getBoldforms() {
+        List<String> texts = new ArrayList<String>();
+        for (Annotation b : boldforms) {
+            texts.add(text.substring(b.begin, b.end));
+        }
+        return texts;
+    }
 
     public String getRedirect() {
         return redirect;

diff --git a/src/main/java/pignlproc/storage/ParsingWikipediaLoader.java b/src/main/java/pignlproc/storage/ParsingWikipediaLoader.java
@@ -61,8 +61,13 @@ public Tuple getNext() throws IOException {
                 paragraphs.add(tupleFactory.newTupleNoCopy(Arrays.asList(
                         p.value, p.begin, p.end)));
             }
+            DataBag boldforms = bagFactory.newDefaultBag();
+            for (Annotation b : converter.getBoldformAnnotations()) {
+                boldforms.add(tupleFactory.newTupleNoCopy(Arrays.asList(
+                        b.value, b.begin, b.end)));
+            }
             return tupleFactory.newTupleNoCopy(Arrays.asList(title, id, uri, text,
-                    redirect, links, headers, paragraphs));
+                    redirect, links, headers, paragraphs, boldforms));
         } catch (InterruptedException e) {
             throw new IOException(e);
         }
@@ -106,6 +111,15 @@ public ResourceSchema getSchema(String location, Job job)
         Schema paragraphInfoWrapper = new Schema(new FieldSchema("t", paragraphInfoSchema));
         paragraphInfoWrapper.setTwoLevelAccessRequired(true);
         schema.add(new FieldSchema("paragraphs", paragraphInfoWrapper, DataType.BAG));
+
+        Schema boldformInfoSchema = new Schema();
+        boldformInfoSchema.add(new FieldSchema("tagname", DataType.CHARARRAY));
+        boldformInfoSchema.add(new FieldSchema("begin", DataType.INTEGER));
+        boldformInfoSchema.add(new FieldSchema("end", DataType.INTEGER));
+
+        Schema boldformInfoWrapper = new Schema(new FieldSchema("t", boldformInfoSchema));
+        boldformInfoWrapper.setTwoLevelAccessRequired(true);
+        schema.add(new FieldSchema("boldforms", boldformInfoWrapper, DataType.BAG));
 
         return new ResourceSchema(schema);
     }

diff --git a/src/test/java/pignlproc/storage/TestWikipediaLoader.java b/src/test/java/pignlproc/storage/TestWikipediaLoader.java
@@ -48,7 +48,7 @@ public void testParsingWikipediaLoader() throws Exception {
         String query = "A = LOAD 'file:" + filename
                 + "' USING pignlproc.storage.ParsingWikipediaLoader('en')"
                 + " as (title: chararray, id: chararray, uri: chararray, text: chararray,"
-                + " redirect: chararray, links, headers, paragraphs);";
+                + " redirect: chararray, links, headers, paragraphs, boldforms);";
         pig.registerQuery(query);
         Iterator<Tuple> it = pig.openIterator("A");
         int tupleCount = 0;