Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Bold Surface Forms #17

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 29 additions & 7 deletions examples/macros/nerd_commons.pig
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ DEFINE read(WIKIPEDIA_DUMP, LANG, MIN_SURFACE_FORM_LENGTH) RETURNS ids, articles
-- Parse the wikipedia dump and extract text and links data
parsed = LOAD '$WIKIPEDIA_DUMP'
USING pignlproc.storage.ParsingWikipediaLoader('$LANG')
AS (title, id, pageUrl, text, redirect, links, headers, paragraphs);
AS (title, id, pageUrl, text, redirect, links, headers, paragraphs, boldforms);

-- Normalize pageUrls to DBpedia URIs
parsed = FOREACH parsed GENERATE
Expand All @@ -18,7 +18,8 @@ DEFINE read(WIKIPEDIA_DUMP, LANG, MIN_SURFACE_FORM_LENGTH) RETURNS ids, articles
dbpediaEncode(redirect) AS redirect,
links,
headers,
paragraphs;
paragraphs,
boldforms;

-- Separate redirects from non-redirects
SPLIT parsed INTO
Expand All @@ -36,7 +37,8 @@ DEFINE read(WIKIPEDIA_DUMP, LANG, MIN_SURFACE_FORM_LENGTH) RETURNS ids, articles
pageUrl,
text,
links,
paragraphs;
paragraphs,
boldforms;

-- Build transitive closure of redirects
redirects = redirectTransClo(parsedRedirects);
Expand All @@ -46,12 +48,12 @@ DEFINE read(WIKIPEDIA_DUMP, LANG, MIN_SURFACE_FORM_LENGTH) RETURNS ids, articles
redirectTarget AS uri;

-- Get Links
pageLinksNonEmptySf = getLinks(articles, $LANG, $MIN_SURFACE_FORM_LENGTH);
pageLinksAndBoldsNonEmptySf = getLinksAndBolds(articles, $LANG, $MIN_SURFACE_FORM_LENGTH);

-- Resolve redirects
pageLinksRedirectsJoin = JOIN
redirects BY redirectSource RIGHT,
pageLinksNonEmptySf BY uri;
pageLinksAndBoldsNonEmptySf BY uri;
resolvedLinks = FOREACH pageLinksRedirectsJoin GENERATE
surfaceForm,
FLATTEN(resolve(uri, redirectTarget)) AS uri,
Expand All @@ -63,7 +65,7 @@ DEFINE read(WIKIPEDIA_DUMP, LANG, MIN_SURFACE_FORM_LENGTH) RETURNS ids, articles
distinctLinks;
};

DEFINE getLinks(articles, LANG, MIN_SURFACE_FORM_LENGTH) RETURNS pageLinksNonEmptySf {
DEFINE getLinksAndBolds(articles, LANG, MIN_SURFACE_FORM_LENGTH) RETURNS pageLinksAndBoldsNonEmptySf {
-- get link pairs

DEFINE dbpediaEncode pignlproc.evaluation.DBpediaUriEncode('$LANG');
Expand All @@ -81,8 +83,28 @@ DEFINE getLinks(articles, LANG, MIN_SURFACE_FORM_LENGTH) RETURNS pageLinksNonEmp
pageUrl AS pageUrl;

-- Filter out surfaceForms that have zero or one character
$pageLinksNonEmptySf = FILTER pageLinks
pageLinksNonEmptySf = FILTER pageLinks
BY SIZE(surfaceForm) >= $MIN_SURFACE_FORM_LENGTH;

-- get bold pairs

-- Extract sentence contexts of the initial bold surface forms respecting the paragraph boundaries
sentences_ = FOREACH $articles GENERATE
pageUrl,
FLATTEN(pignlproc.evaluation.SentencesWithLink(text, boldforms, paragraphs))
AS (sentenceIdx, sentence, targetUri, startPos, endPos);

-- Project to three important relations
pageInitBoldforms = FOREACH sentences_ GENERATE
TRIM(SUBSTRING(sentence, startPos, endPos)) AS surfaceForm,
dbpediaEncode(targetUri) AS uri,
pageUrl AS pageUrl;

-- Filter out surfaceForms that have zero or one character
pageInitBoldformsNonEmptySf = FILTER pageInitBoldforms
BY SIZE(surfaceForm) >= $MIN_SURFACE_FORM_LENGTH;

$pageLinksAndBoldsNonEmptySf = UNION pageLinksNonEmptySf, pageInitBoldformsNonEmptySf;
};

DEFINE redirectTransClo(parsedRedirects) RETURNS redirects {
Expand Down
21 changes: 21 additions & 0 deletions src/main/java/pignlproc/markup/AnnotatingMarkupParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ public class AnnotatingMarkupParser implements ITextConverter {

protected final List<Annotation> paragraphs = new ArrayList<Annotation>();

protected final List<Annotation> boldforms = new ArrayList<Annotation>();

protected String languageCode;

protected final WikiModel model;
Expand Down Expand Up @@ -385,6 +387,13 @@ public void nodesToText(List<? extends Object> nodes, Appendable buffer,
tagName));
countingBuffer.append("\n\n");
}
if ("b".equals(tagName)) {
if (tagBegin < 500) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a question, is this for you to get boldforms occur in the first 500 chars of the page ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @tgalery,
Actually I want to extract bold forms only from the definition/introduction part of page, not from the whole page. If I include bold forms from whole page then it may introduce some false surface forms. Following are some examples:

  1. http://en.wikipedia.org/wiki/Sidewalk: "roadway"
  2. http://en.wikipedia.org/wiki/Analysis_of_variance: "most used" and "most useful"
  3. http://en.wikipedia.org/wiki/Phrase_(music): "antecedent phrase", "consequent phrase", "phrase-group" and "Phrase rhythm"
  4. http://en.wikipedia.org/wiki/Afroasiatic_languages: Several bold forms in table
    We can also get some valid surface forms if we extract all bold forms from page but we are not sure about validity in all cases as we have seen above.
  5. http://en.wikipedia.org/wiki/Radio_Warwick: "URW312", "University Radio Warwick"

boldforms.add(new Annotation(tagBegin,
countingBuffer.currentPosition, "boldform",
tagName));
}
}
}
}
} finally {
Expand Down Expand Up @@ -413,6 +422,10 @@ public List<Annotation> getHeaderAnnotations() {
public List<Annotation> getParagraphAnnotations() {
return paragraphs;
}

public List<Annotation> getBoldformAnnotations() {
return boldforms;
}

public List<String> getParagraphs() {
List<String> texts = new ArrayList<String>();
Expand All @@ -429,6 +442,14 @@ public List<String> getHeaders() {
}
return texts;
}

public List<String> getBoldforms() {
List<String> texts = new ArrayList<String>();
for (Annotation b : boldforms) {
texts.add(text.substring(b.begin, b.end));
}
return texts;
}

public String getRedirect() {
return redirect;
Expand Down
16 changes: 15 additions & 1 deletion src/main/java/pignlproc/storage/ParsingWikipediaLoader.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,13 @@ public Tuple getNext() throws IOException {
paragraphs.add(tupleFactory.newTupleNoCopy(Arrays.asList(
p.value, p.begin, p.end)));
}
DataBag boldforms = bagFactory.newDefaultBag();
for (Annotation b : converter.getBoldformAnnotations()) {
boldforms.add(tupleFactory.newTupleNoCopy(Arrays.asList(
b.value, b.begin, b.end)));
}
return tupleFactory.newTupleNoCopy(Arrays.asList(title, id, uri, text,
redirect, links, headers, paragraphs));
redirect, links, headers, paragraphs, boldforms));
} catch (InterruptedException e) {
throw new IOException(e);
}
Expand Down Expand Up @@ -106,6 +111,15 @@ public ResourceSchema getSchema(String location, Job job)
Schema paragraphInfoWrapper = new Schema(new FieldSchema("t", paragraphInfoSchema));
paragraphInfoWrapper.setTwoLevelAccessRequired(true);
schema.add(new FieldSchema("paragraphs", paragraphInfoWrapper, DataType.BAG));

Schema boldformInfoSchema = new Schema();
boldformInfoSchema.add(new FieldSchema("tagname", DataType.CHARARRAY));
boldformInfoSchema.add(new FieldSchema("begin", DataType.INTEGER));
boldformInfoSchema.add(new FieldSchema("end", DataType.INTEGER));

Schema boldformInfoWrapper = new Schema(new FieldSchema("t", boldformInfoSchema));
boldformInfoWrapper.setTwoLevelAccessRequired(true);
schema.add(new FieldSchema("boldforms", boldformInfoWrapper, DataType.BAG));

return new ResourceSchema(schema);
}
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/pignlproc/storage/TestWikipediaLoader.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public void testParsingWikipediaLoader() throws Exception {
String query = "A = LOAD 'file:" + filename
+ "' USING pignlproc.storage.ParsingWikipediaLoader('en')"
+ " as (title: chararray, id: chararray, uri: chararray, text: chararray,"
+ " redirect: chararray, links, headers, paragraphs);";
+ " redirect: chararray, links, headers, paragraphs, boldforms);";
pig.registerQuery(query);
Iterator<Tuple> it = pig.openIterator("A");
int tupleCount = 0;
Expand Down