From 7b8bf91aab517757f4b62bd3ca22546e105a736b Mon Sep 17 00:00:00 2001 From: Yasser Ganjisaffar Date: Sun, 11 Mar 2012 23:35:26 -0700 Subject: [PATCH] New crawling order: priority, depth, docid Extracting anchor texts --- pom.xml | 2 +- .../frontier/WebURLTupleBinding.java | 2 + .../ics/crawler4j/frontier/WorkQueues.java | 26 +++++++-- .../parser/ExtractedUrlAnchorPair.java | 24 ++++++++ .../crawler4j/parser/HtmlContentHandler.java | 56 ++++++++++++++----- .../edu/uci/ics/crawler4j/parser/Parser.java | 18 +++--- .../edu/uci/ics/crawler4j/url/WebURL.java | 52 ++++++++++++----- .../java/edu/uci/ics/crawler4j/util/Util.java | 9 ++- 8 files changed, 143 insertions(+), 46 deletions(-) create mode 100644 src/main/java/edu/uci/ics/crawler4j/parser/ExtractedUrlAnchorPair.java diff --git a/pom.xml b/pom.xml index ba1adacad..a5c5030cc 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ crawler4j jar crawler4j - 3.3.1-SNAPSHOT + 3.3.2-SNAPSHOT Open Source Web Crawler for Java http://code.google.com/p/crawler4j/ diff --git a/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java b/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java index 80f1249e0..d7e9b34a4 100644 --- a/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java +++ b/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java @@ -36,6 +36,7 @@ public WebURL entryToObject(TupleInput input) { webURL.setParentDocid(input.readInt()); webURL.setParentUrl(input.readString()); webURL.setDepth(input.readShort()); + webURL.setPriority(input.readByte()); return webURL; } @@ -46,5 +47,6 @@ public void objectToEntry(WebURL url, TupleOutput output) { output.writeInt(url.getParentDocid()); output.writeString(url.getParentUrl()); output.writeShort(url.getDepth()); + output.writeByte(url.getPriority()); } } diff --git a/src/main/java/edu/uci/ics/crawler4j/frontier/WorkQueues.java b/src/main/java/edu/uci/ics/crawler4j/frontier/WorkQueues.java index f524f2f4e..255905099 100644 --- a/src/main/java/edu/uci/ics/crawler4j/frontier/WorkQueues.java +++ b/src/main/java/edu/uci/ics/crawler4j/frontier/WorkQueues.java @@ -134,7 +134,23 @@ public void delete(int count) throws DatabaseException { } public void put(WebURL url) throws DatabaseException { - byte[] keyData = Util.int2ByteArray(url.getDocid()); + + /* + * The key that is used for storing URLs determines the order + * they are crawled. Lower key values results in earlier crawling. + * Here our keys are 6 bytes. The first byte comes from the URL priority. + * The second byte comes from depth of crawl at which this URL is first found. + * The rest of the 4 bytes come from the docid of the URL. As a result, + * URLs with lower priority numbers will be crawled earlier. If priority + * numbers are the same, those found at lower depths will be crawled earlier. + * If depth is also equal, those found earlier (therefore, smaller docid) will + * be crawled earlier. + */ + byte[] keyData = new byte[6]; + keyData[0] = url.getPriority(); + keyData[1] = (url.getDepth() > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte) url.getDepth()); + Util.putIntInByteArray(url.getDocid(), keyData, 2); + DatabaseEntry value = new DatabaseEntry(); webURLBinding.objectToEntry(url, value); Transaction txn; @@ -145,10 +161,10 @@ public void put(WebURL url) throws DatabaseException { } urlsDB.put(txn, new DatabaseEntry(keyData), value); if (resumable) { - if (txn != null) { - txn.commit(); - } - } + if (txn != null) { + txn.commit(); + } + } } public long getLength() { diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/ExtractedUrlAnchorPair.java b/src/main/java/edu/uci/ics/crawler4j/parser/ExtractedUrlAnchorPair.java new file mode 100644 index 000000000..12c9a1ff0 --- /dev/null +++ b/src/main/java/edu/uci/ics/crawler4j/parser/ExtractedUrlAnchorPair.java @@ -0,0 +1,24 @@ +package edu.uci.ics.crawler4j.parser; + +public class ExtractedUrlAnchorPair { + + private String href; + private String anchor; + + public String getHref() { + return href; + } + + public void setHref(String href) { + this.href = href; + } + + public String getAnchor() { + return anchor; + } + + public void setAnchor(String anchor) { + this.anchor = anchor; + } + +} diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java b/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java index bd7596285..471c0969a 100644 --- a/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java +++ b/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java @@ -17,20 +17,20 @@ package edu.uci.ics.crawler4j.parser; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; - public class HtmlContentHandler extends DefaultHandler { private enum Element { A, AREA, LINK, IFRAME, FRAME, EMBED, IMG, BASE, META, BODY - } + } private static class HtmlFactory { private static Map name2Element; @@ -54,12 +54,16 @@ public static Element getElement(String name) { private boolean isWithinBodyElement; private StringBuilder bodyText; - private Set outgoingUrls; + private List outgoingUrls; + + private ExtractedUrlAnchorPair curUrl = null; + private boolean anchorFlag = false; + private StringBuilder anchorText = new StringBuilder(); public HtmlContentHandler() { isWithinBodyElement = false; bodyText = new StringBuilder(); - outgoingUrls = new HashSet(); + outgoingUrls = new ArrayList(); } @Override @@ -69,7 +73,10 @@ public void startElement(String uri, String localName, String qName, Attributes if (element == Element.A || element == Element.AREA || element == Element.LINK) { String href = attributes.getValue("href"); if (href != null) { - outgoingUrls.add(href); + anchorFlag = true; + curUrl = new ExtractedUrlAnchorPair(); + curUrl.setHref(href); + outgoingUrls.add(curUrl); } return; } @@ -77,7 +84,9 @@ public void startElement(String uri, String localName, String qName, Attributes if (element == Element.IMG) { String imgSrc = attributes.getValue("src"); if (imgSrc != null) { - outgoingUrls.add(imgSrc); + curUrl = new ExtractedUrlAnchorPair(); + curUrl.setHref(imgSrc); + outgoingUrls.add(curUrl); } return; } @@ -85,7 +94,9 @@ public void startElement(String uri, String localName, String qName, Attributes if (element == Element.IFRAME || element == Element.FRAME || element == Element.EMBED) { String src = attributes.getValue("src"); if (src != null) { - outgoingUrls.add(src); + curUrl = new ExtractedUrlAnchorPair(); + curUrl.setHref(src); + outgoingUrls.add(curUrl); } return; } @@ -125,12 +136,23 @@ public void startElement(String uri, String localName, String qName, Attributes if (element == Element.BODY) { isWithinBodyElement = true; - } + } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { Element element = HtmlFactory.getElement(localName); + if (element == Element.A || element == Element.AREA || element == Element.LINK) { + anchorFlag = false; + if (curUrl != null) { + String anchor = anchorText.toString().trim(); + if (!anchor.isEmpty()) { + curUrl.setAnchor(anchor); + } + anchorText.delete(0, anchorText.length()); + } + curUrl = null; + } if (element == Element.BODY) { isWithinBodyElement = false; } @@ -140,17 +162,21 @@ public void endElement(String uri, String localName, String qName) throws SAXExc public void characters(char ch[], int start, int length) throws SAXException { if (isWithinBodyElement) { bodyText.append(ch, start, length); + + if (anchorFlag) { + anchorText.append(new String(ch, start, length).replaceAll("\n", "").replaceAll("\t", "").trim()); + } } } public String getBodyText() { return bodyText.toString(); } - - public Set getOutgoingUrls() { + + public List getOutgoingUrls() { return outgoingUrls; } - + public String getBaseUrl() { return base; } diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java b/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java index c010f5bd7..bd44ff740 100644 --- a/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java +++ b/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java @@ -22,9 +22,7 @@ import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.util.ArrayList; -import java.util.HashSet; import java.util.List; -import java.util.Set; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; @@ -98,7 +96,7 @@ public boolean parse(Page page, String contextURL) { parseData.setText(contentHandler.getBodyText().trim()); parseData.setTitle(metadata.get(Metadata.TITLE)); - Set urls = new HashSet(); + List outgoingUrls = new ArrayList(); String baseURL = contentHandler.getBaseUrl(); if (baseURL != null) { @@ -106,7 +104,8 @@ public boolean parse(Page page, String contextURL) { } int urlCount = 0; - for (String href : contentHandler.getOutgoingUrls()) { + for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) { + String href = urlAnchorPair.getHref(); href = href.trim(); if (href.length() == 0) { continue; @@ -118,7 +117,10 @@ public boolean parse(Page page, String contextURL) { if (!hrefWithoutProtocol.contains("javascript:") && !hrefWithoutProtocol.contains("@")) { String url = URLCanonicalizer.getCanonicalURL(href, contextURL); if (url != null) { - urls.add(url); + WebURL webURL = new WebURL(); + webURL.setURL(url); + webURL.setAnchor(urlAnchorPair.getAnchor()); + outgoingUrls.add(webURL); urlCount++; if (urlCount > config.getMaxOutgoingLinksToFollow()) { break; @@ -127,12 +129,6 @@ public boolean parse(Page page, String contextURL) { } } - List outgoingUrls = new ArrayList(); - for (String url : urls) { - WebURL webURL = new WebURL(); - webURL.setURL(url); - outgoingUrls.add(webURL); - } parseData.setOutgoingUrls(outgoingUrls); try { diff --git a/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java b/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java index 1d0bdc8ea..5a093ae16 100644 --- a/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java +++ b/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java @@ -41,6 +41,8 @@ public class WebURL implements Serializable { private String domain; private String subDomain; private String path; + private String anchor; + private byte priority; /** * Returns the unique document id assigned to this Url. @@ -109,9 +111,8 @@ public void setURL(String url) { } /** - * Returns the unique document id of the parent page. - * The parent page is the page in which the Url of this - * page is first observed. + * Returns the unique document id of the parent page. The parent page is the + * page in which the Url of this page is first observed. */ public int getParentDocid() { return parentDocid; @@ -122,9 +123,8 @@ public void setParentDocid(int parentDocid) { } /** - * Returns the url of the parent page. - * The parent page is the page in which the Url of this - * page is first observed. + * Returns the url of the parent page. The parent page is the page in which + * the Url of this page is first observed. */ public String getParentUrl() { return parentUrl; @@ -135,9 +135,9 @@ public void setParentUrl(String parentUrl) { } /** - * Returns the crawl depth at which this Url is first observed. - * Seed Urls are at depth 0. Urls that are extracted from seed Urls - * are at depth 1, etc. + * Returns the crawl depth at which this Url is first observed. Seed Urls + * are at depth 0. Urls that are extracted from seed Urls are at depth 1, + * etc. */ public short getDepth() { return depth; @@ -148,8 +148,8 @@ public void setDepth(short depth) { } /** - * Returns the domain of this Url. - * For 'http://www.example.com/sample.htm', domain will be 'example.com' + * Returns the domain of this Url. For 'http://www.example.com/sample.htm', + * domain will be 'example.com' */ public String getDomain() { return domain; @@ -160,8 +160,8 @@ public String getSubDomain() { } /** - * Returns the path of this Url. - * For 'http://www.example.com/sample.htm', domain will be 'sample.htm' + * Returns the path of this Url. For 'http://www.example.com/sample.htm', + * domain will be 'sample.htm' */ public String getPath() { return path; @@ -170,4 +170,30 @@ public String getPath() { public void setPath(String path) { this.path = path; } + + /** + * Returns the anchor string. For example, in A sample anchor + * the anchor string is 'A sample anchor' + */ + public String getAnchor() { + return anchor; + } + + public void setAnchor(String anchor) { + this.anchor = anchor; + } + + /** + * Returns the priority for crawling this URL. + * A lower number results in higher priority. + */ + public byte getPriority() { + return priority; + } + + public void setPriority(byte priority) { + this.priority = priority; + } + + } diff --git a/src/main/java/edu/uci/ics/crawler4j/util/Util.java b/src/main/java/edu/uci/ics/crawler4j/util/Util.java index 4bd061429..54077a6b2 100644 --- a/src/main/java/edu/uci/ics/crawler4j/util/Util.java +++ b/src/main/java/edu/uci/ics/crawler4j/util/Util.java @@ -35,12 +35,19 @@ public static byte[] long2ByteArray(long l) { public static byte[] int2ByteArray(int value) { byte[] b = new byte[4]; for (int i = 0; i < 4; i++) { - int offset = (b.length - 1 - i) * 8; + int offset = (3 - i) * 8; b[i] = (byte) ((value >>> offset) & 0xFF); } return b; } + public static void putIntInByteArray(int value, byte[] buf, int offset) { + for (int i = 0; i < 4; i++) { + int valueOffset = (3 - i) * 8; + buf[offset + i] = (byte) ((value >>> valueOffset) & 0xFF); + } + } + public static int byteArray2Int(byte[] b) { int value = 0; for (int i = 0; i < 4; i++) {