New crawling order: priority, depth, docid

Extracting anchor texts
dungvn3000 · Mar 12, 2012 · 7b8bf91 · 7b8bf91
1 parent 1c23e32
commit 7b8bf91
Show file tree

Hide file tree

Showing 8 changed files with 143 additions and 46 deletions.
diff --git a/pom.xml b/pom.xml
@@ -4,7 +4,7 @@
 	<artifactId>crawler4j</artifactId>
 	<packaging>jar</packaging>
 	<name>crawler4j</name>
-	<version>3.3.1-SNAPSHOT</version>
+	<version>3.3.2-SNAPSHOT</version>
 	<description>Open Source Web Crawler for Java</description>
 	<url>http://code.google.com/p/crawler4j/</url>
 	<licenses>

diff --git a/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java b/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java
@@ -36,6 +36,7 @@ public WebURL entryToObject(TupleInput input) {
 		webURL.setParentDocid(input.readInt());
 		webURL.setParentUrl(input.readString());
 		webURL.setDepth(input.readShort());
+		webURL.setPriority(input.readByte());
 		return webURL;
 	}
 
@@ -46,5 +47,6 @@ public void objectToEntry(WebURL url, TupleOutput output) {
 		output.writeInt(url.getParentDocid());
 		output.writeString(url.getParentUrl());
 		output.writeShort(url.getDepth());
+		output.writeByte(url.getPriority());
 	}
 }
diff --git a/src/main/java/edu/uci/ics/crawler4j/frontier/WorkQueues.java b/src/main/java/edu/uci/ics/crawler4j/frontier/WorkQueues.java
@@ -134,7 +134,23 @@ public void delete(int count) throws DatabaseException {
 	}
 
 	public void put(WebURL url) throws DatabaseException {
-		byte[] keyData = Util.int2ByteArray(url.getDocid());
+
+		/*
+		 * The key that is used for storing URLs determines the order
+		 * they are crawled. Lower key values results in earlier crawling.
+		 * Here our keys are 6 bytes. The first byte comes from the URL priority.
+		 * The second byte comes from depth of crawl at which this URL is first found.
+		 * The rest of the 4 bytes come from the docid of the URL. As a result,
+		 * URLs with lower priority numbers will be crawled earlier. If priority
+		 * numbers are the same, those found at lower depths will be crawled earlier.
+		 * If depth is also equal, those found earlier (therefore, smaller docid) will
+		 * be crawled earlier.
+		 */
+		byte[] keyData = new byte[6];
+		keyData[0] = url.getPriority();
+		keyData[1] = (url.getDepth() > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte) url.getDepth());
+		Util.putIntInByteArray(url.getDocid(), keyData, 2);
+
 		DatabaseEntry value = new DatabaseEntry();
 		webURLBinding.objectToEntry(url, value);
 		Transaction txn;
@@ -145,10 +161,10 @@ public void put(WebURL url) throws DatabaseException {
 		}
 		urlsDB.put(txn, new DatabaseEntry(keyData), value);
 		if (resumable) {
-            if (txn != null) {
-                txn.commit();
-            }
-        }
+			if (txn != null) {
+				txn.commit();
+			}
+		}
 	}
 
 	public long getLength() {

diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/ExtractedUrlAnchorPair.java b/src/main/java/edu/uci/ics/crawler4j/parser/ExtractedUrlAnchorPair.java
@@ -0,0 +1,24 @@
+package edu.uci.ics.crawler4j.parser;
+
+public class ExtractedUrlAnchorPair {
+
+	private String href;
+	private String anchor;
+
+	public String getHref() {
+		return href;
+	}
+
+	public void setHref(String href) {
+		this.href = href;
+	}
+
+	public String getAnchor() {
+		return anchor;
+	}
+
+	public void setAnchor(String anchor) {
+		this.anchor = anchor;
+	}
+
+}
diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java b/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java
@@ -17,20 +17,20 @@
 
 package edu.uci.ics.crawler4j.parser;
 
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-
 public class HtmlContentHandler extends DefaultHandler {
 
 	private enum Element {
 		A, AREA, LINK, IFRAME, FRAME, EMBED, IMG, BASE, META, BODY
-    }
+	}
 
 	private static class HtmlFactory {
 		private static Map<String, Element> name2Element;
@@ -54,12 +54,16 @@ public static Element getElement(String name) {
 	private boolean isWithinBodyElement;
 	private StringBuilder bodyText;
 
-	private Set<String> outgoingUrls;
+	private List<ExtractedUrlAnchorPair> outgoingUrls;
+
+	private ExtractedUrlAnchorPair curUrl = null;
+	private boolean anchorFlag = false;
+	private StringBuilder anchorText = new StringBuilder();
 
 	public HtmlContentHandler() {
 		isWithinBodyElement = false;
 		bodyText = new StringBuilder();
-		outgoingUrls = new HashSet<String>();
+		outgoingUrls = new ArrayList<ExtractedUrlAnchorPair>();
 	}
 
 	@Override
@@ -69,23 +73,30 @@ public void startElement(String uri, String localName, String qName, Attributes
 		if (element == Element.A || element == Element.AREA || element == Element.LINK) {
 			String href = attributes.getValue("href");
 			if (href != null) {
-				outgoingUrls.add(href);
+				anchorFlag = true;
+				curUrl = new ExtractedUrlAnchorPair();
+				curUrl.setHref(href);
+				outgoingUrls.add(curUrl);
 			}
 			return;
 		}
 
 		if (element == Element.IMG) {
 			String imgSrc = attributes.getValue("src");
 			if (imgSrc != null) {
-				outgoingUrls.add(imgSrc);
+				curUrl = new ExtractedUrlAnchorPair();
+				curUrl.setHref(imgSrc);
+				outgoingUrls.add(curUrl);
 			}
 			return;
 		}
 
 		if (element == Element.IFRAME || element == Element.FRAME || element == Element.EMBED) {
 			String src = attributes.getValue("src");
 			if (src != null) {
-				outgoingUrls.add(src);
+				curUrl = new ExtractedUrlAnchorPair();
+				curUrl.setHref(src);
+				outgoingUrls.add(curUrl);
 			}
 			return;
 		}
@@ -125,12 +136,23 @@ public void startElement(String uri, String localName, String qName, Attributes
 
 		if (element == Element.BODY) {
 			isWithinBodyElement = true;
-        }
+		}
 	}
 
 	@Override
 	public void endElement(String uri, String localName, String qName) throws SAXException {
 		Element element = HtmlFactory.getElement(localName);
+		if (element == Element.A || element == Element.AREA || element == Element.LINK) {
+			anchorFlag = false;
+			if (curUrl != null) {
+				String anchor = anchorText.toString().trim();
+				if (!anchor.isEmpty()) {
+					curUrl.setAnchor(anchor);
+				}
+				anchorText.delete(0, anchorText.length());
+			}
+			curUrl = null;
+		}
 		if (element == Element.BODY) {
 			isWithinBodyElement = false;
 		}
@@ -140,17 +162,21 @@ public void endElement(String uri, String localName, String qName) throws SAXExc
 	public void characters(char ch[], int start, int length) throws SAXException {
 		if (isWithinBodyElement) {
 			bodyText.append(ch, start, length);
+
+			if (anchorFlag) {
+				anchorText.append(new String(ch, start, length).replaceAll("\n", "").replaceAll("\t", "").trim());
+			}
 		}
 	}
 
 	public String getBodyText() {
 		return bodyText.toString();
 	}
-	
-	public Set<String> getOutgoingUrls() {
+
+	public List<ExtractedUrlAnchorPair> getOutgoingUrls() {
 		return outgoingUrls;
 	}
-	
+
 	public String getBaseUrl() {
 		return base;
 	}

diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java b/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java
@@ -22,9 +22,7 @@
 import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.util.ArrayList;
-import java.util.HashSet;
 import java.util.List;
-import java.util.Set;
 
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
@@ -98,15 +96,16 @@ public boolean parse(Page page, String contextURL) {
 		parseData.setText(contentHandler.getBodyText().trim());
 		parseData.setTitle(metadata.get(Metadata.TITLE));
 
-		Set<String> urls = new HashSet<String>();
+		List<WebURL> outgoingUrls = new ArrayList<WebURL>();
 
 		String baseURL = contentHandler.getBaseUrl();
 		if (baseURL != null) {
 			contextURL = baseURL;
 		}
 
 		int urlCount = 0;
-		for (String href : contentHandler.getOutgoingUrls()) {
+		for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
+			String href = urlAnchorPair.getHref();
 			href = href.trim();
 			if (href.length() == 0) {
 				continue;
@@ -118,7 +117,10 @@ public boolean parse(Page page, String contextURL) {
 			if (!hrefWithoutProtocol.contains("javascript:") && !hrefWithoutProtocol.contains("@")) {
 				String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
 				if (url != null) {
-					urls.add(url);
+					WebURL webURL = new WebURL();
+					webURL.setURL(url);
+					webURL.setAnchor(urlAnchorPair.getAnchor());
+					outgoingUrls.add(webURL);
 					urlCount++;
 					if (urlCount > config.getMaxOutgoingLinksToFollow()) {
 						break;
@@ -127,12 +129,6 @@ public boolean parse(Page page, String contextURL) {
 			}
 		}
 
-		List<WebURL> outgoingUrls = new ArrayList<WebURL>();
-		for (String url : urls) {
-			WebURL webURL = new WebURL();
-			webURL.setURL(url);
-			outgoingUrls.add(webURL);
-		}
 		parseData.setOutgoingUrls(outgoingUrls);
 
 		try {

diff --git a/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java b/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java
@@ -41,6 +41,8 @@ public class WebURL implements Serializable {
 	private String domain;
 	private String subDomain;
 	private String path;
+	private String anchor;
+	private byte priority;
 
 	/**
 	 * Returns the unique document id assigned to this Url.
@@ -109,9 +111,8 @@ public void setURL(String url) {
 	}
 
 	/**
-	 * Returns the unique document id of the parent page.
-	 * The parent page is the page in which the Url of this
-	 * page is first observed.
+	 * Returns the unique document id of the parent page. The parent page is the
+	 * page in which the Url of this page is first observed.
 	 */
 	public int getParentDocid() {
 		return parentDocid;
@@ -122,9 +123,8 @@ public void setParentDocid(int parentDocid) {
 	}
 
 	/**
-	 * Returns the url of the parent page.
-	 * The parent page is the page in which the Url of this
-	 * page is first observed.
+	 * Returns the url of the parent page. The parent page is the page in which
+	 * the Url of this page is first observed.
 	 */
 	public String getParentUrl() {
 		return parentUrl;
@@ -135,9 +135,9 @@ public void setParentUrl(String parentUrl) {
 	}
 
 	/**
-	 * Returns the crawl depth at which this Url is first observed.
-	 * Seed Urls are at depth 0. Urls that are extracted from seed Urls
-	 * are at depth 1, etc.
+	 * Returns the crawl depth at which this Url is first observed. Seed Urls
+	 * are at depth 0. Urls that are extracted from seed Urls are at depth 1,
+	 * etc.
 	 */
 	public short getDepth() {
 		return depth;
@@ -148,8 +148,8 @@ public void setDepth(short depth) {
 	}
 
 	/**
-	 * Returns the domain of this Url.
-	 * For 'http://www.example.com/sample.htm', domain will be 'example.com'
+	 * Returns the domain of this Url. For 'http://www.example.com/sample.htm',
+	 * domain will be 'example.com'
 	 */
 	public String getDomain() {
 		return domain;
@@ -160,8 +160,8 @@ public String getSubDomain() {
 	}
 
 	/**
-	 * Returns the path of this Url.
-	 * For 'http://www.example.com/sample.htm', domain will be 'sample.htm'
+	 * Returns the path of this Url. For 'http://www.example.com/sample.htm',
+	 * domain will be 'sample.htm'
 	 */
 	public String getPath() {
 		return path;
@@ -170,4 +170,30 @@ public String getPath() {
 	public void setPath(String path) {
 		this.path = path;
 	}
+
+	/**
+	 * Returns the anchor string. For example, in <a href="example.com">A sample anchor</a>
+	 * the anchor string is 'A sample anchor'
+	 */
+	public String getAnchor() {
+		return anchor;
+	}
+
+	public void setAnchor(String anchor) {
+		this.anchor = anchor;
+	}
+
+	/**
+	 * Returns the priority for crawling this URL.
+	 * A lower number results in higher priority.
+	 */
+	public byte getPriority() {
+		return priority;
+	}
+
+	public void setPriority(byte priority) {
+		this.priority = priority;
+	}
+
+
 }
diff --git a/src/main/java/edu/uci/ics/crawler4j/util/Util.java b/src/main/java/edu/uci/ics/crawler4j/util/Util.java
@@ -35,12 +35,19 @@ public static byte[] long2ByteArray(long l) {
     public static byte[] int2ByteArray(int value) {
     	byte[] b = new byte[4];
         for (int i = 0; i < 4; i++) {
-            int offset = (b.length - 1 - i) * 8;
+            int offset = (3 - i) * 8;
             b[i] = (byte) ((value >>> offset) & 0xFF);
         }
         return b;
     }
 
+    public static void putIntInByteArray(int value, byte[] buf, int offset) {
+        for (int i = 0; i < 4; i++) {
+            int valueOffset = (3 - i) * 8;
+            buf[offset + i] = (byte) ((value >>> valueOffset) & 0xFF);
+        }
+    }
+
     public static int byteArray2Int(byte[] b) {
         int value = 0;
         for (int i = 0; i < 4; i++) {