Skip to content

Commit

Permalink
New crawling order: priority, depth, docid
Browse files Browse the repository at this point in the history
Extracting anchor texts
  • Loading branch information
yasserg committed Mar 12, 2012
1 parent 1c23e32 commit 7b8bf91
Show file tree
Hide file tree
Showing 8 changed files with 143 additions and 46 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<artifactId>crawler4j</artifactId>
<packaging>jar</packaging>
<name>crawler4j</name>
<version>3.3.1-SNAPSHOT</version>
<version>3.3.2-SNAPSHOT</version>
<description>Open Source Web Crawler for Java</description>
<url>http://code.google.com/p/crawler4j/</url>
<licenses>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ public WebURL entryToObject(TupleInput input) {
webURL.setParentDocid(input.readInt());
webURL.setParentUrl(input.readString());
webURL.setDepth(input.readShort());
webURL.setPriority(input.readByte());
return webURL;
}

Expand All @@ -46,5 +47,6 @@ public void objectToEntry(WebURL url, TupleOutput output) {
output.writeInt(url.getParentDocid());
output.writeString(url.getParentUrl());
output.writeShort(url.getDepth());
output.writeByte(url.getPriority());
}
}
26 changes: 21 additions & 5 deletions src/main/java/edu/uci/ics/crawler4j/frontier/WorkQueues.java
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,23 @@ public void delete(int count) throws DatabaseException {
}

public void put(WebURL url) throws DatabaseException {
byte[] keyData = Util.int2ByteArray(url.getDocid());

/*
* The key that is used for storing URLs determines the order
* they are crawled. Lower key values results in earlier crawling.
* Here our keys are 6 bytes. The first byte comes from the URL priority.
* The second byte comes from depth of crawl at which this URL is first found.
* The rest of the 4 bytes come from the docid of the URL. As a result,
* URLs with lower priority numbers will be crawled earlier. If priority
* numbers are the same, those found at lower depths will be crawled earlier.
* If depth is also equal, those found earlier (therefore, smaller docid) will
* be crawled earlier.
*/
byte[] keyData = new byte[6];
keyData[0] = url.getPriority();
keyData[1] = (url.getDepth() > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte) url.getDepth());
Util.putIntInByteArray(url.getDocid(), keyData, 2);

DatabaseEntry value = new DatabaseEntry();
webURLBinding.objectToEntry(url, value);
Transaction txn;
Expand All @@ -145,10 +161,10 @@ public void put(WebURL url) throws DatabaseException {
}
urlsDB.put(txn, new DatabaseEntry(keyData), value);
if (resumable) {
if (txn != null) {
txn.commit();
}
}
if (txn != null) {
txn.commit();
}
}
}

public long getLength() {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package edu.uci.ics.crawler4j.parser;

public class ExtractedUrlAnchorPair {

private String href;
private String anchor;

public String getHref() {
return href;
}

public void setHref(String href) {
this.href = href;
}

public String getAnchor() {
return anchor;
}

public void setAnchor(String anchor) {
this.anchor = anchor;
}

}
56 changes: 41 additions & 15 deletions src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,20 @@

package edu.uci.ics.crawler4j.parser;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

public class HtmlContentHandler extends DefaultHandler {

private enum Element {
A, AREA, LINK, IFRAME, FRAME, EMBED, IMG, BASE, META, BODY
}
}

private static class HtmlFactory {
private static Map<String, Element> name2Element;
Expand All @@ -54,12 +54,16 @@ public static Element getElement(String name) {
private boolean isWithinBodyElement;
private StringBuilder bodyText;

private Set<String> outgoingUrls;
private List<ExtractedUrlAnchorPair> outgoingUrls;

private ExtractedUrlAnchorPair curUrl = null;
private boolean anchorFlag = false;
private StringBuilder anchorText = new StringBuilder();

public HtmlContentHandler() {
isWithinBodyElement = false;
bodyText = new StringBuilder();
outgoingUrls = new HashSet<String>();
outgoingUrls = new ArrayList<ExtractedUrlAnchorPair>();
}

@Override
Expand All @@ -69,23 +73,30 @@ public void startElement(String uri, String localName, String qName, Attributes
if (element == Element.A || element == Element.AREA || element == Element.LINK) {
String href = attributes.getValue("href");
if (href != null) {
outgoingUrls.add(href);
anchorFlag = true;
curUrl = new ExtractedUrlAnchorPair();
curUrl.setHref(href);
outgoingUrls.add(curUrl);
}
return;
}

if (element == Element.IMG) {
String imgSrc = attributes.getValue("src");
if (imgSrc != null) {
outgoingUrls.add(imgSrc);
curUrl = new ExtractedUrlAnchorPair();
curUrl.setHref(imgSrc);
outgoingUrls.add(curUrl);
}
return;
}

if (element == Element.IFRAME || element == Element.FRAME || element == Element.EMBED) {
String src = attributes.getValue("src");
if (src != null) {
outgoingUrls.add(src);
curUrl = new ExtractedUrlAnchorPair();
curUrl.setHref(src);
outgoingUrls.add(curUrl);
}
return;
}
Expand Down Expand Up @@ -125,12 +136,23 @@ public void startElement(String uri, String localName, String qName, Attributes

if (element == Element.BODY) {
isWithinBodyElement = true;
}
}
}

@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
Element element = HtmlFactory.getElement(localName);
if (element == Element.A || element == Element.AREA || element == Element.LINK) {
anchorFlag = false;
if (curUrl != null) {
String anchor = anchorText.toString().trim();
if (!anchor.isEmpty()) {
curUrl.setAnchor(anchor);
}
anchorText.delete(0, anchorText.length());
}
curUrl = null;
}
if (element == Element.BODY) {
isWithinBodyElement = false;
}
Expand All @@ -140,17 +162,21 @@ public void endElement(String uri, String localName, String qName) throws SAXExc
public void characters(char ch[], int start, int length) throws SAXException {
if (isWithinBodyElement) {
bodyText.append(ch, start, length);

if (anchorFlag) {
anchorText.append(new String(ch, start, length).replaceAll("\n", "").replaceAll("\t", "").trim());
}
}
}

public String getBodyText() {
return bodyText.toString();
}
public Set<String> getOutgoingUrls() {

public List<ExtractedUrlAnchorPair> getOutgoingUrls() {
return outgoingUrls;
}

public String getBaseUrl() {
return base;
}
Expand Down
18 changes: 7 additions & 11 deletions src/main/java/edu/uci/ics/crawler4j/parser/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
Expand Down Expand Up @@ -98,15 +96,16 @@ public boolean parse(Page page, String contextURL) {
parseData.setText(contentHandler.getBodyText().trim());
parseData.setTitle(metadata.get(Metadata.TITLE));

Set<String> urls = new HashSet<String>();
List<WebURL> outgoingUrls = new ArrayList<WebURL>();

String baseURL = contentHandler.getBaseUrl();
if (baseURL != null) {
contextURL = baseURL;
}

int urlCount = 0;
for (String href : contentHandler.getOutgoingUrls()) {
for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
String href = urlAnchorPair.getHref();
href = href.trim();
if (href.length() == 0) {
continue;
Expand All @@ -118,7 +117,10 @@ public boolean parse(Page page, String contextURL) {
if (!hrefWithoutProtocol.contains("javascript:") && !hrefWithoutProtocol.contains("@")) {
String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
if (url != null) {
urls.add(url);
WebURL webURL = new WebURL();
webURL.setURL(url);
webURL.setAnchor(urlAnchorPair.getAnchor());
outgoingUrls.add(webURL);
urlCount++;
if (urlCount > config.getMaxOutgoingLinksToFollow()) {
break;
Expand All @@ -127,12 +129,6 @@ public boolean parse(Page page, String contextURL) {
}
}

List<WebURL> outgoingUrls = new ArrayList<WebURL>();
for (String url : urls) {
WebURL webURL = new WebURL();
webURL.setURL(url);
outgoingUrls.add(webURL);
}
parseData.setOutgoingUrls(outgoingUrls);

try {
Expand Down
52 changes: 39 additions & 13 deletions src/main/java/edu/uci/ics/crawler4j/url/WebURL.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ public class WebURL implements Serializable {
private String domain;
private String subDomain;
private String path;
private String anchor;
private byte priority;

/**
* Returns the unique document id assigned to this Url.
Expand Down Expand Up @@ -109,9 +111,8 @@ public void setURL(String url) {
}

/**
* Returns the unique document id of the parent page.
* The parent page is the page in which the Url of this
* page is first observed.
* Returns the unique document id of the parent page. The parent page is the
* page in which the Url of this page is first observed.
*/
public int getParentDocid() {
return parentDocid;
Expand All @@ -122,9 +123,8 @@ public void setParentDocid(int parentDocid) {
}

/**
* Returns the url of the parent page.
* The parent page is the page in which the Url of this
* page is first observed.
* Returns the url of the parent page. The parent page is the page in which
* the Url of this page is first observed.
*/
public String getParentUrl() {
return parentUrl;
Expand All @@ -135,9 +135,9 @@ public void setParentUrl(String parentUrl) {
}

/**
* Returns the crawl depth at which this Url is first observed.
* Seed Urls are at depth 0. Urls that are extracted from seed Urls
* are at depth 1, etc.
* Returns the crawl depth at which this Url is first observed. Seed Urls
* are at depth 0. Urls that are extracted from seed Urls are at depth 1,
* etc.
*/
public short getDepth() {
return depth;
Expand All @@ -148,8 +148,8 @@ public void setDepth(short depth) {
}

/**
* Returns the domain of this Url.
* For 'http://www.example.com/sample.htm', domain will be 'example.com'
* Returns the domain of this Url. For 'http://www.example.com/sample.htm',
* domain will be 'example.com'
*/
public String getDomain() {
return domain;
Expand All @@ -160,8 +160,8 @@ public String getSubDomain() {
}

/**
* Returns the path of this Url.
* For 'http://www.example.com/sample.htm', domain will be 'sample.htm'
* Returns the path of this Url. For 'http://www.example.com/sample.htm',
* domain will be 'sample.htm'
*/
public String getPath() {
return path;
Expand All @@ -170,4 +170,30 @@ public String getPath() {
public void setPath(String path) {
this.path = path;
}

/**
* Returns the anchor string. For example, in <a href="example.com">A sample anchor</a>
* the anchor string is 'A sample anchor'
*/
public String getAnchor() {
return anchor;
}

public void setAnchor(String anchor) {
this.anchor = anchor;
}

/**
* Returns the priority for crawling this URL.
* A lower number results in higher priority.
*/
public byte getPriority() {
return priority;
}

public void setPriority(byte priority) {
this.priority = priority;
}


}
9 changes: 8 additions & 1 deletion src/main/java/edu/uci/ics/crawler4j/util/Util.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,19 @@ public static byte[] long2ByteArray(long l) {
public static byte[] int2ByteArray(int value) {
byte[] b = new byte[4];
for (int i = 0; i < 4; i++) {
int offset = (b.length - 1 - i) * 8;
int offset = (3 - i) * 8;
b[i] = (byte) ((value >>> offset) & 0xFF);
}
return b;
}

public static void putIntInByteArray(int value, byte[] buf, int offset) {
for (int i = 0; i < 4; i++) {
int valueOffset = (3 - i) * 8;
buf[offset + i] = (byte) ((value >>> valueOffset) & 0xFF);
}
}

public static int byteArray2Int(byte[] b) {
int value = 0;
for (int i = 0; i < 4; i++) {
Expand Down

0 comments on commit 7b8bf91

Please sign in to comment.