-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #23 from aecio/dev
Dependency upgrades and maintenance changes
- Loading branch information
Showing
6 changed files
with
103 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
version: 2 | ||
updates: | ||
- package-ecosystem: maven | ||
directory: "/" | ||
schedule: | ||
interval: weekly |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
name: crawler-commons-http-fetcher build | ||
|
||
on: [push] | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
strategy: | ||
matrix: | ||
java: [ 8, 11, 17 ] | ||
name: Java ${{ matrix.java }} | ||
steps: | ||
- uses: actions/checkout@v2 | ||
|
||
- name: Setup JDK | ||
uses: actions/setup-java@v2 | ||
with: | ||
distribution: 'temurin' | ||
java-version: ${{ matrix.java }} | ||
cache: 'maven' | ||
|
||
- name: Build | ||
run: mvn install javadoc:aggregate |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,38 @@ | ||
# http-fetcher | ||
Wrapper code for Apache HttpClient that provides common page fetching functionality | ||
|
||
TODO - add more context here. | ||
The Crawler Commons' http-fetcher is a Java library that provides common page fetching functionality needed in web crawlers. | ||
Currently, it uses Apache HttpClient library to implement low-level HTTP communication. | ||
|
||
An example of creating a fetcher with five threads that will only accept content identified by the server as text/html: | ||
## Requirements | ||
Currently, http-fetcher requires Java 8+. | ||
|
||
## API | ||
|
||
An example of creating a fetcher with five threads that will only accept content identified by the server as `text/html`: | ||
|
||
``` java | ||
BaseFetcher fetcher = new SimpleHttpFetcher(1, new UserAgent("mycrawler", "[email protected]", "http://domain.com")); | ||
Set<String> validMimeTypes = new HashSet<String>(); | ||
// Data passed to UserAgent will be used to automatically create HTTP header 'User-Agent' | ||
UserAgent userAgent = new UserAgent("mycrawler", "[email protected]", "http://domain.com"); | ||
|
||
// Instantiate the BaseFetcher object used to fetch pages | ||
BaseFetcher fetcher = new SimpleHttpFetcher(1, userAgent); | ||
|
||
// Configure the accepted mime-types | ||
Set<String> validMimeTypes = new HashSet<>(); | ||
validMimeTypes.add("text/html"); | ||
fetcher.setValidMimeTypes(validMimeTypes); | ||
FetchedResult result = fetcher.get("http://localhost:8089/"); | ||
|
||
try { | ||
// Fetch the web page from the Web | ||
FetchedResult result = fetcher.get("http://localhost:8089/"); | ||
|
||
// Read downloaded content (additional data is available via remaining methods from FetchedResult object) | ||
String requestedUrl = result.getBaseUrl(); // the requested URL (same as above) | ||
String finalUrl = result.getFetchedUrl(); // the final URL after redirects (if any) | ||
byte[] page = result.getContent(); // the page data returned by server as a byte array | ||
long fetchTime = result.getFetchTime(); // the time taken to download the page | ||
String address = result.getHostAddress(); // the host address | ||
} catch (BaseFetchException e) { | ||
// The download has failed. Check the actual subclass of BaseFetchException to get error details. | ||
} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -92,6 +92,12 @@ | |
<name>Avi Hayun</name> | ||
<email>[email protected]</email> | ||
</developer> | ||
|
||
<developer> | ||
<id>aecio</id> | ||
<name>Aécio Santos</name> | ||
<email>[email protected]</email> | ||
</developer> | ||
</developers> | ||
|
||
<build> | ||
|
@@ -221,7 +227,7 @@ | |
<plugin> | ||
<groupId>de.thetaphi</groupId> | ||
<artifactId>forbiddenapis</artifactId> | ||
<version>1.8</version> | ||
<version>3.3</version> | ||
<configuration> | ||
<!-- disallow undocumented classes like sun.misc.Unsafe: --> | ||
<internalRuntimeForbidden>true</internalRuntimeForbidden> | ||
|
@@ -335,24 +341,24 @@ | |
|
||
<properties> | ||
<!-- Dependencies --> | ||
<httpclient.version>4.5.8</httpclient.version> | ||
<commons-io.version>2.4</commons-io.version> | ||
<slf4j-api.version>1.7.7</slf4j-api.version> | ||
<httpclient.version>4.5.13</httpclient.version> | ||
<commons-io.version>2.11.0</commons-io.version> | ||
<slf4j-api.version>1.7.36</slf4j-api.version> | ||
|
||
<!-- Dependencies for testing --> | ||
<slf4j-log4j12.version>1.7.7</slf4j-log4j12.version> | ||
<junit.version>4.7</junit.version> | ||
<mockito-core.version>1.8.0</mockito-core.version> | ||
<jetty.version>9.3.6.v20151106</jetty.version> | ||
<slf4j-log4j12.version>1.7.33</slf4j-log4j12.version> | ||
<junit.version>4.13.2</junit.version> | ||
<mockito-core.version>4.6.1</mockito-core.version> | ||
<jetty.version>9.4.48.v20220622</jetty.version> | ||
|
||
<!-- Maven Plugin Dependencies --> | ||
<maven-compiler-plugin.version>2.3.2</maven-compiler-plugin.version> | ||
<maven-resources-plugin.version>2.5</maven-resources-plugin.version> | ||
<maven-jar-plugin.version>2.4</maven-jar-plugin.version> | ||
<maven-surfire-plugin.version>2.12</maven-surfire-plugin.version> | ||
<maven-release-plugin.version>2.5.1</maven-release-plugin.version> | ||
<maven-source-plugin.version>2.1.2</maven-source-plugin.version> | ||
<maven-javadoc-plugin.version>2.9.1</maven-javadoc-plugin.version> | ||
<maven-surfire-plugin.version>2.22.2</maven-surfire-plugin.version> | ||
<maven-release-plugin.version>2.5.3</maven-release-plugin.version> | ||
<maven-source-plugin.version>3.2.1</maven-source-plugin.version> | ||
<maven-javadoc-plugin.version>3.4.0</maven-javadoc-plugin.version> | ||
<maven-gpg-plugin.version>1.4</maven-gpg-plugin.version> | ||
<apache-rat-plugin.version>0.8</apache-rat-plugin.version> | ||
<maven-assembly-plugin.version>2.2.2</maven-assembly-plugin.version> | ||
|
@@ -361,9 +367,9 @@ | |
|
||
<!-- General Properties --> | ||
<implementation.build>${scmBranch}@r${buildNumber}</implementation.build> | ||
<javac.src.version>1.7</javac.src.version> | ||
<javac.target.version>1.7</javac.target.version> | ||
<maven.compiler.target>1.7</maven.compiler.target> | ||
<javac.src.version>1.8</javac.src.version> | ||
<javac.target.version>1.8</javac.target.version> | ||
<maven.compiler.target>1.8</maven.compiler.target> | ||
<maven.build.timestamp.format>yyyy-MM-dd HH:mm:ssZ</maven.build.timestamp.format> | ||
<skipTests>false</skipTests> | ||
<assembly.finalName>${project.build.finalName}</assembly.finalName> | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters