Skip to content

Commit

Permalink
DOC 调整
Browse files Browse the repository at this point in the history
  • Loading branch information
xuxueli committed Oct 24, 2018
1 parent 05a4f09 commit d1a7cc3
Show file tree
Hide file tree
Showing 17 changed files with 46 additions and 49 deletions.
2 changes: 2 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>

<maven.test.skip>true</maven.test.skip>

<jsoup.version>1.11.2</jsoup.version>
<htmlunit.version>2.24</htmlunit.version>
<selenium-java.version>2.53.1</selenium-java.version>
Expand Down
34 changes: 17 additions & 17 deletions src/main/java/com/xuxueli/crawler/XxlCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public static class Builder {
* 设置运行数据类型
*
* @param runData
* @return
* @return Builder
*/
public Builder setRunData(RunData runData){
crawler.runData = runData;
Expand All @@ -66,7 +66,7 @@ public Builder setRunData(RunData runData){
* 待爬的URL列表
*
* @param urls
* @return
* @return Builder
*/
public Builder setUrls(String... urls) {
if (urls!=null && urls.length>0) {
Expand All @@ -82,7 +82,7 @@ public Builder setUrls(String... urls) {
* 允许扩散爬取,将会以现有URL为起点扩散爬取整站
*
* @param allowSpread
* @return
* @return Builder
*/
public Builder setAllowSpread(boolean allowSpread) {
crawler.runConf.setAllowSpread(allowSpread);
Expand All @@ -93,7 +93,7 @@ public Builder setAllowSpread(boolean allowSpread) {
* URL白名单正则,非空时进行URL白名单过滤页面
*
* @param whiteUrlRegexs
* @return
* @return Builder
*/
public Builder setWhiteUrlRegexs(String... whiteUrlRegexs) {
if (whiteUrlRegexs!=null && whiteUrlRegexs.length>0) {
Expand All @@ -108,7 +108,7 @@ public Builder setWhiteUrlRegexs(String... whiteUrlRegexs) {
* 页面解析器
*
* @param pageParser
* @return
* @return Builder
*/
public Builder setPageParser(PageParser pageParser){
crawler.runConf.setPageParser(pageParser);
Expand All @@ -119,7 +119,7 @@ public Builder setPageParser(PageParser pageParser){
* 页面下载器
*
* @param pageLoader
* @return
* @return Builder
*/
public Builder setPageLoader(PageLoader pageLoader){
crawler.runConf.setPageLoader(pageLoader);
Expand All @@ -131,7 +131,7 @@ public Builder setPageLoader(PageLoader pageLoader){
* 请求参数
*
* @param paramMap
* @return
* @return Builder
*/
public Builder setParamMap(Map<String, String> paramMap){
crawler.runConf.setParamMap(paramMap);
Expand All @@ -142,7 +142,7 @@ public Builder setParamMap(Map<String, String> paramMap){
* 请求Cookie
*
* @param cookieMap
* @return
* @return Builder
*/
public Builder setCookieMap(Map<String, String> cookieMap){
crawler.runConf.setCookieMap(cookieMap);
Expand All @@ -153,7 +153,7 @@ public Builder setCookieMap(Map<String, String> cookieMap){
* 请求Header
*
* @param headerMap
* @return
* @return Builder
*/
public Builder setHeaderMap(Map<String, String> headerMap){
crawler.runConf.setHeaderMap(headerMap);
Expand All @@ -164,7 +164,7 @@ public Builder setHeaderMap(Map<String, String> headerMap){
* 请求UserAgent
*
* @param userAgents
* @return
* @return Builder
*/
public Builder setUserAgent(String... userAgents){
if (userAgents!=null && userAgents.length>0) {
Expand All @@ -181,7 +181,7 @@ public Builder setUserAgent(String... userAgents){
* 请求Referrer
*
* @param referrer
* @return
* @return Builder
*/
public Builder setReferrer(String referrer){
crawler.runConf.setReferrer(referrer);
Expand All @@ -192,7 +192,7 @@ public Builder setReferrer(String referrer){
* 请求方式:true=POST请求、false=GET请求
*
* @param ifPost
* @return
* @return Builder
*/
public Builder setIfPost(boolean ifPost){
crawler.runConf.setIfPost(ifPost);
Expand All @@ -203,7 +203,7 @@ public Builder setIfPost(boolean ifPost){
* 超时时间,毫秒
*
* @param timeoutMillis
* @return
* @return Builder
*/
public Builder setTimeoutMillis(int timeoutMillis){
crawler.runConf.setTimeoutMillis(timeoutMillis);
Expand All @@ -214,7 +214,7 @@ public Builder setTimeoutMillis(int timeoutMillis){
* 停顿时间,爬虫线程处理完页面之后进行主动停顿,避免过于频繁被拦截;
*
* @param pauseMillis
* @return
* @return Builder
*/
public Builder setPauseMillis(int pauseMillis){
crawler.runConf.setPauseMillis(pauseMillis);
Expand All @@ -225,7 +225,7 @@ public Builder setPauseMillis(int pauseMillis){
* 代理生成器
*
* @param proxyMaker
* @return
* @return Builder
*/
public Builder setProxyMaker(ProxyMaker proxyMaker){
crawler.runConf.setProxyMaker(proxyMaker);
Expand All @@ -236,7 +236,7 @@ public Builder setProxyMaker(ProxyMaker proxyMaker){
* 失败重试次数,大于零时生效
*
* @param failRetryCount
* @return
* @return Builder
*/
public Builder setFailRetryCount(int failRetryCount){
if (failRetryCount > 0) {
Expand All @@ -250,7 +250,7 @@ public Builder setFailRetryCount(int failRetryCount){
* 爬虫并发线程数
*
* @param threadCount
* @return
* @return Builder
*/
public Builder setThreadCount(int threadCount) {
crawler.threadCount = threadCount;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
*
* CSS选择器, 如 "#title"
*
* @return
* @return String
*/
public String cssQuery() default "";

Expand All @@ -32,7 +32,7 @@
*
* @see com.xuxueli.crawler.conf.XxlCrawlerConf.SelectType
*
* @return
* @return SelectType
*/
public XxlCrawlerConf.SelectType selectType() default XxlCrawlerConf.SelectType.TEXT;

Expand All @@ -41,7 +41,7 @@
*
* jquery 数据抽取参数,SelectType=ATTR/HAS_CLASS 时有效,如 ".attr("abs:src")"
*
* @return
* @return String
*/
public String selectVal() default "";

Expand All @@ -50,7 +50,7 @@
*
* 时间格式化,日期类型数据有效
*
* @return
* @return String
*/
String datePattern() default "yyyy-MM-dd HH:mm:ss";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
*
* CSS选择器, 如 "#body"
*
* @return
* @return String
*/
public String cssQuery() default "";

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/com/xuxueli/crawler/loader/PageLoader.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ public abstract class PageLoader {
* load page
*
* @param pageRequest
* @return
* @return Document
*/
public abstract Document load(PageRequest pageRequest);

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/com/xuxueli/crawler/model/RunConf.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public class RunConf {
* valid url, include white url
*
* @param link
* @return
* @return boolean
*/
public boolean validWhiteUrl(String link){
if (!UrlUtil.isUrl(link)) {
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/com/xuxueli/crawler/proxy/ProxyMaker.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public ProxyMaker clear() {
/**
* make proxy
*
* @return
* @return Proxy
*/
public abstract Proxy make();

Expand Down
6 changes: 3 additions & 3 deletions src/main/java/com/xuxueli/crawler/rundata/RunData.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,21 @@ public abstract class RunData {
* add link
*
* @param link
* @return
* @return boolean
*/
public abstract boolean addUrl(String link);

/**
* get link, remove from unVisitedUrlQueue and add to visitedUrlSet
*
* @return
* @return String
*/
public abstract String getUrl();

/**
* get url num
*
* @return
* @return int
*/
public abstract int getUrlNum();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public boolean addUrl(String link) {

/**
* url take
* @return
* @return String
* @throws InterruptedException
*/
@Override
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/com/xuxueli/crawler/thread/CrawlerThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ public void run() {
* make page request
*
* @param link
* @return
* @return PageRequest
*/
private PageRequest makePageRequest(String link){
String userAgent = crawler.getRunConf().getUserAgentList().size()>1
Expand Down Expand Up @@ -145,7 +145,7 @@ private PageRequest makePageRequest(String link){
/**
* process non page
* @param pageRequest
* @return
* @return boolean
*/
private boolean processNonPage(PageRequest pageRequest){
NonPageParser nonPageParser = (NonPageParser) crawler.getRunConf().getPageParser();
Expand All @@ -161,7 +161,7 @@ private boolean processNonPage(PageRequest pageRequest){
/**
* process page
* @param pageRequest
* @return
* @return boolean
*/
private boolean processPage(PageRequest pageRequest) throws IllegalAccessException, InstantiationException {
Document html = crawler.getRunConf().getPageLoader().load(pageRequest);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ public static Date parseDate(PageFieldSelect apiRequestParam, String value) {
*
* @param field
* @param value
* @return
* @return Object
*/
public static Object parseValue(Field field, String value) {

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/com/xuxueli/crawler/util/FileUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ public class FileUtil {
*
* @param url
* @param contentType
* @return
* @return String
*/
public static String getFileNameByUrl(String url, String contentType) {
url = url.replaceAll("[\\?/:*|<>\"]", "_");
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/com/xuxueli/crawler/util/IOUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ public class IOUtil {
* String 2 InputStream
*
* @param str
* @return
* @return InputStream
*/
public static InputStream toInputStream(String str, String encoding) {
try {
Expand All @@ -33,7 +33,7 @@ public static InputStream toInputStream(String str, String encoding) {
* InputStream 2 String
*
* @param inputStream
* @return
* @return String
* @throws IOException
*/
public static String toString(InputStream inputStream, String encoding){
Expand Down
8 changes: 4 additions & 4 deletions src/main/java/com/xuxueli/crawler/util/JsoupUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public class JsoupUtil {
*
* @param pageRequest
*
* @return
* @return Document
*/
public static Document load(PageRequest pageRequest) {
if (!UrlUtil.isUrl(pageRequest.getUrl())) {
Expand Down Expand Up @@ -125,7 +125,7 @@ public static String loadPageSource(PageRequest pageRequest) {
* @param fieldElement
* @param selectType
* @param selectVal
* @return
* @return String
*/
public static String parseElement(Element fieldElement, XxlCrawlerConf.SelectType selectType, String selectVal) {
String fieldElementOrigin = null;
Expand All @@ -149,7 +149,7 @@ public static String parseElement(Element fieldElement, XxlCrawlerConf.SelectTyp
* 获取页面上所有超链接地址 (<a>标签的href值)
*
* @param html 页面文档
* @return
* @return Set<String>
*/
public static Set<String> findLinks(Document html) {

Expand Down Expand Up @@ -185,7 +185,7 @@ public static Set<String> findLinks(Document html) {
* 获取页面上所有图片地址 (<a>标签的href值)
*
* @param html
* @return
* @return Set<String>
*/
public static Set<String> findImages(Document html) {

Expand Down
4 changes: 2 additions & 2 deletions src/main/java/com/xuxueli/crawler/util/ProxyIpUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ public class ProxyIpUtil {
*
* @param proxy
* @param validSite
* @return
* @return int
*/
public static int checkProxy(Proxy proxy, String validSite){
try {
Expand Down Expand Up @@ -55,7 +55,7 @@ public static int checkProxy(Proxy proxy, String validSite){
*
* @param proxy
* @param validSite
* @return
* @return int
*/
public static int checkProxyRepeat(Proxy proxy, String validSite){
for (int i = 0; i < 3; i++) {
Expand Down
Loading

0 comments on commit d1a7cc3

Please sign in to comment.