diff --git a/pom.xml b/pom.xml index cf44a00..1cc2ede 100644 --- a/pom.xml +++ b/pom.xml @@ -19,6 +19,8 @@ 1.7 1.7 + true + 1.11.2 2.24 2.53.1 diff --git a/src/main/java/com/xuxueli/crawler/XxlCrawler.java b/src/main/java/com/xuxueli/crawler/XxlCrawler.java index 9c134ab..0add917 100644 --- a/src/main/java/com/xuxueli/crawler/XxlCrawler.java +++ b/src/main/java/com/xuxueli/crawler/XxlCrawler.java @@ -55,7 +55,7 @@ public static class Builder { * 设置运行数据类型 * * @param runData - * @return + * @return Builder */ public Builder setRunData(RunData runData){ crawler.runData = runData; @@ -66,7 +66,7 @@ public Builder setRunData(RunData runData){ * 待爬的URL列表 * * @param urls - * @return + * @return Builder */ public Builder setUrls(String... urls) { if (urls!=null && urls.length>0) { @@ -82,7 +82,7 @@ public Builder setUrls(String... urls) { * 允许扩散爬取,将会以现有URL为起点扩散爬取整站 * * @param allowSpread - * @return + * @return Builder */ public Builder setAllowSpread(boolean allowSpread) { crawler.runConf.setAllowSpread(allowSpread); @@ -93,7 +93,7 @@ public Builder setAllowSpread(boolean allowSpread) { * URL白名单正则,非空时进行URL白名单过滤页面 * * @param whiteUrlRegexs - * @return + * @return Builder */ public Builder setWhiteUrlRegexs(String... whiteUrlRegexs) { if (whiteUrlRegexs!=null && whiteUrlRegexs.length>0) { @@ -108,7 +108,7 @@ public Builder setWhiteUrlRegexs(String... whiteUrlRegexs) { * 页面解析器 * * @param pageParser - * @return + * @return Builder */ public Builder setPageParser(PageParser pageParser){ crawler.runConf.setPageParser(pageParser); @@ -119,7 +119,7 @@ public Builder setPageParser(PageParser pageParser){ * 页面下载器 * * @param pageLoader - * @return + * @return Builder */ public Builder setPageLoader(PageLoader pageLoader){ crawler.runConf.setPageLoader(pageLoader); @@ -131,7 +131,7 @@ public Builder setPageLoader(PageLoader pageLoader){ * 请求参数 * * @param paramMap - * @return + * @return Builder */ public Builder setParamMap(Map paramMap){ crawler.runConf.setParamMap(paramMap); @@ -142,7 +142,7 @@ public Builder setParamMap(Map paramMap){ * 请求Cookie * * @param cookieMap - * @return + * @return Builder */ public Builder setCookieMap(Map cookieMap){ crawler.runConf.setCookieMap(cookieMap); @@ -153,7 +153,7 @@ public Builder setCookieMap(Map cookieMap){ * 请求Header * * @param headerMap - * @return + * @return Builder */ public Builder setHeaderMap(Map headerMap){ crawler.runConf.setHeaderMap(headerMap); @@ -164,7 +164,7 @@ public Builder setHeaderMap(Map headerMap){ * 请求UserAgent * * @param userAgents - * @return + * @return Builder */ public Builder setUserAgent(String... userAgents){ if (userAgents!=null && userAgents.length>0) { @@ -181,7 +181,7 @@ public Builder setUserAgent(String... userAgents){ * 请求Referrer * * @param referrer - * @return + * @return Builder */ public Builder setReferrer(String referrer){ crawler.runConf.setReferrer(referrer); @@ -192,7 +192,7 @@ public Builder setReferrer(String referrer){ * 请求方式:true=POST请求、false=GET请求 * * @param ifPost - * @return + * @return Builder */ public Builder setIfPost(boolean ifPost){ crawler.runConf.setIfPost(ifPost); @@ -203,7 +203,7 @@ public Builder setIfPost(boolean ifPost){ * 超时时间,毫秒 * * @param timeoutMillis - * @return + * @return Builder */ public Builder setTimeoutMillis(int timeoutMillis){ crawler.runConf.setTimeoutMillis(timeoutMillis); @@ -214,7 +214,7 @@ public Builder setTimeoutMillis(int timeoutMillis){ * 停顿时间,爬虫线程处理完页面之后进行主动停顿,避免过于频繁被拦截; * * @param pauseMillis - * @return + * @return Builder */ public Builder setPauseMillis(int pauseMillis){ crawler.runConf.setPauseMillis(pauseMillis); @@ -225,7 +225,7 @@ public Builder setPauseMillis(int pauseMillis){ * 代理生成器 * * @param proxyMaker - * @return + * @return Builder */ public Builder setProxyMaker(ProxyMaker proxyMaker){ crawler.runConf.setProxyMaker(proxyMaker); @@ -236,7 +236,7 @@ public Builder setProxyMaker(ProxyMaker proxyMaker){ * 失败重试次数,大于零时生效 * * @param failRetryCount - * @return + * @return Builder */ public Builder setFailRetryCount(int failRetryCount){ if (failRetryCount > 0) { @@ -250,7 +250,7 @@ public Builder setFailRetryCount(int failRetryCount){ * 爬虫并发线程数 * * @param threadCount - * @return + * @return Builder */ public Builder setThreadCount(int threadCount) { crawler.threadCount = threadCount; diff --git a/src/main/java/com/xuxueli/crawler/annotation/PageFieldSelect.java b/src/main/java/com/xuxueli/crawler/annotation/PageFieldSelect.java index b2db9d5..bf26702 100644 --- a/src/main/java/com/xuxueli/crawler/annotation/PageFieldSelect.java +++ b/src/main/java/com/xuxueli/crawler/annotation/PageFieldSelect.java @@ -21,7 +21,7 @@ * * CSS选择器, 如 "#title" * - * @return + * @return String */ public String cssQuery() default ""; @@ -32,7 +32,7 @@ * * @see com.xuxueli.crawler.conf.XxlCrawlerConf.SelectType * - * @return + * @return SelectType */ public XxlCrawlerConf.SelectType selectType() default XxlCrawlerConf.SelectType.TEXT; @@ -41,7 +41,7 @@ * * jquery 数据抽取参数,SelectType=ATTR/HAS_CLASS 时有效,如 ".attr("abs:src")" * - * @return + * @return String */ public String selectVal() default ""; @@ -50,7 +50,7 @@ * * 时间格式化,日期类型数据有效 * - * @return + * @return String */ String datePattern() default "yyyy-MM-dd HH:mm:ss"; diff --git a/src/main/java/com/xuxueli/crawler/annotation/PageSelect.java b/src/main/java/com/xuxueli/crawler/annotation/PageSelect.java index bf1a135..4308989 100644 --- a/src/main/java/com/xuxueli/crawler/annotation/PageSelect.java +++ b/src/main/java/com/xuxueli/crawler/annotation/PageSelect.java @@ -19,7 +19,7 @@ * * CSS选择器, 如 "#body" * - * @return + * @return String */ public String cssQuery() default ""; diff --git a/src/main/java/com/xuxueli/crawler/loader/PageLoader.java b/src/main/java/com/xuxueli/crawler/loader/PageLoader.java index c7fcebf..a55c497 100644 --- a/src/main/java/com/xuxueli/crawler/loader/PageLoader.java +++ b/src/main/java/com/xuxueli/crawler/loader/PageLoader.java @@ -14,7 +14,7 @@ public abstract class PageLoader { * load page * * @param pageRequest - * @return + * @return Document */ public abstract Document load(PageRequest pageRequest); diff --git a/src/main/java/com/xuxueli/crawler/model/RunConf.java b/src/main/java/com/xuxueli/crawler/model/RunConf.java index 24250ff..887c8fc 100644 --- a/src/main/java/com/xuxueli/crawler/model/RunConf.java +++ b/src/main/java/com/xuxueli/crawler/model/RunConf.java @@ -39,7 +39,7 @@ public class RunConf { * valid url, include white url * * @param link - * @return + * @return boolean */ public boolean validWhiteUrl(String link){ if (!UrlUtil.isUrl(link)) { diff --git a/src/main/java/com/xuxueli/crawler/proxy/ProxyMaker.java b/src/main/java/com/xuxueli/crawler/proxy/ProxyMaker.java index 5a9ab5b..8270d2f 100644 --- a/src/main/java/com/xuxueli/crawler/proxy/ProxyMaker.java +++ b/src/main/java/com/xuxueli/crawler/proxy/ProxyMaker.java @@ -31,7 +31,7 @@ public ProxyMaker clear() { /** * make proxy * - * @return + * @return Proxy */ public abstract Proxy make(); diff --git a/src/main/java/com/xuxueli/crawler/rundata/RunData.java b/src/main/java/com/xuxueli/crawler/rundata/RunData.java index ea14681..11cb2d9 100644 --- a/src/main/java/com/xuxueli/crawler/rundata/RunData.java +++ b/src/main/java/com/xuxueli/crawler/rundata/RunData.java @@ -11,21 +11,21 @@ public abstract class RunData { * add link * * @param link - * @return + * @return boolean */ public abstract boolean addUrl(String link); /** * get link, remove from unVisitedUrlQueue and add to visitedUrlSet * - * @return + * @return String */ public abstract String getUrl(); /** * get url num * - * @return + * @return int */ public abstract int getUrlNum(); diff --git a/src/main/java/com/xuxueli/crawler/rundata/strategy/LocalRunData.java b/src/main/java/com/xuxueli/crawler/rundata/strategy/LocalRunData.java index 16cb843..ecc8e9a 100644 --- a/src/main/java/com/xuxueli/crawler/rundata/strategy/LocalRunData.java +++ b/src/main/java/com/xuxueli/crawler/rundata/strategy/LocalRunData.java @@ -49,7 +49,7 @@ public boolean addUrl(String link) { /** * url take - * @return + * @return String * @throws InterruptedException */ @Override diff --git a/src/main/java/com/xuxueli/crawler/thread/CrawlerThread.java b/src/main/java/com/xuxueli/crawler/thread/CrawlerThread.java index c7b19e8..4a28667 100644 --- a/src/main/java/com/xuxueli/crawler/thread/CrawlerThread.java +++ b/src/main/java/com/xuxueli/crawler/thread/CrawlerThread.java @@ -116,7 +116,7 @@ public void run() { * make page request * * @param link - * @return + * @return PageRequest */ private PageRequest makePageRequest(String link){ String userAgent = crawler.getRunConf().getUserAgentList().size()>1 @@ -145,7 +145,7 @@ private PageRequest makePageRequest(String link){ /** * process non page * @param pageRequest - * @return + * @return boolean */ private boolean processNonPage(PageRequest pageRequest){ NonPageParser nonPageParser = (NonPageParser) crawler.getRunConf().getPageParser(); @@ -161,7 +161,7 @@ private boolean processNonPage(PageRequest pageRequest){ /** * process page * @param pageRequest - * @return + * @return boolean */ private boolean processPage(PageRequest pageRequest) throws IllegalAccessException, InstantiationException { Document html = crawler.getRunConf().getPageLoader().load(pageRequest); diff --git a/src/main/java/com/xuxueli/crawler/util/FieldReflectionUtil.java b/src/main/java/com/xuxueli/crawler/util/FieldReflectionUtil.java index 468ddea..0c87cd8 100644 --- a/src/main/java/com/xuxueli/crawler/util/FieldReflectionUtil.java +++ b/src/main/java/com/xuxueli/crawler/util/FieldReflectionUtil.java @@ -101,7 +101,7 @@ public static Date parseDate(PageFieldSelect apiRequestParam, String value) { * * @param field * @param value - * @return + * @return Object */ public static Object parseValue(Field field, String value) { diff --git a/src/main/java/com/xuxueli/crawler/util/FileUtil.java b/src/main/java/com/xuxueli/crawler/util/FileUtil.java index 4dd3478..72be170 100644 --- a/src/main/java/com/xuxueli/crawler/util/FileUtil.java +++ b/src/main/java/com/xuxueli/crawler/util/FileUtil.java @@ -22,7 +22,7 @@ public class FileUtil { * * @param url * @param contentType - * @return + * @return String */ public static String getFileNameByUrl(String url, String contentType) { url = url.replaceAll("[\\?/:*|<>\"]", "_"); diff --git a/src/main/java/com/xuxueli/crawler/util/IOUtil.java b/src/main/java/com/xuxueli/crawler/util/IOUtil.java index a49b608..6de3796 100644 --- a/src/main/java/com/xuxueli/crawler/util/IOUtil.java +++ b/src/main/java/com/xuxueli/crawler/util/IOUtil.java @@ -17,7 +17,7 @@ public class IOUtil { * String 2 InputStream * * @param str - * @return + * @return InputStream */ public static InputStream toInputStream(String str, String encoding) { try { @@ -33,7 +33,7 @@ public static InputStream toInputStream(String str, String encoding) { * InputStream 2 String * * @param inputStream - * @return + * @return String * @throws IOException */ public static String toString(InputStream inputStream, String encoding){ diff --git a/src/main/java/com/xuxueli/crawler/util/JsoupUtil.java b/src/main/java/com/xuxueli/crawler/util/JsoupUtil.java index a7767ea..94a1dbe 100644 --- a/src/main/java/com/xuxueli/crawler/util/JsoupUtil.java +++ b/src/main/java/com/xuxueli/crawler/util/JsoupUtil.java @@ -27,7 +27,7 @@ public class JsoupUtil { * * @param pageRequest * - * @return + * @return Document */ public static Document load(PageRequest pageRequest) { if (!UrlUtil.isUrl(pageRequest.getUrl())) { @@ -125,7 +125,7 @@ public static String loadPageSource(PageRequest pageRequest) { * @param fieldElement * @param selectType * @param selectVal - * @return + * @return String */ public static String parseElement(Element fieldElement, XxlCrawlerConf.SelectType selectType, String selectVal) { String fieldElementOrigin = null; @@ -149,7 +149,7 @@ public static String parseElement(Element fieldElement, XxlCrawlerConf.SelectTyp * 获取页面上所有超链接地址 (标签的href值) * * @param html 页面文档 - * @return + * @return Set */ public static Set findLinks(Document html) { @@ -185,7 +185,7 @@ public static Set findLinks(Document html) { * 获取页面上所有图片地址 (标签的href值) * * @param html - * @return + * @return Set */ public static Set findImages(Document html) { diff --git a/src/main/java/com/xuxueli/crawler/util/ProxyIpUtil.java b/src/main/java/com/xuxueli/crawler/util/ProxyIpUtil.java index d925050..52331d8 100644 --- a/src/main/java/com/xuxueli/crawler/util/ProxyIpUtil.java +++ b/src/main/java/com/xuxueli/crawler/util/ProxyIpUtil.java @@ -22,7 +22,7 @@ public class ProxyIpUtil { * * @param proxy * @param validSite - * @return + * @return int */ public static int checkProxy(Proxy proxy, String validSite){ try { @@ -55,7 +55,7 @@ public static int checkProxy(Proxy proxy, String validSite){ * * @param proxy * @param validSite - * @return + * @return int */ public static int checkProxyRepeat(Proxy proxy, String validSite){ for (int i = 0; i < 3; i++) { diff --git a/src/main/java/com/xuxueli/crawler/util/RegexUtil.java b/src/main/java/com/xuxueli/crawler/util/RegexUtil.java index 3f550c4..755b404 100644 --- a/src/main/java/com/xuxueli/crawler/util/RegexUtil.java +++ b/src/main/java/com/xuxueli/crawler/util/RegexUtil.java @@ -13,7 +13,7 @@ public class RegexUtil { * 正则匹配 * @param regex : 正则表达式 * @param str : 待匹配字符串 - * @return + * @return boolean */ public static boolean matches(String regex, String str) { Pattern pattern = Pattern.compile(regex); @@ -27,7 +27,7 @@ public static boolean matches(String regex, String str) { * url格式校验 * * @param str - * @return + * @return boolean */ public static boolean isUrl(String str) { if (str==null || str.trim().length()==0) { diff --git a/src/test/java/com/xuxueli/crawler/test/XxlCrawlerTest06.java b/src/test/java/com/xuxueli/crawler/test/XxlCrawlerTest06.java index 3cda1b7..7468883 100644 --- a/src/test/java/com/xuxueli/crawler/test/XxlCrawlerTest06.java +++ b/src/test/java/com/xuxueli/crawler/test/XxlCrawlerTest06.java @@ -30,7 +30,6 @@ public static void main(String[] args) { * 新增一个待采集的URL,接口需要做URL去重,爬虫线程将会获取到并进行处理; * * @param link - * @return */ @Override public boolean addUrl(String link) { @@ -46,8 +45,6 @@ public boolean addUrl(String link) { /** * 获取一个待采集的URL,并且将它从"待采集URL池"中移除,并且添加到"已采集URL池"中; - * - * @return */ @Override public String getUrl() { @@ -65,8 +62,6 @@ public String getUrl() { /** * 获取待采集URL数量; - * - * @return */ @Override public int getUrlNum() {