Skip to content

Commit

Permalink
fix #2856 Add crawl order configuration to control URL processing order
Browse files Browse the repository at this point in the history
  • Loading branch information
marevol committed Nov 3, 2024
1 parent 70ef1c5 commit 036ebd6
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* Copyright 2012-2024 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.crawler.service;

import java.util.List;
import java.util.Map;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.codelibs.fess.crawler.entity.EsUrlQueue;
import org.codelibs.fess.crawler.service.impl.EsUrlQueueService;
import org.codelibs.fess.crawler.util.EsCrawlerConfig;
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
import org.codelibs.fess.helper.CrawlingConfigHelper;
import org.codelibs.fess.util.ComponentUtil;
import org.opensearch.index.query.QueryBuilders;
import org.opensearch.index.query.functionscore.FunctionScoreQueryBuilder;
import org.opensearch.index.query.functionscore.RandomScoreFunctionBuilder;
import org.opensearch.search.sort.SortBuilders;
import org.opensearch.search.sort.SortOrder;

public class FessUrlQueueService extends EsUrlQueueService {
private static final Logger logger = LogManager.getLogger(FessUrlQueueService.class);

public FessUrlQueueService(final EsCrawlerConfig crawlerConfig) {
super(crawlerConfig);
}

@Override
protected List<EsUrlQueue> fetchUrlQueueList(final String sessionId) {
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(sessionId);
final Map<String, String> configParams = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
final String crawlOrder = configParams.getOrDefault(CrawlingConfig.Param.Config.CRAWL_ORDER, "sequential");
if ("random".equals(crawlOrder)) {
return getList(EsUrlQueue.class, sessionId,
QueryBuilders.functionScoreQuery(QueryBuilders.matchAllQuery(),
new FunctionScoreQueryBuilder.FilterFunctionBuilder[] { new FunctionScoreQueryBuilder.FilterFunctionBuilder(
new RandomScoreFunctionBuilder().seed(sessionId.hashCode())) }),
0, pollingFetchSize, SortBuilders.scoreSort().order(SortOrder.ASC));
} else if (!"sequential".equals(crawlOrder)) {
logger.warn("Invalid crawl order specified: {}. Falling back to sequential.", crawlOrder);
}
return getList(EsUrlQueue.class, sessionId, null, 0, pollingFetchSize, SortBuilders.fieldSort(CREATE_TIME).order(SortOrder.ASC));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ public static class Config {
public static final String IGNORE_ROBOTS_TAGS = "ignore.robots.tags";
public static final String SCRIPT_TYPE = "script.type";
public static final String HTML_CHILD_URL_RULES = "html.child.url.rules";
public static final String CRAWL_ORDER = "crawl.order";
}

// meta.*
Expand Down
9 changes: 9 additions & 0 deletions src/main/resources/crawler_es+urlQueueService.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
"http://dbflute.org/meta/lastadi10.dtd">
<components namespace="fessCrawler">
<component name="urlQueueService"
class="org.codelibs.fess.crawler.service.FessUrlQueueService">
<arg>crawlerConfig</arg>
</component>
</components>

0 comments on commit 036ebd6

Please sign in to comment.