use of net.yacy.grid.crawler.Crawler.CrawlstartURLSplitter in project yacy_grid_crawler by yacy.
the class CrawlStartService method serviceImpl.
@Override
public ServiceResponse serviceImpl(Query call, HttpServletResponse response) {
JSONObject crawlstart = CrawlerDefaultValuesService.crawlStartDefaultClone();
for (String key : crawlstart.keySet()) {
Object object = crawlstart.get(key);
if (object instanceof String)
crawlstart.put(key, call.get(key, crawlstart.getString(key)));
else if (object instanceof Integer)
crawlstart.put(key, call.get(key, crawlstart.getInt(key)));
else if (object instanceof Long)
crawlstart.put(key, call.get(key, crawlstart.getLong(key)));
else if (object instanceof JSONArray) {
JSONArray a = crawlstart.getJSONArray(key);
Object cv = call.get(key);
if (cv != null)
crawlstart.put(key, cv);
} else {
System.out.println("unrecognized type: " + object.getClass().toString());
}
}
// set the crawl id
CrawlstartURLSplitter crawlstartURLs = new CrawlstartURLSplitter(crawlstart.getString("crawlingURL"));
Date now = new Date();
// start the crawls; each of the url in a separate crawl to enforce parallel loading from different hosts
SusiThought allCrawlstarts = new SusiThought();
int count = 0;
for (MultiProtocolURL url : crawlstartURLs.getURLs()) {
JSONObject singlecrawl = new JSONObject();
// create a clone of crawlstart
for (String key : crawlstart.keySet()) singlecrawl.put(key, crawlstart.get(key));
singlecrawl.put("id", Crawler.getCrawlID(url, now, count++));
try {
GridQueue queueName = Data.gridBroker.queueName(YaCyServices.crawler, YaCyServices.crawler.getQueues(), ShardingMethod.BALANCE, Crawler.CRAWLER_PRIORITY_DIMENSIONS, singlecrawl.getInt("priority"), url.getHost());
SusiThought json = new SusiThought();
json.setData(new JSONArray().put(singlecrawl));
JSONObject action = new JSONObject().put("type", YaCyServices.crawler.name()).put("queue", queueName.name()).put("id", singlecrawl.getString("id")).put("depth", 0).put("sourcegraph", "rootasset");
SusiAction crawlAction = new SusiAction(action);
JSONObject graph = new JSONObject(true).put(WebMapping.canonical_s.getMapping().name(), url.toNormalform(true));
crawlAction.setJSONListAsset("rootasset", new JSONList().add(graph));
json.addAction(crawlAction);
allCrawlstarts.addAction(crawlAction);
byte[] b = json.toString().getBytes(StandardCharsets.UTF_8);
Data.gridBroker.send(YaCyServices.crawler, queueName, b);
} catch (IOException e) {
Data.logger.warn("error when starting crawl for " + url.toNormalform(true), e);
allCrawlstarts.put(ObjectAPIHandler.COMMENT_KEY, e.getMessage());
}
}
// construct a crawl start message
allCrawlstarts.setData(new JSONArray().put(crawlstart));
allCrawlstarts.put(ObjectAPIHandler.SUCCESS_KEY, allCrawlstarts.getActions().size() > 0);
// finally add the crawl start on the queue
return new ServiceResponse(allCrawlstarts);
}
Aggregations