Search in sources :

Example 1 with CrawlstartURLSplitter

use of net.yacy.grid.crawler.Crawler.CrawlstartURLSplitter in project yacy_grid_crawler by yacy.

the class CrawlStartService method serviceImpl.

@Override
public ServiceResponse serviceImpl(Query call, HttpServletResponse response) {
    JSONObject crawlstart = CrawlerDefaultValuesService.crawlStartDefaultClone();
    for (String key : crawlstart.keySet()) {
        Object object = crawlstart.get(key);
        if (object instanceof String)
            crawlstart.put(key, call.get(key, crawlstart.getString(key)));
        else if (object instanceof Integer)
            crawlstart.put(key, call.get(key, crawlstart.getInt(key)));
        else if (object instanceof Long)
            crawlstart.put(key, call.get(key, crawlstart.getLong(key)));
        else if (object instanceof JSONArray) {
            JSONArray a = crawlstart.getJSONArray(key);
            Object cv = call.get(key);
            if (cv != null)
                crawlstart.put(key, cv);
        } else {
            System.out.println("unrecognized type: " + object.getClass().toString());
        }
    }
    // set the crawl id
    CrawlstartURLSplitter crawlstartURLs = new CrawlstartURLSplitter(crawlstart.getString("crawlingURL"));
    Date now = new Date();
    // start the crawls; each of the url in a separate crawl to enforce parallel loading from different hosts
    SusiThought allCrawlstarts = new SusiThought();
    int count = 0;
    for (MultiProtocolURL url : crawlstartURLs.getURLs()) {
        JSONObject singlecrawl = new JSONObject();
        // create a clone of crawlstart
        for (String key : crawlstart.keySet()) singlecrawl.put(key, crawlstart.get(key));
        singlecrawl.put("id", Crawler.getCrawlID(url, now, count++));
        try {
            GridQueue queueName = Data.gridBroker.queueName(YaCyServices.crawler, YaCyServices.crawler.getQueues(), ShardingMethod.BALANCE, Crawler.CRAWLER_PRIORITY_DIMENSIONS, singlecrawl.getInt("priority"), url.getHost());
            SusiThought json = new SusiThought();
            json.setData(new JSONArray().put(singlecrawl));
            JSONObject action = new JSONObject().put("type", YaCyServices.crawler.name()).put("queue", queueName.name()).put("id", singlecrawl.getString("id")).put("depth", 0).put("sourcegraph", "rootasset");
            SusiAction crawlAction = new SusiAction(action);
            JSONObject graph = new JSONObject(true).put(WebMapping.canonical_s.getMapping().name(), url.toNormalform(true));
            crawlAction.setJSONListAsset("rootasset", new JSONList().add(graph));
            json.addAction(crawlAction);
            allCrawlstarts.addAction(crawlAction);
            byte[] b = json.toString().getBytes(StandardCharsets.UTF_8);
            Data.gridBroker.send(YaCyServices.crawler, queueName, b);
        } catch (IOException e) {
            Data.logger.warn("error when starting crawl for " + url.toNormalform(true), e);
            allCrawlstarts.put(ObjectAPIHandler.COMMENT_KEY, e.getMessage());
        }
    }
    // construct a crawl start message
    allCrawlstarts.setData(new JSONArray().put(crawlstart));
    allCrawlstarts.put(ObjectAPIHandler.SUCCESS_KEY, allCrawlstarts.getActions().size() > 0);
    // finally add the crawl start on the queue
    return new ServiceResponse(allCrawlstarts);
}
Also used : SusiThought(ai.susi.mind.SusiThought) JSONArray(org.json.JSONArray) IOException(java.io.IOException) Date(java.util.Date) SusiAction(ai.susi.mind.SusiAction) ServiceResponse(net.yacy.grid.http.ServiceResponse) JSONObject(org.json.JSONObject) GridQueue(net.yacy.grid.io.messages.GridQueue) MultiProtocolURL(net.yacy.grid.tools.MultiProtocolURL) CrawlstartURLSplitter(net.yacy.grid.crawler.Crawler.CrawlstartURLSplitter) JSONObject(org.json.JSONObject) JSONList(net.yacy.grid.tools.JSONList)

Aggregations

SusiAction (ai.susi.mind.SusiAction)1 SusiThought (ai.susi.mind.SusiThought)1 IOException (java.io.IOException)1 Date (java.util.Date)1 CrawlstartURLSplitter (net.yacy.grid.crawler.Crawler.CrawlstartURLSplitter)1 ServiceResponse (net.yacy.grid.http.ServiceResponse)1 GridQueue (net.yacy.grid.io.messages.GridQueue)1 JSONList (net.yacy.grid.tools.JSONList)1 MultiProtocolURL (net.yacy.grid.tools.MultiProtocolURL)1 JSONArray (org.json.JSONArray)1 JSONObject (org.json.JSONObject)1