Search in sources :

Example 1 with MultiProtocolURL

use of net.yacy.grid.tools.MultiProtocolURL in project yacy_grid_loader by yacy.

the class ContentLoader method loadHTTP.

private static void loadHTTP(final WarcWriter warcWriter, final String url, final String threadName, final boolean useHeadlessLoader) throws IOException {
    // check short memory status
    if (Memory.shortStatus()) {
        ApacheHttpClient.initClient(userAgentDefault);
    }
    Date loaddate = new Date();
    // first do a HEAD request to find the mime type
    ApacheHttpClient ac = new ApacheHttpClient(url, true);
    // here we know the content type
    byte[] content = null;
    MultiProtocolURL u = new MultiProtocolURL(url);
    if (useHeadlessLoader && (ac.getMime().endsWith("/html") || ac.getMime().endsWith("/xhtml+xml") || u.getContentDomainFromExt() == ContentDomain.TEXT))
        try {
            // use htmlunit to load this
            HtmlUnitLoader htmlUnitLoader = new HtmlUnitLoader(url, threadName);
            String xml = htmlUnitLoader.getXml();
            content = xml.getBytes(StandardCharsets.UTF_8);
        } catch (Throwable e) {
            // do nothing here, input stream is not set
            String cause = e == null ? "null" : e.getMessage();
            if (cause != null && cause.indexOf("404") >= 0) {
                throw new IOException("" + url + " fail: " + cause);
            }
            Data.logger.debug("Loader - HtmlUnit failed (will retry): " + cause);
        }
    if (content == null) {
        // do another http request. This can either happen because mime type is not html
        // or it was html and HtmlUnit has failed - we retry the normal way here.
        ac = new ApacheHttpClient(url, false);
        content = ac.getContent();
    }
    JwatWarcWriter.writeRequest(warcWriter, url, null, loaddate, null, null, ac.getRequestHeader().getBytes(StandardCharsets.UTF_8));
    // add the request header before the content
    ByteArrayOutputStream r = new ByteArrayOutputStream();
    r.write(ac.getResponseHeader().toString().getBytes(StandardCharsets.UTF_8));
    r.write(content);
    content = r.toByteArray();
    Data.logger.info("ContentLoader writing WARC for " + url + " - " + content.length + " bytes");
    JwatWarcWriter.writeResponse(warcWriter, url, null, loaddate, null, null, content);
}
Also used : MultiProtocolURL(net.yacy.grid.tools.MultiProtocolURL) IOException(java.io.IOException) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Date(java.util.Date)

Example 2 with MultiProtocolURL

use of net.yacy.grid.tools.MultiProtocolURL in project yacy_grid_crawler by yacy.

the class Crawler method newLoaderAction.

/**
 * Create a new loader action. This action contains all follow-up actions after
 * loading to create a steering of parser, indexing and follow-up crawler actions.
 * @param id the crawl id
 * @param urls the urls which are part of the same actions
 * @param depth the depth of the crawl step (0 is start depth)
 * @param retry the number of load re-tries (0 is no retry, shows that this is the first attempt)
 * @param timestamp the current time when the crawler created the action
 * @param partition unique number of the url set partition. This is used to create asset names.
 * @param doCrawling flag: if true, create a follow-up crawling action. set this to false to terminate crawling afterwards
 * @param doIndexing flag: if true, do an indexing after loading. set this to false if the purpose is only a follow-up crawl after parsing
 * @return the action json
 * @throws IOException
 */
public static JSONObject newLoaderAction(int priority, String id, JSONArray urls, int depth, int retry, long timestamp, int partition, boolean doCrawling, boolean doIndexing) throws IOException {
    // create file names for the assets: this uses depth and partition information
    // we must create this here to prevent concurrency bugs which are there in the date formatter :((
    SimpleDateFormat FORMAT_TIMEF = new SimpleDateFormat(PATTERN_TIMEF, Locale.US);
    String namestub = id + "/d" + intf(depth) + "-t" + FORMAT_TIMEF.format(new Date(timestamp)) + "-p" + intf(partition);
    String warcasset = namestub + ".warc.gz";
    String webasset = namestub + ".web.jsonlist";
    String graphasset = namestub + ".graph.jsonlist";
    String hashKey = new MultiProtocolURL(urls.getString(0)).getHost();
    // create actions to be done in reverse order:
    // at the end of the processing we simultaneously place actions on the indexing and crawling queue
    JSONArray postParserActions = new JSONArray();
    // one or both must be true; doing none of that does not make sense
    assert doIndexing || doCrawling;
    // if all of the urls shall be indexed (see indexing patterns) then do indexing actions
    if (doIndexing) {
        GridQueue indexerQueueName = Data.gridBroker.queueName(YaCyServices.indexer, YaCyServices.indexer.getQueues(), ShardingMethod.BALANCE, INDEXER_PRIORITY_DIMENSIONS, priority, hashKey);
        postParserActions.put(new JSONObject(true).put("type", YaCyServices.indexer.name()).put("queue", indexerQueueName.name()).put("id", id).put("sourceasset", webasset));
    }
    // if all of the urls shall be crawled at depth + 1, add a crawling action. Don't do this only if the crawling depth is at the depth limit.
    if (doCrawling) {
        GridQueue crawlerQueueName = Data.gridBroker.queueName(YaCyServices.crawler, YaCyServices.crawler.getQueues(), ShardingMethod.BALANCE, CRAWLER_PRIORITY_DIMENSIONS, priority, hashKey);
        postParserActions.put(new JSONObject(true).put("type", YaCyServices.crawler.name()).put("queue", crawlerQueueName.name()).put("id", id).put("depth", depth + 1).put("sourcegraph", graphasset));
    }
    // bevor that and after loading we have a parsing action
    GridQueue parserQueueName = Data.gridBroker.queueName(YaCyServices.parser, YaCyServices.parser.getQueues(), ShardingMethod.BALANCE, PARSER_PRIORITY_DIMENSIONS, priority, hashKey);
    JSONArray parserActions = new JSONArray().put(new JSONObject(true).put("type", YaCyServices.parser.name()).put("queue", parserQueueName.name()).put("id", id).put("sourceasset", warcasset).put("targetasset", webasset).put("targetgraph", graphasset).put("actions", // actions after parsing
    postParserActions));
    // at the beginning of the process, we do a loading.
    GridQueue loaderQueueName = Data.gridBroker.queueName(YaCyServices.loader, YaCyServices.loader.getQueues(), ShardingMethod.BALANCE, LOADER_PRIORITY_DIMENSIONS, priority, hashKey);
    JSONObject loaderAction = new JSONObject(true).put("type", YaCyServices.loader.name()).put("queue", loaderQueueName.name()).put("id", id).put("urls", urls).put("targetasset", warcasset).put("actions", // actions after loading
    parserActions);
    return loaderAction;
}
Also used : GridQueue(net.yacy.grid.io.messages.GridQueue) JSONObject(org.json.JSONObject) MultiProtocolURL(net.yacy.grid.tools.MultiProtocolURL) JSONArray(org.json.JSONArray) SimpleDateFormat(java.text.SimpleDateFormat) Date(java.util.Date)

Example 3 with MultiProtocolURL

use of net.yacy.grid.tools.MultiProtocolURL in project yacy_grid_mcp by yacy.

the class GridStorage method connectFTP.

public boolean connectFTP(String url) {
    try {
        MultiProtocolURL u = new MultiProtocolURL(url);
        StorageFactory<byte[]> ftp = new FTPStorageFactory(u.getHost(), u.getPort(), u.getUser(), u.getPassword(), this.deleteafterread);
        // test the connection
        ftp.getStorage().checkConnection();
        this.ftp = ftp;
        return true;
    } catch (IOException e) {
        Data.logger.debug("GridStorage.connectFTP/1 trying to connect to the ftp server failed", e);
        return false;
    }
}
Also used : MultiProtocolURL(net.yacy.grid.tools.MultiProtocolURL) IOException(java.io.IOException)

Example 4 with MultiProtocolURL

use of net.yacy.grid.tools.MultiProtocolURL in project yacy_grid_crawler by yacy.

the class CrawlStartService method serviceImpl.

@Override
public ServiceResponse serviceImpl(Query call, HttpServletResponse response) {
    JSONObject crawlstart = CrawlerDefaultValuesService.crawlStartDefaultClone();
    for (String key : crawlstart.keySet()) {
        Object object = crawlstart.get(key);
        if (object instanceof String)
            crawlstart.put(key, call.get(key, crawlstart.getString(key)));
        else if (object instanceof Integer)
            crawlstart.put(key, call.get(key, crawlstart.getInt(key)));
        else if (object instanceof Long)
            crawlstart.put(key, call.get(key, crawlstart.getLong(key)));
        else if (object instanceof JSONArray) {
            JSONArray a = crawlstart.getJSONArray(key);
            Object cv = call.get(key);
            if (cv != null)
                crawlstart.put(key, cv);
        } else {
            System.out.println("unrecognized type: " + object.getClass().toString());
        }
    }
    // set the crawl id
    CrawlstartURLSplitter crawlstartURLs = new CrawlstartURLSplitter(crawlstart.getString("crawlingURL"));
    Date now = new Date();
    // start the crawls; each of the url in a separate crawl to enforce parallel loading from different hosts
    SusiThought allCrawlstarts = new SusiThought();
    int count = 0;
    for (MultiProtocolURL url : crawlstartURLs.getURLs()) {
        JSONObject singlecrawl = new JSONObject();
        // create a clone of crawlstart
        for (String key : crawlstart.keySet()) singlecrawl.put(key, crawlstart.get(key));
        singlecrawl.put("id", Crawler.getCrawlID(url, now, count++));
        try {
            GridQueue queueName = Data.gridBroker.queueName(YaCyServices.crawler, YaCyServices.crawler.getQueues(), ShardingMethod.BALANCE, Crawler.CRAWLER_PRIORITY_DIMENSIONS, singlecrawl.getInt("priority"), url.getHost());
            SusiThought json = new SusiThought();
            json.setData(new JSONArray().put(singlecrawl));
            JSONObject action = new JSONObject().put("type", YaCyServices.crawler.name()).put("queue", queueName.name()).put("id", singlecrawl.getString("id")).put("depth", 0).put("sourcegraph", "rootasset");
            SusiAction crawlAction = new SusiAction(action);
            JSONObject graph = new JSONObject(true).put(WebMapping.canonical_s.getMapping().name(), url.toNormalform(true));
            crawlAction.setJSONListAsset("rootasset", new JSONList().add(graph));
            json.addAction(crawlAction);
            allCrawlstarts.addAction(crawlAction);
            byte[] b = json.toString().getBytes(StandardCharsets.UTF_8);
            Data.gridBroker.send(YaCyServices.crawler, queueName, b);
        } catch (IOException e) {
            Data.logger.warn("error when starting crawl for " + url.toNormalform(true), e);
            allCrawlstarts.put(ObjectAPIHandler.COMMENT_KEY, e.getMessage());
        }
    }
    // construct a crawl start message
    allCrawlstarts.setData(new JSONArray().put(crawlstart));
    allCrawlstarts.put(ObjectAPIHandler.SUCCESS_KEY, allCrawlstarts.getActions().size() > 0);
    // finally add the crawl start on the queue
    return new ServiceResponse(allCrawlstarts);
}
Also used : SusiThought(ai.susi.mind.SusiThought) JSONArray(org.json.JSONArray) IOException(java.io.IOException) Date(java.util.Date) SusiAction(ai.susi.mind.SusiAction) ServiceResponse(net.yacy.grid.http.ServiceResponse) JSONObject(org.json.JSONObject) GridQueue(net.yacy.grid.io.messages.GridQueue) MultiProtocolURL(net.yacy.grid.tools.MultiProtocolURL) CrawlstartURLSplitter(net.yacy.grid.crawler.Crawler.CrawlstartURLSplitter) JSONObject(org.json.JSONObject) JSONList(net.yacy.grid.tools.JSONList)

Aggregations

MultiProtocolURL (net.yacy.grid.tools.MultiProtocolURL)4 IOException (java.io.IOException)3 Date (java.util.Date)3 GridQueue (net.yacy.grid.io.messages.GridQueue)2 JSONArray (org.json.JSONArray)2 JSONObject (org.json.JSONObject)2 SusiAction (ai.susi.mind.SusiAction)1 SusiThought (ai.susi.mind.SusiThought)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 SimpleDateFormat (java.text.SimpleDateFormat)1 CrawlstartURLSplitter (net.yacy.grid.crawler.Crawler.CrawlstartURLSplitter)1 ServiceResponse (net.yacy.grid.http.ServiceResponse)1 JSONList (net.yacy.grid.tools.JSONList)1