use of net.yacy.grid.tools.MultiProtocolURL in project yacy_grid_loader by yacy.
the class ContentLoader method loadHTTP.
private static void loadHTTP(final WarcWriter warcWriter, final String url, final String threadName, final boolean useHeadlessLoader) throws IOException {
// check short memory status
if (Memory.shortStatus()) {
ApacheHttpClient.initClient(userAgentDefault);
}
Date loaddate = new Date();
// first do a HEAD request to find the mime type
ApacheHttpClient ac = new ApacheHttpClient(url, true);
// here we know the content type
byte[] content = null;
MultiProtocolURL u = new MultiProtocolURL(url);
if (useHeadlessLoader && (ac.getMime().endsWith("/html") || ac.getMime().endsWith("/xhtml+xml") || u.getContentDomainFromExt() == ContentDomain.TEXT))
try {
// use htmlunit to load this
HtmlUnitLoader htmlUnitLoader = new HtmlUnitLoader(url, threadName);
String xml = htmlUnitLoader.getXml();
content = xml.getBytes(StandardCharsets.UTF_8);
} catch (Throwable e) {
// do nothing here, input stream is not set
String cause = e == null ? "null" : e.getMessage();
if (cause != null && cause.indexOf("404") >= 0) {
throw new IOException("" + url + " fail: " + cause);
}
Data.logger.debug("Loader - HtmlUnit failed (will retry): " + cause);
}
if (content == null) {
// do another http request. This can either happen because mime type is not html
// or it was html and HtmlUnit has failed - we retry the normal way here.
ac = new ApacheHttpClient(url, false);
content = ac.getContent();
}
JwatWarcWriter.writeRequest(warcWriter, url, null, loaddate, null, null, ac.getRequestHeader().getBytes(StandardCharsets.UTF_8));
// add the request header before the content
ByteArrayOutputStream r = new ByteArrayOutputStream();
r.write(ac.getResponseHeader().toString().getBytes(StandardCharsets.UTF_8));
r.write(content);
content = r.toByteArray();
Data.logger.info("ContentLoader writing WARC for " + url + " - " + content.length + " bytes");
JwatWarcWriter.writeResponse(warcWriter, url, null, loaddate, null, null, content);
}
use of net.yacy.grid.tools.MultiProtocolURL in project yacy_grid_crawler by yacy.
the class Crawler method newLoaderAction.
/**
* Create a new loader action. This action contains all follow-up actions after
* loading to create a steering of parser, indexing and follow-up crawler actions.
* @param id the crawl id
* @param urls the urls which are part of the same actions
* @param depth the depth of the crawl step (0 is start depth)
* @param retry the number of load re-tries (0 is no retry, shows that this is the first attempt)
* @param timestamp the current time when the crawler created the action
* @param partition unique number of the url set partition. This is used to create asset names.
* @param doCrawling flag: if true, create a follow-up crawling action. set this to false to terminate crawling afterwards
* @param doIndexing flag: if true, do an indexing after loading. set this to false if the purpose is only a follow-up crawl after parsing
* @return the action json
* @throws IOException
*/
public static JSONObject newLoaderAction(int priority, String id, JSONArray urls, int depth, int retry, long timestamp, int partition, boolean doCrawling, boolean doIndexing) throws IOException {
// create file names for the assets: this uses depth and partition information
// we must create this here to prevent concurrency bugs which are there in the date formatter :((
SimpleDateFormat FORMAT_TIMEF = new SimpleDateFormat(PATTERN_TIMEF, Locale.US);
String namestub = id + "/d" + intf(depth) + "-t" + FORMAT_TIMEF.format(new Date(timestamp)) + "-p" + intf(partition);
String warcasset = namestub + ".warc.gz";
String webasset = namestub + ".web.jsonlist";
String graphasset = namestub + ".graph.jsonlist";
String hashKey = new MultiProtocolURL(urls.getString(0)).getHost();
// create actions to be done in reverse order:
// at the end of the processing we simultaneously place actions on the indexing and crawling queue
JSONArray postParserActions = new JSONArray();
// one or both must be true; doing none of that does not make sense
assert doIndexing || doCrawling;
// if all of the urls shall be indexed (see indexing patterns) then do indexing actions
if (doIndexing) {
GridQueue indexerQueueName = Data.gridBroker.queueName(YaCyServices.indexer, YaCyServices.indexer.getQueues(), ShardingMethod.BALANCE, INDEXER_PRIORITY_DIMENSIONS, priority, hashKey);
postParserActions.put(new JSONObject(true).put("type", YaCyServices.indexer.name()).put("queue", indexerQueueName.name()).put("id", id).put("sourceasset", webasset));
}
// if all of the urls shall be crawled at depth + 1, add a crawling action. Don't do this only if the crawling depth is at the depth limit.
if (doCrawling) {
GridQueue crawlerQueueName = Data.gridBroker.queueName(YaCyServices.crawler, YaCyServices.crawler.getQueues(), ShardingMethod.BALANCE, CRAWLER_PRIORITY_DIMENSIONS, priority, hashKey);
postParserActions.put(new JSONObject(true).put("type", YaCyServices.crawler.name()).put("queue", crawlerQueueName.name()).put("id", id).put("depth", depth + 1).put("sourcegraph", graphasset));
}
// bevor that and after loading we have a parsing action
GridQueue parserQueueName = Data.gridBroker.queueName(YaCyServices.parser, YaCyServices.parser.getQueues(), ShardingMethod.BALANCE, PARSER_PRIORITY_DIMENSIONS, priority, hashKey);
JSONArray parserActions = new JSONArray().put(new JSONObject(true).put("type", YaCyServices.parser.name()).put("queue", parserQueueName.name()).put("id", id).put("sourceasset", warcasset).put("targetasset", webasset).put("targetgraph", graphasset).put("actions", // actions after parsing
postParserActions));
// at the beginning of the process, we do a loading.
GridQueue loaderQueueName = Data.gridBroker.queueName(YaCyServices.loader, YaCyServices.loader.getQueues(), ShardingMethod.BALANCE, LOADER_PRIORITY_DIMENSIONS, priority, hashKey);
JSONObject loaderAction = new JSONObject(true).put("type", YaCyServices.loader.name()).put("queue", loaderQueueName.name()).put("id", id).put("urls", urls).put("targetasset", warcasset).put("actions", // actions after loading
parserActions);
return loaderAction;
}
use of net.yacy.grid.tools.MultiProtocolURL in project yacy_grid_mcp by yacy.
the class GridStorage method connectFTP.
public boolean connectFTP(String url) {
try {
MultiProtocolURL u = new MultiProtocolURL(url);
StorageFactory<byte[]> ftp = new FTPStorageFactory(u.getHost(), u.getPort(), u.getUser(), u.getPassword(), this.deleteafterread);
// test the connection
ftp.getStorage().checkConnection();
this.ftp = ftp;
return true;
} catch (IOException e) {
Data.logger.debug("GridStorage.connectFTP/1 trying to connect to the ftp server failed", e);
return false;
}
}
use of net.yacy.grid.tools.MultiProtocolURL in project yacy_grid_crawler by yacy.
the class CrawlStartService method serviceImpl.
@Override
public ServiceResponse serviceImpl(Query call, HttpServletResponse response) {
JSONObject crawlstart = CrawlerDefaultValuesService.crawlStartDefaultClone();
for (String key : crawlstart.keySet()) {
Object object = crawlstart.get(key);
if (object instanceof String)
crawlstart.put(key, call.get(key, crawlstart.getString(key)));
else if (object instanceof Integer)
crawlstart.put(key, call.get(key, crawlstart.getInt(key)));
else if (object instanceof Long)
crawlstart.put(key, call.get(key, crawlstart.getLong(key)));
else if (object instanceof JSONArray) {
JSONArray a = crawlstart.getJSONArray(key);
Object cv = call.get(key);
if (cv != null)
crawlstart.put(key, cv);
} else {
System.out.println("unrecognized type: " + object.getClass().toString());
}
}
// set the crawl id
CrawlstartURLSplitter crawlstartURLs = new CrawlstartURLSplitter(crawlstart.getString("crawlingURL"));
Date now = new Date();
// start the crawls; each of the url in a separate crawl to enforce parallel loading from different hosts
SusiThought allCrawlstarts = new SusiThought();
int count = 0;
for (MultiProtocolURL url : crawlstartURLs.getURLs()) {
JSONObject singlecrawl = new JSONObject();
// create a clone of crawlstart
for (String key : crawlstart.keySet()) singlecrawl.put(key, crawlstart.get(key));
singlecrawl.put("id", Crawler.getCrawlID(url, now, count++));
try {
GridQueue queueName = Data.gridBroker.queueName(YaCyServices.crawler, YaCyServices.crawler.getQueues(), ShardingMethod.BALANCE, Crawler.CRAWLER_PRIORITY_DIMENSIONS, singlecrawl.getInt("priority"), url.getHost());
SusiThought json = new SusiThought();
json.setData(new JSONArray().put(singlecrawl));
JSONObject action = new JSONObject().put("type", YaCyServices.crawler.name()).put("queue", queueName.name()).put("id", singlecrawl.getString("id")).put("depth", 0).put("sourcegraph", "rootasset");
SusiAction crawlAction = new SusiAction(action);
JSONObject graph = new JSONObject(true).put(WebMapping.canonical_s.getMapping().name(), url.toNormalform(true));
crawlAction.setJSONListAsset("rootasset", new JSONList().add(graph));
json.addAction(crawlAction);
allCrawlstarts.addAction(crawlAction);
byte[] b = json.toString().getBytes(StandardCharsets.UTF_8);
Data.gridBroker.send(YaCyServices.crawler, queueName, b);
} catch (IOException e) {
Data.logger.warn("error when starting crawl for " + url.toNormalform(true), e);
allCrawlstarts.put(ObjectAPIHandler.COMMENT_KEY, e.getMessage());
}
}
// construct a crawl start message
allCrawlstarts.setData(new JSONArray().put(crawlstart));
allCrawlstarts.put(ObjectAPIHandler.SUCCESS_KEY, allCrawlstarts.getActions().size() > 0);
// finally add the crawl start on the queue
return new ServiceResponse(allCrawlstarts);
}
Aggregations