use of org.apache.nutch.protocol.Content in project nutch by apache.
the class FetcherThread method run.
@SuppressWarnings("fallthrough")
public void run() {
// count threads
activeThreads.incrementAndGet();
FetchItem fit = null;
try {
// checking for the server to be running and fetcher.parse to be true
if (parsing && NutchServer.getInstance().isRunning())
reportToNutchServer = true;
while (true) {
// creating FetchNode for storing in FetchNodeDb
if (reportToNutchServer)
this.fetchNode = new FetchNode();
else
this.fetchNode = null;
// check whether must be stopped
if (isHalted()) {
LOG.debug("{} set to halted", getName());
fit = null;
return;
}
fit = fetchQueues.getFetchItem();
if (fit == null) {
if (feeder.isAlive() || fetchQueues.getTotalSize() > 0) {
LOG.debug("{} spin-waiting ...", getName());
// spin-wait.
spinWaiting.incrementAndGet();
try {
Thread.sleep(500);
} catch (Exception e) {
}
spinWaiting.decrementAndGet();
continue;
} else {
// all done, finish this thread
LOG.info("{} {} has no more work available", getName(), Thread.currentThread().getId());
return;
}
}
lastRequestStart.set(System.currentTimeMillis());
Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
if (reprUrlWritable == null) {
setReprUrl(fit.url.toString());
} else {
setReprUrl(reprUrlWritable.toString());
}
try {
// fetch the page
redirecting = false;
redirectCount = 0;
// Publisher event
if (activatePublisher) {
FetcherThreadEvent startEvent = new FetcherThreadEvent(PublishEventType.START, fit.getUrl().toString());
publisher.publish(startEvent, conf);
}
do {
if (LOG.isInfoEnabled()) {
LOG.info("{} {} fetching {} (queue crawl delay={}ms)", getName(), Thread.currentThread().getId(), fit.url, fetchQueues.getFetchItemQueue(fit.queueID).crawlDelay);
}
LOG.debug("redirectCount={}", redirectCount);
redirecting = false;
Protocol protocol = this.protocolFactory.getProtocol(fit.u);
BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum, robotsTxtContent);
if (robotsTxtContent != null) {
outputRobotsTxt(robotsTxtContent);
robotsTxtContent.clear();
}
if (rules.isDeferVisits()) {
LOG.info("Defer visits for queue {} : {}", fit.queueID, fit.url);
// retry the fetch item
if (fetchQueues.timelimitExceeded()) {
fetchQueues.finishFetchItem(fit, true);
} else {
fetchQueues.addFetchItem(fit);
}
// but check whether it's time to cancel the queue
int killedURLs = fetchQueues.checkExceptionThreshold(fit.getQueueID(), this.robotsDeferVisitsRetries + 1, this.robotsDeferVisitsDelay);
if (killedURLs != 0) {
context.getCounter("FetcherStatus", "robots_defer_visits_dropped").increment(killedURLs);
}
continue;
}
if (!rules.isAllowed(fit.url.toString())) {
// unblock
fetchQueues.finishFetchItem(fit, true);
LOG.info("Denied by robots.txt: {}", fit.url);
output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
context.getCounter("FetcherStatus", "robots_denied").increment(1);
continue;
}
if (rules.getCrawlDelay() > 0) {
if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
// unblock
fetchQueues.finishFetchItem(fit, true);
LOG.info("Crawl-Delay for {} too long ({} ms), skipping", fit.url, rules.getCrawlDelay());
output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
context.getCounter("FetcherStatus", "robots_denied_maxcrawldelay").increment(1);
continue;
} else {
FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
long crawlDelay = rules.getCrawlDelay();
if (crawlDelay < minCrawlDelay) {
LOG.info("Crawl-Delay for {} too short ({} ms), adjusting to {} ms", fit.url, rules.getCrawlDelay(), minCrawlDelay);
crawlDelay = minCrawlDelay;
}
fiq.crawlDelay = crawlDelay;
LOG.debug("Crawl delay for queue: {} is set to {} as per robots.txt. url: ", fit.queueID, fiq.crawlDelay, fit.url);
}
}
ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.datum);
ProtocolStatus status = output.getStatus();
Content content = output.getContent();
ParseStatus pstatus = null;
// unblock queue
fetchQueues.finishFetchItem(fit);
// used for FetchNode
if (fetchNode != null) {
fetchNode.setStatus(status.getCode());
fetchNode.setFetchTime(System.currentTimeMillis());
fetchNode.setUrl(fit.url);
}
// Publish fetch finish event
if (activatePublisher) {
FetcherThreadEvent endEvent = new FetcherThreadEvent(PublishEventType.END, fit.getUrl().toString());
endEvent.addEventData("status", status.getName());
publisher.publish(endEvent, conf);
}
context.getCounter("FetcherStatus", status.getName()).increment(1);
switch(status.getCode()) {
case ProtocolStatus.WOULDBLOCK:
// retry ?
fetchQueues.addFetchItem(fit);
break;
case // got a page
ProtocolStatus.SUCCESS:
pstatus = output(fit.url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
updateStatus(content.getContent().length);
if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
String newUrl = pstatus.getMessage();
int refreshTime = Integer.parseInt(pstatus.getArgs()[1]);
Text redirUrl = handleRedirect(fit, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR);
if (redirUrl != null) {
fit = queueRedirect(redirUrl, fit);
}
}
break;
// redirect
case ProtocolStatus.MOVED:
case ProtocolStatus.TEMP_MOVED:
int code;
boolean temp;
if (status.getCode() == ProtocolStatus.MOVED) {
code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
temp = false;
} else {
code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
temp = true;
}
output(fit.url, fit.datum, content, status, code);
String newUrl = status.getMessage();
Text redirUrl = handleRedirect(fit, newUrl, temp, Fetcher.PROTOCOL_REDIR);
if (redirUrl != null) {
fit = queueRedirect(redirUrl, fit);
} else {
// stop redirecting
redirecting = false;
}
break;
case ProtocolStatus.EXCEPTION:
logError(fit.url, status.getMessage());
int killedURLs = fetchQueues.checkExceptionThreshold(fit.getQueueID());
if (killedURLs != 0)
context.getCounter("FetcherStatus", "AboveExceptionThresholdInQueue").increment(killedURLs);
// retry
case ProtocolStatus.RETRY:
case ProtocolStatus.BLOCKED:
output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
break;
// gone
case ProtocolStatus.GONE:
case ProtocolStatus.NOTFOUND:
case ProtocolStatus.ACCESS_DENIED:
case ProtocolStatus.ROBOTS_DENIED:
output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
break;
case ProtocolStatus.NOTMODIFIED:
output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
break;
default:
if (LOG.isWarnEnabled()) {
LOG.warn("{} {} Unknown ProtocolStatus: {}", getName(), Thread.currentThread().getId(), status.getCode());
}
output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
}
if (redirecting && redirectCount > maxRedirect) {
fetchQueues.finishFetchItem(fit);
context.getCounter("FetcherStatus", "redirect_count_exceeded").increment(1);
if (LOG.isInfoEnabled()) {
LOG.info("{} {} - redirect count exceeded {} ({})", getName(), Thread.currentThread().getId(), fit.url, maxRedirectExceededSkip ? "skipped" : "linked");
}
if (maxRedirectExceededSkip) {
// skip redirect target when redirect count is exceeded
} else {
Text newUrl = new Text(status.getMessage());
CrawlDatum newDatum = createRedirDatum(newUrl, fit, CrawlDatum.STATUS_LINKED);
output(newUrl, newDatum, null, null, CrawlDatum.STATUS_LINKED);
}
}
} while (redirecting && (redirectCount <= maxRedirect));
} catch (Throwable t) {
// unexpected exception
// unblock
fetchQueues.finishFetchItem(fit);
String message;
if (LOG.isDebugEnabled()) {
message = StringUtils.stringifyException(t);
} else if (logUtil.logShort(t)) {
message = t.getClass().getName();
} else {
message = StringUtils.stringifyException(t);
}
logError(fit.url, message);
output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY);
}
}
} catch (Throwable e) {
if (LOG.isErrorEnabled()) {
LOG.error("fetcher caught:", e);
}
} finally {
if (fit != null) {
fetchQueues.finishFetchItem(fit);
}
// count threads
activeThreads.decrementAndGet();
LOG.info("{} {} -finishing thread {}, activeThreads={}", getName(), Thread.currentThread().getId(), getName(), activeThreads);
}
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class HttpRobotRulesParser method addRobotsContent.
/**
* Append {@link Content} of robots.txt to {@literal robotsTxtContent}
*
* @param robotsTxtContent
* container to store robots.txt response content
* @param robotsUrl
* robots.txt URL
* @param robotsResponse
* response object to be stored
*/
protected void addRobotsContent(List<Content> robotsTxtContent, URL robotsUrl, Response robotsResponse) {
byte[] robotsBytes = robotsResponse.getContent();
if (robotsBytes == null)
robotsBytes = new byte[0];
Content content = new Content(robotsUrl.toString(), robotsUrl.toString(), robotsBytes, robotsResponse.getHeader("Content-Type"), robotsResponse.getHeaders(), getConf());
robotsTxtContent.add(content);
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TestMetadataScoringFilter method passScoreAfterParsing.
@Test
public void passScoreAfterParsing() {
Configuration conf = NutchConfiguration.create();
conf.set(MetadataScoringFilter.METADATA_DATUM, "parent,depth");
conf.set(MetadataScoringFilter.METADATA_CONTENT, "parent,depth");
MetadataScoringFilter metadataScoringFilter = new MetadataScoringFilter();
metadataScoringFilter.setConf(conf);
CrawlDatum crawlDatum = new CrawlDatum();
Text from = new Text("https://nutch.apache.org/");
String PARENT = "parent";
String DEPTH = "depth";
String parentMD = "https://nutch.apache.org/";
String depthMD = "1";
crawlDatum.getMetaData().put(new Text(PARENT), new Text(parentMD));
crawlDatum.getMetaData().put(new Text(DEPTH), new Text(depthMD));
Content content = new Content();
metadataScoringFilter.passScoreBeforeParsing(from, crawlDatum, content);
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, null, null, content.getMetadata());
Parse parse = new ParseImpl(from.toString(), parseData);
metadataScoringFilter.passScoreAfterParsing(from, content, parse);
Assert.assertEquals(parentMD, parse.getData().getMeta(PARENT));
Assert.assertEquals(depthMD, parse.getData().getMeta(DEPTH));
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class SegmentHandler method handle.
@Override
public void handle(Request req, HttpServletResponse res, String target, int dispatch) throws IOException, ServletException {
try {
String uri = req.getUri().toString();
LOG.info("URI: " + uri);
addMyHeader(res, "URI", uri);
Text url = new Text(uri.toString());
CrawlDatum cd = seg.getCrawlDatum(url);
if (cd != null) {
addMyHeader(res, "Res", "found");
LOG.info("-got " + cd.toString());
ProtocolStatus ps = (ProtocolStatus) cd.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
if (ps != null) {
Integer TrCode = protoCodes.get(ps.getCode());
if (TrCode != null) {
res.setStatus(TrCode.intValue());
} else {
res.setStatus(HttpServletResponse.SC_OK);
}
addMyHeader(res, "ProtocolStatus", ps.toString());
} else {
res.setStatus(HttpServletResponse.SC_OK);
}
Content c = seg.getContent(url);
if (c == null) {
// missing content
req.setHandled(true);
res.addHeader("X-Handled-By", getClass().getSimpleName());
return;
}
byte[] data = c.getContent();
LOG.debug("-data len=" + data.length);
Metadata meta = c.getMetadata();
String[] names = meta.names();
LOG.debug("- " + names.length + " meta");
for (int i = 0; i < names.length; i++) {
boolean my = true;
char ch = names[i].charAt(0);
if (Character.isLetter(ch) && Character.isUpperCase(ch)) {
// pretty good chance it's a standard header
my = false;
}
String[] values = meta.getValues(names[i]);
for (int k = 0; k < values.length; k++) {
if (my) {
addMyHeader(res, names[i], values[k]);
} else {
res.addHeader(names[i], values[k]);
}
}
}
req.setHandled(true);
res.addHeader("X-Handled-By", getClass().getSimpleName());
res.setContentType(meta.get(Metadata.CONTENT_TYPE));
res.setContentLength(data.length);
OutputStream os = res.getOutputStream();
os.write(data, 0, data.length);
res.flushBuffer();
} else {
addMyHeader(res, "Res", "not found");
LOG.info(" -not found " + url);
}
} catch (Exception e) {
e.printStackTrace();
LOG.warn(StringUtils.stringifyException(e));
addMyHeader(res, "Res", "Exception: " + StringUtils.stringifyException(e));
}
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class CommonCrawlDataDumper method dump.
/**
* Dumps the reverse engineered CBOR content from the provided segment
* directories if a parent directory contains more than one segment,
* otherwise a single segment can be passed as an argument. If the boolean
* argument is provided then the CBOR is also zipped.
*
* @param outputDir the directory you wish to dump the raw content to. This
* directory will be created.
* @param segmentRootDir a directory containing one or more segments.
* @param linkdb Path to linkdb.
* @param gzip a boolean flag indicating whether the CBOR content should also
* be gzipped.
* @param mimeTypes a string array of mimeTypes to filter on, everything else is excluded
* @param epochFilename if {@code true}, output files will be names using the epoch time (in milliseconds).
* @param extension a file extension to use with output documents.
* @param warc if true write as warc format
* @throws Exception if any exception occurs.
*/
public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip, String[] mimeTypes, boolean epochFilename, String extension, boolean warc) throws Exception {
if (gzip) {
LOG.info("Gzipping CBOR data has been skipped");
}
// total file counts
Map<String, Integer> typeCounts = new HashMap<>();
// filtered file counters
Map<String, Integer> filteredCounts = new HashMap<>();
Configuration nutchConfig = NutchConfiguration.create();
Path segmentRootPath = new Path(segmentRootDir.toString());
FileSystem fs = segmentRootPath.getFileSystem(nutchConfig);
// get all paths
List<Path> parts = new ArrayList<>();
RemoteIterator<LocatedFileStatus> files = fs.listFiles(segmentRootPath, true);
String partPattern = ".*" + File.separator + Content.DIR_NAME + File.separator + "part-[0-9]{5}" + File.separator + "data";
while (files.hasNext()) {
LocatedFileStatus next = files.next();
if (next.isFile()) {
Path path = next.getPath();
if (path.toString().matches(partPattern)) {
parts.add(path);
}
}
}
LinkDbReader linkDbReader = null;
if (linkdb != null) {
linkDbReader = new LinkDbReader(nutchConfig, new Path(linkdb.toString()));
}
if (parts == null || parts.size() == 0) {
LOG.error("No segment directories found in {} ", segmentRootDir.getAbsolutePath());
System.exit(1);
}
LOG.info("Found {} segment parts", parts.size());
if (gzip && !warc) {
fileList = new ArrayList<>();
constructNewStream(outputDir);
}
for (Path segmentPart : parts) {
LOG.info("Processing segment Part : [ {} ]", segmentPart);
try {
SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig, SequenceFile.Reader.file(segmentPart));
Writable key = (Writable) reader.getKeyClass().getConstructor().newInstance();
Content content = null;
while (reader.next(key)) {
content = new Content();
reader.getCurrentValue(content);
Metadata metadata = content.getMetadata();
String url = key.toString();
String baseName = FilenameUtils.getBaseName(url);
String extensionName = FilenameUtils.getExtension(url);
if (!extension.isEmpty()) {
extensionName = extension;
} else if ((extensionName == null) || extensionName.isEmpty()) {
extensionName = "html";
}
String outputFullPath = null;
String outputRelativePath = null;
String filename = null;
String timestamp = null;
String reverseKey = null;
if (epochFilename || config.getReverseKey()) {
try {
long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(getDate(metadata.get("Date"))).getTime();
timestamp = String.valueOf(epoch);
} catch (ParseException pe) {
LOG.warn(pe.getMessage());
}
reverseKey = reverseUrl(url);
config.setReverseKeyValue(reverseKey.replace("/", "_") + "_" + DigestUtils.sha1Hex(url) + "_" + timestamp);
}
if (!warc) {
if (epochFilename) {
outputFullPath = DumpFileUtil.createFileNameFromUrl(outputDir.getAbsolutePath(), reverseKey, url, timestamp, extensionName, !gzip);
outputRelativePath = outputFullPath.substring(0, outputFullPath.lastIndexOf(File.separator) - 1);
filename = content.getMetadata().get(Metadata.DATE) + "." + extensionName;
} else {
String md5Ofurl = DumpFileUtil.getUrlMD5(url);
String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl, !gzip);
filename = DumpFileUtil.createFileName(md5Ofurl, baseName, extensionName);
outputFullPath = String.format("%s/%s", fullDir, filename);
String[] fullPathLevels = fullDir.split(Pattern.quote(File.separator));
String firstLevelDirName = fullPathLevels[fullPathLevels.length - 2];
String secondLevelDirName = fullPathLevels[fullPathLevels.length - 1];
outputRelativePath = firstLevelDirName + secondLevelDirName;
}
}
// Encode all filetypes if no mimetypes have been given
Boolean filter = (mimeTypes == null);
String jsonData = "";
try {
String mimeType = new Tika().detect(content.getContent());
// Maps file to JSON-based structure
// there may be duplicates, so using set
Set<String> inUrls = null;
if (linkDbReader != null) {
Inlinks inlinks = linkDbReader.getInlinks((Text) key);
if (inlinks != null) {
Iterator<Inlink> iterator = inlinks.iterator();
inUrls = new LinkedHashSet<>();
while (inUrls.size() <= MAX_INLINKS && iterator.hasNext()) {
inUrls.add(iterator.next().getFromUrl());
}
}
}
// TODO: Make this Jackson Format implementation reusable
try (CommonCrawlFormat format = CommonCrawlFormatFactory.getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) {
if (inUrls != null) {
format.setInLinks(new ArrayList<>(inUrls));
}
jsonData = format.getJsonData(url, content, metadata);
}
collectStats(typeCounts, mimeType);
// collects statistics for the given mimetypes
if ((mimeType != null) && (mimeTypes != null) && Arrays.asList(mimeTypes).contains(mimeType)) {
collectStats(filteredCounts, mimeType);
filter = true;
}
} catch (IOException ioe) {
LOG.error("Fatal error in creating JSON data: " + ioe.getMessage());
return;
}
if (!warc) {
if (filter) {
byte[] byteData = serializeCBORData(jsonData);
if (!gzip) {
File outputFile = new File(outputFullPath);
if (outputFile.exists()) {
LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
} else {
LOG.info("Writing: [" + outputFullPath + "]");
IOUtils.copy(new ByteArrayInputStream(byteData), new FileOutputStream(outputFile));
}
} else {
if (fileList.contains(outputFullPath)) {
LOG.info("Skipping compressing: [" + outputFullPath + "]: file already exists");
} else {
fileList.add(outputFullPath);
LOG.info("Compressing: [" + outputFullPath + "]");
// TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + secondLevelDirName + File.separator + filename);
TarArchiveEntry tarEntry = new TarArchiveEntry(outputRelativePath + File.separator + filename);
tarEntry.setSize(byteData.length);
tarOutput.putArchiveEntry(tarEntry);
tarOutput.write(byteData);
tarOutput.closeArchiveEntry();
}
}
}
}
}
reader.close();
} catch (Exception e) {
LOG.warn("SKIPPED: {} Because : {}", segmentPart, e.getMessage());
} finally {
fs.close();
}
}
if (gzip && !warc) {
closeStream();
}
if (!typeCounts.isEmpty()) {
LOG.info("CommonsCrawlDataDumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
}
}
Aggregations