use of org.apache.nutch.protocol.Protocol in project nutch by apache.
the class TestAny23ParseFilter method extract.
public String[] extract(String urlString, File file, String contentType) {
try {
System.out.println(urlString);
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
content.setContentType(contentType);
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.toString());
}
return null;
}
use of org.apache.nutch.protocol.Protocol in project nutch by apache.
the class TestExtParser method setUp.
@Before
protected void setUp() throws ProtocolException, IOException {
// prepare a temp file with expectedText as its content
// This system property is defined in ./src/plugin/build-plugin.xml
String path = System.getProperty("test.data");
if (path != null) {
File tempDir = new File(path);
if (!tempDir.exists())
tempDir.mkdir();
tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt", tempDir);
} else {
// otherwise in java.io.tmpdir
tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt");
}
urlString = tempFile.toURI().toURL().toString();
FileOutputStream fos = new FileOutputStream(tempFile);
fos.write(expectedText.getBytes());
fos.close();
// get nutch content
Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
protocol = null;
}
use of org.apache.nutch.protocol.Protocol in project nutch by apache.
the class TestRTFParser method testIt.
@Ignore("There seems to be an issue with line 71 e.g. text.trim()")
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
urlString = "file:" + sampleDir + fileSeparator + rtfFile;
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
String text = parse.getText();
Assert.assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
String title = parse.getData().getTitle();
Metadata meta = parse.getData().getParseMeta();
Assert.assertEquals("test rft document", title);
Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
}
use of org.apache.nutch.protocol.Protocol in project nutch by apache.
the class TestZipParser method testIt.
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(content.getUrl());
Assert.assertTrue("Extracted text does not start with <" + expectedText + ">: <" + parse.getText() + ">", parse.getText().startsWith(expectedText));
}
}
use of org.apache.nutch.protocol.Protocol in project nutch by apache.
the class FetcherThread method run.
@SuppressWarnings("fallthrough")
public void run() {
// count threads
activeThreads.incrementAndGet();
Text url = new Text();
FetchItem fit = null;
try {
// checking for the server to be running and fetcher.parse to be true
if (parsing && NutchServer.getInstance().isRunning())
reportToNutchServer = true;
while (true) {
// creating FetchNode for storing in FetchNodeDb
if (reportToNutchServer)
this.fetchNode = new FetchNode();
else
this.fetchNode = null;
// check whether must be stopped
if (isHalted()) {
LOG.debug(getName() + " set to halted");
fit = null;
return;
}
fit = ((FetchItemQueues) fetchQueues).getFetchItem();
if (fit != null) {
URL u = fit.u;
String temp_url = u.getProtocol() + "://" + u.getHost() + ":" + u.getPort() + u.getFile();
url = new Text(temp_url);
}
if (fit == null) {
if (feeder.isAlive() || ((FetchItemQueues) fetchQueues).getTotalSize() > 0) {
LOG.debug(getName() + " spin-waiting ...");
// spin-wait.
((AtomicInteger) spinWaiting).incrementAndGet();
try {
Thread.sleep(500);
} catch (Exception e) {
}
((AtomicInteger) spinWaiting).decrementAndGet();
continue;
} else {
// all done, finish this thread
LOG.info(getName() + " " + Thread.currentThread().getId() + " has no more work available");
return;
}
}
lastRequestStart.set(System.currentTimeMillis());
Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
if (reprUrlWritable == null) {
setReprUrl(url.toString());
} else {
setReprUrl(reprUrlWritable.toString());
}
try {
// fetch the page
redirecting = false;
redirectCount = 0;
// Publisher event
if (activatePublisher) {
FetcherThreadEvent startEvent = new FetcherThreadEvent(PublishEventType.START, fit.getUrl().toString());
publisher.publish(startEvent, conf);
}
do {
if (LOG.isInfoEnabled()) {
LOG.info(getName() + " " + Thread.currentThread().getId() + " fetching " + url + " (queue crawl delay=" + ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay + "ms)");
}
if (LOG.isDebugEnabled()) {
LOG.debug("redirectCount=" + redirectCount);
}
redirecting = false;
Protocol protocol = this.protocolFactory.getProtocol(url.toString());
BaseRobotRules rules = protocol.getRobotRules(url, fit.datum, robotsTxtContent);
if (robotsTxtContent != null) {
outputRobotsTxt(robotsTxtContent);
robotsTxtContent.clear();
}
if (!rules.isAllowed(fit.u.toString())) {
// unblock
((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
if (LOG.isDebugEnabled()) {
LOG.debug("Denied by robots.txt: " + url);
}
output(url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
context.getCounter("FetcherStatus", "robots_denied").increment(1);
continue;
}
if (rules.getCrawlDelay() > 0) {
if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
// unblock
((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
LOG.debug("Crawl-Delay for " + url + " too long (" + rules.getCrawlDelay() + "), skipping");
output(url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
context.getCounter("FetcherStatus", "robots_denied_maxcrawldelay").increment(1);
continue;
} else {
FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
fiq.crawlDelay = rules.getCrawlDelay();
if (LOG.isDebugEnabled()) {
LOG.debug("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + url);
}
}
}
ProtocolOutput output = protocol.getProtocolOutput(url, fit.datum);
ProtocolStatus status = output.getStatus();
Content content = output.getContent();
ParseStatus pstatus = null;
// unblock queue
((FetchItemQueues) fetchQueues).finishFetchItem(fit);
String urlString = url.toString();
// used for FetchNode
if (fetchNode != null) {
fetchNode.setStatus(status.getCode());
fetchNode.setFetchTime(System.currentTimeMillis());
fetchNode.setUrl(url);
}
// Publish fetch finish event
if (activatePublisher) {
FetcherThreadEvent endEvent = new FetcherThreadEvent(PublishEventType.END, fit.getUrl().toString());
endEvent.addEventData("status", status.getName());
publisher.publish(endEvent, conf);
}
context.getCounter("FetcherStatus", status.getName()).increment(1);
switch(status.getCode()) {
case ProtocolStatus.WOULDBLOCK:
// retry ?
((FetchItemQueues) fetchQueues).addFetchItem(fit);
break;
case // got a page
ProtocolStatus.SUCCESS:
pstatus = output(url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
updateStatus(content.getContent().length);
if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
String newUrl = pstatus.getMessage();
int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
Text redirUrl = handleRedirect(url, fit.datum, urlString, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR);
if (redirUrl != null) {
fit = queueRedirect(redirUrl, fit);
}
}
break;
// redirect
case ProtocolStatus.MOVED:
case ProtocolStatus.TEMP_MOVED:
int code;
boolean temp;
if (status.getCode() == ProtocolStatus.MOVED) {
code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
temp = false;
} else {
code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
temp = true;
}
output(url, fit.datum, content, status, code);
String newUrl = status.getMessage();
Text redirUrl = handleRedirect(url, fit.datum, urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR);
if (redirUrl != null) {
fit = queueRedirect(redirUrl, fit);
} else {
// stop redirecting
redirecting = false;
}
break;
case ProtocolStatus.EXCEPTION:
logError(url, status.getMessage());
int killedURLs = ((FetchItemQueues) fetchQueues).checkExceptionThreshold(fit.getQueueID());
if (killedURLs != 0)
context.getCounter("FetcherStatus", "AboveExceptionThresholdInQueue").increment(killedURLs);
// retry
case ProtocolStatus.RETRY:
case ProtocolStatus.BLOCKED:
output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
break;
// gone
case ProtocolStatus.GONE:
case ProtocolStatus.NOTFOUND:
case ProtocolStatus.ACCESS_DENIED:
case ProtocolStatus.ROBOTS_DENIED:
output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
break;
case ProtocolStatus.NOTMODIFIED:
output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
break;
default:
if (LOG.isWarnEnabled()) {
LOG.warn(getName() + " " + Thread.currentThread().getId() + " Unknown ProtocolStatus: " + status.getCode());
}
output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
}
if (redirecting && redirectCount > maxRedirect) {
((FetchItemQueues) fetchQueues).finishFetchItem(fit);
if (LOG.isInfoEnabled()) {
LOG.info(getName() + " " + Thread.currentThread().getId() + " - redirect count exceeded " + url);
}
output(url, fit.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, CrawlDatum.STATUS_FETCH_GONE);
}
} while (redirecting && (redirectCount <= maxRedirect));
} catch (Throwable t) {
// unexpected exception
// unblock
((FetchItemQueues) fetchQueues).finishFetchItem(fit);
logError(url, StringUtils.stringifyException(t));
output(url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY);
}
}
} catch (Throwable e) {
if (LOG.isErrorEnabled()) {
LOG.error("fetcher caught:" + e.toString());
}
} finally {
if (fit != null)
((FetchItemQueues) fetchQueues).finishFetchItem(fit);
// count threads
activeThreads.decrementAndGet();
LOG.info(getName() + " " + Thread.currentThread().getId() + " -finishing thread " + getName() + ", activeThreads=" + activeThreads);
}
}
Aggregations