Search in sources :

Example 1 with Protocol

use of org.apache.nutch.protocol.Protocol in project nutch by apache.

the class TestAny23ParseFilter method extract.

public String[] extract(String urlString, File file, String contentType) {
    try {
        System.out.println(urlString);
        Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
        Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        content.setContentType(contentType);
        Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
        return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.toString());
    }
    return null;
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) ParseException(org.apache.nutch.parse.ParseException) IOException(java.io.IOException)

Example 2 with Protocol

use of org.apache.nutch.protocol.Protocol in project nutch by apache.

the class TestExtParser method setUp.

@Before
protected void setUp() throws ProtocolException, IOException {
    // prepare a temp file with expectedText as its content
    // This system property is defined in ./src/plugin/build-plugin.xml
    String path = System.getProperty("test.data");
    if (path != null) {
        File tempDir = new File(path);
        if (!tempDir.exists())
            tempDir.mkdir();
        tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt", tempDir);
    } else {
        // otherwise in java.io.tmpdir
        tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt");
    }
    urlString = tempFile.toURI().toURL().toString();
    FileOutputStream fos = new FileOutputStream(tempFile);
    fos.write(expectedText.getBytes());
    fos.close();
    // get nutch content
    Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    protocol = null;
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) FileOutputStream(java.io.FileOutputStream) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) File(java.io.File) Before(org.junit.Before)

Example 3 with Protocol

use of org.apache.nutch.protocol.Protocol in project nutch by apache.

the class TestRTFParser method testIt.

@Ignore("There seems to be an issue with line 71 e.g. text.trim()")
@Test
public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    String text = parse.getText();
    Assert.assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
    String title = parse.getData().getTitle();
    Metadata meta = parse.getData().getParseMeta();
    Assert.assertEquals("test rft document", title);
    Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 4 with Protocol

use of org.apache.nutch.protocol.Protocol in project nutch by apache.

the class TestZipParser method testIt.

@Test
public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
        protocol = new ProtocolFactory(conf).getProtocol(urlString);
        content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(content.getUrl());
        Assert.assertTrue("Extracted text does not start with <" + expectedText + ">: <" + parse.getText() + ">", parse.getText().startsWith(expectedText));
    }
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) Test(org.junit.Test)

Example 5 with Protocol

use of org.apache.nutch.protocol.Protocol in project nutch by apache.

the class FetcherThread method run.

@SuppressWarnings("fallthrough")
public void run() {
    // count threads
    activeThreads.incrementAndGet();
    Text url = new Text();
    FetchItem fit = null;
    try {
        // checking for the server to be running and fetcher.parse to be true
        if (parsing && NutchServer.getInstance().isRunning())
            reportToNutchServer = true;
        while (true) {
            // creating FetchNode for storing in FetchNodeDb
            if (reportToNutchServer)
                this.fetchNode = new FetchNode();
            else
                this.fetchNode = null;
            // check whether must be stopped
            if (isHalted()) {
                LOG.debug(getName() + " set to halted");
                fit = null;
                return;
            }
            fit = ((FetchItemQueues) fetchQueues).getFetchItem();
            if (fit != null) {
                URL u = fit.u;
                String temp_url = u.getProtocol() + "://" + u.getHost() + ":" + u.getPort() + u.getFile();
                url = new Text(temp_url);
            }
            if (fit == null) {
                if (feeder.isAlive() || ((FetchItemQueues) fetchQueues).getTotalSize() > 0) {
                    LOG.debug(getName() + " spin-waiting ...");
                    // spin-wait.
                    ((AtomicInteger) spinWaiting).incrementAndGet();
                    try {
                        Thread.sleep(500);
                    } catch (Exception e) {
                    }
                    ((AtomicInteger) spinWaiting).decrementAndGet();
                    continue;
                } else {
                    // all done, finish this thread
                    LOG.info(getName() + " " + Thread.currentThread().getId() + " has no more work available");
                    return;
                }
            }
            lastRequestStart.set(System.currentTimeMillis());
            Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
            if (reprUrlWritable == null) {
                setReprUrl(url.toString());
            } else {
                setReprUrl(reprUrlWritable.toString());
            }
            try {
                // fetch the page
                redirecting = false;
                redirectCount = 0;
                // Publisher event
                if (activatePublisher) {
                    FetcherThreadEvent startEvent = new FetcherThreadEvent(PublishEventType.START, fit.getUrl().toString());
                    publisher.publish(startEvent, conf);
                }
                do {
                    if (LOG.isInfoEnabled()) {
                        LOG.info(getName() + " " + Thread.currentThread().getId() + " fetching " + url + " (queue crawl delay=" + ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay + "ms)");
                    }
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("redirectCount=" + redirectCount);
                    }
                    redirecting = false;
                    Protocol protocol = this.protocolFactory.getProtocol(url.toString());
                    BaseRobotRules rules = protocol.getRobotRules(url, fit.datum, robotsTxtContent);
                    if (robotsTxtContent != null) {
                        outputRobotsTxt(robotsTxtContent);
                        robotsTxtContent.clear();
                    }
                    if (!rules.isAllowed(fit.u.toString())) {
                        // unblock
                        ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Denied by robots.txt: " + url);
                        }
                        output(url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                        context.getCounter("FetcherStatus", "robots_denied").increment(1);
                        continue;
                    }
                    if (rules.getCrawlDelay() > 0) {
                        if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                            // unblock
                            ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
                            LOG.debug("Crawl-Delay for " + url + " too long (" + rules.getCrawlDelay() + "), skipping");
                            output(url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                            context.getCounter("FetcherStatus", "robots_denied_maxcrawldelay").increment(1);
                            continue;
                        } else {
                            FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
                            fiq.crawlDelay = rules.getCrawlDelay();
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + url);
                            }
                        }
                    }
                    ProtocolOutput output = protocol.getProtocolOutput(url, fit.datum);
                    ProtocolStatus status = output.getStatus();
                    Content content = output.getContent();
                    ParseStatus pstatus = null;
                    // unblock queue
                    ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
                    String urlString = url.toString();
                    // used for FetchNode
                    if (fetchNode != null) {
                        fetchNode.setStatus(status.getCode());
                        fetchNode.setFetchTime(System.currentTimeMillis());
                        fetchNode.setUrl(url);
                    }
                    // Publish fetch finish event
                    if (activatePublisher) {
                        FetcherThreadEvent endEvent = new FetcherThreadEvent(PublishEventType.END, fit.getUrl().toString());
                        endEvent.addEventData("status", status.getName());
                        publisher.publish(endEvent, conf);
                    }
                    context.getCounter("FetcherStatus", status.getName()).increment(1);
                    switch(status.getCode()) {
                        case ProtocolStatus.WOULDBLOCK:
                            // retry ?
                            ((FetchItemQueues) fetchQueues).addFetchItem(fit);
                            break;
                        case // got a page
                        ProtocolStatus.SUCCESS:
                            pstatus = output(url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
                            updateStatus(content.getContent().length);
                            if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                                String newUrl = pstatus.getMessage();
                                int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
                                Text redirUrl = handleRedirect(url, fit.datum, urlString, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR);
                                if (redirUrl != null) {
                                    fit = queueRedirect(redirUrl, fit);
                                }
                            }
                            break;
                        // redirect
                        case ProtocolStatus.MOVED:
                        case ProtocolStatus.TEMP_MOVED:
                            int code;
                            boolean temp;
                            if (status.getCode() == ProtocolStatus.MOVED) {
                                code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
                                temp = false;
                            } else {
                                code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
                                temp = true;
                            }
                            output(url, fit.datum, content, status, code);
                            String newUrl = status.getMessage();
                            Text redirUrl = handleRedirect(url, fit.datum, urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR);
                            if (redirUrl != null) {
                                fit = queueRedirect(redirUrl, fit);
                            } else {
                                // stop redirecting
                                redirecting = false;
                            }
                            break;
                        case ProtocolStatus.EXCEPTION:
                            logError(url, status.getMessage());
                            int killedURLs = ((FetchItemQueues) fetchQueues).checkExceptionThreshold(fit.getQueueID());
                            if (killedURLs != 0)
                                context.getCounter("FetcherStatus", "AboveExceptionThresholdInQueue").increment(killedURLs);
                        // retry
                        case ProtocolStatus.RETRY:
                        case ProtocolStatus.BLOCKED:
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                            break;
                        // gone
                        case ProtocolStatus.GONE:
                        case ProtocolStatus.NOTFOUND:
                        case ProtocolStatus.ACCESS_DENIED:
                        case ProtocolStatus.ROBOTS_DENIED:
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
                            break;
                        case ProtocolStatus.NOTMODIFIED:
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
                            break;
                        default:
                            if (LOG.isWarnEnabled()) {
                                LOG.warn(getName() + " " + Thread.currentThread().getId() + " Unknown ProtocolStatus: " + status.getCode());
                            }
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                    }
                    if (redirecting && redirectCount > maxRedirect) {
                        ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
                        if (LOG.isInfoEnabled()) {
                            LOG.info(getName() + " " + Thread.currentThread().getId() + "  - redirect count exceeded " + url);
                        }
                        output(url, fit.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, CrawlDatum.STATUS_FETCH_GONE);
                    }
                } while (redirecting && (redirectCount <= maxRedirect));
            } catch (Throwable t) {
                // unexpected exception
                // unblock
                ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
                logError(url, StringUtils.stringifyException(t));
                output(url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY);
            }
        }
    } catch (Throwable e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("fetcher caught:" + e.toString());
        }
    } finally {
        if (fit != null)
            ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
        // count threads
        activeThreads.decrementAndGet();
        LOG.info(getName() + " " + Thread.currentThread().getId() + " -finishing thread " + getName() + ", activeThreads=" + activeThreads);
    }
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) URL(java.net.URL) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) URLFilterException(org.apache.nutch.net.URLFilterException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ParseStatus(org.apache.nutch.parse.ParseStatus) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Content(org.apache.nutch.protocol.Content) Protocol(org.apache.nutch.protocol.Protocol) BaseRobotRules(crawlercommons.robots.BaseRobotRules) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Aggregations

Text (org.apache.hadoop.io.Text)16 Protocol (org.apache.nutch.protocol.Protocol)16 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)15 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)13 Content (org.apache.nutch.protocol.Content)13 Parse (org.apache.nutch.parse.Parse)11 ParseUtil (org.apache.nutch.parse.ParseUtil)11 Configuration (org.apache.hadoop.conf.Configuration)7 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)7 Test (org.junit.Test)7 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)4 IOException (java.io.IOException)2 Map (java.util.Map)2 Metadata (org.apache.nutch.metadata.Metadata)2 BaseRobotRules (crawlercommons.robots.BaseRobotRules)1 File (java.io.File)1 FileOutputStream (java.io.FileOutputStream)1 MalformedURLException (java.net.MalformedURLException)1 URL (java.net.URL)1 HashMap (java.util.HashMap)1