Search in sources :

Example 26 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class Ftp method main.

/**
 * For debugging.
 */
public static void main(String[] args) throws Exception {
    int timeout = Integer.MIN_VALUE;
    int maxContentLength = Integer.MIN_VALUE;
    String logLevel = "info";
    boolean followTalk = false;
    boolean keepConnection = false;
    boolean dumpContent = false;
    String urlString = null;
    String usage = "Usage: Ftp [-logLevel level] [-followTalk] [-keepConnection] [-timeout N] [-maxContentLength L] [-dumpContent] url";
    if (args.length == 0) {
        System.err.println(usage);
        System.exit(-1);
    }
    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-logLevel")) {
            logLevel = args[++i];
        } else if (args[i].equals("-followTalk")) {
            followTalk = true;
        } else if (args[i].equals("-keepConnection")) {
            keepConnection = true;
        } else if (args[i].equals("-timeout")) {
            timeout = Integer.parseInt(args[++i]) * 1000;
        } else if (args[i].equals("-maxContentLength")) {
            maxContentLength = Integer.parseInt(args[++i]);
        } else if (args[i].equals("-dumpContent")) {
            dumpContent = true;
        } else if (i != args.length - 1) {
            System.err.println(usage);
            System.exit(-1);
        } else {
            urlString = args[i];
        }
    }
    Ftp ftp = new Ftp();
    ftp.setFollowTalk(followTalk);
    ftp.setKeepConnection(keepConnection);
    if (// set timeout
    timeout != Integer.MIN_VALUE)
        ftp.setTimeout(timeout);
    if (// set maxContentLength
    maxContentLength != Integer.MIN_VALUE)
        ftp.setMaxContentLength(maxContentLength);
    // set log level
    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
    Content content = ftp.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    System.err.println("Content-Type: " + content.getContentType());
    System.err.println("Content-Length: " + content.getMetadata().get(Response.CONTENT_LENGTH));
    System.err.println("Last-Modified: " + content.getMetadata().get(Response.LAST_MODIFIED));
    if (dumpContent) {
        System.out.print(new String(content.getContent()));
    }
    ftp = null;
}
Also used : Content(org.apache.nutch.protocol.Content) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text)

Example 27 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class FtpRobotRulesParser method getRobotRulesSet.

/**
 * The hosts for which the caching of robots rules is yet to be done, it sends
 * a Ftp request to the host corresponding to the {@link URL} passed, gets
 * robots file, parses the rules and caches the rules object to avoid re-work
 * in future.
 *
 * @param ftp
 *          The {@link Protocol} object
 * @param url
 *          URL
 * @param robotsTxtContent
 *          container to store responses when fetching the robots.txt file for
 *          debugging or archival purposes. Instead of a robots.txt file, it
 *          may include redirects or an error page (404, etc.). Response
 *          {@link Content} is appended to the passed list. If null is passed
 *          nothing is stored.
 *
 * @return robotRules A {@link BaseRobotRules} object for the rules
 */
@Override
public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url, List<Content> robotsTxtContent) {
    // normalize to lower
    String protocol = url.getProtocol().toLowerCase();
    // case
    // normalize to lower case
    String host = url.getHost().toLowerCase();
    if (LOG.isTraceEnabled() && isWhiteListed(url)) {
        LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
    }
    BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
    if (robotRules != null) {
        // cached rule
        return robotRules;
    } else if (LOG.isTraceEnabled()) {
        LOG.trace("cache miss " + url);
    }
    boolean cacheRule = true;
    if (isWhiteListed(url)) {
        // check in advance whether a host is whitelisted
        // (we do not need to fetch robots.txt)
        robotRules = EMPTY_RULES;
        LOG.info("Whitelisted host found for: {}", url);
        LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
    } else {
        try {
            Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
            ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl, new CrawlDatum());
            ProtocolStatus status = output.getStatus();
            if (robotsTxtContent != null) {
                robotsTxtContent.add(output.getContent());
            }
            if (status.getCode() == ProtocolStatus.SUCCESS) {
                robotRules = parseRules(url.toString(), output.getContent().getContent(), CONTENT_TYPE, agentNames);
            } else {
                // use default rules
                robotRules = EMPTY_RULES;
            }
        } catch (Throwable t) {
            if (LOG.isInfoEnabled()) {
                LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
            }
            // try again later to fetch robots.txt
            cacheRule = false;
            robotRules = EMPTY_RULES;
        }
    }
    if (cacheRule)
        // cache rules for host
        CACHE.put(protocol + ":" + host, robotRules);
    return robotRules;
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) BaseRobotRules(crawlercommons.robots.BaseRobotRules) URL(java.net.URL) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 28 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class TestCrawlDbStates method testCrawlDbStateTransitionMatrix.

/**
 * Test the matrix of state transitions:
 * <ul>
 * <li>for all available {@link FetchSchedule} implementations</li>
 * <li>for every possible status in CrawlDb (including "not in CrawlDb")</li>
 * <li>for every possible fetch status</li>
 * <li>and zero or more (0-3) additional in-links</li>
 * </ul>
 * call {@literal updatedb} and check whether the resulting CrawlDb status is
 * the expected one.
 */
@Test
public void testCrawlDbStateTransitionMatrix() {
    LOG.info("Test CrawlDatum state transitions");
    Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context context = CrawlDBTestUtil.createContext();
    Configuration conf = context.getConfiguration();
    CrawlDbUpdateUtil updateDb = null;
    try {
        updateDb = new CrawlDbUpdateUtil(new CrawlDbReducer(), context);
    } catch (IOException e) {
        e.printStackTrace();
    }
    int retryMax = conf.getInt("db.fetch.retry.max", 3);
    for (String sched : schedules) {
        LOG.info("Testing state transitions with " + sched);
        conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
        FetchSchedule schedule = FetchScheduleFactory.getFetchSchedule(conf);
        for (int i = 0; i < fetchDbStatusPairs.length; i++) {
            byte fromDbStatus = fetchDbStatusPairs[i][1];
            for (int j = 0; j < fetchDbStatusPairs.length; j++) {
                byte fetchStatus = fetchDbStatusPairs[j][0];
                CrawlDatum fromDb = null;
                if (fromDbStatus == -1) {
                // nothing yet in CrawlDb
                // CrawlDatum added by FreeGenerator or via outlink
                } else {
                    fromDb = new CrawlDatum();
                    fromDb.setStatus(fromDbStatus);
                    // initialize fetchInterval:
                    schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fromDb);
                }
                // expected db status
                byte toDbStatus = fetchDbStatusPairs[j][1];
                if (fetchStatus == -1) {
                    if (fromDbStatus == -1) {
                        // nothing fetched yet: new document detected via outlink
                        toDbStatus = STATUS_DB_UNFETCHED;
                    } else {
                        // nothing fetched but new inlinks detected: status is unchanged
                        toDbStatus = fromDbStatus;
                    }
                } else if (fetchStatus == STATUS_FETCH_RETRY) {
                    // a simple test of fetch_retry (without retries)
                    if (fromDb == null || fromDb.getRetriesSinceFetch() < retryMax) {
                        toDbStatus = STATUS_DB_UNFETCHED;
                    } else {
                        toDbStatus = STATUS_DB_GONE;
                    }
                }
                String fromDbStatusName = (fromDbStatus == -1 ? "<not in CrawlDb>" : getStatusName(fromDbStatus));
                String fetchStatusName = (fetchStatus == -1 ? "<only inlinks>" : CrawlDatum.getStatusName(fetchStatus));
                LOG.info(fromDbStatusName + " + " + fetchStatusName + " => " + getStatusName(toDbStatus));
                List<CrawlDatum> values = new ArrayList<CrawlDatum>();
                for (int l = 0; l <= 2; l++) {
                    // number of additional in-links
                    CrawlDatum fetch = null;
                    if (fetchStatus == -1) {
                        // nothing fetched, need at least one in-link
                        if (l == 0)
                            continue;
                    } else {
                        fetch = new CrawlDatum();
                        if (fromDb != null) {
                            fetch.set(fromDb);
                        } else {
                            // not yet in CrawlDb: added by FreeGenerator
                            schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fetch);
                        }
                        fetch.setStatus(fetchStatus);
                        fetch.setFetchTime(System.currentTimeMillis());
                    }
                    if (fromDb != null)
                        values.add(fromDb);
                    if (fetch != null)
                        values.add(fetch);
                    for (int n = 0; n < l; n++) {
                        values.add(linked);
                    }
                    List<CrawlDatum> res = updateDb.update(values);
                    if (res.size() != 1) {
                        fail("CrawlDb update didn't result in one single CrawlDatum per URL");
                        continue;
                    }
                    byte status = res.get(0).getStatus();
                    if (status != toDbStatus) {
                        fail("CrawlDb update for " + fromDbStatusName + " and " + fetchStatusName + " and " + l + " inlinks results in " + getStatusName(status) + " (expected: " + getStatusName(toDbStatus) + ")");
                    }
                    values.clear();
                }
            }
        }
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) IOException(java.io.IOException) Reducer(org.apache.hadoop.mapreduce.Reducer) Test(org.junit.Test)

Example 29 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class TestIndexingFilters method testNonExistingIndexingFilter.

/**
 * Test behaviour when defined filter does not exist.
 *
 * @throws IndexingException
 */
@Test
public void testNonExistingIndexingFilter() throws IndexingException {
    Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");
    String class1 = "NonExistingFilter";
    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 30 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class TestSegmentMergerCrawlDatums method createSegment.

protected void createSegment(Path segment, byte status, boolean fetch, boolean redirect) throws Exception {
    LOG.info("\nSegment: " + segment.toString());
    // The URL of our main record
    String url = "http://nutch.apache.org/";
    // The URL of our redirecting URL
    String redirectUrl = "http://nutch.apache.org/i_redirect_to_the_root/";
    // Our value
    CrawlDatum value = new CrawlDatum();
    // Path of the segment's crawl_fetch directory
    Path crawlFetchPath = new Path(new Path(segment, CrawlDatum.FETCH_DIR_NAME), "part-00000");
    // Get a writer for map files containing <Text,CrawlDatum> pairs
    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
    MapFile.Writer writer = new MapFile.Writer(conf, crawlFetchPath, wKeyOpt, wValueOpt);
    // linked datum when merging
    if (redirect) {
        // We're writing our our main record URL with status linked
        LOG.info(url + " > " + CrawlDatum.getStatusName(CrawlDatum.STATUS_LINKED));
        value = new CrawlDatum();
        value.setStatus(CrawlDatum.STATUS_LINKED);
        writer.append(new Text(url), value);
    }
    // Whether we're fetching now
    if (fetch) {
        LOG.info(url + " > " + CrawlDatum.getStatusName(status));
        // Set the status
        value.setStatus(status);
        // Write the pair and ok
        writer.append(new Text(url), value);
    }
    // Whether we're handing a redirect now
    if (redirect) {
        // And the redirect URL with redirect status, pointing to our main URL
        LOG.info(redirectUrl + " > " + CrawlDatum.getStatusName(CrawlDatum.STATUS_FETCH_REDIR_TEMP));
        value.setStatus(CrawlDatum.STATUS_FETCH_REDIR_TEMP);
        writer.append(new Text(redirectUrl), value);
    }
    // Close the stuff
    writer.close();
}
Also used : Path(org.apache.hadoop.fs.Path) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) MapFile(org.apache.hadoop.io.MapFile) Option(org.apache.hadoop.io.MapFile.Writer.Option) Text(org.apache.hadoop.io.Text)

Aggregations

CrawlDatum (org.apache.nutch.crawl.CrawlDatum)66 Text (org.apache.hadoop.io.Text)60 Test (org.junit.Test)31 Inlinks (org.apache.nutch.crawl.Inlinks)25 Configuration (org.apache.hadoop.conf.Configuration)24 ParseData (org.apache.nutch.parse.ParseData)22 ParseImpl (org.apache.nutch.parse.ParseImpl)21 NutchDocument (org.apache.nutch.indexer.NutchDocument)20 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)20 Content (org.apache.nutch.protocol.Content)19 Parse (org.apache.nutch.parse.Parse)15 Metadata (org.apache.nutch.metadata.Metadata)14 ParseStatus (org.apache.nutch.parse.ParseStatus)14 ParseUtil (org.apache.nutch.parse.ParseUtil)13 Protocol (org.apache.nutch.protocol.Protocol)13 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)13 URL (java.net.URL)11 Outlink (org.apache.nutch.parse.Outlink)11 IOException (java.io.IOException)7 ArrayList (java.util.ArrayList)5