Search in sources :

Example 61 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class IndexingFiltersChecker method process.

protected int process(String url, StringBuilder output) throws Exception {
    if (normalizers != null) {
        url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
    }
    LOG.info("fetching: " + url);
    CrawlDatum datum = new CrawlDatum();
    Iterator<String> iter = metadata.keySet().iterator();
    while (iter.hasNext()) {
        String key = iter.next();
        String value = metadata.get(key);
        if (value == null)
            value = "";
        datum.getMetaData().put(new Text(key), new Text(value));
    }
    IndexingFilters indexers = new IndexingFilters(getConf());
    int maxRedirects = 3;
    ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
    Text turl = new Text(url);
    // Following redirects and not reached maxRedirects?
    while (!protocolOutput.getStatus().isSuccess() && followRedirects && protocolOutput.getStatus().isRedirect() && maxRedirects != 0) {
        String[] stuff = protocolOutput.getStatus().getArgs();
        url = stuff[0];
        if (normalizers != null) {
            url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
        }
        turl.set(url);
        // try again
        protocolOutput = getProtocolOutput(url, datum);
        maxRedirects--;
    }
    if (!protocolOutput.getStatus().isSuccess()) {
        output.append("Fetch failed with protocol status: " + protocolOutput.getStatus() + "\n");
        return 0;
    }
    Content content = protocolOutput.getContent();
    if (content == null) {
        output.append("No content for " + url + "\n");
        return 0;
    }
    String contentType = content.getContentType();
    if (contentType == null) {
        return -1;
    }
    // store the guessed content type in the crawldatum
    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(contentType));
    if (ParseSegment.isTruncated(content)) {
        LOG.warn("Content is truncated, parse may fail!");
    }
    ScoringFilters scfilters = new ScoringFilters(getConf());
    // call the scoring filters
    try {
        scfilters.passScoreBeforeParsing(turl, datum, content);
    } catch (Exception e) {
        LOG.warn("Couldn't pass score, url {} ({})", url, e);
    }
    LOG.info("parsing: {}", url);
    LOG.info("contentType: {}", contentType);
    ParseResult parseResult = new ParseUtil(getConf()).parse(content);
    NutchDocument doc = new NutchDocument();
    doc.add("id", url);
    Text urlText = new Text(url);
    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    if (parse == null) {
        LOG.error("Failed to get parse from parse result");
        LOG.error("Available parses in parse result (by URL key):");
        for (Map.Entry<Text, Parse> entry : parseResult) {
            LOG.error("  " + entry.getKey());
        }
        LOG.error("Parse result does not contain a parse for URL to be checked:");
        LOG.error("  " + urlText);
        return -1;
    }
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
    parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
    String digest = parse.getData().getContentMeta().get(Nutch.SIGNATURE_KEY);
    doc.add("digest", digest);
    datum.setSignature(signature);
    // call the scoring filters
    try {
        scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
    } catch (Exception e) {
        LOG.warn("Couldn't pass score, url {} ({})", turl, e);
    }
    try {
        doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
        e.printStackTrace();
    }
    if (doc == null) {
        output.append("Document discarded by indexing filter\n");
        return 0;
    }
    for (String fname : doc.getFieldNames()) {
        List<Object> values = doc.getField(fname).getValues();
        if (values != null) {
            for (Object value : values) {
                String str = value.toString();
                int minText = dumpText ? str.length() : Math.min(100, str.length());
                output.append(fname + " :\t" + str.substring(0, minText) + "\n");
            }
        }
    }
    // For readability if keepClientCnxOpen
    output.append("\n");
    if (getConf().getBoolean("doIndex", false) && doc != null) {
        IndexWriters writers = new IndexWriters(getConf());
        writers.open(getConf(), "IndexingFilterChecker");
        writers.write(doc);
        writers.close();
    }
    return 0;
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) ParseResult(org.apache.nutch.parse.ParseResult) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Content(org.apache.nutch.protocol.Content) ScoringFilters(org.apache.nutch.scoring.ScoringFilters) HashMap(java.util.HashMap) Map(java.util.Map)

Example 62 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class ParseOutputFormat method getRecordWriter.

public RecordWriter<Text, Parse> getRecordWriter(TaskAttemptContext context) throws IOException {
    Configuration conf = context.getConfiguration();
    String name = getUniqueFile(context, "part");
    Path dir = FileOutputFormat.getOutputPath(context);
    FileSystem fs = dir.getFileSystem(context.getConfiguration());
    if (conf.getBoolean("parse.filter.urls", true)) {
        filters = new URLFilters(conf);
        exemptionFilters = new URLExemptionFilters(conf);
    }
    if (conf.getBoolean("parse.normalize.urls", true)) {
        normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
    }
    this.scfilters = new ScoringFilters(conf);
    final int interval = conf.getInt("db.fetch.interval.default", 2592000);
    final boolean ignoreInternalLinks = conf.getBoolean("db.ignore.internal.links", false);
    final boolean ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
    final String ignoreExternalLinksMode = conf.get("db.ignore.external.links.mode", "byHost");
    // NUTCH-2435 - parameter "parser.store.text" allowing to choose whether to
    // store 'parse_text' directory or not:
    final boolean storeText = conf.getBoolean("parser.store.text", true);
    int maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
    final boolean isParsing = conf.getBoolean("fetcher.parse", true);
    final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage;
    final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(context);
    Path out = FileOutputFormat.getOutputPath(context);
    Path text = new Path(new Path(out, ParseText.DIR_NAME), name);
    Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
    Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);
    final String[] parseMDtoCrawlDB = conf.get("db.parsemeta.to.crawldb", "").split(" *, *");
    // textOut Options
    final MapFile.Writer textOut;
    if (storeText) {
        Option tKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
        org.apache.hadoop.io.SequenceFile.Writer.Option tValClassOpt = SequenceFile.Writer.valueClass(ParseText.class);
        org.apache.hadoop.io.SequenceFile.Writer.Option tProgressOpt = SequenceFile.Writer.progressable((Progressable) context);
        org.apache.hadoop.io.SequenceFile.Writer.Option tCompOpt = SequenceFile.Writer.compression(CompressionType.RECORD);
        textOut = new MapFile.Writer(conf, text, tKeyClassOpt, tValClassOpt, tCompOpt, tProgressOpt);
    } else {
        textOut = null;
    }
    // dataOut Options
    Option dKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
    org.apache.hadoop.io.SequenceFile.Writer.Option dValClassOpt = SequenceFile.Writer.valueClass(ParseData.class);
    org.apache.hadoop.io.SequenceFile.Writer.Option dProgressOpt = SequenceFile.Writer.progressable((Progressable) context);
    org.apache.hadoop.io.SequenceFile.Writer.Option dCompOpt = SequenceFile.Writer.compression(compType);
    final MapFile.Writer dataOut = new MapFile.Writer(conf, data, dKeyClassOpt, dValClassOpt, dCompOpt, dProgressOpt);
    final SequenceFile.Writer crawlOut = SequenceFile.createWriter(conf, SequenceFile.Writer.file(crawl), SequenceFile.Writer.keyClass(Text.class), SequenceFile.Writer.valueClass(CrawlDatum.class), SequenceFile.Writer.bufferSize(fs.getConf().getInt("io.file.buffer.size", 4096)), SequenceFile.Writer.replication(fs.getDefaultReplication(crawl)), SequenceFile.Writer.blockSize(1073741824), SequenceFile.Writer.compression(compType, new DefaultCodec()), SequenceFile.Writer.progressable((Progressable) context), SequenceFile.Writer.metadata(new Metadata()));
    return new RecordWriter<Text, Parse>() {

        public void write(Text key, Parse parse) throws IOException {
            String fromUrl = key.toString();
            // host or domain name of the source URL
            String origin = null;
            if (textOut != null) {
                textOut.append(key, new ParseText(parse.getText()));
            }
            ParseData parseData = parse.getData();
            // recover the signature prepared by Fetcher or ParseSegment
            String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
            if (sig != null) {
                byte[] signature = StringUtil.fromHexString(sig);
                if (signature != null) {
                    // append a CrawlDatum with a signature
                    CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
                    d.setSignature(signature);
                    crawlOut.append(key, d);
                }
            }
            // see if the parse metadata contain things that we'd like
            // to pass to the metadata of the crawlDB entry
            CrawlDatum parseMDCrawlDatum = null;
            for (String mdname : parseMDtoCrawlDB) {
                String mdvalue = parse.getData().getParseMeta().get(mdname);
                if (mdvalue != null) {
                    if (parseMDCrawlDatum == null)
                        parseMDCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_PARSE_META, 0);
                    parseMDCrawlDatum.getMetaData().put(new Text(mdname), new Text(mdvalue));
                }
            }
            if (parseMDCrawlDatum != null)
                crawlOut.append(key, parseMDCrawlDatum);
            // need to determine origin (once for all outlinks)
            if (ignoreExternalLinks || ignoreInternalLinks) {
                URL originURL = new URL(fromUrl.toString());
                // based on domain?
                if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
                    origin = URLUtil.getDomainName(originURL).toLowerCase();
                } else // use host
                {
                    origin = originURL.getHost().toLowerCase();
                }
            }
            ParseStatus pstatus = parseData.getStatus();
            if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                String newUrl = pstatus.getMessage();
                int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
                newUrl = filterNormalize(fromUrl, newUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers, URLNormalizers.SCOPE_FETCHER);
                if (newUrl != null) {
                    String reprUrl = URLUtil.chooseRepr(fromUrl, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME);
                    CrawlDatum newDatum = new CrawlDatum();
                    newDatum.setStatus(CrawlDatum.STATUS_LINKED);
                    if (reprUrl != null && !reprUrl.equals(newUrl)) {
                        newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
                    }
                    crawlOut.append(new Text(newUrl), newDatum);
                }
            }
            // collect outlinks for subsequent db update
            Outlink[] links = parseData.getOutlinks();
            int outlinksToStore = Math.min(maxOutlinks, links.length);
            int validCount = 0;
            CrawlDatum adjust = null;
            List<Entry<Text, CrawlDatum>> targets = new ArrayList<>(outlinksToStore);
            List<Outlink> outlinkList = new ArrayList<>(outlinksToStore);
            for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
                String toUrl = links[i].getToUrl();
                // only normalize and filter if fetcher.parse = false
                if (!isParsing) {
                    toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers);
                    if (toUrl == null) {
                        continue;
                    }
                }
                CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
                Text targetUrl = new Text(toUrl);
                // see if the outlink has any metadata attached
                // and if so pass that to the crawldatum so that
                // the initial score or distribution can use that
                MapWritable outlinkMD = links[i].getMetadata();
                if (outlinkMD != null) {
                    target.getMetaData().putAll(outlinkMD);
                }
                try {
                    scfilters.initialScore(targetUrl, target);
                } catch (ScoringFilterException e) {
                    LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage());
                    target.setScore(0.0f);
                }
                targets.add(new SimpleEntry(targetUrl, target));
                // overwrite URL in Outlink object with normalized URL (NUTCH-1174)
                links[i].setUrl(toUrl);
                outlinkList.add(links[i]);
                validCount++;
            }
            try {
                // compute score contributions and adjustment to the original score
                adjust = scfilters.distributeScoreToOutlinks(key, parseData, targets, null, links.length);
            } catch (ScoringFilterException e) {
                LOG.warn("Cannot distribute score from " + key + ": " + e.getMessage());
            }
            for (Entry<Text, CrawlDatum> target : targets) {
                crawlOut.append(target.getKey(), target.getValue());
            }
            if (adjust != null)
                crawlOut.append(key, adjust);
            Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]);
            parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), filteredLinks, parseData.getContentMeta(), parseData.getParseMeta());
            dataOut.append(key, parseData);
            if (!parse.isCanonical()) {
                CrawlDatum datum = new CrawlDatum();
                datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
                String timeString = parse.getData().getContentMeta().get(Nutch.FETCH_TIME_KEY);
                try {
                    datum.setFetchTime(Long.parseLong(timeString));
                } catch (Exception e) {
                    LOG.warn("Can't read fetch time for: " + key);
                    datum.setFetchTime(System.currentTimeMillis());
                }
                crawlOut.append(key, datum);
            }
        }

        public void close(TaskAttemptContext context) throws IOException {
            if (textOut != null)
                textOut.close();
            dataOut.close();
            crawlOut.close();
        }
    };
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Metadata(org.apache.hadoop.io.SequenceFile.Metadata) ArrayList(java.util.ArrayList) MapFile(org.apache.hadoop.io.MapFile) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) URL(java.net.URL) Entry(java.util.Map.Entry) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) ScoringFilters(org.apache.nutch.scoring.ScoringFilters) URLFilters(org.apache.nutch.net.URLFilters) Path(org.apache.hadoop.fs.Path) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) MapWritable(org.apache.hadoop.io.MapWritable) MalformedURLException(java.net.MalformedURLException) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) IOException(java.io.IOException) Progressable(org.apache.hadoop.util.Progressable) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) Option(org.apache.hadoop.io.MapFile.Writer.Option) CompressionType(org.apache.hadoop.io.SequenceFile.CompressionType) URLExemptionFilters(org.apache.nutch.net.URLExemptionFilters) URLNormalizers(org.apache.nutch.net.URLNormalizers) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter)

Example 63 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class SegmentReader method getStats.

public void getStats(Path segment, final SegmentReaderStats stats) throws Exception {
    long cnt = 0L;
    Text key = new Text();
    Text val = new Text();
    FileSystem fs = segment.getFileSystem(getConf());
    if (ge) {
        SequenceFile.Reader[] readers = SegmentReaderUtil.getReaders(new Path(segment, CrawlDatum.GENERATE_DIR_NAME), getConf());
        for (int i = 0; i < readers.length; i++) {
            while (readers[i].next(key, val)) cnt++;
            readers[i].close();
        }
        stats.generated = cnt;
    }
    if (fe) {
        Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
        if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDirectory()) {
            cnt = 0L;
            long start = Long.MAX_VALUE;
            long end = Long.MIN_VALUE;
            CrawlDatum value = new CrawlDatum();
            MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fetchDir, getConf());
            for (int i = 0; i < mreaders.length; i++) {
                while (mreaders[i].next(key, value)) {
                    cnt++;
                    if (value.getFetchTime() < start)
                        start = value.getFetchTime();
                    if (value.getFetchTime() > end)
                        end = value.getFetchTime();
                }
                mreaders[i].close();
            }
            stats.start = start;
            stats.end = end;
            stats.fetched = cnt;
        }
    }
    if (pd) {
        Path parseDir = new Path(segment, ParseData.DIR_NAME);
        if (fs.exists(parseDir) && fs.getFileStatus(parseDir).isDirectory()) {
            cnt = 0L;
            long errors = 0L;
            ParseData value = new ParseData();
            MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(parseDir, getConf());
            for (int i = 0; i < mreaders.length; i++) {
                while (mreaders[i].next(key, value)) {
                    cnt++;
                    if (!value.getStatus().isSuccess())
                        errors++;
                }
                mreaders[i].close();
            }
            stats.parsed = cnt;
            stats.parseErrors = errors;
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ParseData(org.apache.nutch.parse.ParseData) FileSystem(org.apache.hadoop.fs.FileSystem) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText)

Example 64 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class TestFeedParser method testParseFetchChannel.

/**
 * Calls the {@link FeedParser} on a sample RSS file and checks that there are
 * 3 {@link ParseResult} entries including the below 2 links:
 * <ul>
 * <li>http://www-scf.usc.edu/~mattmann/</li>
 * <li>http://www.nutch.org</li>
 * </ul>
 *
 * @throws ProtocolNotFound
 *           If the {@link Protocol}Layer cannot be loaded (required to fetch
 *           the {@link Content} for the RSS file).
 * @throws ParseException
 *           If the {@link Parser}Layer cannot be loaded.
 */
@Test
public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    ParseResult parseResult;
    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
        urlString = urlString.replace('\\', '/');
        protocol = new ProtocolFactory(conf).getProtocol(urlString);
        content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
        Assert.assertEquals(3, parseResult.size());
        boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
        for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j.hasNext(); ) {
            Map.Entry<Text, Parse> entry = j.next();
            if (entry.getKey().toString().equals("http://www-scf.usc.edu/~mattmann/")) {
                hasLink1 = true;
            } else if (entry.getKey().toString().equals("http://www.nutch.org/")) {
                hasLink2 = true;
            } else if (entry.getKey().toString().equals(urlString)) {
                hasLink3 = true;
            }
            Assert.assertNotNull(entry.getValue());
            Assert.assertNotNull(entry.getValue().getData());
        }
        if (!hasLink1 || !hasLink2 || !hasLink3) {
            Assert.fail("Outlinks read from sample rss file are not correct!");
        }
    }
}
Also used : ParseResult(org.apache.nutch.parse.ParseResult) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) Content(org.apache.nutch.protocol.Content) Protocol(org.apache.nutch.protocol.Protocol) Map(java.util.Map) Test(org.junit.Test)

Example 65 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class AdaptiveFetchSchedule method main.

public static void main(String[] args) throws Exception {
    FetchSchedule fs = new AdaptiveFetchSchedule();
    fs.setConf(NutchConfiguration.create());
    // we start the time at 0, for simplicity
    long curTime = 0;
    // 2 hours
    long delta = 1000L * 3600L * 24L;
    // we trigger the update of the page every 30 days
    // 30 days
    long update = 1000L * 3600L * 24L * 30L;
    boolean changed = true;
    long lastModified = 0;
    int miss = 0;
    int totalMiss = 0;
    int maxMiss = 0;
    int fetchCnt = 0;
    int changeCnt = 0;
    // initial fetchInterval is 10 days
    CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
    p.setFetchTime(0);
    LOG.info(p.toString());
    // let's move the timeline a couple of deltas
    for (int i = 0; i < 10000; i++) {
        if (lastModified + update < curTime) {
            // System.out.println("i=" + i + ", lastModified=" + lastModified +
            // ", update=" + update + ", curTime=" + curTime);
            changed = true;
            changeCnt++;
            lastModified = curTime;
        }
        LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " + (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed " + miss);
        if (p.getFetchTime() <= curTime) {
            fetchCnt++;
            fs.setFetchSchedule(new Text("http://www.example.com"), p, p.getFetchTime(), p.getModifiedTime(), curTime, lastModified, changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);
            LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " + (p.getFetchInterval() / SECONDS_PER_DAY) + " days");
            if (!changed)
                miss++;
            if (miss > maxMiss)
                maxMiss = miss;
            changed = false;
            totalMiss += miss;
            miss = 0;
        }
        if (changed)
            miss++;
        curTime += delta;
    }
    LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
    LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
}
Also used : CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text)

Aggregations

CrawlDatum (org.apache.nutch.crawl.CrawlDatum)66 Text (org.apache.hadoop.io.Text)60 Test (org.junit.Test)31 Inlinks (org.apache.nutch.crawl.Inlinks)25 Configuration (org.apache.hadoop.conf.Configuration)24 ParseData (org.apache.nutch.parse.ParseData)22 ParseImpl (org.apache.nutch.parse.ParseImpl)21 NutchDocument (org.apache.nutch.indexer.NutchDocument)20 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)20 Content (org.apache.nutch.protocol.Content)19 Parse (org.apache.nutch.parse.Parse)15 Metadata (org.apache.nutch.metadata.Metadata)14 ParseStatus (org.apache.nutch.parse.ParseStatus)14 ParseUtil (org.apache.nutch.parse.ParseUtil)13 Protocol (org.apache.nutch.protocol.Protocol)13 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)13 URL (java.net.URL)11 Outlink (org.apache.nutch.parse.Outlink)11 IOException (java.io.IOException)7 ArrayList (java.util.ArrayList)5