Examples with CrawlDatum - org.apache.nutch.crawl.CrawlDatum

Example 56 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class TestIndexingFilters method testFilterCacheIndexingFilter.

/**
 * Test behaviour when reset the index filter order will not take effect
 *
 * @throws IndexingException
 */
@Test
public void testFilterCacheIndexingFilter() throws IndexingException {
    Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");
    String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);
    IndexingFilters filters1 = new IndexingFilters(conf);
    NutchDocument fdoc1 = filters1.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    // add another index filter
    String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
    // set content metadata
    Metadata md = new Metadata();
    md.add("example", "data");
    // set content metadata property defined in MetadataIndexer
    conf.set("index.content.md", "example");
    // add MetadataIndxer filter
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
    IndexingFilters filters2 = new IndexingFilters(conf);
    NutchDocument fdoc2 = filters2.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], md)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertEquals(fdoc1.getFieldNames().size(), fdoc2.getFieldNames().size());
}

Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 57 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class TestSegmentMergerCrawlDatums method checkMergedSegment.

/**
 * Checks the merged segment and removes the stuff again.
 *
 * @param the
 *          test directory
 * @param the
 *          merged segment
 * @return the final status
 */
protected byte checkMergedSegment(Path testDir, Path mergedSegment) throws Exception {
    // Get a MapFile reader for the <Text,CrawlDatum> pairs
    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(new Path(mergedSegment, CrawlDatum.FETCH_DIR_NAME), conf);
    Text key = new Text();
    CrawlDatum value = new CrawlDatum();
    byte finalStatus = 0x0;
    for (MapFile.Reader reader : readers) {
        while (reader.next(key, value)) {
            LOG.info("Reading status for: " + key.toString() + " > " + CrawlDatum.getStatusName(value.getStatus()));
            // Only consider fetch status
            if (CrawlDatum.hasFetchStatus(value) && key.toString().equals("http://nutch.apache.org/")) {
                finalStatus = value.getStatus();
            }
        }
        // Close the reader again
        reader.close();
    }
    // Remove the test directory again
    fs.delete(testDir, true);
    LOG.info("Final fetch status for: http://nutch.apache.org/ > " + CrawlDatum.getStatusName(finalStatus));
    // Return the final status
    return finalStatus;
}

Also used : Path(org.apache.hadoop.fs.Path) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) MapFile(org.apache.hadoop.io.MapFile) Text(org.apache.hadoop.io.Text)

Example 58 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class FetcherOutputFormat method getRecordWriter.

@Override
public RecordWriter<Text, NutchWritable> getRecordWriter(TaskAttemptContext context) throws IOException {
    Configuration conf = context.getConfiguration();
    String name = getUniqueFile(context, "part", "");
    Path dir = FileOutputFormat.getOutputPath(context);
    FileSystem fs = dir.getFileSystem(context.getConfiguration());
    Path out = FileOutputFormat.getOutputPath(context);
    final Path fetch = new Path(new Path(out, CrawlDatum.FETCH_DIR_NAME), name);
    final Path content = new Path(new Path(out, Content.DIR_NAME), name);
    final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(context);
    Option fKeyClassOpt = MapFile.Writer.keyClass(Text.class);
    org.apache.hadoop.io.SequenceFile.Writer.Option fValClassOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
    org.apache.hadoop.io.SequenceFile.Writer.Option fProgressOpt = SequenceFile.Writer.progressable((Progressable) context);
    org.apache.hadoop.io.SequenceFile.Writer.Option fCompOpt = SequenceFile.Writer.compression(compType);
    final MapFile.Writer fetchOut = new MapFile.Writer(conf, fetch, fKeyClassOpt, fValClassOpt, fCompOpt, fProgressOpt);
    return new RecordWriter<Text, NutchWritable>() {

        private MapFile.Writer contentOut;

        private RecordWriter<Text, Parse> parseOut;

        {
            if (Fetcher.isStoringContent(conf)) {
                Option cKeyClassOpt = MapFile.Writer.keyClass(Text.class);
                org.apache.hadoop.io.SequenceFile.Writer.Option cValClassOpt = SequenceFile.Writer.valueClass(Content.class);
                org.apache.hadoop.io.SequenceFile.Writer.Option cProgressOpt = SequenceFile.Writer.progressable((Progressable) context);
                org.apache.hadoop.io.SequenceFile.Writer.Option cCompOpt = SequenceFile.Writer.compression(compType);
                contentOut = new MapFile.Writer(conf, content, cKeyClassOpt, cValClassOpt, cCompOpt, cProgressOpt);
            }
            if (Fetcher.isParsing(conf)) {
                parseOut = new ParseOutputFormat().getRecordWriter(context);
            }
        }

        public void write(Text key, NutchWritable value) throws IOException, InterruptedException {
            Writable w = value.get();
            if (w instanceof CrawlDatum)
                fetchOut.append(key, w);
            else if (w instanceof Content && contentOut != null)
                contentOut.append(key, w);
            else if (w instanceof Parse && parseOut != null)
                parseOut.write(key, (Parse) w);
        }

        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            fetchOut.close();
            if (contentOut != null) {
                contentOut.close();
            }
            if (parseOut != null) {
                parseOut.close(context);
            }
        }
    };
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Parse(org.apache.nutch.parse.Parse) MapFile(org.apache.hadoop.io.MapFile) NutchWritable(org.apache.nutch.crawl.NutchWritable) Writable(org.apache.hadoop.io.Writable) NutchWritable(org.apache.nutch.crawl.NutchWritable) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) Content(org.apache.nutch.protocol.Content) FileSystem(org.apache.hadoop.fs.FileSystem) ParseOutputFormat(org.apache.nutch.parse.ParseOutputFormat) Option(org.apache.hadoop.io.MapFile.Writer.Option) CompressionType(org.apache.hadoop.io.SequenceFile.CompressionType) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter)

Example 59 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class FetcherThread method queueRedirect.

private FetchItem queueRedirect(Text redirUrl, FetchItem fit) throws ScoringFilterException {
    CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, fit.datum.getFetchInterval(), fit.datum.getScore());
    // transfer all existing metadata to the redirect
    newDatum.getMetaData().putAll(fit.datum.getMetaData());
    scfilters.initialScore(redirUrl, newDatum);
    if (reprUrl != null) {
        newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
    }
    fit = FetchItem.create(redirUrl, newDatum, queueMode);
    if (fit != null) {
        FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
        fiq.addInProgressFetchItem(fit);
    } else {
        // stop redirecting
        redirecting = false;
        context.getCounter("FetcherStatus", "FetchItem.notCreated.redirect").increment(1);
    }
    return fit;
}

Also used : CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText)

Example 60 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class UpdateHostDbReducer method reduce.

/**
 */
public void reduce(Text key, Iterable<NutchWritable> values, Context context) throws IOException, InterruptedException {
    Map<String, Map<String, Integer>> stringCounts = new HashMap<>();
    Map<String, Float> maximums = new HashMap<>();
    // used to calc averages
    Map<String, Float> sums = new HashMap<>();
    // used to calc averages
    Map<String, Integer> counts = new HashMap<>();
    Map<String, Float> minimums = new HashMap<>();
    Map<String, TDigest> tdigests = new HashMap<String, TDigest>();
    HostDatum hostDatum = new HostDatum();
    float score = 0;
    if (stringFields != null) {
        for (int i = 0; i < stringFields.length; i++) {
            stringCounts.put(stringFields[i], new HashMap<>());
        }
    }
    // an empty if this is a new host for the host db
    for (Writable value : values) {
        // Count crawl datum status's and collect metadata from fields
        if (value instanceof CrawlDatum) {
            CrawlDatum buffer = (CrawlDatum) value;
            // Set the correct status field
            switch(buffer.getStatus()) {
                case CrawlDatum.STATUS_DB_UNFETCHED:
                    hostDatum.setUnfetched(hostDatum.getUnfetched() + 1);
                    break;
                case CrawlDatum.STATUS_DB_FETCHED:
                    hostDatum.setFetched(hostDatum.getFetched() + 1);
                    break;
                case CrawlDatum.STATUS_DB_GONE:
                    hostDatum.setGone(hostDatum.getGone() + 1);
                    break;
                case CrawlDatum.STATUS_DB_REDIR_TEMP:
                    hostDatum.setRedirTemp(hostDatum.getRedirTemp() + 1);
                    break;
                case CrawlDatum.STATUS_DB_REDIR_PERM:
                    hostDatum.setRedirPerm(hostDatum.getRedirPerm() + 1);
                    break;
                case CrawlDatum.STATUS_DB_NOTMODIFIED:
                    hostDatum.setNotModified(hostDatum.getNotModified() + 1);
                    break;
            }
            // Record connection failures
            if (buffer.getRetriesSinceFetch() != 0) {
                hostDatum.incConnectionFailures();
            }
            // Only gather metadata statistics for proper fetched pages
            if (buffer.getStatus() == CrawlDatum.STATUS_DB_FETCHED || buffer.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
                // Deal with the string fields
                if (stringFields != null) {
                    for (int i = 0; i < stringFields.length; i++) {
                        // Does this field exist?
                        if (buffer.getMetaData().get(stringFieldWritables[i]) != null) {
                            // Get it!
                            String metadataValue = null;
                            try {
                                metadataValue = buffer.getMetaData().get(stringFieldWritables[i]).toString();
                            } catch (Exception e) {
                                LOG.error("Metadata field " + stringFields[i] + " is probably not a numeric value");
                            }
                            // Does the value exist?
                            if (stringCounts.get(stringFields[i]).containsKey(metadataValue)) {
                                // Yes, increment it
                                stringCounts.get(stringFields[i]).put(metadataValue, stringCounts.get(stringFields[i]).get(metadataValue) + 1);
                            } else {
                                // Create it!
                                stringCounts.get(stringFields[i]).put(metadataValue, 1);
                            }
                        }
                    }
                }
                // Deal with the numeric fields
                if (numericFields != null) {
                    for (int i = 0; i < numericFields.length; i++) {
                        // Does this field exist?
                        if (buffer.getMetaData().get(numericFieldWritables[i]) != null) {
                            try {
                                // Get it!
                                Float metadataValue = Float.parseFloat(buffer.getMetaData().get(numericFieldWritables[i]).toString());
                                // Does the median value exist?
                                if (tdigests.containsKey(numericFields[i])) {
                                    tdigests.get(numericFields[i]).add(metadataValue);
                                } else {
                                    // Create it!
                                    TDigest tdigest = TDigest.createDigest(100);
                                    tdigest.add((double) metadataValue);
                                    tdigests.put(numericFields[i], tdigest);
                                }
                                // Does the minimum value exist?
                                if (minimums.containsKey(numericFields[i])) {
                                    // Write if this is lower than existing value
                                    if (metadataValue < minimums.get(numericFields[i])) {
                                        minimums.put(numericFields[i], metadataValue);
                                    }
                                } else {
                                    // Create it!
                                    minimums.put(numericFields[i], metadataValue);
                                }
                                // Does the maximum value exist?
                                if (maximums.containsKey(numericFields[i])) {
                                    // Write if this is lower than existing value
                                    if (metadataValue > maximums.get(numericFields[i])) {
                                        maximums.put(numericFields[i], metadataValue);
                                    }
                                } else {
                                    // Create it!
                                    maximums.put(numericFields[i], metadataValue);
                                }
                                // Sum it up!
                                if (sums.containsKey(numericFields[i])) {
                                    // Increment
                                    sums.put(numericFields[i], sums.get(numericFields[i]) + metadataValue);
                                    counts.put(numericFields[i], counts.get(numericFields[i]) + 1);
                                } else {
                                    // Create it!
                                    sums.put(numericFields[i], metadataValue);
                                    counts.put(numericFields[i], 1);
                                }
                            } catch (Exception e) {
                                LOG.error(e.getMessage() + " when processing values for " + key.toString());
                            }
                        }
                    }
                }
            }
        }
        // 
        if (value instanceof HostDatum) {
            HostDatum buffer = (HostDatum) value;
            // Check homepage URL
            if (buffer.hasHomepageUrl()) {
                hostDatum.setHomepageUrl(buffer.getHomepageUrl());
            }
            // Check lastCheck timestamp
            if (!buffer.isEmpty()) {
                hostDatum.setLastCheck(buffer.getLastCheck());
            }
            // Check and set DNS failures
            if (buffer.getDnsFailures() > 0) {
                hostDatum.setDnsFailures(buffer.getDnsFailures());
            }
            // Check and set connection failures
            if (buffer.getConnectionFailures() > 0) {
                hostDatum.setConnectionFailures(buffer.getConnectionFailures());
            }
            // Check metadata
            if (!buffer.getMetaData().isEmpty()) {
                hostDatum.setMetaData(buffer.getMetaData());
            }
            // Check and set score (score from Web Graph has precedence)
            if (buffer.getScore() > 0) {
                hostDatum.setScore(buffer.getScore());
            }
        }
        // Check for the score
        if (value instanceof FloatWritable) {
            FloatWritable buffer = (FloatWritable) value;
            score = buffer.get();
        }
    }
    // Check if score was set from Web Graph
    if (score > 0) {
        hostDatum.setScore(score);
    }
    // Set metadata
    for (Map.Entry<String, Map<String, Integer>> entry : stringCounts.entrySet()) {
        for (Map.Entry<String, Integer> subEntry : entry.getValue().entrySet()) {
            hostDatum.getMetaData().put(new Text(entry.getKey() + "." + subEntry.getKey()), new IntWritable(subEntry.getValue()));
        }
    }
    for (Map.Entry<String, Float> entry : maximums.entrySet()) {
        hostDatum.getMetaData().put(new Text("max." + entry.getKey()), new FloatWritable(entry.getValue()));
    }
    for (Map.Entry<String, Float> entry : sums.entrySet()) {
        hostDatum.getMetaData().put(new Text("avg." + entry.getKey()), new FloatWritable(entry.getValue() / counts.get(entry.getKey())));
    }
    for (Map.Entry<String, TDigest> entry : tdigests.entrySet()) {
        // Emit all percentiles
        for (int i = 0; i < percentiles.length; i++) {
            hostDatum.getMetaData().put(new Text("pct" + Integer.toString(percentiles[i]) + "." + entry.getKey()), new FloatWritable((float) entry.getValue().quantile(0.5)));
        }
    }
    for (Map.Entry<String, Float> entry : minimums.entrySet()) {
        hostDatum.getMetaData().put(new Text("min." + entry.getKey()), new FloatWritable(entry.getValue()));
    }
    context.getCounter("UpdateHostDb", "total_hosts").increment(1);
    // See if this record is to be checked
    if (shouldCheck(hostDatum)) {
        // Make an entry
        resolverThread = new ResolverThread(key.toString(), hostDatum, context, purgeFailedHostsThreshold);
        // Add the entry to the queue (blocking)
        try {
            queue.put(resolverThread);
        } catch (InterruptedException e) {
            LOG.error("UpdateHostDb: " + StringUtils.stringifyException(e));
        }
        // Do not progress, the datum will be written in the resolver thread
        return;
    } else {
        context.getCounter("UpdateHostDb", "skipped_not_eligible").increment(1);
        LOG.info("UpdateHostDb: " + key.toString() + ": skipped_not_eligible");
    }
    // Write the host datum if it wasn't written by the resolver thread
    context.write(key, hostDatum);
}

Also used : HashMap(java.util.HashMap) Writable(org.apache.hadoop.io.Writable) NutchWritable(org.apache.nutch.crawl.NutchWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) IntWritable(org.apache.hadoop.io.IntWritable) IntWritable(org.apache.hadoop.io.IntWritable) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) FloatWritable(org.apache.hadoop.io.FloatWritable) TDigest(com.tdunning.math.stats.TDigest) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

CrawlDatum (org.apache.nutch.crawl.CrawlDatum)66 Text (org.apache.hadoop.io.Text)60 Test (org.junit.Test)31 Inlinks (org.apache.nutch.crawl.Inlinks)25 Configuration (org.apache.hadoop.conf.Configuration)24 ParseData (org.apache.nutch.parse.ParseData)22 ParseImpl (org.apache.nutch.parse.ParseImpl)21 NutchDocument (org.apache.nutch.indexer.NutchDocument)20 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)20 Content (org.apache.nutch.protocol.Content)19 Parse (org.apache.nutch.parse.Parse)15 Metadata (org.apache.nutch.metadata.Metadata)14 ParseStatus (org.apache.nutch.parse.ParseStatus)14 ParseUtil (org.apache.nutch.parse.ParseUtil)13 Protocol (org.apache.nutch.protocol.Protocol)13 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)13 URL (java.net.URL)11 Outlink (org.apache.nutch.parse.Outlink)11 IOException (java.io.IOException)7 ArrayList (java.util.ArrayList)5