use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestIndexingFilters method testFilterCacheIndexingFilter.
/**
* Test behaviour when reset the index filter order will not take effect
*
* @throws IndexingException
*/
@Test
public void testFilterCacheIndexingFilter() throws IndexingException {
Configuration conf = NutchConfiguration.create();
conf.addResource("nutch-default.xml");
conf.addResource("crawl-tests.xml");
String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);
IndexingFilters filters1 = new IndexingFilters(conf);
NutchDocument fdoc1 = filters1.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
// add another index filter
String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
// set content metadata
Metadata md = new Metadata();
md.add("example", "data");
// set content metadata property defined in MetadataIndexer
conf.set("index.content.md", "example");
// add MetadataIndxer filter
conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
IndexingFilters filters2 = new IndexingFilters(conf);
NutchDocument fdoc2 = filters2.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], md)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertEquals(fdoc1.getFieldNames().size(), fdoc2.getFieldNames().size());
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestSegmentMergerCrawlDatums method checkMergedSegment.
/**
* Checks the merged segment and removes the stuff again.
*
* @param the
* test directory
* @param the
* merged segment
* @return the final status
*/
protected byte checkMergedSegment(Path testDir, Path mergedSegment) throws Exception {
// Get a MapFile reader for the <Text,CrawlDatum> pairs
MapFile.Reader[] readers = MapFileOutputFormat.getReaders(new Path(mergedSegment, CrawlDatum.FETCH_DIR_NAME), conf);
Text key = new Text();
CrawlDatum value = new CrawlDatum();
byte finalStatus = 0x0;
for (MapFile.Reader reader : readers) {
while (reader.next(key, value)) {
LOG.info("Reading status for: " + key.toString() + " > " + CrawlDatum.getStatusName(value.getStatus()));
// Only consider fetch status
if (CrawlDatum.hasFetchStatus(value) && key.toString().equals("http://nutch.apache.org/")) {
finalStatus = value.getStatus();
}
}
// Close the reader again
reader.close();
}
// Remove the test directory again
fs.delete(testDir, true);
LOG.info("Final fetch status for: http://nutch.apache.org/ > " + CrawlDatum.getStatusName(finalStatus));
// Return the final status
return finalStatus;
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class FetcherOutputFormat method getRecordWriter.
@Override
public RecordWriter<Text, NutchWritable> getRecordWriter(TaskAttemptContext context) throws IOException {
Configuration conf = context.getConfiguration();
String name = getUniqueFile(context, "part", "");
Path dir = FileOutputFormat.getOutputPath(context);
FileSystem fs = dir.getFileSystem(context.getConfiguration());
Path out = FileOutputFormat.getOutputPath(context);
final Path fetch = new Path(new Path(out, CrawlDatum.FETCH_DIR_NAME), name);
final Path content = new Path(new Path(out, Content.DIR_NAME), name);
final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(context);
Option fKeyClassOpt = MapFile.Writer.keyClass(Text.class);
org.apache.hadoop.io.SequenceFile.Writer.Option fValClassOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
org.apache.hadoop.io.SequenceFile.Writer.Option fProgressOpt = SequenceFile.Writer.progressable((Progressable) context);
org.apache.hadoop.io.SequenceFile.Writer.Option fCompOpt = SequenceFile.Writer.compression(compType);
final MapFile.Writer fetchOut = new MapFile.Writer(conf, fetch, fKeyClassOpt, fValClassOpt, fCompOpt, fProgressOpt);
return new RecordWriter<Text, NutchWritable>() {
private MapFile.Writer contentOut;
private RecordWriter<Text, Parse> parseOut;
{
if (Fetcher.isStoringContent(conf)) {
Option cKeyClassOpt = MapFile.Writer.keyClass(Text.class);
org.apache.hadoop.io.SequenceFile.Writer.Option cValClassOpt = SequenceFile.Writer.valueClass(Content.class);
org.apache.hadoop.io.SequenceFile.Writer.Option cProgressOpt = SequenceFile.Writer.progressable((Progressable) context);
org.apache.hadoop.io.SequenceFile.Writer.Option cCompOpt = SequenceFile.Writer.compression(compType);
contentOut = new MapFile.Writer(conf, content, cKeyClassOpt, cValClassOpt, cCompOpt, cProgressOpt);
}
if (Fetcher.isParsing(conf)) {
parseOut = new ParseOutputFormat().getRecordWriter(context);
}
}
public void write(Text key, NutchWritable value) throws IOException, InterruptedException {
Writable w = value.get();
if (w instanceof CrawlDatum)
fetchOut.append(key, w);
else if (w instanceof Content && contentOut != null)
contentOut.append(key, w);
else if (w instanceof Parse && parseOut != null)
parseOut.write(key, (Parse) w);
}
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
fetchOut.close();
if (contentOut != null) {
contentOut.close();
}
if (parseOut != null) {
parseOut.close(context);
}
}
};
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class FetcherThread method queueRedirect.
private FetchItem queueRedirect(Text redirUrl, FetchItem fit) throws ScoringFilterException {
CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, fit.datum.getFetchInterval(), fit.datum.getScore());
// transfer all existing metadata to the redirect
newDatum.getMetaData().putAll(fit.datum.getMetaData());
scfilters.initialScore(redirUrl, newDatum);
if (reprUrl != null) {
newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
}
fit = FetchItem.create(redirUrl, newDatum, queueMode);
if (fit != null) {
FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
fiq.addInProgressFetchItem(fit);
} else {
// stop redirecting
redirecting = false;
context.getCounter("FetcherStatus", "FetchItem.notCreated.redirect").increment(1);
}
return fit;
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class UpdateHostDbReducer method reduce.
/**
*/
public void reduce(Text key, Iterable<NutchWritable> values, Context context) throws IOException, InterruptedException {
Map<String, Map<String, Integer>> stringCounts = new HashMap<>();
Map<String, Float> maximums = new HashMap<>();
// used to calc averages
Map<String, Float> sums = new HashMap<>();
// used to calc averages
Map<String, Integer> counts = new HashMap<>();
Map<String, Float> minimums = new HashMap<>();
Map<String, TDigest> tdigests = new HashMap<String, TDigest>();
HostDatum hostDatum = new HostDatum();
float score = 0;
if (stringFields != null) {
for (int i = 0; i < stringFields.length; i++) {
stringCounts.put(stringFields[i], new HashMap<>());
}
}
// an empty if this is a new host for the host db
for (Writable value : values) {
// Count crawl datum status's and collect metadata from fields
if (value instanceof CrawlDatum) {
CrawlDatum buffer = (CrawlDatum) value;
// Set the correct status field
switch(buffer.getStatus()) {
case CrawlDatum.STATUS_DB_UNFETCHED:
hostDatum.setUnfetched(hostDatum.getUnfetched() + 1);
break;
case CrawlDatum.STATUS_DB_FETCHED:
hostDatum.setFetched(hostDatum.getFetched() + 1);
break;
case CrawlDatum.STATUS_DB_GONE:
hostDatum.setGone(hostDatum.getGone() + 1);
break;
case CrawlDatum.STATUS_DB_REDIR_TEMP:
hostDatum.setRedirTemp(hostDatum.getRedirTemp() + 1);
break;
case CrawlDatum.STATUS_DB_REDIR_PERM:
hostDatum.setRedirPerm(hostDatum.getRedirPerm() + 1);
break;
case CrawlDatum.STATUS_DB_NOTMODIFIED:
hostDatum.setNotModified(hostDatum.getNotModified() + 1);
break;
}
// Record connection failures
if (buffer.getRetriesSinceFetch() != 0) {
hostDatum.incConnectionFailures();
}
// Only gather metadata statistics for proper fetched pages
if (buffer.getStatus() == CrawlDatum.STATUS_DB_FETCHED || buffer.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
// Deal with the string fields
if (stringFields != null) {
for (int i = 0; i < stringFields.length; i++) {
// Does this field exist?
if (buffer.getMetaData().get(stringFieldWritables[i]) != null) {
// Get it!
String metadataValue = null;
try {
metadataValue = buffer.getMetaData().get(stringFieldWritables[i]).toString();
} catch (Exception e) {
LOG.error("Metadata field " + stringFields[i] + " is probably not a numeric value");
}
// Does the value exist?
if (stringCounts.get(stringFields[i]).containsKey(metadataValue)) {
// Yes, increment it
stringCounts.get(stringFields[i]).put(metadataValue, stringCounts.get(stringFields[i]).get(metadataValue) + 1);
} else {
// Create it!
stringCounts.get(stringFields[i]).put(metadataValue, 1);
}
}
}
}
// Deal with the numeric fields
if (numericFields != null) {
for (int i = 0; i < numericFields.length; i++) {
// Does this field exist?
if (buffer.getMetaData().get(numericFieldWritables[i]) != null) {
try {
// Get it!
Float metadataValue = Float.parseFloat(buffer.getMetaData().get(numericFieldWritables[i]).toString());
// Does the median value exist?
if (tdigests.containsKey(numericFields[i])) {
tdigests.get(numericFields[i]).add(metadataValue);
} else {
// Create it!
TDigest tdigest = TDigest.createDigest(100);
tdigest.add((double) metadataValue);
tdigests.put(numericFields[i], tdigest);
}
// Does the minimum value exist?
if (minimums.containsKey(numericFields[i])) {
// Write if this is lower than existing value
if (metadataValue < minimums.get(numericFields[i])) {
minimums.put(numericFields[i], metadataValue);
}
} else {
// Create it!
minimums.put(numericFields[i], metadataValue);
}
// Does the maximum value exist?
if (maximums.containsKey(numericFields[i])) {
// Write if this is lower than existing value
if (metadataValue > maximums.get(numericFields[i])) {
maximums.put(numericFields[i], metadataValue);
}
} else {
// Create it!
maximums.put(numericFields[i], metadataValue);
}
// Sum it up!
if (sums.containsKey(numericFields[i])) {
// Increment
sums.put(numericFields[i], sums.get(numericFields[i]) + metadataValue);
counts.put(numericFields[i], counts.get(numericFields[i]) + 1);
} else {
// Create it!
sums.put(numericFields[i], metadataValue);
counts.put(numericFields[i], 1);
}
} catch (Exception e) {
LOG.error(e.getMessage() + " when processing values for " + key.toString());
}
}
}
}
}
}
//
if (value instanceof HostDatum) {
HostDatum buffer = (HostDatum) value;
// Check homepage URL
if (buffer.hasHomepageUrl()) {
hostDatum.setHomepageUrl(buffer.getHomepageUrl());
}
// Check lastCheck timestamp
if (!buffer.isEmpty()) {
hostDatum.setLastCheck(buffer.getLastCheck());
}
// Check and set DNS failures
if (buffer.getDnsFailures() > 0) {
hostDatum.setDnsFailures(buffer.getDnsFailures());
}
// Check and set connection failures
if (buffer.getConnectionFailures() > 0) {
hostDatum.setConnectionFailures(buffer.getConnectionFailures());
}
// Check metadata
if (!buffer.getMetaData().isEmpty()) {
hostDatum.setMetaData(buffer.getMetaData());
}
// Check and set score (score from Web Graph has precedence)
if (buffer.getScore() > 0) {
hostDatum.setScore(buffer.getScore());
}
}
// Check for the score
if (value instanceof FloatWritable) {
FloatWritable buffer = (FloatWritable) value;
score = buffer.get();
}
}
// Check if score was set from Web Graph
if (score > 0) {
hostDatum.setScore(score);
}
// Set metadata
for (Map.Entry<String, Map<String, Integer>> entry : stringCounts.entrySet()) {
for (Map.Entry<String, Integer> subEntry : entry.getValue().entrySet()) {
hostDatum.getMetaData().put(new Text(entry.getKey() + "." + subEntry.getKey()), new IntWritable(subEntry.getValue()));
}
}
for (Map.Entry<String, Float> entry : maximums.entrySet()) {
hostDatum.getMetaData().put(new Text("max." + entry.getKey()), new FloatWritable(entry.getValue()));
}
for (Map.Entry<String, Float> entry : sums.entrySet()) {
hostDatum.getMetaData().put(new Text("avg." + entry.getKey()), new FloatWritable(entry.getValue() / counts.get(entry.getKey())));
}
for (Map.Entry<String, TDigest> entry : tdigests.entrySet()) {
// Emit all percentiles
for (int i = 0; i < percentiles.length; i++) {
hostDatum.getMetaData().put(new Text("pct" + Integer.toString(percentiles[i]) + "." + entry.getKey()), new FloatWritable((float) entry.getValue().quantile(0.5)));
}
}
for (Map.Entry<String, Float> entry : minimums.entrySet()) {
hostDatum.getMetaData().put(new Text("min." + entry.getKey()), new FloatWritable(entry.getValue()));
}
context.getCounter("UpdateHostDb", "total_hosts").increment(1);
// See if this record is to be checked
if (shouldCheck(hostDatum)) {
// Make an entry
resolverThread = new ResolverThread(key.toString(), hostDatum, context, purgeFailedHostsThreshold);
// Add the entry to the queue (blocking)
try {
queue.put(resolverThread);
} catch (InterruptedException e) {
LOG.error("UpdateHostDb: " + StringUtils.stringifyException(e));
}
// Do not progress, the datum will be written in the resolver thread
return;
} else {
context.getCounter("UpdateHostDb", "skipped_not_eligible").increment(1);
LOG.info("UpdateHostDb: " + key.toString() + ": skipped_not_eligible");
}
// Write the host datum if it wasn't written by the resolver thread
context.write(key, hostDatum);
}
Aggregations