use of org.apache.nutch.crawl.NutchWritable in project nutch by apache.
the class TestIndexerMapReduce method runIndexer.
/**
* Run {@link IndexerMapReduce.reduce(...)} to get a "indexed"
* {@link NutchDocument} by passing objects from segment and CrawlDb to the
* indexer.
*
* @param dbDatum
* crawl datum from CrawlDb
* @param fetchDatum
* crawl datum (fetch status) from segment
* @param parseText
* plain text from parsed document
* @param parseData
* parse data
* @param content
* (optional, if index binary content) protocol content
* @return "indexed" document
*/
public NutchDocument runIndexer(CrawlDatum dbDatum, CrawlDatum fetchDatum, ParseText parseText, ParseData parseData, Content content) {
List<NutchWritable> values = new ArrayList<NutchWritable>();
values.add(new NutchWritable(dbDatum));
values.add(new NutchWritable(fetchDatum));
values.add(new NutchWritable(parseText));
values.add(new NutchWritable(parseData));
values.add(new NutchWritable(content));
reduceDriver = ReduceDriver.newReduceDriver(reducer);
reduceDriver.getConfiguration().addResource(configuration);
reduceDriver.withInput(testUrlText, values);
List<Pair<Text, NutchIndexAction>> reduceResult;
NutchDocument doc = null;
try {
reduceResult = reduceDriver.run();
for (Pair<Text, NutchIndexAction> p : reduceResult) {
if (p.getSecond().action != NutchIndexAction.DELETE) {
doc = p.getSecond().doc;
}
}
} catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
}
return doc;
}
use of org.apache.nutch.crawl.NutchWritable in project nutch by apache.
the class FetcherThread method output.
private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) throws InterruptedException {
datum.setStatus(status);
datum.setFetchTime(System.currentTimeMillis());
if (pstatus != null)
datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
ParseResult parseResult = null;
if (content != null) {
Metadata metadata = content.getMetadata();
// store the guessed content type in the crawldatum
if (content.getContentType() != null)
datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));
// add segment to metadata
metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
// add score to content metadata so that ParseSegment can pick it up.
try {
scfilters.passScoreBeforeParsing(key, datum, content);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn(getName() + " " + Thread.currentThread().getId() + " Couldn't pass score, url " + key + " (" + e + ")");
}
}
/*
* Note: Fetcher will only follow meta-redirects coming from the
* original URL.
*/
if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) {
try {
parseResult = this.parseUtil.parse(content);
} catch (Exception e) {
LOG.warn(getName() + " " + Thread.currentThread().getId() + " Error parsing: " + key + ": " + StringUtils.stringifyException(e));
}
}
if (parseResult == null) {
byte[] signature = SignatureFactory.getSignature(conf).calculate(content, new ParseStatus().getEmptyParse(conf));
datum.setSignature(signature);
}
}
/*
* Store status code in content So we can read this value during parsing
* (as a separate job) and decide to parse or not.
*/
content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
}
try {
context.write(key, new NutchWritable(datum));
if (content != null && storingContent)
context.write(key, new NutchWritable(content));
if (parseResult != null) {
for (Entry<Text, Parse> entry : parseResult) {
Text url = entry.getKey();
Parse parse = entry.getValue();
ParseStatus parseStatus = parse.getData().getStatus();
ParseData parseData = parse.getData();
if (!parseStatus.isSuccess()) {
LOG.warn(getName() + " " + Thread.currentThread().getId() + " Error parsing: " + key + ": " + parseStatus);
parse = parseStatus.getEmptyParse(conf);
}
// Calculate page signature. For non-parsing fetchers this will
// be done in ParseSegment
byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse);
// Ensure segment name and score are in parseData metadata
parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
// Pass fetch time to content meta
parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
if (url.equals(key))
datum.setSignature(signature);
try {
scfilters.passScoreAfterParsing(url, content, parse);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn(getName() + " " + Thread.currentThread().getId() + " Couldn't pass score, url " + key + " (" + e + ")");
}
}
String origin = null;
// collect outlinks for subsequent db update
Outlink[] links = parseData.getOutlinks();
int outlinksToStore = Math.min(maxOutlinks, links.length);
if (ignoreExternalLinks || ignoreInternalLinks) {
URL originURL = new URL(url.toString());
// based on domain?
if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
origin = URLUtil.getDomainName(originURL).toLowerCase();
} else // use host
{
origin = originURL.getHost().toLowerCase();
}
}
// used by fetchNode
if (fetchNode != null) {
fetchNode.setOutlinks(links);
fetchNode.setTitle(parseData.getTitle());
FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode);
}
int validCount = 0;
// Process all outlinks, normalize, filter and deduplicate
List<Outlink> outlinkList = new ArrayList<>(outlinksToStore);
HashSet<String> outlinks = new HashSet<>(outlinksToStore);
for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
String toUrl = links[i].getToUrl();
toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFiltersForOutlinks, urlExemptionFilters, normalizersForOutlinks);
if (toUrl == null) {
continue;
}
validCount++;
links[i].setUrl(toUrl);
outlinkList.add(links[i]);
outlinks.add(toUrl);
}
// Publish fetch report event
if (activatePublisher) {
FetcherThreadEvent reportEvent = new FetcherThreadEvent(PublishEventType.REPORT, url.toString());
reportEvent.addOutlinksToEventData(outlinkList);
reportEvent.addEventData(Nutch.FETCH_EVENT_TITLE, parseData.getTitle());
reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTTYPE, parseData.getContentMeta().get("content-type"));
reportEvent.addEventData(Nutch.FETCH_EVENT_SCORE, datum.getScore());
reportEvent.addEventData(Nutch.FETCH_EVENT_FETCHTIME, datum.getFetchTime());
reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTLANG, parseData.getContentMeta().get("content-language"));
publisher.publish(reportEvent, conf);
}
// Only process depth N outlinks
if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
FetchItem ft = FetchItem.create(url, null, queueMode);
FetchItemQueue queue = ((FetchItemQueues) fetchQueues).getFetchItemQueue(ft.queueID);
queue.alreadyFetched.add(url.toString().hashCode());
context.getCounter("FetcherOutlinks", "outlinks_detected").increment(outlinks.size());
// Counter to limit num outlinks to follow per page
int outlinkCounter = 0;
// Calculate variable number of outlinks by depth using the
// divisor (outlinks = Math.floor(divisor / depth * num.links))
int maxOutlinksByDepth = (int) Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);
String followUrl;
// Walk over the outlinks and add as new FetchItem to the queues
Iterator<String> iter = outlinks.iterator();
while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
followUrl = iter.next();
// Check whether we'll follow external outlinks
if (outlinksIgnoreExternal) {
if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
continue;
}
}
// Already followed?
int urlHashCode = followUrl.hashCode();
if (queue.alreadyFetched.contains(urlHashCode)) {
continue;
}
queue.alreadyFetched.add(urlHashCode);
// Create new FetchItem with depth incremented
FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
context.getCounter("FetcherOutlinks", "outlinks_following").increment(1);
((FetchItemQueues) fetchQueues).addFetchItem(fit);
outlinkCounter++;
}
}
// Overwrite the outlinks in ParseData with the normalized and
// filtered set
parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));
context.write(url, new NutchWritable(new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical())));
}
}
} catch (IOException e) {
if (LOG.isErrorEnabled()) {
LOG.error("fetcher caught:" + e.toString());
}
}
// return parse status if it exits
if (parseResult != null && !parseResult.isEmpty()) {
Parse p = parseResult.get(content.getUrl());
if (p != null) {
context.getCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()]).increment(1);
return p.getData().getStatus();
}
}
return null;
}
use of org.apache.nutch.crawl.NutchWritable in project nutch by apache.
the class UpdateHostDbMapper method map.
/**
* Mapper ingesting records from the HostDB, CrawlDB and plaintext host
* scores file. Statistics and scores are passed on.
*
* @param key
* @param value
* @param context
*/
public void map(Text key, Writable value, Context context) throws IOException, InterruptedException {
// Get the key!
String keyStr = key.toString();
// Check if we process records from the CrawlDB
if (key instanceof Text && value instanceof CrawlDatum) {
// Get the normalized and filtered host of this URL
buffer = filterNormalize(URLUtil.getHost(keyStr));
// Filtered out?
if (buffer == null) {
context.getCounter("UpdateHostDb", "filtered_records").increment(1);
LOG.info("UpdateHostDb: " + URLUtil.getHost(keyStr) + " crawldatum has been filtered");
return;
}
// Set the host of this URL
host.set(buffer);
crawlDatum = (CrawlDatum) value;
hostDatum = new HostDatum();
// Do not resolve homepages when the root URL is unfetched
if (crawlDatum.getStatus() != CrawlDatum.STATUS_DB_UNFETCHED) {
// Get the protocol
String protocol = URLUtil.getProtocol(keyStr);
// Get the proposed homepage URL
String homepage = protocol + "://" + buffer + "/";
// Check if the current key is equals the host
if (keyStr.equals(homepage)) {
// Check if this is a redirect to the real home page
if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
// Obtain the repr url for this redirect via protocolstatus from the metadata
ProtocolStatus z = (ProtocolStatus) crawlDatum.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
// Get the protocol status' arguments
args = z.getArgs();
// ..and the possible redirect URL
reprUrl = args[0];
// Am i a redirect?
if (reprUrl != null) {
LOG.info("UpdateHostDb: homepage: " + keyStr + " redirects to: " + args[0]);
context.write(host, new NutchWritable(hostDatum));
hostDatum.setHomepageUrl(reprUrl);
} else {
LOG.info("UpdateHostDb: homepage: " + keyStr + " redirects to: " + args[0] + " but has been filtered out");
}
} else {
hostDatum.setHomepageUrl(homepage);
context.write(host, new NutchWritable(hostDatum));
LOG.info("UpdateHostDb: homepage: " + homepage);
}
}
}
// Always emit crawl datum
context.write(host, new NutchWritable(crawlDatum));
}
// Check if we got a record from the hostdb
if (key instanceof Text && value instanceof HostDatum) {
buffer = filterNormalize(keyStr);
// Filtered out?
if (buffer == null) {
context.getCounter("UpdateHostDb", "filtered_records").increment(1);
LOG.info("UpdateHostDb: " + key.toString() + " hostdatum has been filtered");
return;
}
// Get a HostDatum
hostDatum = (HostDatum) value;
key.set(buffer);
// we're aggregating them from CrawlDB anyway
if (readingCrawlDb) {
hostDatum.resetStatistics();
}
context.write(key, new NutchWritable(hostDatum));
}
// Check if we got a record with host scores
if (key instanceof Text && value instanceof Text) {
buffer = filterNormalize(keyStr);
// Filtered out?
if (buffer == null) {
context.getCounter("UpdateHostDb", "filtered_records").increment(1);
LOG.info("UpdateHostDb: " + key.toString() + " score has been filtered");
return;
}
key.set(buffer);
context.write(key, new NutchWritable(new FloatWritable(Float.parseFloat(value.toString()))));
}
}
use of org.apache.nutch.crawl.NutchWritable in project nutch by apache.
the class FetcherOutputFormat method getRecordWriter.
@Override
public RecordWriter<Text, NutchWritable> getRecordWriter(TaskAttemptContext context) throws IOException {
Configuration conf = context.getConfiguration();
String name = getUniqueFile(context, "part", "");
Path dir = FileOutputFormat.getOutputPath(context);
FileSystem fs = dir.getFileSystem(context.getConfiguration());
Path out = FileOutputFormat.getOutputPath(context);
final Path fetch = new Path(new Path(out, CrawlDatum.FETCH_DIR_NAME), name);
final Path content = new Path(new Path(out, Content.DIR_NAME), name);
final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(context);
Option fKeyClassOpt = MapFile.Writer.keyClass(Text.class);
org.apache.hadoop.io.SequenceFile.Writer.Option fValClassOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
org.apache.hadoop.io.SequenceFile.Writer.Option fProgressOpt = SequenceFile.Writer.progressable((Progressable) context);
org.apache.hadoop.io.SequenceFile.Writer.Option fCompOpt = SequenceFile.Writer.compression(compType);
final MapFile.Writer fetchOut = new MapFile.Writer(conf, fetch, fKeyClassOpt, fValClassOpt, fCompOpt, fProgressOpt);
return new RecordWriter<Text, NutchWritable>() {
private MapFile.Writer contentOut;
private RecordWriter<Text, Parse> parseOut;
{
if (Fetcher.isStoringContent(conf)) {
Option cKeyClassOpt = MapFile.Writer.keyClass(Text.class);
org.apache.hadoop.io.SequenceFile.Writer.Option cValClassOpt = SequenceFile.Writer.valueClass(Content.class);
org.apache.hadoop.io.SequenceFile.Writer.Option cProgressOpt = SequenceFile.Writer.progressable((Progressable) context);
org.apache.hadoop.io.SequenceFile.Writer.Option cCompOpt = SequenceFile.Writer.compression(compType);
contentOut = new MapFile.Writer(conf, content, cKeyClassOpt, cValClassOpt, cCompOpt, cProgressOpt);
}
if (Fetcher.isParsing(conf)) {
parseOut = new ParseOutputFormat().getRecordWriter(context);
}
}
public void write(Text key, NutchWritable value) throws IOException, InterruptedException {
Writable w = value.get();
if (w instanceof CrawlDatum)
fetchOut.append(key, w);
else if (w instanceof Content && contentOut != null)
contentOut.append(key, w);
else if (w instanceof Parse && parseOut != null)
parseOut.write(key, (Parse) w);
}
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
fetchOut.close();
if (contentOut != null) {
contentOut.close();
}
if (parseOut != null) {
parseOut.close(context);
}
}
};
}
use of org.apache.nutch.crawl.NutchWritable in project nutch by apache.
the class UpdateHostDbReducer method reduce.
/**
*/
public void reduce(Text key, Iterable<NutchWritable> values, Context context) throws IOException, InterruptedException {
Map<String, Map<String, Integer>> stringCounts = new HashMap<>();
Map<String, Float> maximums = new HashMap<>();
// used to calc averages
Map<String, Float> sums = new HashMap<>();
// used to calc averages
Map<String, Integer> counts = new HashMap<>();
Map<String, Float> minimums = new HashMap<>();
Map<String, TDigest> tdigests = new HashMap<String, TDigest>();
HostDatum hostDatum = new HostDatum();
float score = 0;
if (stringFields != null) {
for (int i = 0; i < stringFields.length; i++) {
stringCounts.put(stringFields[i], new HashMap<>());
}
}
// an empty if this is a new host for the host db
for (Writable value : values) {
// Count crawl datum status's and collect metadata from fields
if (value instanceof CrawlDatum) {
CrawlDatum buffer = (CrawlDatum) value;
// Set the correct status field
switch(buffer.getStatus()) {
case CrawlDatum.STATUS_DB_UNFETCHED:
hostDatum.setUnfetched(hostDatum.getUnfetched() + 1);
break;
case CrawlDatum.STATUS_DB_FETCHED:
hostDatum.setFetched(hostDatum.getFetched() + 1);
break;
case CrawlDatum.STATUS_DB_GONE:
hostDatum.setGone(hostDatum.getGone() + 1);
break;
case CrawlDatum.STATUS_DB_REDIR_TEMP:
hostDatum.setRedirTemp(hostDatum.getRedirTemp() + 1);
break;
case CrawlDatum.STATUS_DB_REDIR_PERM:
hostDatum.setRedirPerm(hostDatum.getRedirPerm() + 1);
break;
case CrawlDatum.STATUS_DB_NOTMODIFIED:
hostDatum.setNotModified(hostDatum.getNotModified() + 1);
break;
}
// Record connection failures
if (buffer.getRetriesSinceFetch() != 0) {
hostDatum.incConnectionFailures();
}
// Only gather metadata statistics for proper fetched pages
if (buffer.getStatus() == CrawlDatum.STATUS_DB_FETCHED || buffer.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
// Deal with the string fields
if (stringFields != null) {
for (int i = 0; i < stringFields.length; i++) {
// Does this field exist?
if (buffer.getMetaData().get(stringFieldWritables[i]) != null) {
// Get it!
String metadataValue = null;
try {
metadataValue = buffer.getMetaData().get(stringFieldWritables[i]).toString();
} catch (Exception e) {
LOG.error("Metadata field " + stringFields[i] + " is probably not a numeric value");
}
// Does the value exist?
if (stringCounts.get(stringFields[i]).containsKey(metadataValue)) {
// Yes, increment it
stringCounts.get(stringFields[i]).put(metadataValue, stringCounts.get(stringFields[i]).get(metadataValue) + 1);
} else {
// Create it!
stringCounts.get(stringFields[i]).put(metadataValue, 1);
}
}
}
}
// Deal with the numeric fields
if (numericFields != null) {
for (int i = 0; i < numericFields.length; i++) {
// Does this field exist?
if (buffer.getMetaData().get(numericFieldWritables[i]) != null) {
try {
// Get it!
Float metadataValue = Float.parseFloat(buffer.getMetaData().get(numericFieldWritables[i]).toString());
// Does the median value exist?
if (tdigests.containsKey(numericFields[i])) {
tdigests.get(numericFields[i]).add(metadataValue);
} else {
// Create it!
TDigest tdigest = TDigest.createDigest(100);
tdigest.add((double) metadataValue);
tdigests.put(numericFields[i], tdigest);
}
// Does the minimum value exist?
if (minimums.containsKey(numericFields[i])) {
// Write if this is lower than existing value
if (metadataValue < minimums.get(numericFields[i])) {
minimums.put(numericFields[i], metadataValue);
}
} else {
// Create it!
minimums.put(numericFields[i], metadataValue);
}
// Does the maximum value exist?
if (maximums.containsKey(numericFields[i])) {
// Write if this is lower than existing value
if (metadataValue > maximums.get(numericFields[i])) {
maximums.put(numericFields[i], metadataValue);
}
} else {
// Create it!
maximums.put(numericFields[i], metadataValue);
}
// Sum it up!
if (sums.containsKey(numericFields[i])) {
// Increment
sums.put(numericFields[i], sums.get(numericFields[i]) + metadataValue);
counts.put(numericFields[i], counts.get(numericFields[i]) + 1);
} else {
// Create it!
sums.put(numericFields[i], metadataValue);
counts.put(numericFields[i], 1);
}
} catch (Exception e) {
LOG.error(e.getMessage() + " when processing values for " + key.toString());
}
}
}
}
}
}
//
if (value instanceof HostDatum) {
HostDatum buffer = (HostDatum) value;
// Check homepage URL
if (buffer.hasHomepageUrl()) {
hostDatum.setHomepageUrl(buffer.getHomepageUrl());
}
// Check lastCheck timestamp
if (!buffer.isEmpty()) {
hostDatum.setLastCheck(buffer.getLastCheck());
}
// Check and set DNS failures
if (buffer.getDnsFailures() > 0) {
hostDatum.setDnsFailures(buffer.getDnsFailures());
}
// Check and set connection failures
if (buffer.getConnectionFailures() > 0) {
hostDatum.setConnectionFailures(buffer.getConnectionFailures());
}
// Check metadata
if (!buffer.getMetaData().isEmpty()) {
hostDatum.setMetaData(buffer.getMetaData());
}
// Check and set score (score from Web Graph has precedence)
if (buffer.getScore() > 0) {
hostDatum.setScore(buffer.getScore());
}
}
// Check for the score
if (value instanceof FloatWritable) {
FloatWritable buffer = (FloatWritable) value;
score = buffer.get();
}
}
// Check if score was set from Web Graph
if (score > 0) {
hostDatum.setScore(score);
}
// Set metadata
for (Map.Entry<String, Map<String, Integer>> entry : stringCounts.entrySet()) {
for (Map.Entry<String, Integer> subEntry : entry.getValue().entrySet()) {
hostDatum.getMetaData().put(new Text(entry.getKey() + "." + subEntry.getKey()), new IntWritable(subEntry.getValue()));
}
}
for (Map.Entry<String, Float> entry : maximums.entrySet()) {
hostDatum.getMetaData().put(new Text("max." + entry.getKey()), new FloatWritable(entry.getValue()));
}
for (Map.Entry<String, Float> entry : sums.entrySet()) {
hostDatum.getMetaData().put(new Text("avg." + entry.getKey()), new FloatWritable(entry.getValue() / counts.get(entry.getKey())));
}
for (Map.Entry<String, TDigest> entry : tdigests.entrySet()) {
// Emit all percentiles
for (int i = 0; i < percentiles.length; i++) {
hostDatum.getMetaData().put(new Text("pct" + Integer.toString(percentiles[i]) + "." + entry.getKey()), new FloatWritable((float) entry.getValue().quantile(0.5)));
}
}
for (Map.Entry<String, Float> entry : minimums.entrySet()) {
hostDatum.getMetaData().put(new Text("min." + entry.getKey()), new FloatWritable(entry.getValue()));
}
context.getCounter("UpdateHostDb", "total_hosts").increment(1);
// See if this record is to be checked
if (shouldCheck(hostDatum)) {
// Make an entry
resolverThread = new ResolverThread(key.toString(), hostDatum, context, purgeFailedHostsThreshold);
// Add the entry to the queue (blocking)
try {
queue.put(resolverThread);
} catch (InterruptedException e) {
LOG.error("UpdateHostDb: " + StringUtils.stringifyException(e));
}
// Do not progress, the datum will be written in the resolver thread
return;
} else {
context.getCounter("UpdateHostDb", "skipped_not_eligible").increment(1);
LOG.info("UpdateHostDb: " + key.toString() + ": skipped_not_eligible");
}
// Write the host datum if it wasn't written by the resolver thread
context.write(key, hostDatum);
}
Aggregations