use of org.apache.nutch.indexer.IndexingException in project nutch by apache.
the class BasicIndexingFilter method filter.
/**
* The {@link BasicIndexingFilter} filter object which supports few
* configuration settings for adding basic searchable fields. See
* {@code indexer.add.domain}, {@code indexer.max.title.length},
* {@code indexer.max.content.length} in nutch-default.xml.
*
* @param doc
* The {@link NutchDocument} object
* @param parse
* The relevant {@link Parse} object passing through the filter
* @param url
* URL to be filtered for anchor text
* @param datum
* The {@link CrawlDatum} entry
* @param inlinks
* The {@link Inlinks} containing anchor text
* @return filtered NutchDocument
*/
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
String reprUrlString = reprUrl != null ? reprUrl.toString() : null;
String urlString = url.toString();
String host = null;
try {
URL u;
if (reprUrlString != null) {
u = new URL(reprUrlString);
} else {
u = new URL(urlString);
}
if (addDomain) {
doc.add("domain", URLUtil.getDomainName(u));
}
host = u.getHost();
} catch (MalformedURLException e) {
throw new IndexingException(e);
}
if (host != null) {
doc.add("host", host);
}
doc.add("url", reprUrlString == null ? urlString : reprUrlString);
// content
String content = parse.getText();
if (MAX_CONTENT_LENGTH > -1 && content.length() > MAX_CONTENT_LENGTH) {
content = content.substring(0, MAX_CONTENT_LENGTH);
}
doc.add("content", StringUtil.cleanField(content));
// title
String title = parse.getData().getTitle();
if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) {
// truncate
// title
// if
// needed
title = title.substring(0, MAX_TITLE_LENGTH);
}
if (title.length() > 0) {
// NUTCH-1004 Do not index empty values for title field
doc.add("title", StringUtil.cleanField(title));
}
// add cached content/summary display policy, if available
String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
doc.add("cache", caching);
}
// add timestamp when fetched, for deduplication
doc.add("tstamp", new Date(datum.getFetchTime()));
return doc;
}
use of org.apache.nutch.indexer.IndexingException in project nutch by apache.
the class JexlIndexingFilter method filter.
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
// Create a context and add data
JexlContext jcontext = new MapContext();
jcontext.set("status", CrawlDatum.getStatusName(datum.getStatus()));
jcontext.set("fetchTime", (long) (datum.getFetchTime()));
jcontext.set("modifiedTime", (long) (datum.getModifiedTime()));
jcontext.set("retries", datum.getRetriesSinceFetch());
jcontext.set("interval", Integer.valueOf(datum.getFetchInterval()));
jcontext.set("score", datum.getScore());
jcontext.set("signature", StringUtil.toHexString(datum.getSignature()));
jcontext.set("url", url.toString());
jcontext.set("text", parse.getText());
jcontext.set("title", parse.getData().getTitle());
JexlContext httpStatusContext = new MapContext();
httpStatusContext.set("majorCode", parse.getData().getStatus().getMajorCode());
httpStatusContext.set("minorCode", parse.getData().getStatus().getMinorCode());
httpStatusContext.set("message", parse.getData().getStatus().getMessage());
jcontext.set("httpStatus", httpStatusContext);
jcontext.set("documentMeta", metadataToContext(doc.getDocumentMeta()));
jcontext.set("contentMeta", metadataToContext(parse.getData().getContentMeta()));
jcontext.set("parseMeta", metadataToContext(parse.getData().getParseMeta()));
JexlContext context = new MapContext();
for (Entry<String, NutchField> entry : doc) {
List<Object> values = entry.getValue().getValues();
context.set(entry.getKey(), values.size() > 1 ? values : values.get(0));
}
jcontext.set("doc", context);
try {
if (Boolean.TRUE.equals(expr.execute(jcontext))) {
return doc;
}
} catch (Exception e) {
LOG.warn("Failed evaluating JEXL {}", expr.getSourceText(), e);
}
return null;
}
use of org.apache.nutch.indexer.IndexingException in project nutch by apache.
the class TLDIndexingFilter method filter.
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
try {
URL url = new URL(urlText.toString());
DomainSuffix d = URLUtil.getDomainSuffix(url);
doc.add("tld", d.getDomain());
} catch (Exception ex) {
LOG.warn(ex.toString());
}
return doc;
}
use of org.apache.nutch.indexer.IndexingException in project nutch by apache.
the class MimeTypeIndexingFilter method main.
/**
* Main method for invoking this tool
* @param args run with no arguments to print help
* @throws IOException if there is a fatal I/O error processing the input args
* @throws IndexingException if there is a fatal error whils indexing
*/
public static void main(String[] args) throws IOException, IndexingException {
Option helpOpt = new Option("h", "help", false, "show this help message");
@SuppressWarnings("static-access") Option rulesOpt = OptionBuilder.withArgName("file").hasArg().withDescription("Rules file to be used in the tests relative to the conf directory").isRequired().create("rules");
Options options = new Options();
options.addOption(helpOpt).addOption(rulesOpt);
CommandLineParser parser = new GnuParser();
HelpFormatter formatter = new HelpFormatter();
String rulesFile;
try {
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || !line.hasOption("rules")) {
formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
return;
}
rulesFile = line.getOptionValue("rules");
} catch (UnrecognizedOptionException e) {
formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
return;
} catch (Exception e) {
LOG.error(StringUtils.stringifyException(e));
e.printStackTrace();
return;
}
MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
Configuration conf = NutchConfiguration.create();
conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
filter.setConf(conf);
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
String line;
while ((line = in.readLine()) != null && !line.isEmpty()) {
Metadata metadata = new Metadata();
metadata.set(Response.CONTENT_TYPE, line);
ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
NutchDocument doc = filter.filter(new NutchDocument(), parse, new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
if (doc != null) {
System.out.print("+ ");
System.out.println(line);
} else {
System.out.print("- ");
System.out.println(line);
}
}
}
Aggregations