Search in sources :

Example 1 with IndexingException

use of org.apache.nutch.indexer.IndexingException in project nutch by apache.

the class BasicIndexingFilter method filter.

/**
 * The {@link BasicIndexingFilter} filter object which supports few
 * configuration settings for adding basic searchable fields. See
 * {@code indexer.add.domain}, {@code indexer.max.title.length},
 * {@code indexer.max.content.length} in nutch-default.xml.
 *
 * @param doc
 *          The {@link NutchDocument} object
 * @param parse
 *          The relevant {@link Parse} object passing through the filter
 * @param url
 *          URL to be filtered for anchor text
 * @param datum
 *          The {@link CrawlDatum} entry
 * @param inlinks
 *          The {@link Inlinks} containing anchor text
 * @return filtered NutchDocument
 */
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
    Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
    String reprUrlString = reprUrl != null ? reprUrl.toString() : null;
    String urlString = url.toString();
    String host = null;
    try {
        URL u;
        if (reprUrlString != null) {
            u = new URL(reprUrlString);
        } else {
            u = new URL(urlString);
        }
        if (addDomain) {
            doc.add("domain", URLUtil.getDomainName(u));
        }
        host = u.getHost();
    } catch (MalformedURLException e) {
        throw new IndexingException(e);
    }
    if (host != null) {
        doc.add("host", host);
    }
    doc.add("url", reprUrlString == null ? urlString : reprUrlString);
    // content
    String content = parse.getText();
    if (MAX_CONTENT_LENGTH > -1 && content.length() > MAX_CONTENT_LENGTH) {
        content = content.substring(0, MAX_CONTENT_LENGTH);
    }
    doc.add("content", StringUtil.cleanField(content));
    // title
    String title = parse.getData().getTitle();
    if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) {
        // truncate
        // title
        // if
        // needed
        title = title.substring(0, MAX_TITLE_LENGTH);
    }
    if (title.length() > 0) {
        // NUTCH-1004 Do not index empty values for title field
        doc.add("title", StringUtil.cleanField(title));
    }
    // add cached content/summary display policy, if available
    String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
    if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
        doc.add("cache", caching);
    }
    // add timestamp when fetched, for deduplication
    doc.add("tstamp", new Date(datum.getFetchTime()));
    return doc;
}
Also used : IndexingException(org.apache.nutch.indexer.IndexingException) MalformedURLException(java.net.MalformedURLException) Text(org.apache.hadoop.io.Text) URL(java.net.URL) Date(java.util.Date)

Example 2 with IndexingException

use of org.apache.nutch.indexer.IndexingException in project nutch by apache.

the class JexlIndexingFilter method filter.

@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
    // Create a context and add data
    JexlContext jcontext = new MapContext();
    jcontext.set("status", CrawlDatum.getStatusName(datum.getStatus()));
    jcontext.set("fetchTime", (long) (datum.getFetchTime()));
    jcontext.set("modifiedTime", (long) (datum.getModifiedTime()));
    jcontext.set("retries", datum.getRetriesSinceFetch());
    jcontext.set("interval", Integer.valueOf(datum.getFetchInterval()));
    jcontext.set("score", datum.getScore());
    jcontext.set("signature", StringUtil.toHexString(datum.getSignature()));
    jcontext.set("url", url.toString());
    jcontext.set("text", parse.getText());
    jcontext.set("title", parse.getData().getTitle());
    JexlContext httpStatusContext = new MapContext();
    httpStatusContext.set("majorCode", parse.getData().getStatus().getMajorCode());
    httpStatusContext.set("minorCode", parse.getData().getStatus().getMinorCode());
    httpStatusContext.set("message", parse.getData().getStatus().getMessage());
    jcontext.set("httpStatus", httpStatusContext);
    jcontext.set("documentMeta", metadataToContext(doc.getDocumentMeta()));
    jcontext.set("contentMeta", metadataToContext(parse.getData().getContentMeta()));
    jcontext.set("parseMeta", metadataToContext(parse.getData().getParseMeta()));
    JexlContext context = new MapContext();
    for (Entry<String, NutchField> entry : doc) {
        List<Object> values = entry.getValue().getValues();
        context.set(entry.getKey(), values.size() > 1 ? values : values.get(0));
    }
    jcontext.set("doc", context);
    try {
        if (Boolean.TRUE.equals(expr.execute(jcontext))) {
            return doc;
        }
    } catch (Exception e) {
        LOG.warn("Failed evaluating JEXL {}", expr.getSourceText(), e);
    }
    return null;
}
Also used : NutchField(org.apache.nutch.indexer.NutchField) JexlContext(org.apache.commons.jexl3.JexlContext) MapContext(org.apache.commons.jexl3.MapContext) IndexingException(org.apache.nutch.indexer.IndexingException)

Example 3 with IndexingException

use of org.apache.nutch.indexer.IndexingException in project nutch by apache.

the class TLDIndexingFilter method filter.

@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
    try {
        URL url = new URL(urlText.toString());
        DomainSuffix d = URLUtil.getDomainSuffix(url);
        doc.add("tld", d.getDomain());
    } catch (Exception ex) {
        LOG.warn(ex.toString());
    }
    return doc;
}
Also used : DomainSuffix(org.apache.nutch.util.domain.DomainSuffix) URL(java.net.URL) IndexingException(org.apache.nutch.indexer.IndexingException)

Example 4 with IndexingException

use of org.apache.nutch.indexer.IndexingException in project nutch by apache.

the class MimeTypeIndexingFilter method main.

/**
 * Main method for invoking this tool
 * @param args run with no arguments to print help
 * @throws IOException if there is a fatal I/O error processing the input args
 * @throws IndexingException if there is a fatal error whils indexing
 */
public static void main(String[] args) throws IOException, IndexingException {
    Option helpOpt = new Option("h", "help", false, "show this help message");
    @SuppressWarnings("static-access") Option rulesOpt = OptionBuilder.withArgName("file").hasArg().withDescription("Rules file to be used in the tests relative to the conf directory").isRequired().create("rules");
    Options options = new Options();
    options.addOption(helpOpt).addOption(rulesOpt);
    CommandLineParser parser = new GnuParser();
    HelpFormatter formatter = new HelpFormatter();
    String rulesFile;
    try {
        CommandLine line = parser.parse(options, args);
        if (line.hasOption("help") || !line.hasOption("rules")) {
            formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
            return;
        }
        rulesFile = line.getOptionValue("rules");
    } catch (UnrecognizedOptionException e) {
        formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
        return;
    } catch (Exception e) {
        LOG.error(StringUtils.stringifyException(e));
        e.printStackTrace();
        return;
    }
    MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
    Configuration conf = NutchConfiguration.create();
    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
    filter.setConf(conf);
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    String line;
    while ((line = in.readLine()) != null && !line.isEmpty()) {
        Metadata metadata = new Metadata();
        metadata.set(Response.CONTENT_TYPE, line);
        ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
        NutchDocument doc = filter.filter(new NutchDocument(), parse, new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
        if (doc != null) {
            System.out.print("+ ");
            System.out.println(line);
        } else {
            System.out.print("- ");
            System.out.println(line);
        }
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) Options(org.apache.commons.cli.Options) Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) InputStreamReader(java.io.InputStreamReader) NutchDocument(org.apache.nutch.indexer.NutchDocument) GnuParser(org.apache.commons.cli.GnuParser) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) UnrecognizedOptionException(org.apache.commons.cli.UnrecognizedOptionException) UnrecognizedOptionException(org.apache.commons.cli.UnrecognizedOptionException) IOException(java.io.IOException) IndexingException(org.apache.nutch.indexer.IndexingException) HelpFormatter(org.apache.commons.cli.HelpFormatter) ParseStatus(org.apache.nutch.parse.ParseStatus) CommandLine(org.apache.commons.cli.CommandLine) ParseData(org.apache.nutch.parse.ParseData) BufferedReader(java.io.BufferedReader) ParseImpl(org.apache.nutch.parse.ParseImpl) Option(org.apache.commons.cli.Option) CommandLineParser(org.apache.commons.cli.CommandLineParser)

Aggregations

IndexingException (org.apache.nutch.indexer.IndexingException)4 URL (java.net.URL)2 Text (org.apache.hadoop.io.Text)2 BufferedReader (java.io.BufferedReader)1 IOException (java.io.IOException)1 InputStreamReader (java.io.InputStreamReader)1 MalformedURLException (java.net.MalformedURLException)1 Date (java.util.Date)1 CommandLine (org.apache.commons.cli.CommandLine)1 CommandLineParser (org.apache.commons.cli.CommandLineParser)1 GnuParser (org.apache.commons.cli.GnuParser)1 HelpFormatter (org.apache.commons.cli.HelpFormatter)1 Option (org.apache.commons.cli.Option)1 Options (org.apache.commons.cli.Options)1 UnrecognizedOptionException (org.apache.commons.cli.UnrecognizedOptionException)1 JexlContext (org.apache.commons.jexl3.JexlContext)1 MapContext (org.apache.commons.jexl3.MapContext)1 Configuration (org.apache.hadoop.conf.Configuration)1 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)1 Inlinks (org.apache.nutch.crawl.Inlinks)1