Search in sources :

Example 31 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class RelTagParser method filter.

/**
 * Scan the HTML document looking at possible rel-tags
 */
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
    // get parse obj
    Parse parse = parseResult.get(content.getUrl());
    // Trying to find the document's rel-tags
    Parser parser = new Parser(doc);
    Set<?> tags = parser.getRelTags();
    Iterator<?> iter = tags.iterator();
    Metadata metadata = parse.getData().getParseMeta();
    while (iter.hasNext()) metadata.add(REL_TAG, (String) iter.next());
    return parseResult;
}
Also used : Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata)

Example 32 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class MimeTypeIndexingFilter method main.

/**
 * Main method for invoking this tool
 *
 * @throws IOException
 * @throws IndexingException
 */
public static void main(String[] args) throws IOException, IndexingException {
    Option helpOpt = new Option("h", "help", false, "show this help message");
    Option rulesOpt = OptionBuilder.withArgName("file").hasArg().withDescription("Rules file to be used in the tests relative to the conf directory").isRequired().create("rules");
    Options options = new Options();
    options.addOption(helpOpt).addOption(rulesOpt);
    CommandLineParser parser = new GnuParser();
    HelpFormatter formatter = new HelpFormatter();
    String rulesFile;
    try {
        CommandLine line = parser.parse(options, args);
        if (line.hasOption("help") || !line.hasOption("rules")) {
            formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
            return;
        }
        rulesFile = line.getOptionValue("rules");
    } catch (UnrecognizedOptionException e) {
        formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
        return;
    } catch (Exception e) {
        LOG.error(StringUtils.stringifyException(e));
        e.printStackTrace();
        return;
    }
    MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
    Configuration conf = NutchConfiguration.create();
    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
    filter.setConf(conf);
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    String line;
    while ((line = in.readLine()) != null && !line.isEmpty()) {
        Metadata metadata = new Metadata();
        metadata.set(Response.CONTENT_TYPE, line);
        ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
        NutchDocument doc = filter.filter(new NutchDocument(), parse, new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
        if (doc != null) {
            System.out.print("+ ");
            System.out.println(line);
        } else {
            System.out.print("- ");
            System.out.println(line);
        }
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) Options(org.apache.commons.cli.Options) Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) InputStreamReader(java.io.InputStreamReader) NutchDocument(org.apache.nutch.indexer.NutchDocument) GnuParser(org.apache.commons.cli.GnuParser) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) UnrecognizedOptionException(org.apache.commons.cli.UnrecognizedOptionException) UnrecognizedOptionException(org.apache.commons.cli.UnrecognizedOptionException) IOException(java.io.IOException) IndexingException(org.apache.nutch.indexer.IndexingException) HelpFormatter(org.apache.commons.cli.HelpFormatter) ParseStatus(org.apache.nutch.parse.ParseStatus) CommandLine(org.apache.commons.cli.CommandLine) ParseData(org.apache.nutch.parse.ParseData) BufferedReader(java.io.BufferedReader) ParseImpl(org.apache.nutch.parse.ParseImpl) Option(org.apache.commons.cli.Option) CommandLineParser(org.apache.commons.cli.CommandLineParser)

Example 33 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class HtmlParser method main.

public static void main(String[] args) throws Exception {
    String name = args[0];
    String url = "file:" + name;
    File file = new File(name);
    byte[] bytes = new byte[(int) file.length()];
    DataInputStream in = new DataInputStream(new FileInputStream(file));
    in.readFully(bytes);
    Configuration conf = NutchConfiguration.create();
    HtmlParser parser = new HtmlParser();
    parser.setConf(conf);
    Parse parse = parser.getParse(new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
    System.out.println("data: " + parse.getData());
    System.out.println("text: " + parse.getText());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Parse(org.apache.nutch.parse.Parse) Content(org.apache.nutch.protocol.Content) Metadata(org.apache.nutch.metadata.Metadata) DataInputStream(java.io.DataInputStream) File(java.io.File) FileInputStream(java.io.FileInputStream)

Example 34 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class MetaTagsParser method filter.

public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
    Parse parse = parseResult.get(content.getUrl());
    Metadata metadata = parse.getData().getParseMeta();
    // might have stored the values there already
    for (String mdName : metadata.names()) {
        addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));
    }
    Metadata generalMetaTags = metaTags.getGeneralTags();
    for (String tagName : generalMetaTags.names()) {
        addIndexedMetatags(metadata, tagName, generalMetaTags.getValues(tagName));
    }
    Properties httpequiv = metaTags.getHttpEquivTags();
    for (Enumeration<?> tagNames = httpequiv.propertyNames(); tagNames.hasMoreElements(); ) {
        String name = (String) tagNames.nextElement();
        String value = httpequiv.getProperty(name);
        addIndexedMetatags(metadata, name, value);
    }
    return parseResult;
}
Also used : Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) Properties(java.util.Properties)

Example 35 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestMetatagParser method parseMeta.

public Metadata parseMeta(String fileName, Configuration conf) {
    Metadata metadata = null;
    try {
        String urlString = "file:" + sampleDir + fileSeparator + fileName;
        Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
        Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
        metadata = parse.getData().getParseMeta();
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.toString());
    }
    return metadata;
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol)

Aggregations

Metadata (org.apache.nutch.metadata.Metadata)42 Configuration (org.apache.hadoop.conf.Configuration)20 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)20 ParseData (org.apache.nutch.parse.ParseData)19 Content (org.apache.nutch.protocol.Content)18 Test (org.junit.Test)17 Text (org.apache.hadoop.io.Text)16 Parse (org.apache.nutch.parse.Parse)16 ParseImpl (org.apache.nutch.parse.ParseImpl)15 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)14 Inlinks (org.apache.nutch.crawl.Inlinks)11 Outlink (org.apache.nutch.parse.Outlink)10 ParseStatus (org.apache.nutch.parse.ParseStatus)9 NutchDocument (org.apache.nutch.indexer.NutchDocument)7 ParseResult (org.apache.nutch.parse.ParseResult)7 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 File (java.io.File)4 ArrayList (java.util.ArrayList)4 ParseUtil (org.apache.nutch.parse.ParseUtil)4