use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class RelTagParser method filter.
/**
* Scan the HTML document looking at possible rel-tags
*/
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
// get parse obj
Parse parse = parseResult.get(content.getUrl());
// Trying to find the document's rel-tags
Parser parser = new Parser(doc);
Set<?> tags = parser.getRelTags();
Iterator<?> iter = tags.iterator();
Metadata metadata = parse.getData().getParseMeta();
while (iter.hasNext()) metadata.add(REL_TAG, (String) iter.next());
return parseResult;
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class MimeTypeIndexingFilter method main.
/**
* Main method for invoking this tool
*
* @throws IOException
* @throws IndexingException
*/
public static void main(String[] args) throws IOException, IndexingException {
Option helpOpt = new Option("h", "help", false, "show this help message");
Option rulesOpt = OptionBuilder.withArgName("file").hasArg().withDescription("Rules file to be used in the tests relative to the conf directory").isRequired().create("rules");
Options options = new Options();
options.addOption(helpOpt).addOption(rulesOpt);
CommandLineParser parser = new GnuParser();
HelpFormatter formatter = new HelpFormatter();
String rulesFile;
try {
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || !line.hasOption("rules")) {
formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
return;
}
rulesFile = line.getOptionValue("rules");
} catch (UnrecognizedOptionException e) {
formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
return;
} catch (Exception e) {
LOG.error(StringUtils.stringifyException(e));
e.printStackTrace();
return;
}
MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
Configuration conf = NutchConfiguration.create();
conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
filter.setConf(conf);
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
String line;
while ((line = in.readLine()) != null && !line.isEmpty()) {
Metadata metadata = new Metadata();
metadata.set(Response.CONTENT_TYPE, line);
ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
NutchDocument doc = filter.filter(new NutchDocument(), parse, new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
if (doc != null) {
System.out.print("+ ");
System.out.println(line);
} else {
System.out.print("- ");
System.out.println(line);
}
}
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class HtmlParser method main.
public static void main(String[] args) throws Exception {
String name = args[0];
String url = "file:" + name;
File file = new File(name);
byte[] bytes = new byte[(int) file.length()];
DataInputStream in = new DataInputStream(new FileInputStream(file));
in.readFully(bytes);
Configuration conf = NutchConfiguration.create();
HtmlParser parser = new HtmlParser();
parser.setConf(conf);
Parse parse = parser.getParse(new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
System.out.println("data: " + parse.getData());
System.out.println("text: " + parse.getText());
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class MetaTagsParser method filter.
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
Metadata metadata = parse.getData().getParseMeta();
// might have stored the values there already
for (String mdName : metadata.names()) {
addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));
}
Metadata generalMetaTags = metaTags.getGeneralTags();
for (String tagName : generalMetaTags.names()) {
addIndexedMetatags(metadata, tagName, generalMetaTags.getValues(tagName));
}
Properties httpequiv = metaTags.getHttpEquivTags();
for (Enumeration<?> tagNames = httpequiv.propertyNames(); tagNames.hasMoreElements(); ) {
String name = (String) tagNames.nextElement();
String value = httpequiv.getProperty(name);
addIndexedMetatags(metadata, name, value);
}
return parseResult;
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestMetatagParser method parseMeta.
public Metadata parseMeta(String fileName, Configuration conf) {
Metadata metadata = null;
try {
String urlString = "file:" + sampleDir + fileSeparator + fileName;
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
metadata = parse.getData().getParseMeta();
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.toString());
}
return metadata;
}
Aggregations