use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestStaticFieldIndexerTest method setUp.
@Before
public void setUp() throws Exception {
conf = NutchConfiguration.create();
parse = new ParseImpl();
url = new Text("http://nutch.apache.org/index.html");
crawlDatum = new CrawlDatum();
inlinks = new Inlinks();
filter = new StaticFieldIndexer();
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class HttpBase method main.
protected static void main(HttpBase http, String[] args) throws Exception {
String url = null;
String usage = "Usage: Http [-verbose] [-timeout N] url";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
for (int i = 0; i < args.length; i++) {
// parse command line
if (args[i].equals("-timeout")) {
// found -timeout option
http.timeout = Integer.parseInt(args[++i]) * 1000;
} else if (args[i].equals("-verbose")) {
// found -verbose option
} else if (i != args.length - 1) {
System.err.println(usage);
System.exit(-1);
} else
// root is required parameter
url = args[i];
}
ProtocolOutput out = http.getProtocolOutput(new Text(url), new CrawlDatum());
Content content = out.getContent();
System.out.println("Status: " + out.getStatus());
if (content != null) {
System.out.println("Content Type: " + content.getContentType());
System.out.println("Content Length: " + content.getMetadata().get(Response.CONTENT_LENGTH));
System.out.println("Content:");
String text = new String(content.getContent());
System.out.println(text);
}
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class MimeTypeIndexingFilter method main.
/**
* Main method for invoking this tool
*
* @throws IOException
* @throws IndexingException
*/
public static void main(String[] args) throws IOException, IndexingException {
Option helpOpt = new Option("h", "help", false, "show this help message");
Option rulesOpt = OptionBuilder.withArgName("file").hasArg().withDescription("Rules file to be used in the tests relative to the conf directory").isRequired().create("rules");
Options options = new Options();
options.addOption(helpOpt).addOption(rulesOpt);
CommandLineParser parser = new GnuParser();
HelpFormatter formatter = new HelpFormatter();
String rulesFile;
try {
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || !line.hasOption("rules")) {
formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
return;
}
rulesFile = line.getOptionValue("rules");
} catch (UnrecognizedOptionException e) {
formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
return;
} catch (Exception e) {
LOG.error(StringUtils.stringifyException(e));
e.printStackTrace();
return;
}
MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
Configuration conf = NutchConfiguration.create();
conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
filter.setConf(conf);
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
String line;
while ((line = in.readLine()) != null && !line.isEmpty()) {
Metadata metadata = new Metadata();
metadata.set(Response.CONTENT_TYPE, line);
ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
NutchDocument doc = filter.filter(new NutchDocument(), parse, new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
if (doc != null) {
System.out.print("+ ");
System.out.println(line);
} else {
System.out.print("- ");
System.out.println(line);
}
}
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestMetatagParser method parseMeta.
public Metadata parseMeta(String fileName, Configuration conf) {
Metadata metadata = null;
try {
String urlString = "file:" + sampleDir + fileSeparator + fileName;
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
metadata = parse.getData().getParseMeta();
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.toString());
}
return metadata;
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestSWFParser method testIt.
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parse(content).get(content.getUrl());
String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
Assert.assertTrue(sampleTexts[i].equals(text));
}
}
Aggregations