use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class ZipTextExtractor method extractText.
public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException {
String resultText = "";
ZipInputStream zin = new ZipInputStream(input);
ZipEntry entry;
while ((entry = zin.getNextEntry()) != null) {
if (!entry.isDirectory()) {
int size = (int) entry.getSize();
byte[] b = new byte[size];
for (int x = 0; x < size; x++) {
int err = zin.read();
if (err != -1) {
b[x] = (byte) err;
}
}
String newurl = url + "/";
String fname = entry.getName();
newurl += fname;
URL aURL = new URL(newurl);
String base = aURL.toString();
int i = fname.lastIndexOf('.');
if (i != -1) {
// Trying to resolve the Mime-Type
Tika tika = new Tika();
String contentType = tika.detect(fname);
try {
Metadata metadata = new Metadata();
metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
metadata.set(Response.CONTENT_TYPE, contentType);
Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
ParseData theParseData = parse.getData();
Outlink[] theOutlinks = theParseData.getOutlinks();
for (int count = 0; count < theOutlinks.length; count++) {
outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
}
resultText += entry.getName() + " " + parse.getText() + " ";
} catch (ParseException e) {
if (LOG.isInfoEnabled()) {
LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
}
}
}
}
}
return resultText;
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestRegexParseFilter method testPositiveFilter.
public void testPositiveFilter() throws Exception {
Configuration conf = NutchConfiguration.create();
String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
RegexParseFilter filter = new RegexParseFilter(file);
filter.setConf(conf);
String url = "http://nutch.apache.org/";
String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>";
Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData());
ParseResult result = ParseResult.createParseResult(url, parse);
result = filter.filter(content, result, null, null);
Metadata meta = parse.getData().getParseMeta();
assertEquals("true", meta.get("first"));
assertEquals("true", meta.get("second"));
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestRegexParseFilter method testNegativeFilter.
public void testNegativeFilter() throws Exception {
Configuration conf = NutchConfiguration.create();
String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
RegexParseFilter filter = new RegexParseFilter(file);
filter.setConf(conf);
String url = "http://nutch.apache.org/";
String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>";
Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData());
ParseResult result = ParseResult.createParseResult(url, parse);
result = filter.filter(content, result, null, null);
Metadata meta = parse.getData().getParseMeta();
assertEquals("false", meta.get("first"));
assertEquals("false", meta.get("second"));
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestFetcher method testFetch.
@Test
public void testFetch() throws IOException, ClassNotFoundException, InterruptedException {
// generate seedlist
ArrayList<String> urls = new ArrayList<String>();
addUrl(urls, "index.html");
addUrl(urls, "pagea.html");
addUrl(urls, "pageb.html");
addUrl(urls, "dup_of_pagea.html");
addUrl(urls, "nested_spider_trap.html");
addUrl(urls, "exception.html");
CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
// inject
Injector injector = new Injector(conf);
injector.inject(crawldbPath, urlPath);
// generate
Generator g = new Generator(conf);
Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1, Long.MAX_VALUE, Long.MAX_VALUE, false, false);
long time = System.currentTimeMillis();
// fetch
Fetcher fetcher = new Fetcher(conf);
// Set fetcher.parse to true
conf.setBoolean("fetcher.parse", true);
fetcher.fetch(generatedSegment[0], 1);
time = System.currentTimeMillis() - time;
// verify politeness, time taken should be more than (num_of_pages +1)*delay
int minimumTime = (int) ((urls.size() + 1) * 1000 * conf.getFloat("fetcher.server.delay", 5));
Assert.assertTrue(time > minimumTime);
// verify content
Path content = new Path(new Path(generatedSegment[0], Content.DIR_NAME), "part-r-00000/data");
@SuppressWarnings("resource") SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(content));
ArrayList<String> handledurls = new ArrayList<String>();
READ_CONTENT: do {
Text key = new Text();
Content value = new Content();
if (!reader.next(key, value))
break READ_CONTENT;
String contentString = new String(value.getContent());
if (contentString.indexOf("Nutch fetcher test page") != -1) {
handledurls.add(key.toString());
}
} while (true);
reader.close();
Collections.sort(urls);
Collections.sort(handledurls);
// verify that enough pages were handled
Assert.assertEquals(urls.size(), handledurls.size());
// verify that correct pages were handled
Assert.assertTrue(handledurls.containsAll(urls));
Assert.assertTrue(urls.containsAll(handledurls));
handledurls.clear();
// verify parse data
Path parseData = new Path(new Path(generatedSegment[0], ParseData.DIR_NAME), "part-r-00000/data");
reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(parseData));
READ_PARSE_DATA: do {
Text key = new Text();
ParseData value = new ParseData();
if (!reader.next(key, value))
break READ_PARSE_DATA;
// make sure they all contain "nutch.segment.name" and
// "nutch.content.digest"
// keys in parse metadata
Metadata contentMeta = value.getContentMeta();
if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
handledurls.add(key.toString());
}
} while (true);
Collections.sort(handledurls);
Assert.assertEquals(urls.size(), handledurls.size());
Assert.assertTrue(handledurls.containsAll(urls));
Assert.assertTrue(urls.containsAll(handledurls));
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestIndexingFilters method testNonExistingIndexingFilter.
/**
* Test behaviour when defined filter does not exist.
*
* @throws IndexingException
*/
@Test
public void testNonExistingIndexingFilter() throws IndexingException {
Configuration conf = NutchConfiguration.create();
conf.addResource("nutch-default.xml");
conf.addResource("crawl-tests.xml");
String class1 = "NonExistingFilter";
String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
IndexingFilters filters = new IndexingFilters(conf);
filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
}
Aggregations