use of org.apache.nutch.parse.ParseUtil in project nutch by apache.
the class TestZipParser method testIt.
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(content.getUrl());
Assert.assertTrue("Extracted text does not start with <" + expectedText + ">: <" + parse.getText() + ">", parse.getText().startsWith(expectedText));
}
}
use of org.apache.nutch.parse.ParseUtil in project nutch by apache.
the class TestIndexReplace method parseAndFilterFile.
/**
* Run a test file through the Nutch parser and index filters.
*
* @param fileName
* @param conf
* @return the Nutch document with the replace indexer applied
*/
public NutchDocument parseAndFilterFile(String fileName, Configuration conf) {
NutchDocument doc = new NutchDocument();
BasicIndexingFilter basicIndexer = new BasicIndexingFilter();
basicIndexer.setConf(conf);
Assert.assertNotNull(basicIndexer);
MetadataIndexer metaIndexer = new MetadataIndexer();
metaIndexer.setConf(conf);
Assert.assertNotNull(basicIndexer);
ReplaceIndexer replaceIndexer = new ReplaceIndexer();
replaceIndexer.setConf(conf);
Assert.assertNotNull(replaceIndexer);
try {
String urlString = "file:" + sampleDir + fileSeparator + fileName;
Text text = new Text(urlString);
CrawlDatum crawlDatum = new CrawlDatum();
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(text, crawlDatum).getContent();
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
crawlDatum.setFetchTime(100L);
Inlinks inlinks = new Inlinks();
doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks);
doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks);
doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.toString());
}
return doc;
}
use of org.apache.nutch.parse.ParseUtil in project nutch by apache.
the class TestHTMLLanguageParser method testMetaHTMLParsing.
/**
* Test parsing of language identifiers from html
*/
@Test
public void testMetaHTMLParsing() {
try {
ParseUtil parser = new ParseUtil(NutchConfiguration.create());
/* loop through the test documents and validate result */
for (int t = 0; t < docs.length; t++) {
Content content = getContent(docs[t]);
Parse parse = parser.parse(content).get(content.getUrl());
Assert.assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
}
} catch (Exception e) {
e.printStackTrace(System.out);
Assert.fail(e.toString());
}
}
use of org.apache.nutch.parse.ParseUtil in project nutch by apache.
the class TestMetatagParser method parseMeta.
public Metadata parseMeta(String fileName, Configuration conf) {
Metadata metadata = null;
try {
String urlString = "file:" + sampleDir + fileSeparator + fileName;
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
metadata = parse.getData().getParseMeta();
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.toString());
}
return metadata;
}
use of org.apache.nutch.parse.ParseUtil in project nutch by apache.
the class TestSWFParser method testIt.
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parse(content).get(content.getUrl());
String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
Assert.assertTrue(sampleTexts[i].equals(text));
}
}
Aggregations