use of org.apache.nutch.parse.ParseUtil in project nutch by apache.
the class TestAny23ParseFilter method extract.
public String[] extract(String urlString, File file, String contentType) {
try {
System.out.println(urlString);
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
content.setContentType(contentType);
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.toString());
}
return null;
}
use of org.apache.nutch.parse.ParseUtil in project nutch by apache.
the class TestCCParseFilter method pageTest.
public void pageTest(File file, String url, String license, String location, String type) throws Exception {
String contentType = "text/html";
InputStream in = new FileInputStream(file);
ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
byte[] buffer = new byte[1024];
int i;
while ((i = in.read(buffer)) != -1) {
out.write(buffer, 0, i);
}
in.close();
byte[] bytes = out.toByteArray();
Configuration conf = NutchConfiguration.create();
Content content = new Content(url, url, bytes, contentType, new Metadata(), conf);
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
Metadata metadata = parse.getData().getParseMeta();
Assert.assertEquals(license, metadata.get("License-Url"));
Assert.assertEquals(location, metadata.get("License-Location"));
Assert.assertEquals(type, metadata.get("Work-Type"));
}
use of org.apache.nutch.parse.ParseUtil in project nutch by apache.
the class TestExtParser method testIt.
@Test
public void testIt() throws ParseException {
String contentType;
// now test only on linux platform
if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
System.err.println("Current OS is " + System.getProperty("os.name") + ".");
System.err.println("No test is run on OS other than linux.");
return;
}
Configuration conf = NutchConfiguration.create();
// loop alternately, total 10*2 times of invoking external command
for (int i = 0; i < 10; i++) {
// check external parser that does 'cat'
contentType = "application/vnd.nutch.example.cat";
content.setContentType(contentType);
parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
Assert.assertEquals(expectedText, parse.getText());
// check external parser that does 'md5sum'
contentType = "application/vnd.nutch.example.md5sum";
content.setContentType(contentType);
parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
Assert.assertTrue(parse.getText().startsWith(expectedMD5sum));
}
}
use of org.apache.nutch.parse.ParseUtil in project nutch by apache.
the class TestRTFParser method testIt.
@Ignore("There seems to be an issue with line 71 e.g. text.trim()")
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
urlString = "file:" + sampleDir + fileSeparator + rtfFile;
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
String text = parse.getText();
Assert.assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
String title = parse.getData().getTitle();
Metadata meta = parse.getData().getParseMeta();
Assert.assertEquals("test rft document", title);
Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
}
use of org.apache.nutch.parse.ParseUtil in project nutch by apache.
the class ZipTextExtractor method extractText.
public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException {
String resultText = "";
ZipInputStream zin = new ZipInputStream(input);
ZipEntry entry;
while ((entry = zin.getNextEntry()) != null) {
if (!entry.isDirectory()) {
int size = (int) entry.getSize();
byte[] b = new byte[size];
for (int x = 0; x < size; x++) {
int err = zin.read();
if (err != -1) {
b[x] = (byte) err;
}
}
String newurl = url + "/";
String fname = entry.getName();
newurl += fname;
URL aURL = new URL(newurl);
String base = aURL.toString();
int i = fname.lastIndexOf('.');
if (i != -1) {
// Trying to resolve the Mime-Type
Tika tika = new Tika();
String contentType = tika.detect(fname);
try {
Metadata metadata = new Metadata();
metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
metadata.set(Response.CONTENT_TYPE, contentType);
Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
ParseData theParseData = parse.getData();
Outlink[] theOutlinks = theParseData.getOutlinks();
for (int count = 0; count < theOutlinks.length; count++) {
outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
}
resultText += entry.getName() + " " + parse.getText() + " ";
} catch (ParseException e) {
if (LOG.isInfoEnabled()) {
LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
}
}
}
}
}
return resultText;
}
Aggregations