use of org.apache.tika.fork.ForkParser in project tika by apache.
the class ForkParserIntegrationTest method testParsingErrorInForkedParserShouldBeReported.
/**
* TIKA-831 Parsers throwing errors should be caught and
* properly reported
*/
@Test
public void testParsingErrorInForkedParserShouldBeReported() throws Exception {
BrokenParser brokenParser = new BrokenParser();
ForkParser parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
InputStream stream = getClass().getResourceAsStream("/test-documents/testTXT.txt");
// With a serializable error, we'll get that back
try {
ContentHandler output = new BodyContentHandler();
ParseContext context = new ParseContext();
parser.parse(stream, output, new Metadata(), context);
fail("Expected TikaException caused by Error");
} catch (TikaException e) {
assertEquals(brokenParser.err, e.getCause());
} finally {
parser.close();
}
// With a non serializable one, we'll get something else
// TODO Fix this test
brokenParser = new BrokenParser();
brokenParser.re = new WontBeSerializedError("Can't Serialize");
parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
// try {
// ContentHandler output = new BodyContentHandler();
// ParseContext context = new ParseContext();
// parser.parse(stream, output, new Metadata(), context);
// fail("Expected TikaException caused by Error");
// } catch (TikaException e) {
// assertEquals(TikaException.class, e.getCause().getClass());
// assertEquals("Bang!", e.getCause().getMessage());
// }
}
use of org.apache.tika.fork.ForkParser in project tika by apache.
the class ForkParserIntegrationTest method testParserHandlingOfNonSerializable.
/**
* If we supply a non serializable object on the ParseContext,
* check we get a helpful exception back
*/
@Test
public void testParserHandlingOfNonSerializable() throws Exception {
ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser());
ParseContext context = new ParseContext();
context.set(Detector.class, new Detector() {
public MediaType detect(InputStream input, Metadata metadata) {
return MediaType.OCTET_STREAM;
}
});
try {
ContentHandler output = new BodyContentHandler();
InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
parser.parse(stream, output, new Metadata(), context);
fail("Should have blown up with a non serializable ParseContext");
} catch (TikaException e) {
// Check the right details
assertNotNull(e.getCause());
assertEquals(NotSerializableException.class, e.getCause().getClass());
assertEquals("Unable to serialize ParseContext to pass to the Forked Parser", e.getMessage());
} finally {
parser.close();
}
}
use of org.apache.tika.fork.ForkParser in project tika by apache.
the class ForkParserIntegrationTest method testAttachingADebuggerOnTheForkedParserShouldWork.
/**
* TIKA-832
*/
@Test
public void testAttachingADebuggerOnTheForkedParserShouldWork() throws Exception {
ParseContext context = new ParseContext();
context.set(Parser.class, tika.getParser());
ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser());
parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Xdebug", "-Xrunjdwp:transport=dt_socket,address=54321,server=y,suspend=n"));
try {
ContentHandler body = new BodyContentHandler();
InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
parser.parse(stream, body, new Metadata(), context);
String content = body.toString();
assertContains("Test d'indexation", content);
assertContains("http://www.apache.org", content);
} finally {
parser.close();
}
}
use of org.apache.tika.fork.ForkParser in project tika by apache.
the class BundleIT method testForkParser.
@Test
public void testForkParser() throws Exception {
ForkParser parser = new ForkParser(Activator.class.getClassLoader(), defaultParser);
String data = "<!DOCTYPE html>\n<html><body><p>test <span>content</span></p></body></html>";
InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
Writer writer = new StringWriter();
ContentHandler contentHandler = new BodyContentHandler(writer);
Metadata metadata = new Metadata();
MediaType type = contentTypeDetector.detect(stream, metadata);
assertEquals(type.toString(), "text/html");
metadata.add(Metadata.CONTENT_TYPE, type.toString());
ParseContext parseCtx = new ParseContext();
parser.parse(stream, contentHandler, metadata, parseCtx);
writer.flush();
String content = writer.toString();
assertTrue(content.length() > 0);
assertEquals("test content", content.trim());
}
use of org.apache.tika.fork.ForkParser in project jackrabbit by apache.
the class SearchIndex method createParser.
private Parser createParser() {
URL url = null;
if (tikaConfigPath != null) {
File file = new File(tikaConfigPath);
if (file.exists()) {
try {
url = file.toURI().toURL();
} catch (MalformedURLException e) {
log.warn("Invalid Tika configuration path: " + file, e);
}
} else {
ClassLoader loader = SearchIndex.class.getClassLoader();
url = loader.getResource(tikaConfigPath);
}
}
if (url == null) {
url = SearchIndex.class.getResource("tika-config.xml");
}
TikaConfig config = null;
if (url != null) {
try {
config = new TikaConfig(url);
} catch (Exception e) {
log.warn("Tika configuration not available: " + url, e);
}
}
if (config == null) {
config = TikaConfig.getDefaultConfig();
}
if (forkJavaCommand != null) {
ForkParser forkParser = new ForkParser(SearchIndex.class.getClassLoader(), new AutoDetectParser(config));
forkParser.setJavaCommand(forkJavaCommand);
forkParser.setPoolSize(extractorPoolSize);
return forkParser;
} else {
return new AutoDetectParser(config);
}
}
Aggregations