use of org.apache.tika.metadata.Metadata in project tika by apache.
the class ForkParserIntegrationTest method testParsingErrorInForkedParserShouldBeReported.
/**
* TIKA-831 Parsers throwing errors should be caught and
* properly reported
*/
@Test
public void testParsingErrorInForkedParserShouldBeReported() throws Exception {
BrokenParser brokenParser = new BrokenParser();
ForkParser parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
InputStream stream = getClass().getResourceAsStream("/test-documents/testTXT.txt");
// With a serializable error, we'll get that back
try {
ContentHandler output = new BodyContentHandler();
ParseContext context = new ParseContext();
parser.parse(stream, output, new Metadata(), context);
fail("Expected TikaException caused by Error");
} catch (TikaException e) {
assertEquals(brokenParser.err, e.getCause());
} finally {
parser.close();
}
// With a non serializable one, we'll get something else
// TODO Fix this test
brokenParser = new BrokenParser();
brokenParser.re = new WontBeSerializedError("Can't Serialize");
parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
// try {
// ContentHandler output = new BodyContentHandler();
// ParseContext context = new ParseContext();
// parser.parse(stream, output, new Metadata(), context);
// fail("Expected TikaException caused by Error");
// } catch (TikaException e) {
// assertEquals(TikaException.class, e.getCause().getClass());
// assertEquals("Bang!", e.getCause().getMessage());
// }
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class ForkParserIntegrationTest method testParserHandlingOfNonSerializable.
/**
* If we supply a non serializable object on the ParseContext,
* check we get a helpful exception back
*/
@Test
public void testParserHandlingOfNonSerializable() throws Exception {
ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser());
ParseContext context = new ParseContext();
context.set(Detector.class, new Detector() {
public MediaType detect(InputStream input, Metadata metadata) {
return MediaType.OCTET_STREAM;
}
});
try {
ContentHandler output = new BodyContentHandler();
InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
parser.parse(stream, output, new Metadata(), context);
fail("Should have blown up with a non serializable ParseContext");
} catch (TikaException e) {
// Check the right details
assertNotNull(e.getCause());
assertEquals(NotSerializableException.class, e.getCause().getClass());
assertEquals("Unable to serialize ParseContext to pass to the Forked Parser", e.getMessage());
} finally {
parser.close();
}
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class ForkParserIntegrationTest method testAttachingADebuggerOnTheForkedParserShouldWork.
/**
* TIKA-832
*/
@Test
public void testAttachingADebuggerOnTheForkedParserShouldWork() throws Exception {
ParseContext context = new ParseContext();
context.set(Parser.class, tika.getParser());
ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser());
parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Xdebug", "-Xrunjdwp:transport=dt_socket,address=54321,server=y,suspend=n"));
try {
ContentHandler body = new BodyContentHandler();
InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
parser.parse(stream, body, new Metadata(), context);
String content = body.toString();
assertContains("Test d'indexation", content);
assertContains("http://www.apache.org", content);
} finally {
parser.close();
}
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class RecursiveParserWrapperTest method testMaxEmbedded.
@Test
public void testMaxEmbedded() throws Exception {
int maxEmbedded = 4;
//including outer container file
int totalNoLimit = 12;
ParseContext context = new ParseContext();
Metadata metadata = new Metadata();
String limitReached = null;
Parser wrapped = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
wrapper.parse(stream, new DefaultHandler(), metadata, context);
List<Metadata> list = wrapper.getMetadata();
//test default
assertEquals(totalNoLimit, list.size());
limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
assertNull(limitReached);
wrapper.reset();
stream.close();
//test setting value
metadata = new Metadata();
stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
wrapper.setMaxEmbeddedResources(maxEmbedded);
wrapper.parse(stream, new DefaultHandler(), metadata, context);
list = wrapper.getMetadata();
//add 1 for outer container file
assertEquals(maxEmbedded + 1, list.size());
limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
assertEquals("true", limitReached);
wrapper.reset();
stream.close();
//test setting value < 0
metadata = new Metadata();
stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
wrapper.setMaxEmbeddedResources(-2);
wrapper.parse(stream, new DefaultHandler(), metadata, context);
assertEquals(totalNoLimit, list.size());
limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
assertNull(limitReached);
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class RecursiveParserWrapperTest method testPrimaryExcWEmbedded.
@Test
public void testPrimaryExcWEmbedded() throws Exception {
//if embedded content is handled and then
//the parser hits an exception in the container document,
//that the first element of the returned list is the container document
//and the second is the embedded content
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, "embedded_then_npe.xml");
ParseContext context = new ParseContext();
Parser wrapped = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true);
String path = "/test-documents/mock/embedded_then_npe.xml";
InputStream stream = null;
boolean npe = false;
try {
stream = RecursiveParserWrapperTest.class.getResourceAsStream(path);
wrapper.parse(stream, new DefaultHandler(), metadata, context);
} catch (TikaException e) {
if (e.getCause().getClass().equals(NullPointerException.class)) {
npe = true;
}
} finally {
IOUtils.closeQuietly(stream);
}
assertTrue("npe", npe);
List<Metadata> metadataList = wrapper.getMetadata();
assertEquals(2, metadataList.size());
Metadata outerMetadata = metadataList.get(0);
Metadata embeddedMetadata = metadataList.get(1);
assertContains("main_content", outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
assertEquals("embedded_then_npe.xml", outerMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
assertEquals("Nikolai Lobachevsky", outerMetadata.get("author"));
assertContains("some_embedded_content", embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
assertEquals("embed1.xml", embeddedMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
}
Aggregations