use of org.xml.sax.helpers.DefaultHandler in project tika by apache.
the class RecursiveParserWrapperTest method testCharLimit.
@Test
public void testCharLimit() throws Exception {
ParseContext context = new ParseContext();
Metadata metadata = new Metadata();
Parser wrapped = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
wrapper.parse(stream, new DefaultHandler(), metadata, context);
List<Metadata> list = wrapper.getMetadata();
assertEquals(5, list.size());
int wlr = 0;
for (Metadata m : list) {
String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED);
if (limitReached != null && limitReached.equals("true")) {
wlr++;
}
}
assertEquals(1, wlr);
}
use of org.xml.sax.helpers.DefaultHandler in project tika by apache.
the class RecursiveParserWrapperTest method getMetadata.
private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory, boolean catchEmbeddedExceptions, DigestingParser.Digester digester) throws Exception {
ParseContext context = new ParseContext();
Parser wrapped = new AutoDetectParser();
if (digester != null) {
wrapped = new DigestingParser(wrapped, digester);
}
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, contentHandlerFactory, catchEmbeddedExceptions);
String path = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (path == null) {
path = "/test-documents/test_recursive_embedded.docx";
} else {
path = "/test-documents/" + path;
}
InputStream stream = null;
try {
stream = TikaInputStream.get(RecursiveParserWrapperTest.class.getResource(path).toURI());
wrapper.parse(stream, new DefaultHandler(), metadata, context);
} finally {
IOUtils.closeQuietly(stream);
}
return wrapper.getMetadata();
}
use of org.xml.sax.helpers.DefaultHandler in project tika by apache.
the class RecursiveParserWrapperTest method testMaxEmbedded.
@Test
public void testMaxEmbedded() throws Exception {
int maxEmbedded = 4;
//including outer container file
int totalNoLimit = 12;
ParseContext context = new ParseContext();
Metadata metadata = new Metadata();
String limitReached = null;
Parser wrapped = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
wrapper.parse(stream, new DefaultHandler(), metadata, context);
List<Metadata> list = wrapper.getMetadata();
//test default
assertEquals(totalNoLimit, list.size());
limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
assertNull(limitReached);
wrapper.reset();
stream.close();
//test setting value
metadata = new Metadata();
stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
wrapper.setMaxEmbeddedResources(maxEmbedded);
wrapper.parse(stream, new DefaultHandler(), metadata, context);
list = wrapper.getMetadata();
//add 1 for outer container file
assertEquals(maxEmbedded + 1, list.size());
limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
assertEquals("true", limitReached);
wrapper.reset();
stream.close();
//test setting value < 0
metadata = new Metadata();
stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
wrapper.setMaxEmbeddedResources(-2);
wrapper.parse(stream, new DefaultHandler(), metadata, context);
assertEquals(totalNoLimit, list.size());
limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
assertNull(limitReached);
}
use of org.xml.sax.helpers.DefaultHandler in project tika by apache.
the class RecursiveParserWrapperTest method testPrimaryExcWEmbedded.
@Test
public void testPrimaryExcWEmbedded() throws Exception {
//if embedded content is handled and then
//the parser hits an exception in the container document,
//that the first element of the returned list is the container document
//and the second is the embedded content
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, "embedded_then_npe.xml");
ParseContext context = new ParseContext();
Parser wrapped = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true);
String path = "/test-documents/mock/embedded_then_npe.xml";
InputStream stream = null;
boolean npe = false;
try {
stream = RecursiveParserWrapperTest.class.getResourceAsStream(path);
wrapper.parse(stream, new DefaultHandler(), metadata, context);
} catch (TikaException e) {
if (e.getCause().getClass().equals(NullPointerException.class)) {
npe = true;
}
} finally {
IOUtils.closeQuietly(stream);
}
assertTrue("npe", npe);
List<Metadata> metadataList = wrapper.getMetadata();
assertEquals(2, metadataList.size());
Metadata outerMetadata = metadataList.get(0);
Metadata embeddedMetadata = metadataList.get(1);
assertContains("main_content", outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
assertEquals("embedded_then_npe.xml", outerMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
assertEquals("Nikolai Lobachevsky", outerMetadata.get("author"));
assertContains("some_embedded_content", embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
assertEquals("embed1.xml", embeddedMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
}
use of org.xml.sax.helpers.DefaultHandler in project tika by apache.
the class UnpackerResource method process.
private Map<String, byte[]> process(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info, boolean saveAll) throws Exception {
Metadata metadata = new Metadata();
ParseContext pc = new ParseContext();
Parser parser = TikaResource.createParser();
if (parser instanceof DigestingParser) {
//no need to digest for unwrapping
parser = ((DigestingParser) parser).getWrappedParser();
}
TikaResource.fillMetadata(parser, metadata, pc, httpHeaders.getRequestHeaders());
TikaResource.logRequest(LOG, info, metadata);
ContentHandler ch;
ByteArrayOutputStream text = new ByteArrayOutputStream();
if (saveAll) {
ch = new BodyContentHandler(new RichTextContentHandler(new OutputStreamWriter(text, UTF_8)));
} else {
ch = new DefaultHandler();
}
Map<String, byte[]> files = new HashMap<>();
MutableInt count = new MutableInt();
pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files));
TikaResource.parse(parser, LOG, info.getPath(), is, ch, metadata, pc);
if (count.intValue() == 0 && !saveAll) {
throw new WebApplicationException(Response.Status.NO_CONTENT);
}
if (saveAll) {
files.put(TEXT_FILENAME, text.toByteArray());
ByteArrayOutputStream metaStream = new ByteArrayOutputStream();
metadataToCsv(metadata, metaStream);
files.put(META_FILENAME, metaStream.toByteArray());
}
return files;
}
Aggregations