use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class RecursiveParserWrapperTest method testMaxEmbedded.
@Test
public void testMaxEmbedded() throws Exception {
int maxEmbedded = 4;
//including outer container file
int totalNoLimit = 12;
ParseContext context = new ParseContext();
Metadata metadata = new Metadata();
String limitReached = null;
Parser wrapped = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
wrapper.parse(stream, new DefaultHandler(), metadata, context);
List<Metadata> list = wrapper.getMetadata();
//test default
assertEquals(totalNoLimit, list.size());
limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
assertNull(limitReached);
wrapper.reset();
stream.close();
//test setting value
metadata = new Metadata();
stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
wrapper.setMaxEmbeddedResources(maxEmbedded);
wrapper.parse(stream, new DefaultHandler(), metadata, context);
list = wrapper.getMetadata();
//add 1 for outer container file
assertEquals(maxEmbedded + 1, list.size());
limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
assertEquals("true", limitReached);
wrapper.reset();
stream.close();
//test setting value < 0
metadata = new Metadata();
stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
wrapper.setMaxEmbeddedResources(-2);
wrapper.parse(stream, new DefaultHandler(), metadata, context);
assertEquals(totalNoLimit, list.size());
limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
assertNull(limitReached);
}
use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class RecursiveParserWrapperTest method testPrimaryExcWEmbedded.
@Test
public void testPrimaryExcWEmbedded() throws Exception {
//if embedded content is handled and then
//the parser hits an exception in the container document,
//that the first element of the returned list is the container document
//and the second is the embedded content
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, "embedded_then_npe.xml");
ParseContext context = new ParseContext();
Parser wrapped = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true);
String path = "/test-documents/mock/embedded_then_npe.xml";
InputStream stream = null;
boolean npe = false;
try {
stream = RecursiveParserWrapperTest.class.getResourceAsStream(path);
wrapper.parse(stream, new DefaultHandler(), metadata, context);
} catch (TikaException e) {
if (e.getCause().getClass().equals(NullPointerException.class)) {
npe = true;
}
} finally {
IOUtils.closeQuietly(stream);
}
assertTrue("npe", npe);
List<Metadata> metadataList = wrapper.getMetadata();
assertEquals(2, metadataList.size());
Metadata outerMetadata = metadataList.get(0);
Metadata embeddedMetadata = metadataList.get(1);
assertContains("main_content", outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
assertEquals("embedded_then_npe.xml", outerMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
assertEquals("Nikolai Lobachevsky", outerMetadata.get("author"));
assertContains("some_embedded_content", embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
assertEquals("embed1.xml", embeddedMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
}
use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class TesseractOCRParserTest method runOCR.
private String runOCR(String resource, String[] nonOCRContains, int numMetadatas, BasicContentHandlerFactory.HANDLER_TYPE handlerType, TesseractOCRConfig.OUTPUT_TYPE outputType) throws Exception {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setOutputType(outputType);
Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory(handlerType, -1));
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(true);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
parseContext.set(Parser.class, parser);
parseContext.set(PDFParserConfig.class, pdfConfig);
try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) {
parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
}
List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata();
assertEquals(numMetadatas, metadataList.size());
StringBuilder contents = new StringBuilder();
for (Metadata m : metadataList) {
contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
}
for (String needle : nonOCRContains) {
assertContains(needle, contents.toString());
}
assertTrue(metadataList.get(0).names().length > 10);
assertTrue(metadataList.get(1).names().length > 10);
//test at least one value
assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
return contents.toString();
}
use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class RecursiveParserWrapperFSConsumerTest method testEmbeddedWithNPE.
@Test
public void testEmbeddedWithNPE() throws Exception {
final String path = "/test-documents/embedded_with_npe.xml";
final Metadata metadata = new Metadata();
metadata.add(Metadata.RESOURCE_NAME_KEY, "embedded_with_npe.xml");
ArrayBlockingQueue<FileResource> queue = new ArrayBlockingQueue<FileResource>(2);
queue.add(new FileResource() {
@Override
public String getResourceId() {
return "testFile";
}
@Override
public Metadata getMetadata() {
return metadata;
}
@Override
public InputStream openInputStream() throws IOException {
return this.getClass().getResourceAsStream(path);
}
});
queue.add(new PoisonFileResource());
MockOSFactory mockOSFactory = new MockOSFactory();
RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer(queue, new AutoDetectParserFactory(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), mockOSFactory, new TikaConfig());
IFileProcessorFutureResult result = consumer.call();
mockOSFactory.getStreams().get(0).flush();
byte[] bytes = mockOSFactory.getStreams().get(0).toByteArray();
List<Metadata> results = JsonMetadataList.fromJson(new InputStreamReader(new ByteArrayInputStream(bytes), UTF_8));
assertEquals(4, results.size());
assertContains("another null pointer", results.get(2).get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
assertEquals("Nikolai Lobachevsky", results.get(0).get("author"));
for (int i = 1; i < 4; i++) {
assertEquals("embeddedAuthor" + i, results.get(i).get("author"));
assertContains("some_embedded_content" + i, results.get(i).get(RecursiveParserWrapper.TIKA_CONTENT));
}
}
use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class ParsingExample method recursiveParserWrapperExample.
/**
* For documents that may contain embedded documents, it might be helpful
* to create list of metadata objects, one for the container document and
* one for each embedded document. This allows easy access to both the
* extracted content and the metadata of each embedded document.
* Note that many document formats can contain embedded documents,
* including traditional container formats -- zip, tar and others -- but also
* common office document formats including: MSWord, MSExcel,
* MSPowerPoint, RTF, PDF, MSG and several others.
* <p>
* The "content" format is determined by the ContentHandlerFactory, and
* the content is stored in {@link org.apache.tika.parser.RecursiveParserWrapper#TIKA_CONTENT}
* <p>
* The drawback to the RecursiveParserWrapper is that it caches metadata and contents
* in memory. This should not be used on files whose contents are too big to be handled
* in memory.
*
* @return a list of metadata object, one each for the container file and each embedded file
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public List<Metadata> recursiveParserWrapperExample() throws IOException, SAXException, TikaException {
Parser p = new AutoDetectParser();
ContentHandlerFactory factory = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1);
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, factory);
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
ParseContext context = new ParseContext();
try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
wrapper.parse(stream, new DefaultHandler(), metadata, context);
}
return wrapper.getMetadata();
}
Aggregations