use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.
the class TikaTest method getRecursiveMetadata.
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
Parser p = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
wrapper.parse(is, new DefaultHandler(), new Metadata(), context);
}
return wrapper.getMetadata();
}
use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.
the class TesseractOCRParserTest method runOCR.
private String runOCR(String resource, String[] nonOCRContains, int numMetadatas, BasicContentHandlerFactory.HANDLER_TYPE handlerType, TesseractOCRConfig.OUTPUT_TYPE outputType) throws Exception {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setOutputType(outputType);
Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory(handlerType, -1));
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(true);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
parseContext.set(Parser.class, parser);
parseContext.set(PDFParserConfig.class, pdfConfig);
try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) {
parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
}
List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata();
assertEquals(numMetadatas, metadataList.size());
StringBuilder contents = new StringBuilder();
for (Metadata m : metadataList) {
contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
}
for (String needle : nonOCRContains) {
assertContains(needle, contents.toString());
}
assertTrue(metadataList.get(0).names().length > 10);
assertTrue(metadataList.get(1).names().length > 10);
//test at least one value
assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
return contents.toString();
}
use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.
the class RecursiveParserWrapperFSConsumer method processFileResource.
@Override
public boolean processFileResource(FileResource fileResource) {
Parser wrapped = parserFactory.getParser(tikaConfig);
RecursiveParserWrapper parser = new RecursiveParserWrapper(wrapped, contentHandlerFactory);
ParseContext context = new ParseContext();
// if (parseRecursively == true) {
context.set(Parser.class, parser);
// }
//try to open outputstream first
OutputStream os = getOutputStream(fsOSFactory, fileResource);
if (os == null) {
LOG.debug("Skipping: {}", fileResource.getMetadata().get(FSProperties.FS_REL_PATH));
return false;
}
//try to open the inputstream before the parse.
//if the parse hangs or throws a nasty exception, at least there will
//be a zero byte file there so that the batchrunner can skip that problematic
//file during the next run.
InputStream is = getInputStream(fileResource);
if (is == null) {
IOUtils.closeQuietly(os);
return false;
}
Throwable thrown = null;
List<Metadata> metadataList = null;
Metadata containerMetadata = fileResource.getMetadata();
try {
parse(fileResource.getResourceId(), parser, is, new DefaultHandler(), containerMetadata, context);
metadataList = parser.getMetadata();
} catch (Throwable t) {
thrown = t;
metadataList = parser.getMetadata();
if (metadataList == null) {
metadataList = new LinkedList<>();
}
Metadata m = null;
if (metadataList.size() == 0) {
m = containerMetadata;
} else {
//take the top metadata item
m = metadataList.remove(0);
}
String stackTrace = ExceptionUtils.getFilteredStackTrace(t);
m.add(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime", stackTrace);
metadataList.add(0, m);
} finally {
IOUtils.closeQuietly(is);
}
Writer writer = null;
try {
writer = new OutputStreamWriter(os, getOutputEncoding());
JsonMetadataList.toJson(metadataList, writer);
} catch (Exception e) {
//this is a stop the world kind of thing
LOG.error("{}", getXMLifiedLogMsg(IO_OS + "json", fileResource.getResourceId(), e));
throw new RuntimeException(e);
} finally {
flushAndClose(writer);
}
if (thrown != null) {
if (thrown instanceof Error) {
throw (Error) thrown;
} else {
return false;
}
}
return true;
}
use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.
the class ParsingExample method recursiveParserWrapperExample.
/**
* For documents that may contain embedded documents, it might be helpful
* to create list of metadata objects, one for the container document and
* one for each embedded document. This allows easy access to both the
* extracted content and the metadata of each embedded document.
* Note that many document formats can contain embedded documents,
* including traditional container formats -- zip, tar and others -- but also
* common office document formats including: MSWord, MSExcel,
* MSPowerPoint, RTF, PDF, MSG and several others.
* <p>
* The "content" format is determined by the ContentHandlerFactory, and
* the content is stored in {@link org.apache.tika.parser.RecursiveParserWrapper#TIKA_CONTENT}
* <p>
* The drawback to the RecursiveParserWrapper is that it caches metadata and contents
* in memory. This should not be used on files whose contents are too big to be handled
* in memory.
*
* @return a list of metadata object, one each for the container file and each embedded file
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public List<Metadata> recursiveParserWrapperExample() throws IOException, SAXException, TikaException {
Parser p = new AutoDetectParser();
ContentHandlerFactory factory = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1);
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, factory);
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
ParseContext context = new ParseContext();
try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
wrapper.parse(stream, new DefaultHandler(), metadata, context);
}
return wrapper.getMetadata();
}
use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.
the class PDFParserTest method testEmbeddedFilesInChildren.
// TIKA-1228, TIKA-1268
@Test
public void testEmbeddedFilesInChildren() throws Exception {
String xml = getXML("/testPDF_childAttachments.pdf").xml;
//"regressiveness" exists only in Unit10.doc not in the container pdf document
assertTrue(xml.contains("regressiveness"));
RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
ParseContext context = new ParseContext();
PDFParserConfig config = new PDFParserConfig();
config.setExtractInlineImages(true);
config.setExtractUniqueInlineImagesOnly(false);
context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
context.set(org.apache.tika.parser.Parser.class, p);
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"))) {
p.parse(tis, new BodyContentHandler(-1), new Metadata(), context);
}
List<Metadata> metadatas = p.getMetadata();
assertEquals(5, metadatas.size());
assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
assertEquals("image0.jpg", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
assertEquals("Press Quality(1).joboptions", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
assertEquals("Unit10.doc", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
assertEquals(MediaType.image("jpeg").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE));
assertEquals(MediaType.image("tiff").toString(), metadatas.get(2).get(Metadata.CONTENT_TYPE));
assertEquals("text/plain; charset=ISO-8859-1", metadatas.get(3).get(Metadata.CONTENT_TYPE));
assertEquals(TYPE_DOC.toString(), metadatas.get(4).get(Metadata.CONTENT_TYPE));
}
Aggregations