use of org.apache.tika.extractor.ParserContainerExtractor in project Xponents by OpenSextant.
the class EmbeddedContentConverter method conversionImplementation.
/**
* Convert Embedded documents in the supported types to a folder of the embedded items.
* Trivial embedded icons and other components will not be extracted
*
*/
@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
ConvertedDocument compoundDoc = super.conversionImplementation(in, doc);
String ext = FilenameUtils.getExtension(doc.getName());
if (!isSupported(ext)) {
// Not really compound by our standards here.
return compoundDoc;
}
ParserContainerExtractor extractor = new ParserContainerExtractor();
EmbeddedObjectExtractor objExtractor = new EmbeddedObjectExtractor(compoundDoc, true);
TikaInputStream tikaStream = null;
try {
tikaStream = TikaInputStream.get(doc.toPath());
extractor.extract(tikaStream, extractor, objExtractor);
compoundDoc.is_converted = true;
if (compoundDoc.hasRawChildren()) {
// Create text buffer for this compound document here.
// If raw children should be post-processed by some other means, that is up to caller.
// This parent document at least contains a complete text representation of the content in the original doc.
StringBuilder completeText = new StringBuilder();
completeText.append(compoundDoc.getText());
completeText.append("\n==Embedded Objects==\n");
completeText.append(renderText(compoundDoc.getRawChildren()));
compoundDoc.setText(completeText.toString());
compoundDoc.is_converted = true;
return compoundDoc;
} else {
// Try the simple approach.
return compoundDoc;
}
} catch (Exception e) {
throw new IOException("Stream parsing problem", e);
} finally {
tikaStream.close();
}
}
use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.
the class FictionBookParserTest method testEmbedded.
@Test
public void testEmbedded() throws Exception {
try (InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2")) {
ContainerExtractor extractor = new ParserContainerExtractor();
TikaInputStream stream = TikaInputStream.get(input);
assertEquals(true, extractor.isSupported(stream));
// Process it
TrackingHandler handler = new TrackingHandler();
extractor.extract(stream, null, handler);
assertEquals(2, handler.filenames.size());
}
}
use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.
the class SQLite3ParserTest method testInputStreamReset.
//This confirms that reading the stream twice is not
//quadrupling the number of attachments.
@Test
public void testInputStreamReset() throws Exception {
//There should be 8 embedded documents:
//4x word files, two docs and two docxs
//4x png files, the same image embedded in each of the doc and docx
ParserContainerExtractor ex = new ParserContainerExtractor();
InputStreamResettingHandler byteCopier = new InputStreamResettingHandler();
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
try (InputStream is = getResourceAsStream(TEST_FILE1)) {
try (TikaInputStream tis = TikaInputStream.get(is)) {
ex.extract(tis, ex, byteCopier);
is.reset();
}
}
assertEquals(8, byteCopier.bytes.size());
}
use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.
the class POIContainerExtractionTest method testEmbeddedOfficeFilesXML.
@Test
public void testEmbeddedOfficeFilesXML() throws Exception {
ContainerExtractor extractor = new ParserContainerExtractor();
TrackingHandler handler;
handler = process("EmbeddedDocument.docx", extractor, false);
assertTrue(handler.filenames.contains("Microsoft_Office_Excel_97-2003_Worksheet1.bin"));
assertEquals(2, handler.filenames.size());
}
use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.
the class TNEFParserTest method testBodyAndAttachments.
/**
* Check the Rtf and Attachments are returned
* as expected
*/
@Test
public void testBodyAndAttachments() throws Exception {
ContainerExtractor extractor = new ParserContainerExtractor();
// Process it with recursing
// Will have the message body RTF and the attachments
TrackingHandler handler = process(file, extractor, true);
assertEquals(6, handler.filenames.size());
assertEquals(6, handler.mediaTypes.size());
// We know the filenames for all of them
assertEquals("message.rtf", handler.filenames.get(0));
assertEquals(MediaType.application("rtf"), handler.mediaTypes.get(0));
assertEquals("quick.doc", handler.filenames.get(1));
assertEquals(MediaType.application("msword"), handler.mediaTypes.get(1));
assertEquals("quick.html", handler.filenames.get(2));
assertEquals(MediaType.text("html"), handler.mediaTypes.get(2));
assertEquals("quick.pdf", handler.filenames.get(3));
assertEquals(MediaType.application("pdf"), handler.mediaTypes.get(3));
assertEquals("quick.txt", handler.filenames.get(4));
assertEquals(MediaType.text("plain"), handler.mediaTypes.get(4));
assertEquals("quick.xml", handler.filenames.get(5));
assertEquals(MediaType.application("xml"), handler.mediaTypes.get(5));
}
Aggregations