use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class TestContainerAwareDetector method testTruncatedFiles.
@Test
public void testTruncatedFiles() throws Exception {
// First up a truncated OOXML (zip) file
// With only the data supplied, the best we can do is the container
Metadata m = new Metadata();
try (TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300)) {
assertEquals(MediaType.application("x-tika-ooxml"), detector.detect(xlsx, m));
}
// With truncated data + filename, we can use the filename to specialise
m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
try (TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300)) {
assertEquals(MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"), detector.detect(xlsx, m));
}
// Now a truncated OLE2 file
m = new Metadata();
try (TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400)) {
assertEquals(MediaType.application("x-tika-msoffice"), detector.detect(xls, m));
}
// Finally a truncated OLE2 file, with a filename available
m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xls");
try (TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400)) {
assertEquals(MediaType.application("vnd.ms-excel"), detector.detect(xls, m));
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class TestContainerAwareDetector method assertTypeByNameAndData.
private void assertTypeByNameAndData(String dataFile, String name, String typeFromDetector, String typeFromMagic) throws Exception {
try (TikaInputStream stream = TikaInputStream.get(TestContainerAwareDetector.class.getResource("/test-documents/" + dataFile))) {
Metadata m = new Metadata();
if (name != null)
m.add(Metadata.RESOURCE_NAME_KEY, name);
// Mime Magic version is likely to be less precise
if (typeFromMagic != null) {
assertEquals(MediaType.parse(typeFromMagic), mimeTypes.detect(stream, m));
}
// All being well, the detector should get it perfect
assertEquals(MediaType.parse(typeFromDetector), detector.detect(stream, m));
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class StringsParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
StringsConfig stringsConfig = context.get(StringsConfig.class, DEFAULT_STRINGS_CONFIG);
FileConfig fileConfig = context.get(FileConfig.class, DEFAULT_FILE_CONFIG);
if (!hasStrings(stringsConfig)) {
return;
}
TikaInputStream tis = TikaInputStream.get(stream);
File input = tis.getFile();
// Metadata
metadata.set("strings:min-len", "" + stringsConfig.getMinLength());
metadata.set("strings:encoding", stringsConfig.toString());
metadata.set("strings:file_output", doFile(input, fileConfig));
int totalBytes = 0;
// Content
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
totalBytes = doStrings(input, stringsConfig, xhtml);
xhtml.endDocument();
// Metadata
metadata.set("strings:length", "" + totalBytes);
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class RTFObjDataParser method handleEmbeddedPOIFS.
//will throw IOException if not actually POIFS
//can return null byte[]
private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException {
byte[] ret = null;
try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {
DirectoryNode root = fs.getRoot();
if (root == null) {
return ret;
}
if (root.hasEntry("Package")) {
Entry ooxml = root.getEntry("Package");
TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));
ByteArrayOutputStream out = new ByteArrayOutputStream();
IOUtils.copy(stream, out);
ret = out.toByteArray();
} else {
//try poifs
POIFSDocumentType type = POIFSDocumentType.detectType(root);
if (type == POIFSDocumentType.OLE10_NATIVE) {
try {
// Try to un-wrap the OLE10Native record:
Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
ret = ole.getDataBuffer();
} catch (Ole10NativeException ex) {
// Not a valid OLE10Native record, skip it
}
} else if (type == POIFSDocumentType.COMP_OBJ) {
DocumentEntry contentsEntry;
try {
contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
} catch (FileNotFoundException ioe) {
contentsEntry = (DocumentEntry) root.getEntry("Contents");
}
try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) {
ret = new byte[contentsEntry.getSize()];
inp.readFully(ret);
}
} else {
ByteArrayOutputStream out = new ByteArrayOutputStream();
is.reset();
IOUtils.copy(is, out);
ret = out.toByteArray();
metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
}
}
}
return ret;
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class MyFirstTika method parseUsingAutoDetect.
public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception {
System.out.println("Handling using AutoDetectParser: [" + filename + "]");
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
ContentHandler handler = new BodyContentHandler();
TikaInputStream stream = TikaInputStream.get(new File(filename), metadata);
parser.parse(stream, handler, metadata, new ParseContext());
return handler.toString();
}
Aggregations