use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class NetworkParser method parse.
private void parse(TikaInputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
if ("telnet".equals(uri.getScheme())) {
try (Socket socket = new Socket(uri.getHost(), uri.getPort())) {
new ParsingTask(stream, new FilterOutputStream(socket.getOutputStream()) {
@Override
public void close() throws IOException {
socket.shutdownOutput();
}
}).parse(socket.getInputStream(), handler, metadata, context);
}
} else {
URL url = uri.toURL();
URLConnection connection = url.openConnection();
connection.setDoOutput(true);
connection.connect();
try (InputStream input = connection.getInputStream()) {
new ParsingTask(stream, connection.getOutputStream()).parse(new CloseShieldInputStream(input), handler, metadata, context);
}
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class AutoDetectParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
// Automatically detect the MIME type of the document
MediaType type = detector.detect(tis, metadata);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// TIKA-216: Zip bomb prevention
SecureContentHandler sch = handler != null ? new SecureContentHandler(handler, tis) : null;
//the caller hasn't specified one.
if (context.get(EmbeddedDocumentExtractor.class) == null) {
Parser p = context.get(Parser.class);
if (p == null) {
context.set(Parser.class, this);
}
context.set(EmbeddedDocumentExtractor.class, new ParsingEmbeddedDocumentExtractor(context));
}
try {
// Parse the document
super.parse(tis, sch, metadata, context);
} catch (SAXException e) {
// Convert zip bomb exceptions to TikaExceptions
sch.throwIfCauseOf(e);
throw e;
}
} finally {
tmp.dispose();
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class DigestingParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
TikaInputStream tis = TikaInputStream.get(stream, tmp);
try {
if (digester != null) {
digester.digest(tis, metadata, context);
}
super.parse(tis, handler, metadata, context);
} finally {
tmp.dispose();
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class ZipContainerDetector method detect.
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
// Check if we have access to the document
if (input == null) {
return MediaType.OCTET_STREAM;
}
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(input, tmp);
// enough for all known formats
byte[] prefix = new byte[1024];
int length = tis.peek(prefix);
MediaType type = detectArchiveFormat(prefix, length);
if (PackageParser.isZipArchive(type) && TikaInputStream.isTikaInputStream(input)) {
return detectZipFormat(tis);
} else if (!type.equals(MediaType.OCTET_STREAM)) {
return type;
} else {
return detectCompressorFormat(prefix, length);
}
} finally {
try {
tmp.dispose();
} catch (TikaException e) {
// ignore
}
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class TikaDetectorConfigTest method testPSTDetectionWithoutZipDetector.
/**
* TIKA-1708 - If the Zip detector is disabled, either explicitly,
* or via giving a list of detectors that it isn't part of, ensure
* that detection of PST files still works
*/
@Test
public void testPSTDetectionWithoutZipDetector() throws Exception {
// Check the one with an exclude
TikaConfig configWX = getConfig("TIKA-1708-detector-default.xml");
assertNotNull(configWX.getParser());
assertNotNull(configWX.getDetector());
CompositeDetector detectorWX = (CompositeDetector) configWX.getDetector();
// Check it has the POIFS one, but not the zip one
assertDetectors(detectorWX, true, false);
// Check the one with an explicit list
TikaConfig configCL = getConfig("TIKA-1708-detector-composite.xml");
assertNotNull(configCL.getParser());
assertNotNull(configCL.getDetector());
CompositeDetector detectorCL = (CompositeDetector) configCL.getDetector();
assertEquals(2, detectorCL.getDetectors().size());
// Check it also has the POIFS one, but not the zip one
assertDetectors(detectorCL, true, false);
// Check that both detectors have a mimetypes with entries
assertTrue("Not enough mime types: " + configWX.getMediaTypeRegistry().getTypes().size(), configWX.getMediaTypeRegistry().getTypes().size() > 100);
assertTrue("Not enough mime types: " + configCL.getMediaTypeRegistry().getTypes().size(), configCL.getMediaTypeRegistry().getTypes().size() > 100);
// Now check they detect PST files correctly
TikaInputStream stream = TikaInputStream.get(getResourceAsFile("/test-documents/testPST.pst"));
assertEquals(OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE, detectorWX.detect(stream, new Metadata()));
assertEquals(OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE, detectorCL.detect(stream, new Metadata()));
}
Aggregations