use of org.apache.tika.io.CloseShieldInputStream in project tika by apache.
the class NetworkParser method parse.
private void parse(TikaInputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
if ("telnet".equals(uri.getScheme())) {
try (Socket socket = new Socket(uri.getHost(), uri.getPort())) {
new ParsingTask(stream, new FilterOutputStream(socket.getOutputStream()) {
@Override
public void close() throws IOException {
socket.shutdownOutput();
}
}).parse(socket.getInputStream(), handler, metadata, context);
}
} else {
URL url = uri.toURL();
URLConnection connection = url.openConnection();
connection.setDoOutput(true);
connection.connect();
try (InputStream input = connection.getInputStream()) {
new ParsingTask(stream, connection.getOutputStream()).parse(new CloseShieldInputStream(input), handler, metadata, context);
}
}
}
use of org.apache.tika.io.CloseShieldInputStream in project tika by apache.
the class ParsingEmbeddedDocumentExtractor method parseEmbedded.
public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
if (outputHtml) {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
handler.startElement(XHTML, "div", "div", attributes);
}
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name != null && name.length() > 0 && outputHtml) {
handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
char[] chars = name.toCharArray();
handler.characters(chars, 0, chars.length);
handler.endElement(XHTML, "h1", "h1");
}
// Use the delegate parser to parse this entry
try (TemporaryResources tmp = new TemporaryResources()) {
final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
if (stream instanceof TikaInputStream) {
final Object container = ((TikaInputStream) stream).getOpenContainer();
if (container != null) {
newStream.setOpenContainer(container);
}
}
DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)), metadata, context);
} catch (EncryptedDocumentException ede) {
// TODO: can we log a warning that we lack the password?
// For now, just skip the content
} catch (TikaException e) {
// TODO: can we log a warning somehow?
// Could not parse the entry, just skip the content
}
if (outputHtml) {
handler.endElement(XHTML, "div", "div");
}
}
use of org.apache.tika.io.CloseShieldInputStream in project tika by apache.
the class HSLFExtractor method handleSlideEmbeddedResources.
private void handleSlideEmbeddedResources(HSLFSlide slide, XHTMLContentHandler xhtml) throws TikaException, SAXException, IOException {
List<HSLFShape> shapes;
try {
shapes = slide.getShapes();
} catch (NullPointerException e) {
// Sometimes HSLF hits problems
// Please open POI bugs for any you come across!
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
for (HSLFShape shape : shapes) {
if (shape instanceof OLEShape) {
OLEShape oleShape = (OLEShape) shape;
HSLFObjectData data = null;
try {
data = oleShape.getObjectData();
} catch (NullPointerException e) {
/* getObjectData throws NPE some times. */
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
continue;
}
if (data != null) {
String objID = Integer.toString(oleShape.getObjectID());
// Embedded Object: add a <div
// class="embedded" id="X"/> so consumer can see where
// in the main text each embedded document
// occurred:
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", objID);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
InputStream dataStream = null;
try {
dataStream = data.getData();
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
continue;
}
try (TikaInputStream stream = TikaInputStream.get(dataStream)) {
String mediaType = null;
if ("Excel.Chart.8".equals(oleShape.getProgID())) {
mediaType = "application/vnd.ms-excel";
} else {
MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
mediaType = mt.toString();
}
if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")) {
try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream))) {
handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
}
} else {
handleEmbeddedResource(stream, objID, objID, mediaType, xhtml, false);
}
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
}
}
}
}
}
use of org.apache.tika.io.CloseShieldInputStream in project tika by apache.
the class XmlRootExtractor method extractRootElement.
/**
* @since Apache Tika 0.9
*/
public QName extractRootElement(InputStream stream) {
ExtractorHandler handler = new ExtractorHandler();
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(true);
factory.setValidating(false);
try {
factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
} catch (SAXNotRecognizedException e) {
// TIKA-271 and TIKA-1000: Some XML parsers do not support the secure-processing
// feature, even though it's required by JAXP in Java 5. Ignoring
// the exception is fine here, deployments without this feature
// are inherently vulnerable to XML denial-of-service attacks.
}
factory.newSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(handler));
} catch (Exception ignore) {
}
return handler.rootElement;
}
use of org.apache.tika.io.CloseShieldInputStream in project uPortal by Jasig.
the class JaxbPortalDataHandlerService method importDataArchive.
/**
* Extracts the archive resource and then runs the batch-import process on it.
*/
private void importDataArchive(final Resource resource, final ArchiveInputStream resourceStream, BatchImportOptions options) {
final File tempDir = Files.createTempDir();
try {
ArchiveEntry archiveEntry;
while ((archiveEntry = resourceStream.getNextEntry()) != null) {
final File entryFile = new File(tempDir, archiveEntry.getName());
if (!archiveEntry.isDirectory()) {
entryFile.getParentFile().mkdirs();
IOUtils.copy(new CloseShieldInputStream(resourceStream), new FileOutputStream(entryFile));
}
}
importDataDirectory(tempDir, null, options);
} catch (IOException e) {
throw new RuntimeException("Failed to extract data from '" + resource + "' to '" + tempDir + "' for batch import.", e);
} finally {
FileUtils.deleteQuietly(tempDir);
}
}
Aggregations