use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.
the class PDFParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
PDDocument pdfDocument = null;
String password = "";
try {
// PDFBox can process entirely in memory, or can use a temp file
// for unpacked / processed resources
// Decide which to do based on if we're reading from a file or not already
//TODO: make this configurable via MemoryUsageSetting
TikaInputStream tstream = TikaInputStream.cast(stream);
password = getPassword(metadata, context);
if (tstream != null && tstream.hasFile()) {
// File based -- send file directly to PDFBox
pdfDocument = PDDocument.load(tstream.getPath().toFile(), password);
} else {
pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password);
}
metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));
metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
extractMetadata(pdfDocument, metadata, context);
AccessChecker checker = localConfig.getAccessChecker();
checker.check(metadata);
if (handler != null) {
if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
handleXFAOnly(pdfDocument, handler, metadata, context);
} else if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
} else {
if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
}
PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
}
}
} catch (InvalidPasswordException e) {
metadata.set(PDF.IS_ENCRYPTED, "true");
throw new EncryptedDocumentException(e);
} finally {
if (pdfDocument != null) {
pdfDocument.close();
}
}
}
use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.
the class TXTParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Automatically detect the character encoding
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
//try to get detected content type; could be a subclass of text/plain
//such as vcal, etc.
String incomingMime = metadata.get(Metadata.CONTENT_TYPE);
MediaType mediaType = MediaType.TEXT_PLAIN;
if (incomingMime != null) {
MediaType tmpMediaType = MediaType.parse(incomingMime);
if (tmpMediaType != null) {
mediaType = tmpMediaType;
}
}
Charset charset = reader.getCharset();
MediaType type = new MediaType(mediaType, charset);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
char[] buffer = new char[4096];
int n = reader.read(buffer);
while (n != -1) {
xhtml.characters(buffer, 0, n);
n = reader.read(buffer);
}
xhtml.endElement("p");
xhtml.endDocument();
}
}
use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.
the class AppleSingleFileParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
EmbeddedDocumentExtractor ex = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
short numEntries = readThroughNumEntries(stream);
long bytesRead = 26;
List<FieldInfo> fieldInfoList = getSortedFieldInfoList(stream, numEntries);
bytesRead += 12 * numEntries;
Metadata embeddedMetadata = new Metadata();
bytesRead = processFieldEntries(stream, fieldInfoList, embeddedMetadata, bytesRead);
FieldInfo contentFieldInfo = getContentFieldInfo(fieldInfoList);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
if (contentFieldInfo != null) {
long diff = contentFieldInfo.offset - bytesRead;
IOUtils.skipFully(stream, diff);
if (ex.shouldParseEmbedded(embeddedMetadata)) {
// TODO: we should probably add a readlimiting wrapper around this
// stream to ensure that not more than contentFieldInfo.length bytes
// are read
ex.parseEmbedded(new CloseShieldInputStream(stream), xhtml, embeddedMetadata, false);
}
}
xhtml.endDocument();
}
use of org.apache.commons.io.input.CloseShieldInputStream in project gradle by gradle.
the class TarTaskOutputPacker method unpack.
private UnpackResult unpack(SortedSet<ResolvedTaskOutputFilePropertySpec> propertySpecs, TarArchiveInputStream tarInput, TaskOutputOriginReader readOriginAction) throws IOException {
Map<String, ResolvedTaskOutputFilePropertySpec> propertySpecsMap = Maps.uniqueIndex(propertySpecs, new Function<TaskFilePropertySpec, String>() {
@Override
public String apply(TaskFilePropertySpec propertySpec) {
return propertySpec.getPropertyName();
}
});
TarArchiveEntry tarEntry;
OriginTaskExecutionMetadata originMetadata = null;
ImmutableListMultimap.Builder<String, FileSnapshot> propertyFileSnapshots = ImmutableListMultimap.builder();
long entries = 0;
while ((tarEntry = tarInput.getNextTarEntry()) != null) {
++entries;
String path = tarEntry.getName();
if (path.equals(METADATA_PATH)) {
// handle origin metadata
originMetadata = readOriginAction.execute(new CloseShieldInputStream(tarInput));
} else {
// handle output property
Matcher matcher = PROPERTY_PATH.matcher(path);
if (!matcher.matches()) {
throw new IllegalStateException("Cached result format error, invalid contents: " + path);
}
String propertyName = unescape(matcher.group(2));
ResolvedTaskOutputFilePropertySpec propertySpec = propertySpecsMap.get(propertyName);
if (propertySpec == null) {
throw new IllegalStateException(String.format("No output property '%s' registered", propertyName));
}
boolean outputMissing = matcher.group(1) != null;
String childPath = matcher.group(3);
unpackPropertyEntry(propertySpec, tarInput, tarEntry, childPath, outputMissing, propertyFileSnapshots);
}
}
if (originMetadata == null) {
throw new IllegalStateException("Cached result format error, no origin metadata was found.");
}
return new UnpackResult(originMetadata, entries, propertyFileSnapshots.build());
}
Aggregations