Search in sources :

Example 26 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class PDFParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
    PDDocument pdfDocument = null;
    String password = "";
    try {
        // PDFBox can process entirely in memory, or can use a temp file
        //  for unpacked / processed resources
        // Decide which to do based on if we're reading from a file or not already
        //TODO: make this configurable via MemoryUsageSetting
        TikaInputStream tstream = TikaInputStream.cast(stream);
        password = getPassword(metadata, context);
        if (tstream != null && tstream.hasFile()) {
            // File based -- send file directly to PDFBox
            pdfDocument = PDDocument.load(tstream.getPath().toFile(), password);
        } else {
            pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password);
        }
        metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));
        metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
        extractMetadata(pdfDocument, metadata, context);
        AccessChecker checker = localConfig.getAccessChecker();
        checker.check(metadata);
        if (handler != null) {
            if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
                handleXFAOnly(pdfDocument, handler, metadata, context);
            } else if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
                metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
                OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
            } else {
                if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
                    metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
                }
                PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
            }
        }
    } catch (InvalidPasswordException e) {
        metadata.set(PDF.IS_ENCRYPTED, "true");
        throw new EncryptedDocumentException(e);
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}
Also used : EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) TikaInputStream(org.apache.tika.io.TikaInputStream) InvalidPasswordException(org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException) COSString(org.apache.pdfbox.cos.COSString) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 27 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class TXTParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Automatically detect the character encoding
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
        //try to get detected content type; could be a subclass of text/plain
        //such as vcal, etc.
        String incomingMime = metadata.get(Metadata.CONTENT_TYPE);
        MediaType mediaType = MediaType.TEXT_PLAIN;
        if (incomingMime != null) {
            MediaType tmpMediaType = MediaType.parse(incomingMime);
            if (tmpMediaType != null) {
                mediaType = tmpMediaType;
            }
        }
        Charset charset = reader.getCharset();
        MediaType type = new MediaType(mediaType, charset);
        metadata.set(Metadata.CONTENT_TYPE, type.toString());
        // deprecated, see TIKA-431
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.startElement("p");
        char[] buffer = new char[4096];
        int n = reader.read(buffer);
        while (n != -1) {
            xhtml.characters(buffer, 0, n);
            n = reader.read(buffer);
        }
        xhtml.endElement("p");
        xhtml.endDocument();
    }
}
Also used : AutoDetectReader(org.apache.tika.detect.AutoDetectReader) MediaType(org.apache.tika.mime.MediaType) Charset(java.nio.charset.Charset) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 28 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class AppleSingleFileParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    EmbeddedDocumentExtractor ex = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
    short numEntries = readThroughNumEntries(stream);
    long bytesRead = 26;
    List<FieldInfo> fieldInfoList = getSortedFieldInfoList(stream, numEntries);
    bytesRead += 12 * numEntries;
    Metadata embeddedMetadata = new Metadata();
    bytesRead = processFieldEntries(stream, fieldInfoList, embeddedMetadata, bytesRead);
    FieldInfo contentFieldInfo = getContentFieldInfo(fieldInfoList);
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    if (contentFieldInfo != null) {
        long diff = contentFieldInfo.offset - bytesRead;
        IOUtils.skipFully(stream, diff);
        if (ex.shouldParseEmbedded(embeddedMetadata)) {
            // TODO: we should probably add a readlimiting wrapper around this
            // stream to ensure that not more than contentFieldInfo.length bytes
            // are read
            ex.parseEmbedded(new CloseShieldInputStream(stream), xhtml, embeddedMetadata, false);
        }
    }
    xhtml.endDocument();
}
Also used : EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) Metadata(org.apache.tika.metadata.Metadata) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 29 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project gradle by gradle.

the class TarTaskOutputPacker method unpack.

private UnpackResult unpack(SortedSet<ResolvedTaskOutputFilePropertySpec> propertySpecs, TarArchiveInputStream tarInput, TaskOutputOriginReader readOriginAction) throws IOException {
    Map<String, ResolvedTaskOutputFilePropertySpec> propertySpecsMap = Maps.uniqueIndex(propertySpecs, new Function<TaskFilePropertySpec, String>() {

        @Override
        public String apply(TaskFilePropertySpec propertySpec) {
            return propertySpec.getPropertyName();
        }
    });
    TarArchiveEntry tarEntry;
    OriginTaskExecutionMetadata originMetadata = null;
    ImmutableListMultimap.Builder<String, FileSnapshot> propertyFileSnapshots = ImmutableListMultimap.builder();
    long entries = 0;
    while ((tarEntry = tarInput.getNextTarEntry()) != null) {
        ++entries;
        String path = tarEntry.getName();
        if (path.equals(METADATA_PATH)) {
            // handle origin metadata
            originMetadata = readOriginAction.execute(new CloseShieldInputStream(tarInput));
        } else {
            // handle output property
            Matcher matcher = PROPERTY_PATH.matcher(path);
            if (!matcher.matches()) {
                throw new IllegalStateException("Cached result format error, invalid contents: " + path);
            }
            String propertyName = unescape(matcher.group(2));
            ResolvedTaskOutputFilePropertySpec propertySpec = propertySpecsMap.get(propertyName);
            if (propertySpec == null) {
                throw new IllegalStateException(String.format("No output property '%s' registered", propertyName));
            }
            boolean outputMissing = matcher.group(1) != null;
            String childPath = matcher.group(3);
            unpackPropertyEntry(propertySpec, tarInput, tarEntry, childPath, outputMissing, propertyFileSnapshots);
        }
    }
    if (originMetadata == null) {
        throw new IllegalStateException("Cached result format error, no origin metadata was found.");
    }
    return new UnpackResult(originMetadata, entries, propertyFileSnapshots.build());
}
Also used : Matcher(java.util.regex.Matcher) TaskFilePropertySpec(org.gradle.api.internal.tasks.TaskFilePropertySpec) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) DirectoryFileSnapshot(org.gradle.api.internal.changedetection.state.DirectoryFileSnapshot) FileSnapshot(org.gradle.api.internal.changedetection.state.FileSnapshot) RegularFileSnapshot(org.gradle.api.internal.changedetection.state.RegularFileSnapshot) OriginTaskExecutionMetadata(org.gradle.api.internal.tasks.OriginTaskExecutionMetadata) ImmutableListMultimap(com.google.common.collect.ImmutableListMultimap) ResolvedTaskOutputFilePropertySpec(org.gradle.api.internal.tasks.ResolvedTaskOutputFilePropertySpec) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Aggregations

CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)29 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)13 TikaException (org.apache.tika.exception.TikaException)12 OfflineContentHandler (org.apache.tika.sax.OfflineContentHandler)8 InputStream (java.io.InputStream)7 TikaInputStream (org.apache.tika.io.TikaInputStream)7 AutoDetectReader (org.apache.tika.detect.AutoDetectReader)6 MediaType (org.apache.tika.mime.MediaType)6 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)5 Charset (java.nio.charset.Charset)4 TikaConfig (org.apache.tika.config.TikaConfig)4 SAXException (org.xml.sax.SAXException)4 BufferedInputStream (java.io.BufferedInputStream)3 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)3 Metadata (org.apache.tika.metadata.Metadata)3 TaggedContentHandler (org.apache.tika.sax.TaggedContentHandler)3 InputSource (org.xml.sax.InputSource)3 IOException (java.io.IOException)2 SAXParser (javax.xml.parsers.SAXParser)2 ZipArchiveEntry (org.apache.commons.compress.archivers.zip.ZipArchiveEntry)2