Search in sources :

Example 1 with ToTextContentHandler

use of org.apache.tika.sax.ToTextContentHandler in project ddf by codice.

the class TikaInputTransformer method transform.

@Override
public Metacard transform(InputStream input, String id) throws IOException, CatalogTransformerException {
    LOGGER.debug("Transforming input stream using Tika.");
    if (input == null) {
        throw new CatalogTransformerException("Cannot transform null input.");
    }
    try (TemporaryFileBackedOutputStream fileBackedOutputStream = new TemporaryFileBackedOutputStream()) {
        try {
            IOUtils.copy(input, fileBackedOutputStream);
        } catch (IOException e) {
            throw new CatalogTransformerException("Could not copy bytes of content message.", e);
        }
        Parser parser = new AutoDetectParser();
        ToXMLContentHandler xmlContentHandler = new ToXMLContentHandler();
        ToTextContentHandler textContentHandler = null;
        ContentHandler contentHandler;
        if (!contentMetadataExtractors.isEmpty()) {
            textContentHandler = new ToTextContentHandler();
            contentHandler = new TeeContentHandler(xmlContentHandler, textContentHandler);
        } else {
            contentHandler = xmlContentHandler;
        }
        TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser, contentHandler);
        Metadata metadata;
        try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource().openStream()) {
            metadata = tikaMetadataExtractor.parseMetadata(inputStreamCopy, new ParseContext());
        }
        String metadataText = xmlContentHandler.toString();
        if (templates != null) {
            metadataText = transformToXml(metadataText);
        }
        String metacardContentType = metadata.get(Metadata.CONTENT_TYPE);
        MetacardType metacardType = getMetacardTypeFromMimeType(metacardContentType);
        if (metacardType == null) {
            metacardType = commonTikaMetacardType;
        }
        Metacard metacard;
        if (textContentHandler != null) {
            String plainText = textContentHandler.toString();
            Set<AttributeDescriptor> attributes = contentMetadataExtractors.values().stream().map(ContentMetadataExtractor::getMetacardAttributes).flatMap(Collection::stream).collect(Collectors.toSet());
            MetacardTypeImpl extendedMetacardType = new MetacardTypeImpl(metacardType.getName(), metacardType, attributes);
            metacard = MetacardCreator.createMetacard(metadata, id, metadataText, extendedMetacardType);
            for (ContentMetadataExtractor contentMetadataExtractor : contentMetadataExtractors.values()) {
                contentMetadataExtractor.process(plainText, metacard);
            }
        } else {
            metacard = MetacardCreator.createMetacard(metadata, id, metadataText, metacardType);
        }
        if (StringUtils.isNotBlank(metacardContentType)) {
            metacard.setAttribute(new AttributeImpl(Core.DATATYPE, getDatatype(metacardContentType)));
        }
        if (StringUtils.startsWith(metacardContentType, "image")) {
            try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource().openStream()) {
                createThumbnail(inputStreamCopy, metacard);
            }
        }
        LOGGER.debug("Finished transforming input stream using Tika.");
        return metacard;
    }
}
Also used : ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) TemporaryFileBackedOutputStream(org.codice.ddf.platform.util.TemporaryFileBackedOutputStream) CloseShieldInputStream(org.apache.tika.io.CloseShieldInputStream) InputStream(java.io.InputStream) AttributeImpl(ddf.catalog.data.impl.AttributeImpl) Metadata(org.apache.tika.metadata.Metadata) AttributeDescriptor(ddf.catalog.data.AttributeDescriptor) CatalogTransformerException(ddf.catalog.transform.CatalogTransformerException) MetacardTypeImpl(ddf.catalog.data.impl.MetacardTypeImpl) IOException(java.io.IOException) ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) ToTextContentHandler(org.apache.tika.sax.ToTextContentHandler) ContentHandler(org.xml.sax.ContentHandler) MetacardType(ddf.catalog.data.MetacardType) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ToTextContentHandler(org.apache.tika.sax.ToTextContentHandler) TikaMetadataExtractor(ddf.catalog.transformer.common.tika.TikaMetadataExtractor) Metacard(ddf.catalog.data.Metacard) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ContentMetadataExtractor(ddf.catalog.content.operation.ContentMetadataExtractor) TeeContentHandler(org.apache.tika.sax.TeeContentHandler)

Example 2 with ToTextContentHandler

use of org.apache.tika.sax.ToTextContentHandler in project ddf by codice.

the class PdfInputTransformer method transformWithExtractors.

private Metacard transformWithExtractors(InputStream input, String id) throws IOException, CatalogTransformerException {
    try (TemporaryFileBackedOutputStream fbos = new TemporaryFileBackedOutputStream()) {
        try {
            IOUtils.copy(input, fbos);
        } catch (IOException e) {
            throw new CatalogTransformerException("Could not copy bytes of content message.", e);
        }
        String plainText = null;
        try (InputStream isCopy = fbos.asByteSource().openStream()) {
            Parser parser = new AutoDetectParser();
            ContentHandler contentHandler = new ToTextContentHandler();
            TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser, contentHandler);
            tikaMetadataExtractor.parseMetadata(isCopy, new ParseContext());
            plainText = contentHandler.toString();
        } catch (CatalogTransformerException e) {
            LOGGER.warn("Cannot extract metadata from pdf", e);
        }
        try (InputStream isCopy = fbos.asByteSource().openStream();
            PDDocument pdfDocument = pdDocumentGenerator.apply(isCopy)) {
            return transformPdf(id, pdfDocument, plainText);
        } catch (InvalidPasswordException e) {
            LOGGER.debug("Cannot transform encrypted pdf", e);
            return initializeMetacard(id);
        }
    }
}
Also used : TemporaryFileBackedOutputStream(org.codice.ddf.platform.util.TemporaryFileBackedOutputStream) InputStream(java.io.InputStream) CatalogTransformerException(ddf.catalog.transform.CatalogTransformerException) IOException(java.io.IOException) ContentHandler(org.xml.sax.ContentHandler) ToTextContentHandler(org.apache.tika.sax.ToTextContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ToTextContentHandler(org.apache.tika.sax.ToTextContentHandler) TikaMetadataExtractor(ddf.catalog.transformer.common.tika.TikaMetadataExtractor) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) InvalidPasswordException(org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException)

Aggregations

CatalogTransformerException (ddf.catalog.transform.CatalogTransformerException)2 TikaMetadataExtractor (ddf.catalog.transformer.common.tika.TikaMetadataExtractor)2 IOException (java.io.IOException)2 InputStream (java.io.InputStream)2 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)2 ParseContext (org.apache.tika.parser.ParseContext)2 Parser (org.apache.tika.parser.Parser)2 ToTextContentHandler (org.apache.tika.sax.ToTextContentHandler)2 TemporaryFileBackedOutputStream (org.codice.ddf.platform.util.TemporaryFileBackedOutputStream)2 ContentHandler (org.xml.sax.ContentHandler)2 ContentMetadataExtractor (ddf.catalog.content.operation.ContentMetadataExtractor)1 AttributeDescriptor (ddf.catalog.data.AttributeDescriptor)1 Metacard (ddf.catalog.data.Metacard)1 MetacardType (ddf.catalog.data.MetacardType)1 AttributeImpl (ddf.catalog.data.impl.AttributeImpl)1 MetacardTypeImpl (ddf.catalog.data.impl.MetacardTypeImpl)1 PDDocument (org.apache.pdfbox.pdmodel.PDDocument)1 InvalidPasswordException (org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException)1 CloseShieldInputStream (org.apache.tika.io.CloseShieldInputStream)1 Metadata (org.apache.tika.metadata.Metadata)1