use of org.apache.tika.sax.ToTextContentHandler in project ddf by codice.
the class TikaInputTransformer method transform.
@Override
public Metacard transform(InputStream input, String id) throws IOException, CatalogTransformerException {
LOGGER.debug("Transforming input stream using Tika.");
if (input == null) {
throw new CatalogTransformerException("Cannot transform null input.");
}
try (TemporaryFileBackedOutputStream fileBackedOutputStream = new TemporaryFileBackedOutputStream()) {
try {
IOUtils.copy(input, fileBackedOutputStream);
} catch (IOException e) {
throw new CatalogTransformerException("Could not copy bytes of content message.", e);
}
Parser parser = new AutoDetectParser();
ToXMLContentHandler xmlContentHandler = new ToXMLContentHandler();
ToTextContentHandler textContentHandler = null;
ContentHandler contentHandler;
if (!contentMetadataExtractors.isEmpty()) {
textContentHandler = new ToTextContentHandler();
contentHandler = new TeeContentHandler(xmlContentHandler, textContentHandler);
} else {
contentHandler = xmlContentHandler;
}
TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser, contentHandler);
Metadata metadata;
try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource().openStream()) {
metadata = tikaMetadataExtractor.parseMetadata(inputStreamCopy, new ParseContext());
}
String metadataText = xmlContentHandler.toString();
if (templates != null) {
metadataText = transformToXml(metadataText);
}
String metacardContentType = metadata.get(Metadata.CONTENT_TYPE);
MetacardType metacardType = getMetacardTypeFromMimeType(metacardContentType);
if (metacardType == null) {
metacardType = commonTikaMetacardType;
}
Metacard metacard;
if (textContentHandler != null) {
String plainText = textContentHandler.toString();
Set<AttributeDescriptor> attributes = contentMetadataExtractors.values().stream().map(ContentMetadataExtractor::getMetacardAttributes).flatMap(Collection::stream).collect(Collectors.toSet());
MetacardTypeImpl extendedMetacardType = new MetacardTypeImpl(metacardType.getName(), metacardType, attributes);
metacard = MetacardCreator.createMetacard(metadata, id, metadataText, extendedMetacardType);
for (ContentMetadataExtractor contentMetadataExtractor : contentMetadataExtractors.values()) {
contentMetadataExtractor.process(plainText, metacard);
}
} else {
metacard = MetacardCreator.createMetacard(metadata, id, metadataText, metacardType);
}
if (StringUtils.isNotBlank(metacardContentType)) {
metacard.setAttribute(new AttributeImpl(Core.DATATYPE, getDatatype(metacardContentType)));
}
if (StringUtils.startsWith(metacardContentType, "image")) {
try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource().openStream()) {
createThumbnail(inputStreamCopy, metacard);
}
}
LOGGER.debug("Finished transforming input stream using Tika.");
return metacard;
}
}
use of org.apache.tika.sax.ToTextContentHandler in project ddf by codice.
the class PdfInputTransformer method transformWithExtractors.
private Metacard transformWithExtractors(InputStream input, String id) throws IOException, CatalogTransformerException {
try (TemporaryFileBackedOutputStream fbos = new TemporaryFileBackedOutputStream()) {
try {
IOUtils.copy(input, fbos);
} catch (IOException e) {
throw new CatalogTransformerException("Could not copy bytes of content message.", e);
}
String plainText = null;
try (InputStream isCopy = fbos.asByteSource().openStream()) {
Parser parser = new AutoDetectParser();
ContentHandler contentHandler = new ToTextContentHandler();
TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser, contentHandler);
tikaMetadataExtractor.parseMetadata(isCopy, new ParseContext());
plainText = contentHandler.toString();
} catch (CatalogTransformerException e) {
LOGGER.warn("Cannot extract metadata from pdf", e);
}
try (InputStream isCopy = fbos.asByteSource().openStream();
PDDocument pdfDocument = pdDocumentGenerator.apply(isCopy)) {
return transformPdf(id, pdfDocument, plainText);
} catch (InvalidPasswordException e) {
LOGGER.debug("Cannot transform encrypted pdf", e);
return initializeMetacard(id);
}
}
}
Aggregations