use of ddf.catalog.transformer.common.tika.TikaMetadataExtractor in project ddf by codice.
the class TikaInputTransformer method transform.
@Override
public Metacard transform(InputStream input, String id) throws IOException, CatalogTransformerException {
LOGGER.debug("Transforming input stream using Tika.");
long bytes;
if (input == null) {
throw new CatalogTransformerException("Cannot transform null input.");
}
try (TemporaryFileBackedOutputStream fileBackedOutputStream = new TemporaryFileBackedOutputStream()) {
try {
bytes = IOUtils.copyLarge(input, fileBackedOutputStream);
} catch (IOException e) {
throw new CatalogTransformerException("Could not copy bytes of content message.", e);
}
Metadata metadata;
String bodyText = null;
String metadataText;
Metacard metacard = new MetacardImpl(commonTikaMetacardType);
String contentType = DataType.DATASET.name();
TikaMetadataExtractor extractor = null;
try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource().openStream()) {
extractor = new TikaMetadataExtractor(inputStreamCopy, previewMaxLength, metadataMaxLength);
} catch (TikaException | RuntimeException t) {
LOGGER.debug("Unable to extract tika metadata", t);
}
if (extractor != null) {
metadataText = getMetadataXml(extractor.getMetadataXml());
Attribute validationAttribute = null;
if (metadataText.equals(TikaMetadataExtractor.METADATA_LIMIT_REACHED_MSG)) {
validationAttribute = new AttributeImpl(Validation.VALIDATION_WARNINGS, Collections.singletonList(metadataText));
metadataText = "";
}
bodyText = extractor.getBodyText();
metadata = extractor.getMetadata();
contentType = metadata.get(Metadata.CONTENT_TYPE);
MetacardType metacardType = mergeAttributes(getMetacardType(contentType));
metacard = MetacardCreator.createMetacard(metadata, id, metadataText, metacardType, useResourceTitleAsTitle);
if (StringUtils.isNotBlank(bodyText)) {
metacard.setAttribute(new AttributeImpl(Extracted.EXTRACTED_TEXT, bodyText));
processContentMetadataExtractors(bodyText, metacard);
}
if (StringUtils.isNotBlank(metadataText)) {
processMetadataExtractors(metadataText, metacard);
}
if (validationAttribute != null) {
metacard.setAttribute(validationAttribute);
}
}
enrichMetacard(fileBackedOutputStream, contentType, bytes, metacard);
LOGGER.debug("Finished transforming input stream using Tika.");
return metacard;
}
}
use of ddf.catalog.transformer.common.tika.TikaMetadataExtractor in project ddf by codice.
the class VideoInputTransformer method transform.
@Override
public Metacard transform(InputStream input, String id) throws IOException, CatalogTransformerException {
Metacard metacard;
try {
TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(input);
Metadata metadata = tikaMetadataExtractor.getMetadata();
String metadataText = tikaMetadataExtractor.getMetadataXml();
metacard = MetacardCreator.createMetacard(metadata, id, metadataText, metacardType);
metacard.setAttribute(new AttributeImpl(Core.DATATYPE, DataType.MOVING_IMAGE.toString()));
} catch (TikaException e) {
throw new CatalogTransformerException(e);
}
return metacard;
}
use of ddf.catalog.transformer.common.tika.TikaMetadataExtractor in project ddf by codice.
the class PdfInputTransformer method transformWithExtractors.
private Metacard transformWithExtractors(InputStream input, String id) throws IOException, CatalogTransformerException {
try (TemporaryFileBackedOutputStream fbos = new TemporaryFileBackedOutputStream()) {
try {
IOUtils.copy(input, fbos);
} catch (IOException e) {
throw new CatalogTransformerException("Could not copy bytes of content message.", e);
}
String plainText = null;
try (InputStream isCopy = fbos.asByteSource().openStream()) {
Parser parser = new AutoDetectParser();
ContentHandler contentHandler = new ToTextContentHandler();
TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser, contentHandler);
tikaMetadataExtractor.parseMetadata(isCopy, new ParseContext());
plainText = contentHandler.toString();
} catch (CatalogTransformerException e) {
LOGGER.warn("Cannot extract metadata from pdf", e);
}
try (InputStream isCopy = fbos.asByteSource().openStream();
PDDocument pdfDocument = pdDocumentGenerator.apply(isCopy)) {
return transformPdf(id, pdfDocument, plainText);
} catch (InvalidPasswordException e) {
LOGGER.debug("Cannot transform encrypted pdf", e);
return initializeMetacard(id);
}
}
}
use of ddf.catalog.transformer.common.tika.TikaMetadataExtractor in project ddf by codice.
the class PdfInputTransformer method transformPdf.
private Metacard transformPdf(String id, PDDocument pdfDocument, InputStream contentInput) throws IOException, CatalogTransformerException {
if (pdfDocument.isEncrypted()) {
LOGGER.debug("Cannot transform encrypted pdf");
return initializeMetacard(id);
}
String bodyText = null;
String metadataXml = null;
TikaMetadataExtractor tikaMetadataExtractor = null;
try {
tikaMetadataExtractor = new TikaMetadataExtractor(contentInput, previewMaxLength, metadataMaxLength);
} catch (TikaException e) {
throw new CatalogTransformerException(e);
}
metadataXml = tikaMetadataExtractor.getMetadataXml();
Attribute validationAttribute = null;
if (metadataXml.equals(TikaMetadataExtractor.METADATA_LIMIT_REACHED_MSG)) {
validationAttribute = new AttributeImpl(Validation.VALIDATION_WARNINGS, Collections.singletonList(metadataXml));
metadataXml = "";
}
bodyText = tikaMetadataExtractor.getBodyText();
MetacardImpl metacard = initializeMetacard(id, bodyText, metadataXml);
if (validationAttribute != null) {
metacard.setAttribute(validationAttribute);
}
extractPdfMetadata(pdfDocument, metacard);
pdfThumbnailGenerator.apply(pdfDocument).ifPresent(metacard::setThumbnail);
Optional.ofNullable(geoParser.apply(pdfDocument)).ifPresent(metacard::setLocation);
return metacard;
}
Aggregations