use of ddf.catalog.content.operation.ContentMetadataExtractor in project ddf by codice.
the class TikaInputTransformer method addContentMetadataExtractors.
public void addContentMetadataExtractors(ServiceReference<ContentMetadataExtractor> contentMetadataExtractorRef) {
Bundle bundle = getBundle();
if (bundle != null) {
ContentMetadataExtractor cme = bundle.getBundleContext().getService(contentMetadataExtractorRef);
contentMetadataExtractors.put(contentMetadataExtractorRef, cme);
}
}
use of ddf.catalog.content.operation.ContentMetadataExtractor in project ddf by codice.
the class TikaInputTransformer method transform.
@Override
public Metacard transform(InputStream input, String id) throws IOException, CatalogTransformerException {
LOGGER.debug("Transforming input stream using Tika.");
if (input == null) {
throw new CatalogTransformerException("Cannot transform null input.");
}
try (TemporaryFileBackedOutputStream fileBackedOutputStream = new TemporaryFileBackedOutputStream()) {
try {
IOUtils.copy(input, fileBackedOutputStream);
} catch (IOException e) {
throw new CatalogTransformerException("Could not copy bytes of content message.", e);
}
Parser parser = new AutoDetectParser();
ToXMLContentHandler xmlContentHandler = new ToXMLContentHandler();
ToTextContentHandler textContentHandler = null;
ContentHandler contentHandler;
if (!contentMetadataExtractors.isEmpty()) {
textContentHandler = new ToTextContentHandler();
contentHandler = new TeeContentHandler(xmlContentHandler, textContentHandler);
} else {
contentHandler = xmlContentHandler;
}
TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser, contentHandler);
Metadata metadata;
try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource().openStream()) {
metadata = tikaMetadataExtractor.parseMetadata(inputStreamCopy, new ParseContext());
}
String metadataText = xmlContentHandler.toString();
if (templates != null) {
metadataText = transformToXml(metadataText);
}
String metacardContentType = metadata.get(Metadata.CONTENT_TYPE);
MetacardType metacardType = getMetacardTypeFromMimeType(metacardContentType);
if (metacardType == null) {
metacardType = commonTikaMetacardType;
}
Metacard metacard;
if (textContentHandler != null) {
String plainText = textContentHandler.toString();
Set<AttributeDescriptor> attributes = contentMetadataExtractors.values().stream().map(ContentMetadataExtractor::getMetacardAttributes).flatMap(Collection::stream).collect(Collectors.toSet());
MetacardTypeImpl extendedMetacardType = new MetacardTypeImpl(metacardType.getName(), metacardType, attributes);
metacard = MetacardCreator.createMetacard(metadata, id, metadataText, extendedMetacardType);
for (ContentMetadataExtractor contentMetadataExtractor : contentMetadataExtractors.values()) {
contentMetadataExtractor.process(plainText, metacard);
}
} else {
metacard = MetacardCreator.createMetacard(metadata, id, metadataText, metacardType);
}
if (StringUtils.isNotBlank(metacardContentType)) {
metacard.setAttribute(new AttributeImpl(Core.DATATYPE, getDatatype(metacardContentType)));
}
if (StringUtils.startsWith(metacardContentType, "image")) {
try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource().openStream()) {
createThumbnail(inputStreamCopy, metacard);
}
}
LOGGER.debug("Finished transforming input stream using Tika.");
return metacard;
}
}
use of ddf.catalog.content.operation.ContentMetadataExtractor in project ddf by codice.
the class TikaInputTransformerTest method setup.
@Before
public void setup() {
ContentMetadataExtractor cme = mock(ContentMetadataExtractor.class);
when(bundleMock.getBundleContext()).thenReturn(bundleCtx);
when(bundleCtx.getService(any())).thenReturn(cme);
when(cme.getMetacardAttributes()).thenReturn(attributeDescriptors);
tikaInputTransformer.setFallbackExcelMetacardType(getMetacardType(EXCEL_METACARDTYPE_NAME));
tikaInputTransformer.setFallbackJpegMetacardType(getMetacardType(JPEG_METACARDTYPE_NAME));
tikaInputTransformer.setFallbackMp4MetacardType(getMetacardType(MP4_METACARDTYPE_NAME));
tikaInputTransformer.setFallbackMpegMetacardType(getMetacardType(MPEG_METACARDTYPE_NAME));
tikaInputTransformer.setFallbackOfficeDocMetacardType(getMetacardType(OFFICEDOC_METACARDTYPE_NAME));
tikaInputTransformer.setFallbackPdfMetacardType(getMetacardType(PDF_METACARDTYPE_NAME));
tikaInputTransformer.setFallbackPowerpointMetacardType(getMetacardType(POWERPOINT_METACARDTYPE_NAME));
tikaInputTransformer.populateMimeTypeMap();
}
use of ddf.catalog.content.operation.ContentMetadataExtractor in project ddf by codice.
the class PdfInputTransformer method initializeMetacard.
private MetacardImpl initializeMetacard(String id, String contentInput) {
MetacardImpl metacard;
if (StringUtils.isNotBlank(contentInput)) {
Set<AttributeDescriptor> attributes = contentMetadataExtractors.values().stream().map(ContentMetadataExtractor::getMetacardAttributes).flatMap(Collection::stream).collect(Collectors.toSet());
metacard = new MetacardImpl(new MetacardTypeImpl(metacardType.getName(), metacardType, attributes));
for (ContentMetadataExtractor contentMetadataExtractor : contentMetadataExtractors.values()) {
contentMetadataExtractor.process(contentInput, metacard);
}
} else {
metacard = new MetacardImpl(metacardType);
}
metacard.setId(id);
metacard.setContentTypeName(MediaType.PDF.toString());
metacard.setAttribute(Media.TYPE, MediaType.PDF.toString());
metacard.setAttribute(Core.DATATYPE, DataType.DOCUMENT.toString());
return metacard;
}
use of ddf.catalog.content.operation.ContentMetadataExtractor in project ddf by codice.
the class PdfInputTransformer method addContentMetadataExtractors.
public void addContentMetadataExtractors(ServiceReference<ContentMetadataExtractor> contentMetadataExtractorRef) {
Bundle bundle = getBundle();
if (bundle != null) {
ContentMetadataExtractor cme = bundle.getBundleContext().getService(contentMetadataExtractorRef);
contentMetadataExtractors.put(contentMetadataExtractorRef, cme);
}
}
Aggregations