use of org.apache.tika.metadata.Metadata in project elasticsearch by elastic.
the class AttachmentProcessor method execute.
@Override
public void execute(IngestDocument ingestDocument) {
Map<String, Object> additionalFields = new HashMap<>();
byte[] input = ingestDocument.getFieldValueAsBytes(field, ignoreMissing);
if (input == null && ignoreMissing) {
return;
} else if (input == null) {
throw new IllegalArgumentException("field [" + field + "] is null, cannot parse.");
}
try {
Metadata metadata = new Metadata();
String parsedContent = TikaImpl.parse(input, metadata, indexedChars);
if (properties.contains(Property.CONTENT) && Strings.hasLength(parsedContent)) {
// somehow tika seems to append a newline at the end automatically, lets remove that again
additionalFields.put(Property.CONTENT.toLowerCase(), parsedContent.trim());
}
if (properties.contains(Property.LANGUAGE) && Strings.hasLength(parsedContent)) {
LanguageIdentifier identifier = new LanguageIdentifier(parsedContent);
String language = identifier.getLanguage();
additionalFields.put(Property.LANGUAGE.toLowerCase(), language);
}
if (properties.contains(Property.DATE)) {
String createdDate = metadata.get(TikaCoreProperties.CREATED);
if (createdDate != null) {
additionalFields.put(Property.DATE.toLowerCase(), createdDate);
}
}
if (properties.contains(Property.TITLE)) {
String title = metadata.get(TikaCoreProperties.TITLE);
if (Strings.hasLength(title)) {
additionalFields.put(Property.TITLE.toLowerCase(), title);
}
}
if (properties.contains(Property.AUTHOR)) {
String author = metadata.get("Author");
if (Strings.hasLength(author)) {
additionalFields.put(Property.AUTHOR.toLowerCase(), author);
}
}
if (properties.contains(Property.KEYWORDS)) {
String keywords = metadata.get("Keywords");
if (Strings.hasLength(keywords)) {
additionalFields.put(Property.KEYWORDS.toLowerCase(), keywords);
}
}
if (properties.contains(Property.CONTENT_TYPE)) {
String contentType = metadata.get(Metadata.CONTENT_TYPE);
if (Strings.hasLength(contentType)) {
additionalFields.put(Property.CONTENT_TYPE.toLowerCase(), contentType);
}
}
if (properties.contains(Property.CONTENT_LENGTH)) {
String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
long length;
if (Strings.hasLength(contentLength)) {
length = Long.parseLong(contentLength);
} else {
length = parsedContent.length();
}
additionalFields.put(Property.CONTENT_LENGTH.toLowerCase(), length);
}
} catch (Exception e) {
throw new ElasticsearchParseException("Error parsing document in field [{}]", e, field);
}
ingestDocument.setFieldValue(targetField, additionalFields);
}
use of org.apache.tika.metadata.Metadata in project che by eclipse.
the class MediaTypeFilter method accept.
@Override
public boolean accept(VirtualFile file) {
try (InputStream content = file.getContent()) {
TikaConfig tikaConfig = new TikaConfig();
MediaType mimeType = tikaConfig.getDetector().detect(content, new Metadata());
if (excludedMediaTypes.contains(mimeType) || excludedTypes.contains(mimeType.getType())) {
return true;
}
return false;
} catch (TikaException | ForbiddenException | ServerException | IOException e) {
return true;
}
}
use of org.apache.tika.metadata.Metadata in project camel by apache.
the class TikaProducer method doParse.
private Object doParse(Exchange exchange) throws TikaException, IOException, SAXException, TransformerConfigurationException {
InputStream inputStream = exchange.getIn().getBody(InputStream.class);
OutputStream result = new ByteArrayOutputStream();
ContentHandler contentHandler = getContentHandler(this.tikaConfiguration, result);
ParseContext context = new ParseContext();
context.set(Parser.class, this.parser);
Metadata metadata = new Metadata();
this.parser.parse(inputStream, contentHandler, metadata, context);
convertMetadataToHeaders(metadata, exchange);
return result;
}
use of org.apache.tika.metadata.Metadata in project camel by apache.
the class TikaProducer method doDetect.
private Object doDetect(Exchange exchange) throws IOException {
InputStream inputStream = exchange.getIn().getBody(InputStream.class);
Metadata metadata = new Metadata();
MediaType result = this.detector.detect(inputStream, metadata);
convertMetadataToHeaders(metadata, exchange);
return result.toString();
}
use of org.apache.tika.metadata.Metadata in project camel by apache.
the class TikaParseTest method testDocumentParse.
@Test
public void testDocumentParse() throws Exception {
File document = new File("src/test/resources/test.doc");
template.sendBody("direct:start", document);
resultEndpoint.setExpectedMessageCount(1);
resultEndpoint.expectedMessagesMatches(new Predicate() {
@Override
public boolean matches(Exchange exchange) {
Object body = exchange.getIn().getBody(String.class);
Map<String, Object> headerMap = exchange.getIn().getHeaders();
assertThat(body, instanceOf(String.class));
Charset detectedCharset = null;
try {
InputStream bodyIs = new ByteArrayInputStream(((String) body).getBytes());
UniversalEncodingDetector encodingDetector = new UniversalEncodingDetector();
detectedCharset = encodingDetector.detect(bodyIs, new Metadata());
} catch (IOException e1) {
fail();
}
assertThat(detectedCharset.name(), startsWith(Charset.defaultCharset().name()));
assertThat((String) body, containsString("test"));
assertThat(headerMap.get(Exchange.CONTENT_TYPE), equalTo("application/msword"));
return true;
}
});
resultEndpoint.assertIsSatisfied();
}
Aggregations