Search in sources :

Example 1 with Metadata

use of org.apache.tika.metadata.Metadata in project elasticsearch by elastic.

the class AttachmentProcessor method execute.

@Override
public void execute(IngestDocument ingestDocument) {
    Map<String, Object> additionalFields = new HashMap<>();
    byte[] input = ingestDocument.getFieldValueAsBytes(field, ignoreMissing);
    if (input == null && ignoreMissing) {
        return;
    } else if (input == null) {
        throw new IllegalArgumentException("field [" + field + "] is null, cannot parse.");
    }
    try {
        Metadata metadata = new Metadata();
        String parsedContent = TikaImpl.parse(input, metadata, indexedChars);
        if (properties.contains(Property.CONTENT) && Strings.hasLength(parsedContent)) {
            // somehow tika seems to append a newline at the end automatically, lets remove that again
            additionalFields.put(Property.CONTENT.toLowerCase(), parsedContent.trim());
        }
        if (properties.contains(Property.LANGUAGE) && Strings.hasLength(parsedContent)) {
            LanguageIdentifier identifier = new LanguageIdentifier(parsedContent);
            String language = identifier.getLanguage();
            additionalFields.put(Property.LANGUAGE.toLowerCase(), language);
        }
        if (properties.contains(Property.DATE)) {
            String createdDate = metadata.get(TikaCoreProperties.CREATED);
            if (createdDate != null) {
                additionalFields.put(Property.DATE.toLowerCase(), createdDate);
            }
        }
        if (properties.contains(Property.TITLE)) {
            String title = metadata.get(TikaCoreProperties.TITLE);
            if (Strings.hasLength(title)) {
                additionalFields.put(Property.TITLE.toLowerCase(), title);
            }
        }
        if (properties.contains(Property.AUTHOR)) {
            String author = metadata.get("Author");
            if (Strings.hasLength(author)) {
                additionalFields.put(Property.AUTHOR.toLowerCase(), author);
            }
        }
        if (properties.contains(Property.KEYWORDS)) {
            String keywords = metadata.get("Keywords");
            if (Strings.hasLength(keywords)) {
                additionalFields.put(Property.KEYWORDS.toLowerCase(), keywords);
            }
        }
        if (properties.contains(Property.CONTENT_TYPE)) {
            String contentType = metadata.get(Metadata.CONTENT_TYPE);
            if (Strings.hasLength(contentType)) {
                additionalFields.put(Property.CONTENT_TYPE.toLowerCase(), contentType);
            }
        }
        if (properties.contains(Property.CONTENT_LENGTH)) {
            String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
            long length;
            if (Strings.hasLength(contentLength)) {
                length = Long.parseLong(contentLength);
            } else {
                length = parsedContent.length();
            }
            additionalFields.put(Property.CONTENT_LENGTH.toLowerCase(), length);
        }
    } catch (Exception e) {
        throw new ElasticsearchParseException("Error parsing document in field [{}]", e, field);
    }
    ingestDocument.setFieldValue(targetField, additionalFields);
}
Also used : LanguageIdentifier(org.apache.tika.language.LanguageIdentifier) HashMap(java.util.HashMap) ElasticsearchParseException(org.elasticsearch.ElasticsearchParseException) Metadata(org.apache.tika.metadata.Metadata) IOException(java.io.IOException) ElasticsearchParseException(org.elasticsearch.ElasticsearchParseException) ConfigurationUtils.newConfigurationException(org.elasticsearch.ingest.ConfigurationUtils.newConfigurationException)

Example 2 with Metadata

use of org.apache.tika.metadata.Metadata in project che by eclipse.

the class MediaTypeFilter method accept.

@Override
public boolean accept(VirtualFile file) {
    try (InputStream content = file.getContent()) {
        TikaConfig tikaConfig = new TikaConfig();
        MediaType mimeType = tikaConfig.getDetector().detect(content, new Metadata());
        if (excludedMediaTypes.contains(mimeType) || excludedTypes.contains(mimeType.getType())) {
            return true;
        }
        return false;
    } catch (TikaException | ForbiddenException | ServerException | IOException e) {
        return true;
    }
}
Also used : ForbiddenException(org.eclipse.che.api.core.ForbiddenException) TikaException(org.apache.tika.exception.TikaException) ServerException(org.eclipse.che.api.core.ServerException) TikaConfig(org.apache.tika.config.TikaConfig) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType) IOException(java.io.IOException)

Example 3 with Metadata

use of org.apache.tika.metadata.Metadata in project camel by apache.

the class TikaProducer method doParse.

private Object doParse(Exchange exchange) throws TikaException, IOException, SAXException, TransformerConfigurationException {
    InputStream inputStream = exchange.getIn().getBody(InputStream.class);
    OutputStream result = new ByteArrayOutputStream();
    ContentHandler contentHandler = getContentHandler(this.tikaConfiguration, result);
    ParseContext context = new ParseContext();
    context.set(Parser.class, this.parser);
    Metadata metadata = new Metadata();
    this.parser.parse(inputStream, contentHandler, metadata, context);
    convertMetadataToHeaders(metadata, exchange);
    return result;
}
Also used : InputStream(java.io.InputStream) ByteArrayOutputStream(java.io.ByteArrayOutputStream) OutputStream(java.io.OutputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) ByteArrayOutputStream(java.io.ByteArrayOutputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) ContentHandler(org.xml.sax.ContentHandler) ExpandedTitleContentHandler(org.apache.tika.sax.ExpandedTitleContentHandler)

Example 4 with Metadata

use of org.apache.tika.metadata.Metadata in project camel by apache.

the class TikaProducer method doDetect.

private Object doDetect(Exchange exchange) throws IOException {
    InputStream inputStream = exchange.getIn().getBody(InputStream.class);
    Metadata metadata = new Metadata();
    MediaType result = this.detector.detect(inputStream, metadata);
    convertMetadataToHeaders(metadata, exchange);
    return result.toString();
}
Also used : InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType)

Example 5 with Metadata

use of org.apache.tika.metadata.Metadata in project camel by apache.

the class TikaParseTest method testDocumentParse.

@Test
public void testDocumentParse() throws Exception {
    File document = new File("src/test/resources/test.doc");
    template.sendBody("direct:start", document);
    resultEndpoint.setExpectedMessageCount(1);
    resultEndpoint.expectedMessagesMatches(new Predicate() {

        @Override
        public boolean matches(Exchange exchange) {
            Object body = exchange.getIn().getBody(String.class);
            Map<String, Object> headerMap = exchange.getIn().getHeaders();
            assertThat(body, instanceOf(String.class));
            Charset detectedCharset = null;
            try {
                InputStream bodyIs = new ByteArrayInputStream(((String) body).getBytes());
                UniversalEncodingDetector encodingDetector = new UniversalEncodingDetector();
                detectedCharset = encodingDetector.detect(bodyIs, new Metadata());
            } catch (IOException e1) {
                fail();
            }
            assertThat(detectedCharset.name(), startsWith(Charset.defaultCharset().name()));
            assertThat((String) body, containsString("test"));
            assertThat(headerMap.get(Exchange.CONTENT_TYPE), equalTo("application/msword"));
            return true;
        }
    });
    resultEndpoint.assertIsSatisfied();
}
Also used : UniversalEncodingDetector(org.apache.tika.parser.txt.UniversalEncodingDetector) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) Charset(java.nio.charset.Charset) IOException(java.io.IOException) Predicate(org.apache.camel.Predicate) Exchange(org.apache.camel.Exchange) ByteArrayInputStream(java.io.ByteArrayInputStream) File(java.io.File) Map(java.util.Map) Test(org.junit.Test)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)643 Test (org.junit.Test)467 InputStream (java.io.InputStream)318 ParseContext (org.apache.tika.parser.ParseContext)281 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)268 TikaTest (org.apache.tika.TikaTest)257 ContentHandler (org.xml.sax.ContentHandler)228 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)151 ByteArrayInputStream (java.io.ByteArrayInputStream)141 Parser (org.apache.tika.parser.Parser)134 TikaInputStream (org.apache.tika.io.TikaInputStream)131 IOException (java.io.IOException)62 DefaultHandler (org.xml.sax.helpers.DefaultHandler)59 TikaException (org.apache.tika.exception.TikaException)46 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)36 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)36 StringWriter (java.io.StringWriter)33 Tika (org.apache.tika.Tika)28 FileInputStream (java.io.FileInputStream)27 MediaType (org.apache.tika.mime.MediaType)27