Search in sources :

Example 1 with UniversalEncodingDetector

use of org.apache.tika.parser.txt.UniversalEncodingDetector in project camel by apache.

the class TikaParseTest method testDocumentParse.

@Test
public void testDocumentParse() throws Exception {
    File document = new File("src/test/resources/test.doc");
    template.sendBody("direct:start", document);
    resultEndpoint.setExpectedMessageCount(1);
    resultEndpoint.expectedMessagesMatches(new Predicate() {

        @Override
        public boolean matches(Exchange exchange) {
            Object body = exchange.getIn().getBody(String.class);
            Map<String, Object> headerMap = exchange.getIn().getHeaders();
            assertThat(body, instanceOf(String.class));
            Charset detectedCharset = null;
            try {
                InputStream bodyIs = new ByteArrayInputStream(((String) body).getBytes());
                UniversalEncodingDetector encodingDetector = new UniversalEncodingDetector();
                detectedCharset = encodingDetector.detect(bodyIs, new Metadata());
            } catch (IOException e1) {
                fail();
            }
            assertThat(detectedCharset.name(), startsWith(Charset.defaultCharset().name()));
            assertThat((String) body, containsString("test"));
            assertThat(headerMap.get(Exchange.CONTENT_TYPE), equalTo("application/msword"));
            return true;
        }
    });
    resultEndpoint.assertIsSatisfied();
}
Also used : UniversalEncodingDetector(org.apache.tika.parser.txt.UniversalEncodingDetector) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) Charset(java.nio.charset.Charset) IOException(java.io.IOException) Predicate(org.apache.camel.Predicate) Exchange(org.apache.camel.Exchange) ByteArrayInputStream(java.io.ByteArrayInputStream) File(java.io.File) Map(java.util.Map) Test(org.junit.Test)

Example 2 with UniversalEncodingDetector

use of org.apache.tika.parser.txt.UniversalEncodingDetector in project tika by apache.

the class TikaEncodingDetectorTest method testDefault.

@Test
public void testDefault() {
    EncodingDetector detector = TikaConfig.getDefaultConfig().getEncodingDetector();
    assertTrue(detector instanceof CompositeEncodingDetector);
    List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
    assertEquals(3, detectors.size());
    assertTrue(detectors.get(0) instanceof HtmlEncodingDetector);
    assertTrue(detectors.get(1) instanceof UniversalEncodingDetector);
    assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector);
}
Also used : Icu4jEncodingDetector(org.apache.tika.parser.txt.Icu4jEncodingDetector) NonDetectingEncodingDetector(org.apache.tika.detect.NonDetectingEncodingDetector) UniversalEncodingDetector(org.apache.tika.parser.txt.UniversalEncodingDetector) CompositeEncodingDetector(org.apache.tika.detect.CompositeEncodingDetector) EncodingDetector(org.apache.tika.detect.EncodingDetector) HtmlEncodingDetector(org.apache.tika.parser.html.HtmlEncodingDetector) CompositeEncodingDetector(org.apache.tika.detect.CompositeEncodingDetector) UniversalEncodingDetector(org.apache.tika.parser.txt.UniversalEncodingDetector) Icu4jEncodingDetector(org.apache.tika.parser.txt.Icu4jEncodingDetector) HtmlEncodingDetector(org.apache.tika.parser.html.HtmlEncodingDetector) Test(org.junit.Test)

Example 3 with UniversalEncodingDetector

use of org.apache.tika.parser.txt.UniversalEncodingDetector in project tika by apache.

the class TikaEncodingDetectorTest method testBlackList.

@Test
public void testBlackList() throws Exception {
    TikaConfig config = getConfig("TIKA-2273-blacklist-encoding-detector-default.xml");
    EncodingDetector detector = config.getEncodingDetector();
    assertTrue(detector instanceof CompositeEncodingDetector);
    List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
    assertEquals(2, detectors.size());
    EncodingDetector detector1 = detectors.get(0);
    assertTrue(detector1 instanceof CompositeEncodingDetector);
    List<EncodingDetector> detectors1Children = ((CompositeEncodingDetector) detector1).getDetectors();
    assertEquals(2, detectors1Children.size());
    assertTrue(detectors1Children.get(0) instanceof UniversalEncodingDetector);
    assertTrue(detectors1Children.get(1) instanceof Icu4jEncodingDetector);
    assertTrue(detectors.get(1) instanceof NonDetectingEncodingDetector);
}
Also used : Icu4jEncodingDetector(org.apache.tika.parser.txt.Icu4jEncodingDetector) NonDetectingEncodingDetector(org.apache.tika.detect.NonDetectingEncodingDetector) UniversalEncodingDetector(org.apache.tika.parser.txt.UniversalEncodingDetector) CompositeEncodingDetector(org.apache.tika.detect.CompositeEncodingDetector) EncodingDetector(org.apache.tika.detect.EncodingDetector) HtmlEncodingDetector(org.apache.tika.parser.html.HtmlEncodingDetector) CompositeEncodingDetector(org.apache.tika.detect.CompositeEncodingDetector) UniversalEncodingDetector(org.apache.tika.parser.txt.UniversalEncodingDetector) Icu4jEncodingDetector(org.apache.tika.parser.txt.Icu4jEncodingDetector) NonDetectingEncodingDetector(org.apache.tika.detect.NonDetectingEncodingDetector) Test(org.junit.Test)

Example 4 with UniversalEncodingDetector

use of org.apache.tika.parser.txt.UniversalEncodingDetector in project camel by apache.

the class TikaParseTest method testDocumentParseWithEncoding.

@Test
public void testDocumentParseWithEncoding() throws Exception {
    File document = new File("src/test/resources/testOpenOffice2.odt");
    template.sendBody("direct:start4", document);
    resultEndpoint.setExpectedMessageCount(1);
    resultEndpoint.expectedMessagesMatches(new Predicate() {

        @Override
        public boolean matches(Exchange exchange) {
            Object body = exchange.getIn().getBody(String.class);
            Map<String, Object> headerMap = exchange.getIn().getHeaders();
            assertThat(body, instanceOf(String.class));
            Charset detectedCharset = null;
            try {
                InputStream bodyIs = new ByteArrayInputStream(((String) body).getBytes(StandardCharsets.UTF_16));
                UniversalEncodingDetector encodingDetector = new UniversalEncodingDetector();
                detectedCharset = encodingDetector.detect(bodyIs, new Metadata());
            } catch (IOException e1) {
                fail();
            }
            assertThat(detectedCharset.name(), startsWith(StandardCharsets.UTF_16.name()));
            assertThat(headerMap.get(Exchange.CONTENT_TYPE), equalTo("application/vnd.oasis.opendocument.text"));
            return true;
        }
    });
    resultEndpoint.assertIsSatisfied();
}
Also used : UniversalEncodingDetector(org.apache.tika.parser.txt.UniversalEncodingDetector) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) Charset(java.nio.charset.Charset) IOException(java.io.IOException) Predicate(org.apache.camel.Predicate) Exchange(org.apache.camel.Exchange) ByteArrayInputStream(java.io.ByteArrayInputStream) File(java.io.File) Map(java.util.Map) Test(org.junit.Test)

Aggregations

UniversalEncodingDetector (org.apache.tika.parser.txt.UniversalEncodingDetector)4 Test (org.junit.Test)4 ByteArrayInputStream (java.io.ByteArrayInputStream)2 File (java.io.File)2 IOException (java.io.IOException)2 InputStream (java.io.InputStream)2 Charset (java.nio.charset.Charset)2 Map (java.util.Map)2 Exchange (org.apache.camel.Exchange)2 Predicate (org.apache.camel.Predicate)2 CompositeEncodingDetector (org.apache.tika.detect.CompositeEncodingDetector)2 EncodingDetector (org.apache.tika.detect.EncodingDetector)2 NonDetectingEncodingDetector (org.apache.tika.detect.NonDetectingEncodingDetector)2 Metadata (org.apache.tika.metadata.Metadata)2 HtmlEncodingDetector (org.apache.tika.parser.html.HtmlEncodingDetector)2 Icu4jEncodingDetector (org.apache.tika.parser.txt.Icu4jEncodingDetector)2