use of org.apache.tika.parser.txt.UniversalEncodingDetector in project camel by apache.
the class TikaParseTest method testDocumentParse.
@Test
public void testDocumentParse() throws Exception {
File document = new File("src/test/resources/test.doc");
template.sendBody("direct:start", document);
resultEndpoint.setExpectedMessageCount(1);
resultEndpoint.expectedMessagesMatches(new Predicate() {
@Override
public boolean matches(Exchange exchange) {
Object body = exchange.getIn().getBody(String.class);
Map<String, Object> headerMap = exchange.getIn().getHeaders();
assertThat(body, instanceOf(String.class));
Charset detectedCharset = null;
try {
InputStream bodyIs = new ByteArrayInputStream(((String) body).getBytes());
UniversalEncodingDetector encodingDetector = new UniversalEncodingDetector();
detectedCharset = encodingDetector.detect(bodyIs, new Metadata());
} catch (IOException e1) {
fail();
}
assertThat(detectedCharset.name(), startsWith(Charset.defaultCharset().name()));
assertThat((String) body, containsString("test"));
assertThat(headerMap.get(Exchange.CONTENT_TYPE), equalTo("application/msword"));
return true;
}
});
resultEndpoint.assertIsSatisfied();
}
use of org.apache.tika.parser.txt.UniversalEncodingDetector in project tika by apache.
the class TikaEncodingDetectorTest method testDefault.
@Test
public void testDefault() {
EncodingDetector detector = TikaConfig.getDefaultConfig().getEncodingDetector();
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
assertEquals(3, detectors.size());
assertTrue(detectors.get(0) instanceof HtmlEncodingDetector);
assertTrue(detectors.get(1) instanceof UniversalEncodingDetector);
assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector);
}
use of org.apache.tika.parser.txt.UniversalEncodingDetector in project tika by apache.
the class TikaEncodingDetectorTest method testBlackList.
@Test
public void testBlackList() throws Exception {
TikaConfig config = getConfig("TIKA-2273-blacklist-encoding-detector-default.xml");
EncodingDetector detector = config.getEncodingDetector();
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
assertEquals(2, detectors.size());
EncodingDetector detector1 = detectors.get(0);
assertTrue(detector1 instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors1Children = ((CompositeEncodingDetector) detector1).getDetectors();
assertEquals(2, detectors1Children.size());
assertTrue(detectors1Children.get(0) instanceof UniversalEncodingDetector);
assertTrue(detectors1Children.get(1) instanceof Icu4jEncodingDetector);
assertTrue(detectors.get(1) instanceof NonDetectingEncodingDetector);
}
use of org.apache.tika.parser.txt.UniversalEncodingDetector in project camel by apache.
the class TikaParseTest method testDocumentParseWithEncoding.
@Test
public void testDocumentParseWithEncoding() throws Exception {
File document = new File("src/test/resources/testOpenOffice2.odt");
template.sendBody("direct:start4", document);
resultEndpoint.setExpectedMessageCount(1);
resultEndpoint.expectedMessagesMatches(new Predicate() {
@Override
public boolean matches(Exchange exchange) {
Object body = exchange.getIn().getBody(String.class);
Map<String, Object> headerMap = exchange.getIn().getHeaders();
assertThat(body, instanceOf(String.class));
Charset detectedCharset = null;
try {
InputStream bodyIs = new ByteArrayInputStream(((String) body).getBytes(StandardCharsets.UTF_16));
UniversalEncodingDetector encodingDetector = new UniversalEncodingDetector();
detectedCharset = encodingDetector.detect(bodyIs, new Metadata());
} catch (IOException e1) {
fail();
}
assertThat(detectedCharset.name(), startsWith(StandardCharsets.UTF_16.name()));
assertThat(headerMap.get(Exchange.CONTENT_TYPE), equalTo("application/vnd.oasis.opendocument.text"));
return true;
}
});
resultEndpoint.assertIsSatisfied();
}
Aggregations