use of org.apache.tika.detect.EncodingDetector in project tika by apache.
the class TikaEncodingDetectorTest method testParameterization.
@Test
public void testParameterization() throws Exception {
TikaConfig config = getConfig("TIKA-2273-parameterize-encoding-detector.xml");
EncodingDetector detector = config.getEncodingDetector();
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
assertEquals(2, detectors.size());
assertTrue(((Icu4jEncodingDetector) detectors.get(0)).getStripMarkup());
assertTrue(detectors.get(1) instanceof NonDetectingEncodingDetector);
}
use of org.apache.tika.detect.EncodingDetector in project tika by apache.
the class TikaEncodingDetectorTest method testConfigurabilityOfUserSpecified.
@Test
public void testConfigurabilityOfUserSpecified() throws Exception {
TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-encoding-detector-outside-static-init.xml"));
AutoDetectParser p = new AutoDetectParser(tikaConfig);
//make sure that all static and non-static parsers are using the same encoding detector!
List<Parser> parsers = new ArrayList<>();
findEncodingDetectionParsers(p, parsers);
assertEquals(3, parsers.size());
for (Parser encodingDetectingParser : parsers) {
EncodingDetector encodingDetector = ((AbstractEncodingDetectorParser) encodingDetectingParser).getEncodingDetector();
assertTrue(encodingDetector instanceof CompositeEncodingDetector);
assertEquals(2, ((CompositeEncodingDetector) encodingDetector).getDetectors().size());
for (EncodingDetector child : ((CompositeEncodingDetector) encodingDetector).getDetectors()) {
assertNotContained("cu4j", child.getClass().getCanonicalName());
}
}
//also just make sure this is still true
try {
Metadata metadata = getXML("english.cp500.txt", p).metadata;
fail("can't detect w/out ICU");
} catch (TikaException e) {
assertContains("Failed to detect", e.getMessage());
}
}
use of org.apache.tika.detect.EncodingDetector in project tika by apache.
the class TikaEncodingDetectorTest method testDefault.
@Test
public void testDefault() {
EncodingDetector detector = TikaConfig.getDefaultConfig().getEncodingDetector();
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
assertEquals(3, detectors.size());
assertTrue(detectors.get(0) instanceof HtmlEncodingDetector);
assertTrue(detectors.get(1) instanceof UniversalEncodingDetector);
assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector);
}
use of org.apache.tika.detect.EncodingDetector in project tika by apache.
the class TikaEncodingDetectorTest method testNonDetectingDetectorParams.
@Test
public void testNonDetectingDetectorParams() throws Exception {
TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-non-detecting-params.xml"));
AutoDetectParser p = new AutoDetectParser(tikaConfig);
List<Parser> parsers = new ArrayList<>();
findEncodingDetectionParsers(p, parsers);
assertEquals(3, parsers.size());
EncodingDetector encodingDetector = ((AbstractEncodingDetectorParser) parsers.get(0)).getEncodingDetector();
assertTrue(encodingDetector instanceof CompositeEncodingDetector);
assertEquals(1, ((CompositeEncodingDetector) encodingDetector).getDetectors().size());
EncodingDetector child = ((CompositeEncodingDetector) encodingDetector).getDetectors().get(0);
assertTrue(child instanceof NonDetectingEncodingDetector);
assertEquals(StandardCharsets.UTF_16LE, ((NonDetectingEncodingDetector) child).getCharset());
}
use of org.apache.tika.detect.EncodingDetector in project tika by apache.
the class TikaEncodingDetectorTest method testBlackList.
@Test
public void testBlackList() throws Exception {
TikaConfig config = getConfig("TIKA-2273-blacklist-encoding-detector-default.xml");
EncodingDetector detector = config.getEncodingDetector();
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
assertEquals(2, detectors.size());
EncodingDetector detector1 = detectors.get(0);
assertTrue(detector1 instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors1Children = ((CompositeEncodingDetector) detector1).getDetectors();
assertEquals(2, detectors1Children.size());
assertTrue(detectors1Children.get(0) instanceof UniversalEncodingDetector);
assertTrue(detectors1Children.get(1) instanceof Icu4jEncodingDetector);
assertTrue(detectors.get(1) instanceof NonDetectingEncodingDetector);
}
Aggregations