use of org.apache.tika.parser.txt.Icu4jEncodingDetector in project tika by apache.
the class TikaEncodingDetectorTest method testParameterization.
@Test
public void testParameterization() throws Exception {
TikaConfig config = getConfig("TIKA-2273-parameterize-encoding-detector.xml");
EncodingDetector detector = config.getEncodingDetector();
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
assertEquals(2, detectors.size());
assertTrue(((Icu4jEncodingDetector) detectors.get(0)).getStripMarkup());
assertTrue(detectors.get(1) instanceof NonDetectingEncodingDetector);
}
use of org.apache.tika.parser.txt.Icu4jEncodingDetector in project tika by apache.
the class TikaEncodingDetectorTest method testDefault.
@Test
public void testDefault() {
EncodingDetector detector = TikaConfig.getDefaultConfig().getEncodingDetector();
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
assertEquals(3, detectors.size());
assertTrue(detectors.get(0) instanceof HtmlEncodingDetector);
assertTrue(detectors.get(1) instanceof UniversalEncodingDetector);
assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector);
}
use of org.apache.tika.parser.txt.Icu4jEncodingDetector in project tika by apache.
the class TikaEncodingDetectorTest method testBlackList.
@Test
public void testBlackList() throws Exception {
TikaConfig config = getConfig("TIKA-2273-blacklist-encoding-detector-default.xml");
EncodingDetector detector = config.getEncodingDetector();
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
assertEquals(2, detectors.size());
EncodingDetector detector1 = detectors.get(0);
assertTrue(detector1 instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors1Children = ((CompositeEncodingDetector) detector1).getDetectors();
assertEquals(2, detectors1Children.size());
assertTrue(detectors1Children.get(0) instanceof UniversalEncodingDetector);
assertTrue(detectors1Children.get(1) instanceof Icu4jEncodingDetector);
assertTrue(detectors.get(1) instanceof NonDetectingEncodingDetector);
}
use of org.apache.tika.parser.txt.Icu4jEncodingDetector in project tika by apache.
the class DBFParser method getCharset.
private Charset getCharset(List<DBFRow> firstRows, DBFFileHeader header) throws IOException, TikaException {
//TODO: potentially use codepage info in the header
Charset charset = DEFAULT_CHARSET;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
for (DBFRow row : firstRows) {
for (DBFCell cell : row.cells) {
if (cell.getColType().equals(DBFColumnHeader.ColType.C)) {
byte[] bytes = cell.getBytes();
bos.write(bytes);
if (bos.size() > MAX_CHARS_FOR_CHARSET_DETECTION) {
break;
}
}
}
}
byte[] bytes = bos.toByteArray();
if (bytes.length > 20) {
EncodingDetector detector = new Icu4jEncodingDetector();
detector.detect(TikaInputStream.get(bytes), new Metadata());
charset = detector.detect(new ByteArrayInputStream(bytes), new Metadata());
}
return charset;
}
Aggregations