use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class TikaEncodingDetectorTest method testConfigurabilityOfUserSpecified.
@Test
public void testConfigurabilityOfUserSpecified() throws Exception {
TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-encoding-detector-outside-static-init.xml"));
AutoDetectParser p = new AutoDetectParser(tikaConfig);
//make sure that all static and non-static parsers are using the same encoding detector!
List<Parser> parsers = new ArrayList<>();
findEncodingDetectionParsers(p, parsers);
assertEquals(3, parsers.size());
for (Parser encodingDetectingParser : parsers) {
EncodingDetector encodingDetector = ((AbstractEncodingDetectorParser) encodingDetectingParser).getEncodingDetector();
assertTrue(encodingDetector instanceof CompositeEncodingDetector);
assertEquals(2, ((CompositeEncodingDetector) encodingDetector).getDetectors().size());
for (EncodingDetector child : ((CompositeEncodingDetector) encodingDetector).getDetectors()) {
assertNotContained("cu4j", child.getClass().getCanonicalName());
}
}
//also just make sure this is still true
try {
Metadata metadata = getXML("english.cp500.txt", p).metadata;
fail("can't detect w/out ICU");
} catch (TikaException e) {
assertContains("Failed to detect", e.getMessage());
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class TikaEncodingDetectorTest method testNonDetectingDetectorParams.
@Test
public void testNonDetectingDetectorParams() throws Exception {
TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-non-detecting-params.xml"));
AutoDetectParser p = new AutoDetectParser(tikaConfig);
List<Parser> parsers = new ArrayList<>();
findEncodingDetectionParsers(p, parsers);
assertEquals(3, parsers.size());
EncodingDetector encodingDetector = ((AbstractEncodingDetectorParser) parsers.get(0)).getEncodingDetector();
assertTrue(encodingDetector instanceof CompositeEncodingDetector);
assertEquals(1, ((CompositeEncodingDetector) encodingDetector).getDetectors().size());
EncodingDetector child = ((CompositeEncodingDetector) encodingDetector).getDetectors().get(0);
assertTrue(child instanceof NonDetectingEncodingDetector);
assertEquals(StandardCharsets.UTF_16LE, ((NonDetectingEncodingDetector) child).getCharset());
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class TikaEncodingDetectorTest method testEncodingDetectorConfigurability.
@Test
public void testEncodingDetectorConfigurability() throws Exception {
TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-no-icu4j-encoding-detector.xml"));
AutoDetectParser p = new AutoDetectParser(tikaConfig);
try {
Metadata metadata = getXML("english.cp500.txt", p).metadata;
fail("can't detect w/out ICU");
} catch (TikaException e) {
assertContains("Failed to detect", e.getMessage());
}
Tika tika = new Tika(tikaConfig);
try {
String txt = tika.parseToString(getResourceAsFile("/test-documents/english.cp500.txt"));
fail("can't detect w/out ICU");
} catch (TikaException e) {
assertContains("Failed to detect", e.getMessage());
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class MyFirstTika method parseUsingAutoDetect.
public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception {
System.out.println("Handling using AutoDetectParser: [" + filename + "]");
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
ContentHandler handler = new BodyContentHandler();
TikaInputStream stream = TikaInputStream.get(new File(filename), metadata);
parser.parse(stream, handler, metadata, new ParseContext());
return handler.toString();
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class ParsingExample method parseEmbeddedExample.
/**
* This example shows how to extract content from the outer document and all
* embedded documents. The key is to specify a {@link Parser} in the {@link ParseContext}.
*
* @return content, including from embedded documents
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public String parseEmbeddedExample() throws IOException, SAXException, TikaException {
AutoDetectParser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
parser.parse(stream, handler, metadata, context);
return handler.toString();
}
}
Aggregations