Search in sources :

Example 6 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class TikaEncodingDetectorTest method testConfigurabilityOfUserSpecified.

@Test
public void testConfigurabilityOfUserSpecified() throws Exception {
    TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-encoding-detector-outside-static-init.xml"));
    AutoDetectParser p = new AutoDetectParser(tikaConfig);
    //make sure that all static and non-static parsers are using the same encoding detector!
    List<Parser> parsers = new ArrayList<>();
    findEncodingDetectionParsers(p, parsers);
    assertEquals(3, parsers.size());
    for (Parser encodingDetectingParser : parsers) {
        EncodingDetector encodingDetector = ((AbstractEncodingDetectorParser) encodingDetectingParser).getEncodingDetector();
        assertTrue(encodingDetector instanceof CompositeEncodingDetector);
        assertEquals(2, ((CompositeEncodingDetector) encodingDetector).getDetectors().size());
        for (EncodingDetector child : ((CompositeEncodingDetector) encodingDetector).getDetectors()) {
            assertNotContained("cu4j", child.getClass().getCanonicalName());
        }
    }
    //also just make sure this is still true
    try {
        Metadata metadata = getXML("english.cp500.txt", p).metadata;
        fail("can't detect w/out ICU");
    } catch (TikaException e) {
        assertContains("Failed to detect", e.getMessage());
    }
}
Also used : Icu4jEncodingDetector(org.apache.tika.parser.txt.Icu4jEncodingDetector) NonDetectingEncodingDetector(org.apache.tika.detect.NonDetectingEncodingDetector) UniversalEncodingDetector(org.apache.tika.parser.txt.UniversalEncodingDetector) CompositeEncodingDetector(org.apache.tika.detect.CompositeEncodingDetector) EncodingDetector(org.apache.tika.detect.EncodingDetector) HtmlEncodingDetector(org.apache.tika.parser.html.HtmlEncodingDetector) CompositeEncodingDetector(org.apache.tika.detect.CompositeEncodingDetector) TikaException(org.apache.tika.exception.TikaException) ArrayList(java.util.ArrayList) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) AbstractEncodingDetectorParser(org.apache.tika.parser.AbstractEncodingDetectorParser) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TXTParser(org.apache.tika.parser.txt.TXTParser) AbstractEncodingDetectorParser(org.apache.tika.parser.AbstractEncodingDetectorParser) Test(org.junit.Test)

Example 7 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class TikaEncodingDetectorTest method testNonDetectingDetectorParams.

@Test
public void testNonDetectingDetectorParams() throws Exception {
    TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-non-detecting-params.xml"));
    AutoDetectParser p = new AutoDetectParser(tikaConfig);
    List<Parser> parsers = new ArrayList<>();
    findEncodingDetectionParsers(p, parsers);
    assertEquals(3, parsers.size());
    EncodingDetector encodingDetector = ((AbstractEncodingDetectorParser) parsers.get(0)).getEncodingDetector();
    assertTrue(encodingDetector instanceof CompositeEncodingDetector);
    assertEquals(1, ((CompositeEncodingDetector) encodingDetector).getDetectors().size());
    EncodingDetector child = ((CompositeEncodingDetector) encodingDetector).getDetectors().get(0);
    assertTrue(child instanceof NonDetectingEncodingDetector);
    assertEquals(StandardCharsets.UTF_16LE, ((NonDetectingEncodingDetector) child).getCharset());
}
Also used : Icu4jEncodingDetector(org.apache.tika.parser.txt.Icu4jEncodingDetector) NonDetectingEncodingDetector(org.apache.tika.detect.NonDetectingEncodingDetector) UniversalEncodingDetector(org.apache.tika.parser.txt.UniversalEncodingDetector) CompositeEncodingDetector(org.apache.tika.detect.CompositeEncodingDetector) EncodingDetector(org.apache.tika.detect.EncodingDetector) HtmlEncodingDetector(org.apache.tika.parser.html.HtmlEncodingDetector) CompositeEncodingDetector(org.apache.tika.detect.CompositeEncodingDetector) ArrayList(java.util.ArrayList) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) AbstractEncodingDetectorParser(org.apache.tika.parser.AbstractEncodingDetectorParser) NonDetectingEncodingDetector(org.apache.tika.detect.NonDetectingEncodingDetector) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TXTParser(org.apache.tika.parser.txt.TXTParser) AbstractEncodingDetectorParser(org.apache.tika.parser.AbstractEncodingDetectorParser) Test(org.junit.Test)

Example 8 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class TikaEncodingDetectorTest method testEncodingDetectorConfigurability.

@Test
public void testEncodingDetectorConfigurability() throws Exception {
    TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-no-icu4j-encoding-detector.xml"));
    AutoDetectParser p = new AutoDetectParser(tikaConfig);
    try {
        Metadata metadata = getXML("english.cp500.txt", p).metadata;
        fail("can't detect w/out ICU");
    } catch (TikaException e) {
        assertContains("Failed to detect", e.getMessage());
    }
    Tika tika = new Tika(tikaConfig);
    try {
        String txt = tika.parseToString(getResourceAsFile("/test-documents/english.cp500.txt"));
        fail("can't detect w/out ICU");
    } catch (TikaException e) {
        assertContains("Failed to detect", e.getMessage());
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Tika(org.apache.tika.Tika) Test(org.junit.Test)

Example 9 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class MyFirstTika method parseUsingAutoDetect.

public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception {
    System.out.println("Handling using AutoDetectParser: [" + filename + "]");
    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    ContentHandler handler = new BodyContentHandler();
    TikaInputStream stream = TikaInputStream.get(new File(filename), metadata);
    parser.parse(stream, handler, metadata, new ParseContext());
    return handler.toString();
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TikaInputStream(org.apache.tika.io.TikaInputStream) File(java.io.File) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Example 10 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class ParsingExample method parseEmbeddedExample.

/**
     * This example shows how to extract content from the outer document and all
     * embedded documents.  The key is to specify a {@link Parser} in the {@link ParseContext}.
     *
     * @return content, including from embedded documents
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
public String parseEmbeddedExample() throws IOException, SAXException, TikaException {
    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
        parser.parse(stream, handler, metadata, context);
        return handler.toString();
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Aggregations

AutoDetectParser (org.apache.tika.parser.AutoDetectParser)167 Metadata (org.apache.tika.metadata.Metadata)139 Test (org.junit.Test)122 InputStream (java.io.InputStream)117 Parser (org.apache.tika.parser.Parser)112 ParseContext (org.apache.tika.parser.ParseContext)104 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)97 ContentHandler (org.xml.sax.ContentHandler)91 TikaTest (org.apache.tika.TikaTest)82 TikaInputStream (org.apache.tika.io.TikaInputStream)63 ByteArrayInputStream (java.io.ByteArrayInputStream)34 CompositeParser (org.apache.tika.parser.CompositeParser)28 TikaConfig (org.apache.tika.config.TikaConfig)18 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)17 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)17 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)15 TikaException (org.apache.tika.exception.TikaException)13 EmptyParser (org.apache.tika.parser.EmptyParser)13 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)13 DefaultHandler (org.xml.sax.helpers.DefaultHandler)12