Search in sources :

Example 36 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class OOXMLParserTest method testProtectedExcelFile.

/**
     * An excel document which is password protected.
     * See TIKA-437.
     */
@Test
public void testProtectedExcelFile() throws Exception {
    Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    ParseContext context = new ParseContext();
    try (InputStream input = getTestDocument("protectedFile.xlsx")) {
        parser.parse(input, handler, metadata, context);
        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
        String content = handler.toString();
        assertContains("Office", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 37 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class TesseractOCRParserTest method runOCR.

private String runOCR(String resource, String[] nonOCRContains, int numMetadatas, BasicContentHandlerFactory.HANDLER_TYPE handlerType, TesseractOCRConfig.OUTPUT_TYPE outputType) throws Exception {
    TesseractOCRConfig config = new TesseractOCRConfig();
    config.setOutputType(outputType);
    Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory(handlerType, -1));
    PDFParserConfig pdfConfig = new PDFParserConfig();
    pdfConfig.setExtractInlineImages(true);
    ParseContext parseContext = new ParseContext();
    parseContext.set(TesseractOCRConfig.class, config);
    parseContext.set(Parser.class, parser);
    parseContext.set(PDFParserConfig.class, pdfConfig);
    try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) {
        parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
    }
    List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata();
    assertEquals(numMetadatas, metadataList.size());
    StringBuilder contents = new StringBuilder();
    for (Metadata m : metadataList) {
        contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
    }
    for (String needle : nonOCRContains) {
        assertContains(needle, contents.toString());
    }
    assertTrue(metadataList.get(0).names().length > 10);
    assertTrue(metadataList.get(1).names().length > 10);
    //test at least one value
    assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
    return contents.toString();
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) ExternalParser(org.apache.tika.parser.external.ExternalParser) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ImageParser(org.apache.tika.parser.image.ImageParser) DefaultParser(org.apache.tika.parser.DefaultParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) PDFParserConfig(org.apache.tika.parser.pdf.PDFParserConfig)

Example 38 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class ArParserTest method testEmbedded.

/**
     * Tests that the ParseContext parser is correctly fired for all the
     * embedded entries.
     */
@Test
public void testEmbedded() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = ArParserTest.class.getResourceAsStream("/test-documents/testARofText.ar")) {
        parser.parse(stream, handler, metadata, trackingContext);
    }
    assertEquals(1, tracker.filenames.size());
    assertEquals(1, tracker.mediatypes.size());
    assertEquals(1, tracker.modifiedAts.size());
    assertEquals("testTXT.txt", tracker.filenames.get(0));
    String modifiedAt = tracker.modifiedAts.get(0);
    assertTrue("Modified at " + modifiedAt, modifiedAt.startsWith("201"));
    for (String type : tracker.mediatypes) {
        assertNull(type);
    }
    for (String crt : tracker.createdAts) {
        assertNull(crt);
    }
    tracker.reset();
    try (InputStream stream = ArParserTest.class.getResourceAsStream("/test-documents/testARofSND.ar")) {
        parser.parse(stream, handler, metadata, trackingContext);
    }
    assertEquals(1, tracker.filenames.size());
    assertEquals(1, tracker.mediatypes.size());
    assertEquals(1, tracker.modifiedAts.size());
    assertEquals("testAU.au", tracker.filenames.get(0));
    modifiedAt = tracker.modifiedAts.get(0);
    assertTrue("Modified at " + modifiedAt, modifiedAt.startsWith("201"));
    for (String type : tracker.mediatypes) {
        assertNull(type);
    }
    for (String crt : tracker.createdAts) {
        assertNull(crt);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 39 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class Bzip2ParserTest method testEmbedded.

/**
     * Tests that the ParseContext parser is correctly
     *  fired for all the embedded entries.
     */
@Test
public void testEmbedded() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = ZipParserTest.class.getResourceAsStream("/test-documents/test-documents.tbz2")) {
        parser.parse(stream, handler, metadata, trackingContext);
    }
    // Should find a single entry, for the (compressed) tar file
    assertEquals(1, tracker.filenames.size());
    assertEquals(1, tracker.mediatypes.size());
    assertEquals(1, tracker.modifiedAts.size());
    assertEquals(null, tracker.filenames.get(0));
    assertEquals(null, tracker.mediatypes.get(0));
    assertEquals(null, tracker.createdAts.get(0));
    assertEquals(null, tracker.modifiedAts.get(0));
    // Tar file starts with the directory name
    assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 40 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class Bzip2ParserTest method testBzip2Parsing.

@Test
public void testBzip2Parsing() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = Bzip2ParserTest.class.getResourceAsStream("/test-documents/test-documents.tbz2")) {
        parser.parse(stream, handler, metadata, recursingContext);
    }
    assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
    String content = handler.toString();
    assertContains("test-documents/testEXCEL.xls", content);
    assertContains("Sample Excel Worksheet", content);
    assertContains("test-documents/testHTML.html", content);
    assertContains("Test Indexation Html", content);
    assertContains("test-documents/testOpenOffice2.odt", content);
    assertContains("This is a sample Open Office document", content);
    assertContains("test-documents/testPDF.pdf", content);
    assertContains("Apache Tika", content);
    assertContains("test-documents/testPPT.ppt", content);
    assertContains("Sample Powerpoint Slide", content);
    assertContains("test-documents/testRTF.rtf", content);
    assertContains("indexation Word", content);
    assertContains("test-documents/testTXT.txt", content);
    assertContains("Test d'indexation de Txt", content);
    assertContains("test-documents/testWORD.doc", content);
    assertContains("This is a sample Microsoft Word Document", content);
    assertContains("test-documents/testXML.xml", content);
    assertContains("Rida Benjelloun", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Aggregations

AutoDetectParser (org.apache.tika.parser.AutoDetectParser)167 Metadata (org.apache.tika.metadata.Metadata)139 Test (org.junit.Test)122 InputStream (java.io.InputStream)117 Parser (org.apache.tika.parser.Parser)112 ParseContext (org.apache.tika.parser.ParseContext)104 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)97 ContentHandler (org.xml.sax.ContentHandler)91 TikaTest (org.apache.tika.TikaTest)82 TikaInputStream (org.apache.tika.io.TikaInputStream)63 ByteArrayInputStream (java.io.ByteArrayInputStream)34 CompositeParser (org.apache.tika.parser.CompositeParser)28 TikaConfig (org.apache.tika.config.TikaConfig)18 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)17 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)17 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)15 TikaException (org.apache.tika.exception.TikaException)13 EmptyParser (org.apache.tika.parser.EmptyParser)13 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)13 DefaultHandler (org.xml.sax.helpers.DefaultHandler)12