Search in sources :

Example 61 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class TIAParsingExample method testTeeContentHandler.

public static void testTeeContentHandler(String filename) throws Exception {
    InputStream stream = new ByteArrayInputStream(new byte[0]);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    Parser parser = new AutoDetectParser();
    LinkContentHandler linkCollector = new LinkContentHandler();
    try (OutputStream output = new FileOutputStream(new File(filename))) {
        ContentHandler handler = new TeeContentHandler(new BodyContentHandler(output), linkCollector);
        parser.parse(stream, handler, metadata, context);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) GZIPInputStream(java.util.zip.GZIPInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) Parser(org.apache.tika.parser.Parser) XMLParser(org.apache.tika.parser.xml.XMLParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) TXTParser(org.apache.tika.parser.txt.TXTParser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ByteArrayInputStream(java.io.ByteArrayInputStream) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) FileOutputStream(java.io.FileOutputStream) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) File(java.io.File)

Example 62 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class PhoneExtractingContentHandlerTest method testExtractPhoneNumbers.

@Test
public void testExtractPhoneNumbers() throws Exception {
    Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them
    // to the underlying Handler.
    PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
    try (InputStream stream = PhoneExtractingContentHandlerTest.class.getResourceAsStream("/test-documents/testPhoneNumberExtractor.odt")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    String[] phoneNumbers = metadata.getValues("phonenumbers");
    assertContains("9498888888", phoneNumbers[0]);
    assertContains("9497777777", phoneNumbers[1]);
    assertContains("9496666666", phoneNumbers[2]);
    assertContains("9495555555", phoneNumbers[3]);
    assertContains("4193404645", phoneNumbers[4]);
    assertContains("9044687081", phoneNumbers[5]);
    assertContains("2604094811", phoneNumbers[6]);
}
Also used : InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 63 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class RTFParserTest method testConfig.

@Test
public void testConfig() throws Exception {
    //test that memory allocation of the bin element is limited
    //via the config file.  Unfortunately, this test file's bin embedding contains 10 bytes
    //so we had to set the config to 0.
    InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/rtf/tika-config.xml");
    assertNotNull(is);
    TikaConfig tikaConfig = new TikaConfig(is);
    Parser p = new AutoDetectParser(tikaConfig);
    List<Metadata> metadataList = getRecursiveMetadata("testBinControlWord.rtf", p);
    assertEquals(1, metadataList.size());
    assertContains("TikaMemoryLimitException", metadataList.get(0).get(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM));
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) RTFMetadata(org.apache.tika.metadata.RTFMetadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 64 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class Seven7ParserTest method test7ZParsing.

@Test
public void test7ZParsing() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    // Ensure 7zip is a parsable format
    assertTrue("No 7zip parser found", parser.getSupportedTypes(recursingContext).contains(TYPE_7ZIP));
    // Parse
    try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test-documents.7z")) {
        parser.parse(stream, handler, metadata, recursingContext);
    }
    assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
    String content = handler.toString();
    assertContains("test-documents/testEXCEL.xls", content);
    assertContains("Sample Excel Worksheet", content);
    assertContains("test-documents/testHTML.html", content);
    assertContains("Test Indexation Html", content);
    assertContains("test-documents/testOpenOffice2.odt", content);
    assertContains("This is a sample Open Office document", content);
    assertContains("test-documents/testPDF.pdf", content);
    assertContains("Apache Tika", content);
    assertContains("test-documents/testPPT.ppt", content);
    assertContains("Sample Powerpoint Slide", content);
    assertContains("test-documents/testRTF.rtf", content);
    assertContains("indexation Word", content);
    assertContains("test-documents/testTXT.txt", content);
    assertContains("Test d'indexation de Txt", content);
    assertContains("test-documents/testWORD.doc", content);
    assertContains("This is a sample Microsoft Word Document", content);
    assertContains("test-documents/testXML.xml", content);
    assertContains("Rida Benjelloun", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 65 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class Seven7ParserTest method testEmbedded.

/**
     * Tests that the ParseContext parser is correctly
     *  fired for all the embedded entries.
     */
@Test
public void testEmbedded() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test-documents.7z")) {
        parser.parse(stream, handler, metadata, trackingContext);
    }
    // Should have found all 9 documents, but not the directory
    assertEquals(9, tracker.filenames.size());
    assertEquals(9, tracker.mediatypes.size());
    assertEquals(9, tracker.modifiedAts.size());
    // Should have names but not content types, as 7z doesn't
    //  store the content types
    assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
    assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
    assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
    assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
    assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
    assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
    assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
    assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
    assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
    for (String type : tracker.mediatypes) {
        assertNull(type);
    }
    for (String mod : tracker.modifiedAts) {
        assertNotNull(mod);
        assertTrue("Modified at " + mod, mod.startsWith("20"));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Aggregations

AutoDetectParser (org.apache.tika.parser.AutoDetectParser)167 Metadata (org.apache.tika.metadata.Metadata)139 Test (org.junit.Test)122 InputStream (java.io.InputStream)117 Parser (org.apache.tika.parser.Parser)112 ParseContext (org.apache.tika.parser.ParseContext)104 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)97 ContentHandler (org.xml.sax.ContentHandler)91 TikaTest (org.apache.tika.TikaTest)82 TikaInputStream (org.apache.tika.io.TikaInputStream)63 ByteArrayInputStream (java.io.ByteArrayInputStream)34 CompositeParser (org.apache.tika.parser.CompositeParser)28 TikaConfig (org.apache.tika.config.TikaConfig)18 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)17 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)17 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)15 TikaException (org.apache.tika.exception.TikaException)13 EmptyParser (org.apache.tika.parser.EmptyParser)13 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)13 DefaultHandler (org.xml.sax.helpers.DefaultHandler)12