Search in sources :

Example 86 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class TNEFParserTest method testMetadata.

@Test
public void testMetadata() throws Exception {
    TikaInputStream stream = getTestFile(file);
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    TNEFParser tnef = new TNEFParser();
    tnef.parse(stream, handler, metadata, new ParseContext());
    assertEquals("This is a test message", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) TikaInputStream(org.apache.tika.io.TikaInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Example 87 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class WordParserTest method testCustomProperties.

/**
     * Ensures that custom OLE2 (HPSF) properties are extracted
     */
@Test
public void testCustomProperties() throws Exception {
    Metadata metadata = new Metadata();
    try (InputStream input = WordParserTest.class.getResourceAsStream("/test-documents/testWORD_custom_props.doc")) {
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        new OfficeParser().parse(input, handler, metadata, context);
    }
    assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
    assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
    assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
    assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
    assertEquals("2010-10-05T09:03:00Z", metadata.get(TikaCoreProperties.CREATED));
    assertEquals("2010-10-05T09:03:00Z", metadata.get(Metadata.CREATION_DATE));
    assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
    assertEquals("1", metadata.get(Office.PAGE_COUNT));
    assertEquals("2", metadata.get(Office.WORD_COUNT));
    assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
    assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
    assertEquals("My Comments", metadata.get(TikaCoreProperties.COMMENTS));
    // TODO: Move to OO subject in Tika 2.0
    assertEquals("My subject", metadata.get(Metadata.SUBJECT));
    assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
    assertEquals("EDF-DIT", metadata.get(OfficeOpenXMLExtended.COMPANY));
    assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
    assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 88 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class WordParserTest method testNoFormat.

/**
     * TIKA-1044 - Handle documents where parts of the
     *  text have no formatting or styles applied to them
     */
@Test
public void testNoFormat() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = WordParserTest.class.getResourceAsStream("/test-documents/testWORD_no_format.doc")) {
        new OfficeParser().parse(stream, handler, metadata, new ParseContext());
    }
    String content = handler.toString();
    assertContains("Will generate an exception", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 89 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class WordParserTest method testWordParser.

@Test
public void testWordParser() throws Exception {
    try (InputStream input = WordParserTest.class.getResourceAsStream("/test-documents/testWORD.doc")) {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        new OfficeParser().parse(input, handler, metadata, new ParseContext());
        assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
        assertContains("Sample Word Document", handler.toString());
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 90 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class ExcelParserTest method testExcelParser.

@Test
// Checks legacy Tika-1.0 style metadata keys
@SuppressWarnings("deprecation")
public void testExcelParser() throws Exception {
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xls")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        new OfficeParser().parse(input, handler, metadata, context);
        assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
        // Mon Oct 01 17:13:56 BST 2007
        assertEquals("2007-10-01T16:13:56Z", metadata.get(TikaCoreProperties.CREATED));
        assertEquals("2007-10-01T16:13:56Z", metadata.get(Metadata.CREATION_DATE));
        // Mon Oct 01 17:31:43 BST 2007
        assertEquals("2007-10-01T16:31:43Z", metadata.get(TikaCoreProperties.MODIFIED));
        assertEquals("2007-10-01T16:31:43Z", metadata.get(Metadata.DATE));
        String content = handler.toString();
        assertContains("Sample Excel Worksheet", content);
        assertContains("Numbers and their Squares", content);
        assertContains("\t\tNumber\tSquare", content);
        assertContains("9", content);
        assertNotContained("9.0", content);
        assertContains("196", content);
        assertNotContained("196.0", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

ContentHandler (org.xml.sax.ContentHandler)354 Metadata (org.apache.tika.metadata.Metadata)229 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)229 InputStream (java.io.InputStream)210 Test (org.junit.Test)208 ParseContext (org.apache.tika.parser.ParseContext)164 Parser (org.apache.tika.parser.Parser)106 TikaTest (org.apache.tika.TikaTest)103 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)102 TikaInputStream (org.apache.tika.io.TikaInputStream)75 ByteArrayInputStream (java.io.ByteArrayInputStream)64 SAXException (org.xml.sax.SAXException)40 IOException (java.io.IOException)34 TeeContentHandler (org.apache.tika.sax.TeeContentHandler)28 TikaException (org.apache.tika.exception.TikaException)24 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)21 AttributesImpl (org.xml.sax.helpers.AttributesImpl)21 InputSource (org.xml.sax.InputSource)20