Search in sources :

Example 66 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class OOXMLParserTest method testProtectedExcelFile.

/**
     * An excel document which is password protected.
     * See TIKA-437.
     */
@Test
public void testProtectedExcelFile() throws Exception {
    Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    ParseContext context = new ParseContext();
    try (InputStream input = getTestDocument("protectedFile.xlsx")) {
        parser.parse(input, handler, metadata, context);
        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
        String content = handler.toString();
        assertContains("Office", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 67 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class TNEFParserTest method testMetadata.

@Test
public void testMetadata() throws Exception {
    TikaInputStream stream = getTestFile(file);
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    TNEFParser tnef = new TNEFParser();
    tnef.parse(stream, handler, metadata, new ParseContext());
    assertEquals("This is a test message", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) TikaInputStream(org.apache.tika.io.TikaInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Example 68 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class WordParserTest method testCustomProperties.

/**
     * Ensures that custom OLE2 (HPSF) properties are extracted
     */
@Test
public void testCustomProperties() throws Exception {
    Metadata metadata = new Metadata();
    try (InputStream input = WordParserTest.class.getResourceAsStream("/test-documents/testWORD_custom_props.doc")) {
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        new OfficeParser().parse(input, handler, metadata, context);
    }
    assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
    assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
    assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
    assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
    assertEquals("2010-10-05T09:03:00Z", metadata.get(TikaCoreProperties.CREATED));
    assertEquals("2010-10-05T09:03:00Z", metadata.get(Metadata.CREATION_DATE));
    assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
    assertEquals("1", metadata.get(Office.PAGE_COUNT));
    assertEquals("2", metadata.get(Office.WORD_COUNT));
    assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
    assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
    assertEquals("My Comments", metadata.get(TikaCoreProperties.COMMENTS));
    // TODO: Move to OO subject in Tika 2.0
    assertEquals("My subject", metadata.get(Metadata.SUBJECT));
    assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
    assertEquals("EDF-DIT", metadata.get(OfficeOpenXMLExtended.COMPANY));
    assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
    assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 69 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class WordParserTest method testNoFormat.

/**
     * TIKA-1044 - Handle documents where parts of the
     *  text have no formatting or styles applied to them
     */
@Test
public void testNoFormat() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = WordParserTest.class.getResourceAsStream("/test-documents/testWORD_no_format.doc")) {
        new OfficeParser().parse(stream, handler, metadata, new ParseContext());
    }
    String content = handler.toString();
    assertContains("Will generate an exception", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 70 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class WordParserTest method testWordParser.

@Test
public void testWordParser() throws Exception {
    try (InputStream input = WordParserTest.class.getResourceAsStream("/test-documents/testWORD.doc")) {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        new OfficeParser().parse(input, handler, metadata, new ParseContext());
        assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
        assertContains("Sample Word Document", handler.toString());
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

ParseContext (org.apache.tika.parser.ParseContext)338 Metadata (org.apache.tika.metadata.Metadata)283 Test (org.junit.Test)260 InputStream (java.io.InputStream)195 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)195 TikaTest (org.apache.tika.TikaTest)186 ContentHandler (org.xml.sax.ContentHandler)164 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)118 Parser (org.apache.tika.parser.Parser)109 ByteArrayInputStream (java.io.ByteArrayInputStream)92 TikaInputStream (org.apache.tika.io.TikaInputStream)77 DefaultHandler (org.xml.sax.helpers.DefaultHandler)52 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)31 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)31 TikaException (org.apache.tika.exception.TikaException)30 StringWriter (java.io.StringWriter)26 IOException (java.io.IOException)25 SAXException (org.xml.sax.SAXException)25 CompositeParser (org.apache.tika.parser.CompositeParser)22 TeeContentHandler (org.apache.tika.sax.TeeContentHandler)20