Search in sources :

Example 51 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class SolidworksParserTest method testDrawing2014SP0Parser.

/**
     * Test the parsing of an solidWorks drawing in version 2014SP0
     */
@Test
public void testDrawing2014SP0Parser() throws Exception {
    try (InputStream input = SolidworksParserTest.class.getResourceAsStream("/test-documents/testsolidworksDrawing2014SP0.SLDDRW")) {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        new OfficeParser().parse(input, handler, metadata, new ParseContext());
        //Check content type
        assertEquals("application/sldworks", metadata.get(Metadata.CONTENT_TYPE));
        //Check properties
        assertEquals("2012-07-03T12:05:29Z", metadata.get(TikaCoreProperties.CREATED));
        assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
        assertEquals("2013-11-28T12:41:49Z", metadata.get(Metadata.MODIFIED));
        assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
        assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
        assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
        assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
        assertEquals("", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 52 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class SolidworksParserTest method testAssembly2013SP2Parser.

/**
     * Test the parsing of an solidWorks assembly in version 2013SP2
     */
@Test
public void testAssembly2013SP2Parser() throws Exception {
    try (InputStream input = SolidworksParserTest.class.getResourceAsStream("/test-documents/testsolidworksAssembly2013SP2.SLDASM")) {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        new OfficeParser().parse(input, handler, metadata, new ParseContext());
        //Check content type
        assertEquals("application/sldworks", metadata.get(Metadata.CONTENT_TYPE));
        //Check properties
        assertEquals("2012-04-25T09:51:38Z", metadata.get(TikaCoreProperties.CREATED));
        assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
        assertEquals("2013-09-06T08:11:08Z", metadata.get(Metadata.MODIFIED));
        assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
        assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
        assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
        assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
        assertEquals("", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 53 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class Latin1StringsParserTest method testParse.

@Test
public void testParse() throws Exception {
    String testStr = "These are Latin1 accented scripts: Â Ã É Ü â ã é ü";
    String smallStr = "ab";
    byte[] iso8859Bytes = testStr.getBytes(ISO_8859_1);
    byte[] utf8Bytes = testStr.getBytes(UTF_8);
    byte[] utf16Bytes = testStr.getBytes(UTF_16);
    byte[] zeros = new byte[10];
    byte[] smallString = smallStr.getBytes(ISO_8859_1);
    byte[] trashBytes = { 0x00, 0x01, 0x02, 0x03, 0x1E, 0x1F, (byte) 0xFF };
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    baos.write(iso8859Bytes);
    baos.write(zeros);
    baos.write(utf8Bytes);
    baos.write(trashBytes);
    baos.write(utf16Bytes);
    baos.write(zeros);
    baos.write(smallString);
    Parser parser = new Latin1StringsParser();
    ContentHandler handler = new BodyContentHandler();
    try (InputStream stream = new ByteArrayInputStream(baos.toByteArray())) {
        parser.parse(stream, handler, new Metadata(), new ParseContext());
    }
    String result = handler.toString();
    String expected = testStr + "\n" + testStr + "\n" + testStr + "\n";
    // Test if result contains only the test string appended 3 times
    assertTrue(result.equals(expected));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) ByteArrayOutputStream(java.io.ByteArrayOutputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) Test(org.junit.Test)

Example 54 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class EmptyAndDuplicateElementsXMLParserTest method testEmptiesAndRepeats.

@Test
public void testEmptiesAndRepeats() throws Exception {
    try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream("/test-documents/testXML3.xml")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new AllowEmptiesAndDuplicatesCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
        assertEquals(4, metadata.getValues(FIRST_NAME).length);
        assertEquals(4, metadata.getValues(LAST_NAME).length);
        assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
        assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
        assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
        assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
        assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
        assertEquals("", metadata.getValues(LAST_NAME)[2]);
        assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
        assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 55 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class PhoneExtractingContentHandlerTest method testExtractPhoneNumbers.

@Test
public void testExtractPhoneNumbers() throws Exception {
    Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them
    // to the underlying Handler.
    PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
    try (InputStream stream = PhoneExtractingContentHandlerTest.class.getResourceAsStream("/test-documents/testPhoneNumberExtractor.odt")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    String[] phoneNumbers = metadata.getValues("phonenumbers");
    assertContains("9498888888", phoneNumbers[0]);
    assertContains("9497777777", phoneNumbers[1]);
    assertContains("9496666666", phoneNumbers[2]);
    assertContains("9495555555", phoneNumbers[3]);
    assertContains("4193404645", phoneNumbers[4]);
    assertContains("9044687081", phoneNumbers[5]);
    assertContains("2604094811", phoneNumbers[6]);
}
Also used : InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Aggregations

ParseContext (org.apache.tika.parser.ParseContext)336 Metadata (org.apache.tika.metadata.Metadata)281 Test (org.junit.Test)260 InputStream (java.io.InputStream)195 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)195 TikaTest (org.apache.tika.TikaTest)186 ContentHandler (org.xml.sax.ContentHandler)163 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)117 Parser (org.apache.tika.parser.Parser)107 ByteArrayInputStream (java.io.ByteArrayInputStream)91 TikaInputStream (org.apache.tika.io.TikaInputStream)77 DefaultHandler (org.xml.sax.helpers.DefaultHandler)52 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)31 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)31 TikaException (org.apache.tika.exception.TikaException)29 StringWriter (java.io.StringWriter)26 IOException (java.io.IOException)24 SAXException (org.xml.sax.SAXException)24 CompositeParser (org.apache.tika.parser.CompositeParser)22 FileInputStream (java.io.FileInputStream)19