Search in sources :

Example 21 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class EnviHeaderParserTest method testParseGlobalMetadata.

@Test
public void testParseGlobalMetadata() throws Exception {
    if (System.getProperty("java.version").startsWith("1.5")) {
        return;
    }
    Parser parser = new EnviHeaderParser();
    ToXMLContentHandler handler = new ToXMLContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = EnviHeaderParser.class.getResourceAsStream("/test-documents/envi_test_header.hdr")) {
        assertNotNull("Test ENVI file not found", stream);
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    // Check content of test file
    String content = handler.toString();
    assertContains("<body><p>ENVI</p>", content);
    assertContains("<p>samples = 2400</p>", content);
    assertContains("<p>lines   = 2400</p>", content);
    assertContains("<p>map info = {Sinusoidal, 1.5000, 1.5000, -10007091.3643, 5559289.2856, 4.6331271653e+02, 4.6331271653e+02, , units=Meters}</p>", content);
    assertContains("content=\"application/envi.hdr\"", content);
    assertContains("projection info = {16, 6371007.2, 0.000000, 0.0, 0.0, Sinusoidal, units=Meters}", content);
}
Also used : ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) Parser(org.apache.tika.parser.Parser) Test(org.junit.Test)

Example 22 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class SourceCodeParserTest method testSupportTypes.

@Test
public void testSupportTypes() throws Exception {
    Set<MediaType> supportedTypes = sourceCodeParser.getSupportedTypes(new ParseContext());
    assertTrue(supportedTypes.contains(new MediaType("text", "x-java-source")));
    assertTrue(supportedTypes.contains(new MediaType("text", "x-groovy")));
    assertTrue(supportedTypes.contains(new MediaType("text", "x-c++src")));
    assertFalse(sourceCodeParser.getSupportedTypes(new ParseContext()).contains(new MediaType("text", "html")));
}
Also used : ParseContext(org.apache.tika.parser.ParseContext) MediaType(org.apache.tika.mime.MediaType) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 23 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class HtmlParserTest method testDetectOfCharset.

/**
     * Test case for TIKA-334
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a>
     */
@Test
public void testDetectOfCharset() throws Exception {
    String test = "<html><head><title>Ž</title></head><body></body></html>";
    Metadata metadata = new Metadata();
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
    assertEquals("Ž", metadata.get(TikaCoreProperties.TITLE));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 24 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class HtmlParserTest method testCustomHtmlSchema.

// TIKA-1193
@Test
public void testCustomHtmlSchema() throws Exception {
    // Default schema does not allow tables inside anchors
    String test = "<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>";
    Metadata metadata = new Metadata();
    LinkContentHandler linkContentHandler = new LinkContentHandler();
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), linkContentHandler, metadata, new ParseContext());
    // Expect no anchor text
    assertEquals("", linkContentHandler.getLinks().get(0).getText());
    // We'll change the schema to allow tables inside anchors!
    Schema schema = new HTMLSchema();
    schema.elementType("a", HTMLSchema.M_ANY, 65535, 0);
    ParseContext parseContext = new ParseContext();
    parseContext.set(Schema.class, schema);
    linkContentHandler = new LinkContentHandler();
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), linkContentHandler, metadata, parseContext);
    // Expect anchor text
    assertEquals("\ttext\n\n", linkContentHandler.getLinks().get(0).getText());
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) HTMLSchema(org.ccil.cowan.tagsoup.HTMLSchema) Schema(org.ccil.cowan.tagsoup.Schema) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) HTMLSchema(org.ccil.cowan.tagsoup.HTMLSchema) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 25 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class HtmlParserTest method testParseEmpty.

@Test
public void testParseEmpty() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    new HtmlParser().parse(new ByteArrayInputStream(new byte[0]), handler, new Metadata(), new ParseContext());
    assertEquals("", handler.toString());
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

ParseContext (org.apache.tika.parser.ParseContext)336 Metadata (org.apache.tika.metadata.Metadata)281 Test (org.junit.Test)260 InputStream (java.io.InputStream)195 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)195 TikaTest (org.apache.tika.TikaTest)186 ContentHandler (org.xml.sax.ContentHandler)163 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)117 Parser (org.apache.tika.parser.Parser)107 ByteArrayInputStream (java.io.ByteArrayInputStream)91 TikaInputStream (org.apache.tika.io.TikaInputStream)77 DefaultHandler (org.xml.sax.helpers.DefaultHandler)52 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)31 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)31 TikaException (org.apache.tika.exception.TikaException)29 StringWriter (java.io.StringWriter)26 IOException (java.io.IOException)24 SAXException (org.xml.sax.SAXException)24 CompositeParser (org.apache.tika.parser.CompositeParser)22 FileInputStream (java.io.FileInputStream)19