Search in sources :

Example 26 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class HtmlParserTest method assertRelativeLink.

private void assertRelativeLink(String url, String base, String relative) throws Exception {
    String test = "<html><head><base href=\"" + base + "\"></head>" + "<body><a href=\"" + relative + "\">test</a></body></html>";
    final List<String> links = new ArrayList<String>();
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new DefaultHandler() {

        @Override
        public void startElement(String u, String l, String name, Attributes atts) {
            if (name.equals("a") && atts.getValue("", "href") != null) {
                links.add(atts.getValue("", "href"));
            }
        }
    }, new Metadata(), new ParseContext());
    assertEquals(1, links.size());
    assertEquals(url, links.get(0));
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) ArrayList(java.util.ArrayList) Attributes(org.xml.sax.Attributes) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 27 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class HtmlParserTest method testBoilerplateWithMarkup.

/**
     * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
     */
@Test
public void testBoilerplateWithMarkup() throws Exception {
    String path = "/test-documents/boilerplate.html";
    Metadata metadata = new Metadata();
    StringWriter sw = new StringWriter();
    ContentHandler ch = makeHtmlTransformer(sw);
    BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
    bpch.setIncludeMarkup(true);
    new HtmlParser().parse(HtmlParserTest.class.getResourceAsStream(path), bpch, metadata, new ParseContext());
    String content = sw.toString();
    assertTrue("Has empty table elements", content.contains("<body><table><tr><td><table><tr><td>"));
    assertTrue("Has empty a element", content.contains("<a shape=\"rect\" href=\"Main.php\"/>"));
    assertTrue("Has real content", content.contains("<p>This is the real meat"));
    assertTrue("Ends with appropriate HTML", content.endsWith("</p></body></html>"));
    assertFalse(content.contains("boilerplate"));
    assertFalse(content.contains("footer"));
}
Also used : StringWriter(java.io.StringWriter) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 28 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class HtmlParserTest method testUsingCharsetInContentTypeHeader.

/**
     * Test case for TIKA-341
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
     */
@Test
public void testUsingCharsetInContentTypeHeader() throws Exception {
    final String test = "<html><head><title>the name is ándre</title></head>" + "<body></body></html>";
    Metadata metadata = new Metadata();
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
    assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
    metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext());
    assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 29 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class Pkcs7ParserTest method testDetachedSignature.

public void testDetachedSignature() throws Exception {
    try (InputStream input = Pkcs7ParserTest.class.getResourceAsStream("/test-documents/testDetached.p7s")) {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        new Pkcs7Parser().parse(input, handler, metadata, new ParseContext());
    } catch (NullPointerException npe) {
        fail("should not get NPE");
    } catch (TikaException te) {
        assertTrue(te.toString().contains("cannot parse detached pkcs7 signature"));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Example 30 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class ForkParserIntegrationTest method testParsingErrorInForkedParserShouldBeReported.

/**
     * TIKA-831 Parsers throwing errors should be caught and
     *  properly reported
     */
@Test
public void testParsingErrorInForkedParserShouldBeReported() throws Exception {
    BrokenParser brokenParser = new BrokenParser();
    ForkParser parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
    InputStream stream = getClass().getResourceAsStream("/test-documents/testTXT.txt");
    // With a serializable error, we'll get that back
    try {
        ContentHandler output = new BodyContentHandler();
        ParseContext context = new ParseContext();
        parser.parse(stream, output, new Metadata(), context);
        fail("Expected TikaException caused by Error");
    } catch (TikaException e) {
        assertEquals(brokenParser.err, e.getCause());
    } finally {
        parser.close();
    }
    // With a non serializable one, we'll get something else
    // TODO Fix this test
    brokenParser = new BrokenParser();
    brokenParser.re = new WontBeSerializedError("Can't Serialize");
    parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
//        try {
//           ContentHandler output = new BodyContentHandler();
//           ParseContext context = new ParseContext();
//           parser.parse(stream, output, new Metadata(), context);
//           fail("Expected TikaException caused by Error");
//       } catch (TikaException e) {
//           assertEquals(TikaException.class, e.getCause().getClass());
//           assertEquals("Bang!", e.getCause().getMessage());
//       }
}
Also used : ForkParser(org.apache.tika.fork.ForkParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Aggregations

ParseContext (org.apache.tika.parser.ParseContext)336 Metadata (org.apache.tika.metadata.Metadata)281 Test (org.junit.Test)260 InputStream (java.io.InputStream)195 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)195 TikaTest (org.apache.tika.TikaTest)186 ContentHandler (org.xml.sax.ContentHandler)163 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)117 Parser (org.apache.tika.parser.Parser)107 ByteArrayInputStream (java.io.ByteArrayInputStream)91 TikaInputStream (org.apache.tika.io.TikaInputStream)77 DefaultHandler (org.xml.sax.helpers.DefaultHandler)52 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)31 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)31 TikaException (org.apache.tika.exception.TikaException)29 StringWriter (java.io.StringWriter)26 IOException (java.io.IOException)24 SAXException (org.xml.sax.SAXException)24 CompositeParser (org.apache.tika.parser.CompositeParser)22 FileInputStream (java.io.FileInputStream)19