Search in sources :

Example 66 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class HtmlParserTest method XtestParseUTF8.

@Test
@Ignore("The file 'testXHTML_utf8.html' is not available for testing")
public void XtestParseUTF8() throws IOException, SAXException, TikaException {
    String path = "/test-documents/testXHTML_utf8.html";
    Metadata metadata = new Metadata();
    String content = new Tika().parseToString(HtmlParserTest.class.getResourceAsStream(path), metadata);
    assertTrue("Did not contain expected text:" + "Title : Tilte with UTF-8 chars öäå", content.contains("Title : Tilte with UTF-8 chars öäå"));
    assertTrue("Did not contain expected text:" + "Content with UTF-8 chars", content.contains("Content with UTF-8 chars"));
    assertTrue("Did not contain expected text:" + "åäö", content.contains("åäö"));
}
Also used : Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) Ignore(org.junit.Ignore) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 67 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class HtmlParserTest method testParseEmpty.

@Test
public void testParseEmpty() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    new HtmlParser().parse(new ByteArrayInputStream(new byte[0]), handler, new Metadata(), new ParseContext());
    assertEquals("", handler.toString());
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 68 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class HtmlParserTest method assertRelativeLink.

private void assertRelativeLink(String url, String base, String relative) throws Exception {
    String test = "<html><head><base href=\"" + base + "\"></head>" + "<body><a href=\"" + relative + "\">test</a></body></html>";
    final List<String> links = new ArrayList<String>();
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new DefaultHandler() {

        @Override
        public void startElement(String u, String l, String name, Attributes atts) {
            if (name.equals("a") && atts.getValue("", "href") != null) {
                links.add(atts.getValue("", "href"));
            }
        }
    }, new Metadata(), new ParseContext());
    assertEquals(1, links.size());
    assertEquals(url, links.get(0));
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) ArrayList(java.util.ArrayList) Attributes(org.xml.sax.Attributes) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 69 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class HtmlParserTest method testBoilerplateWithMarkup.

/**
     * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
     */
@Test
public void testBoilerplateWithMarkup() throws Exception {
    String path = "/test-documents/boilerplate.html";
    Metadata metadata = new Metadata();
    StringWriter sw = new StringWriter();
    ContentHandler ch = makeHtmlTransformer(sw);
    BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
    bpch.setIncludeMarkup(true);
    new HtmlParser().parse(HtmlParserTest.class.getResourceAsStream(path), bpch, metadata, new ParseContext());
    String content = sw.toString();
    assertTrue("Has empty table elements", content.contains("<body><table><tr><td><table><tr><td>"));
    assertTrue("Has empty a element", content.contains("<a shape=\"rect\" href=\"Main.php\"/>"));
    assertTrue("Has real content", content.contains("<p>This is the real meat"));
    assertTrue("Ends with appropriate HTML", content.endsWith("</p></body></html>"));
    assertFalse(content.contains("boilerplate"));
    assertFalse(content.contains("footer"));
}
Also used : StringWriter(java.io.StringWriter) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 70 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class HtmlParserTest method testUsingCharsetInContentTypeHeader.

/**
     * Test case for TIKA-341
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
     */
@Test
public void testUsingCharsetInContentTypeHeader() throws Exception {
    final String test = "<html><head><title>the name is ándre</title></head>" + "<body></body></html>";
    Metadata metadata = new Metadata();
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
    assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
    metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext());
    assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)651 Test (org.junit.Test)467 InputStream (java.io.InputStream)320 ParseContext (org.apache.tika.parser.ParseContext)283 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)269 TikaTest (org.apache.tika.TikaTest)257 ContentHandler (org.xml.sax.ContentHandler)229 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)154 ByteArrayInputStream (java.io.ByteArrayInputStream)143 Parser (org.apache.tika.parser.Parser)136 TikaInputStream (org.apache.tika.io.TikaInputStream)133 IOException (java.io.IOException)66 DefaultHandler (org.xml.sax.helpers.DefaultHandler)59 TikaException (org.apache.tika.exception.TikaException)48 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)36 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)36 StringWriter (java.io.StringWriter)33 Tika (org.apache.tika.Tika)29 MediaType (org.apache.tika.mime.MediaType)29 SAXException (org.xml.sax.SAXException)29