Search in sources :

Example 31 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project robovm by robovm.

the class SaxTest method testYesPrefixesYesNamespaces.

/**
     * Android's Expat-based SAX parser fails this test because Expat doesn't
     * supply us with our much desired {@code xmlns="http://..."} attributes.
     */
public void testYesPrefixesYesNamespaces() throws Exception {
    parse(true, true, "<foo bar=\"baz\"/>", new DefaultHandler() {

        @Override
        public void startElement(String uri, String localName, String qName, Attributes attributes) {
            assertEquals("", uri);
            assertEquals("foo", localName);
            assertEquals("foo", qName);
            assertEquals(1, attributes.getLength());
            assertEquals("", attributes.getURI(0));
            assertEquals("bar", attributes.getLocalName(0));
            assertEquals("bar", attributes.getQName(0));
        }
    });
    parse(true, true, "<a:foo a:bar=\"baz\" xmlns:a=\"http://quux\"/>", new DefaultHandler() {

        @Override
        public void startElement(String uri, String localName, String qName, Attributes attributes) {
            assertEquals("http://quux", uri);
            assertEquals("foo", localName);
            assertEquals("a:foo", qName);
            assertEquals(2, attributes.getLength());
            assertEquals("http://quux", attributes.getURI(0));
            assertEquals("bar", attributes.getLocalName(0));
            assertEquals("a:bar", attributes.getQName(0));
            assertEquals("", attributes.getURI(1));
            assertEquals("", attributes.getLocalName(1));
            assertEquals("xmlns:a", attributes.getQName(1));
        }
    });
}
Also used : Attributes(org.xml.sax.Attributes) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 32 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class TikaTest method getRecursiveMetadata.

protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
    Parser p = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, new DefaultHandler(), new Metadata(), context);
    }
    return wrapper.getMetadata();
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 33 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class TestParsers method testEXCELExtraction.

@Test
public void testEXCELExtraction() throws Exception {
    final String expected = "Numbers and their Squares";
    File file = getResourceAsFile("/test-documents/testEXCEL.xls");
    String s1 = tika.parseToString(file);
    assertTrue("Text does not contain '" + expected + "'", s1.contains(expected));
    Parser parser = tika.getParser();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(file)) {
        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
    }
    assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
}
Also used : FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) File(java.io.File) FileInputStream(java.io.FileInputStream) Parser(org.apache.tika.parser.Parser) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test)

Example 34 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class TestParsers method testWORDxtraction.

@Test
public void testWORDxtraction() throws Exception {
    File file = getResourceAsFile("/test-documents/testWORD.doc");
    Parser parser = tika.getParser();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(file)) {
        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
    }
    assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
}
Also used : FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) File(java.io.File) FileInputStream(java.io.FileInputStream) Parser(org.apache.tika.parser.Parser) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test)

Example 35 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class HtmlParserTest method assertRelativeLink.

private void assertRelativeLink(String url, String base, String relative) throws Exception {
    String test = "<html><head><base href=\"" + base + "\"></head>" + "<body><a href=\"" + relative + "\">test</a></body></html>";
    final List<String> links = new ArrayList<String>();
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new DefaultHandler() {

        @Override
        public void startElement(String u, String l, String name, Attributes atts) {
            if (name.equals("a") && atts.getValue("", "href") != null) {
                links.add(atts.getValue("", "href"));
            }
        }
    }, new Metadata(), new ParseContext());
    assertEquals(1, links.size());
    assertEquals(url, links.get(0));
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) ArrayList(java.util.ArrayList) Attributes(org.xml.sax.Attributes) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Aggregations

DefaultHandler (org.xml.sax.helpers.DefaultHandler)148 InputStream (java.io.InputStream)65 Metadata (org.apache.tika.metadata.Metadata)59 ParseContext (org.apache.tika.parser.ParseContext)52 Test (org.junit.Test)44 Attributes (org.xml.sax.Attributes)41 SAXParser (javax.xml.parsers.SAXParser)40 SAXException (org.xml.sax.SAXException)39 ByteArrayInputStream (java.io.ByteArrayInputStream)32 SAXParserFactory (javax.xml.parsers.SAXParserFactory)29 IOException (java.io.IOException)26 InputSource (org.xml.sax.InputSource)23 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)22 Parser (org.apache.tika.parser.Parser)22 TikaInputStream (org.apache.tika.io.TikaInputStream)20 ContentHandler (org.xml.sax.ContentHandler)20 File (java.io.File)19 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)17 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)16 FileInputStream (java.io.FileInputStream)15