Search in sources :

Example 46 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class TIAParsingExample method testHtmlMapper.

public static void testHtmlMapper() throws Exception {
    InputStream stream = new ByteArrayInputStream(new byte[0]);
    ContentHandler handler = new DefaultHandler();
    Metadata metadata = new Metadata();
    Parser parser = new AutoDetectParser();
    ParseContext context = new ParseContext();
    context.set(HtmlMapper.class, new IdentityHtmlMapper());
    parser.parse(stream, handler, metadata, context);
}
Also used : IdentityHtmlMapper(org.apache.tika.parser.html.IdentityHtmlMapper) ByteArrayInputStream(java.io.ByteArrayInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Parser(org.apache.tika.parser.Parser) XMLParser(org.apache.tika.parser.xml.XMLParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) TXTParser(org.apache.tika.parser.txt.TXTParser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Example 47 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class TIAParsingExample method useHtmlParser.

public static void useHtmlParser() throws Exception {
    InputStream stream = new ByteArrayInputStream(new byte[0]);
    ContentHandler handler = new DefaultHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    Parser parser = new HtmlParser();
    parser.parse(stream, handler, metadata, context);
}
Also used : HtmlParser(org.apache.tika.parser.html.HtmlParser) ByteArrayInputStream(java.io.ByteArrayInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Parser(org.apache.tika.parser.Parser) XMLParser(org.apache.tika.parser.xml.XMLParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) TXTParser(org.apache.tika.parser.txt.TXTParser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Example 48 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class TIAParsingExample method parseURLStream.

public static void parseURLStream(String address) throws Exception {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new DefaultHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    try (InputStream stream = new GZIPInputStream(new URL(address).openStream())) {
        parser.parse(stream, handler, metadata, context);
    }
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) URL(java.net.URL) Parser(org.apache.tika.parser.Parser) XMLParser(org.apache.tika.parser.xml.XMLParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) TXTParser(org.apache.tika.parser.txt.TXTParser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 49 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class TIAParsingExample method useAutoDetectParser.

public static void useAutoDetectParser() throws Exception {
    InputStream stream = new ByteArrayInputStream(new byte[0]);
    ContentHandler handler = new DefaultHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    Parser parser = new AutoDetectParser();
    parser.parse(stream, handler, metadata, context);
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Parser(org.apache.tika.parser.Parser) XMLParser(org.apache.tika.parser.xml.XMLParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) TXTParser(org.apache.tika.parser.txt.TXTParser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Example 50 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class TXTParserTest method testLatinDetectionHeuristics.

/**
     * Test for the heuristics that we use to assign an eight-bit character
     * encoding to mostly ASCII sequences. If a more specific match can not
     * be made, a string with a CR(LF) in it is most probably windows-1252,
     * otherwise ISO-8859-1, except if it contains the currency/euro symbol
     * (byte 0xa4) in which case it's more likely to be ISO-8859-15.
     */
@Test
public void testLatinDetectionHeuristics() throws Exception {
    String windows = "test\r\n";
    String unix = "test\n";
    String euro = "test €\n";
    Metadata metadata;
    metadata = new Metadata();
    parser.parse(new ByteArrayInputStream(windows.getBytes("ISO-8859-15")), new DefaultHandler(), metadata, new ParseContext());
    assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE));
    metadata = new Metadata();
    parser.parse(new ByteArrayInputStream(unix.getBytes("ISO-8859-15")), new DefaultHandler(), metadata, new ParseContext());
    assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
    metadata = new Metadata();
    parser.parse(new ByteArrayInputStream(euro.getBytes("ISO-8859-15")), new DefaultHandler(), metadata, new ParseContext());
    assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

DefaultHandler (org.xml.sax.helpers.DefaultHandler)148 InputStream (java.io.InputStream)65 Metadata (org.apache.tika.metadata.Metadata)59 ParseContext (org.apache.tika.parser.ParseContext)52 Test (org.junit.Test)44 Attributes (org.xml.sax.Attributes)41 SAXParser (javax.xml.parsers.SAXParser)40 SAXException (org.xml.sax.SAXException)39 ByteArrayInputStream (java.io.ByteArrayInputStream)32 SAXParserFactory (javax.xml.parsers.SAXParserFactory)29 IOException (java.io.IOException)26 InputSource (org.xml.sax.InputSource)23 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)22 Parser (org.apache.tika.parser.Parser)22 TikaInputStream (org.apache.tika.io.TikaInputStream)20 ContentHandler (org.xml.sax.ContentHandler)20 File (java.io.File)19 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)17 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)16 FileInputStream (java.io.FileInputStream)15