use of org.xml.sax.helpers.DefaultHandler in project tika by apache.
the class TIAParsingExample method testHtmlMapper.
public static void testHtmlMapper() throws Exception {
InputStream stream = new ByteArrayInputStream(new byte[0]);
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
Parser parser = new AutoDetectParser();
ParseContext context = new ParseContext();
context.set(HtmlMapper.class, new IdentityHtmlMapper());
parser.parse(stream, handler, metadata, context);
}
use of org.xml.sax.helpers.DefaultHandler in project tika by apache.
the class TIAParsingExample method useHtmlParser.
public static void useHtmlParser() throws Exception {
InputStream stream = new ByteArrayInputStream(new byte[0]);
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
Parser parser = new HtmlParser();
parser.parse(stream, handler, metadata, context);
}
use of org.xml.sax.helpers.DefaultHandler in project tika by apache.
the class TIAParsingExample method parseURLStream.
public static void parseURLStream(String address) throws Exception {
Parser parser = new AutoDetectParser();
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
try (InputStream stream = new GZIPInputStream(new URL(address).openStream())) {
parser.parse(stream, handler, metadata, context);
}
}
use of org.xml.sax.helpers.DefaultHandler in project tika by apache.
the class TIAParsingExample method useAutoDetectParser.
public static void useAutoDetectParser() throws Exception {
InputStream stream = new ByteArrayInputStream(new byte[0]);
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
Parser parser = new AutoDetectParser();
parser.parse(stream, handler, metadata, context);
}
use of org.xml.sax.helpers.DefaultHandler in project tika by apache.
the class TXTParserTest method testLatinDetectionHeuristics.
/**
* Test for the heuristics that we use to assign an eight-bit character
* encoding to mostly ASCII sequences. If a more specific match can not
* be made, a string with a CR(LF) in it is most probably windows-1252,
* otherwise ISO-8859-1, except if it contains the currency/euro symbol
* (byte 0xa4) in which case it's more likely to be ISO-8859-15.
*/
@Test
public void testLatinDetectionHeuristics() throws Exception {
String windows = "test\r\n";
String unix = "test\n";
String euro = "test €\n";
Metadata metadata;
metadata = new Metadata();
parser.parse(new ByteArrayInputStream(windows.getBytes("ISO-8859-15")), new DefaultHandler(), metadata, new ParseContext());
assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE));
metadata = new Metadata();
parser.parse(new ByteArrayInputStream(unix.getBytes("ISO-8859-15")), new DefaultHandler(), metadata, new ParseContext());
assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
metadata = new Metadata();
parser.parse(new ByteArrayInputStream(euro.getBytes("ISO-8859-15")), new DefaultHandler(), metadata, new ParseContext());
assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
}
Aggregations