use of org.apache.tika.detect.AutoDetectReader in project tika by apache.
the class TXTParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Automatically detect the character encoding
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
//try to get detected content type; could be a subclass of text/plain
//such as vcal, etc.
String incomingMime = metadata.get(Metadata.CONTENT_TYPE);
MediaType mediaType = MediaType.TEXT_PLAIN;
if (incomingMime != null) {
MediaType tmpMediaType = MediaType.parse(incomingMime);
if (tmpMediaType != null) {
mediaType = tmpMediaType;
}
}
Charset charset = reader.getCharset();
MediaType type = new MediaType(mediaType, charset);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
char[] buffer = new char[4096];
int n = reader.read(buffer);
while (n != -1) {
xhtml.characters(buffer, 0, n);
n = reader.read(buffer);
}
xhtml.endElement("p");
xhtml.endDocument();
}
}
use of org.apache.tika.detect.AutoDetectReader in project tika by apache.
the class HtmlParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Automatically detect the character encoding
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
Charset charset = reader.getCharset();
String previous = metadata.get(Metadata.CONTENT_TYPE);
MediaType contentType = null;
if (previous == null || previous.startsWith("text/html")) {
contentType = new MediaType(MediaType.TEXT_HTML, charset);
} else if (previous.startsWith("application/xhtml+xml")) {
contentType = new MediaType(XHTML, charset);
} else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
contentType = new MediaType(WAP_XHTML, charset);
} else if (previous.startsWith("application/x-asp")) {
contentType = new MediaType(X_ASP, charset);
}
if (contentType != null) {
metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
}
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
// Get the HTML mapper from the parse context
HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());
// Parse the HTML document
org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
// Use schema from context or default
Schema schema = context.get(Schema.class, HTML_SCHEMA);
// TIKA-528: Reuse share schema to avoid heavy instantiation
parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
// TIKA-599: Shared schema is thread-safe only if bogons are ignored
parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
parser.setContentHandler(new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata)));
parser.parse(reader.asInputSource());
}
}
use of org.apache.tika.detect.AutoDetectReader in project tika by apache.
the class SourceCodeParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
Charset charset = reader.getCharset();
String mediaType = metadata.get(Metadata.CONTENT_TYPE);
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (mediaType != null && name != null) {
MediaType type = MediaType.parse(mediaType);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
StringBuilder out = new StringBuilder();
String line;
int nbLines = 0;
while ((line = reader.readLine()) != null) {
out.append(line + System.getProperty("line.separator"));
String author = parserAuthor(line);
if (author != null) {
metadata.add(TikaCoreProperties.CREATOR, author);
}
nbLines++;
}
metadata.set("LoC", String.valueOf(nbLines));
Renderer renderer = getRenderer(type.toString());
String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
Schema schema = context.get(Schema.class, HTML_SCHEMA);
org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
parser.setContentHandler(handler);
parser.parse(new InputSource(new StringReader(codeAsHtml)));
}
}
}
use of org.apache.tika.detect.AutoDetectReader in project tika by apache.
the class EnviHeaderParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Only outputting the MIME type as metadata
metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE);
// The following code was taken from the TXTParser
// Automatically detect the character encoding
TikaConfig tikaConfig = context.get(TikaConfig.class);
if (tikaConfig == null) {
tikaConfig = TikaConfig.getDefaultConfig();
}
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
Charset charset = reader.getCharset();
MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
// text contents of the xhtml
String line;
while ((line = reader.readLine()) != null) {
xhtml.startElement("p");
xhtml.characters(line);
xhtml.endElement("p");
}
xhtml.endDocument();
}
}
use of org.apache.tika.detect.AutoDetectReader in project tika by apache.
the class ISATabUtils method parseStudy.
public static void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
TikaInputStream tis = TikaInputStream.get(stream);
// Automatically detect the character encoding
TikaConfig tikaConfig = context.get(TikaConfig.class);
if (tikaConfig == null) {
tikaConfig = TikaConfig.getDefaultConfig();
}
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata, tikaConfig.getEncodingDetector());
CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
Iterator<CSVRecord> iterator = csvParser.iterator();
xhtml.startElement("table");
xhtml.startElement("thead");
if (iterator.hasNext()) {
CSVRecord record = iterator.next();
for (int i = 0; i < record.size(); i++) {
xhtml.startElement("th");
xhtml.characters(record.get(i));
xhtml.endElement("th");
}
}
xhtml.endElement("thead");
xhtml.startElement("tbody");
while (iterator.hasNext()) {
CSVRecord record = iterator.next();
xhtml.startElement("tr");
for (int j = 0; j < record.size(); j++) {
xhtml.startElement("td");
xhtml.characters(record.get(j));
xhtml.endElement("td");
}
xhtml.endElement("tr");
}
xhtml.endElement("tbody");
xhtml.endElement("table");
}
}
Aggregations