use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.
the class HtmlParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Automatically detect the character encoding
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
Charset charset = reader.getCharset();
String previous = metadata.get(Metadata.CONTENT_TYPE);
MediaType contentType = null;
if (previous == null || previous.startsWith("text/html")) {
contentType = new MediaType(MediaType.TEXT_HTML, charset);
} else if (previous.startsWith("application/xhtml+xml")) {
contentType = new MediaType(XHTML, charset);
} else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
contentType = new MediaType(WAP_XHTML, charset);
} else if (previous.startsWith("application/x-asp")) {
contentType = new MediaType(X_ASP, charset);
}
if (contentType != null) {
metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
}
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
// Get the HTML mapper from the parse context
HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());
// Parse the HTML document
org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
// Use schema from context or default
Schema schema = context.get(Schema.class, HTML_SCHEMA);
// TIKA-528: Reuse share schema to avoid heavy instantiation
parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
// TIKA-599: Shared schema is thread-safe only if bogons are ignored
parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
parser.setContentHandler(new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata)));
parser.parse(reader.asInputSource());
}
}
use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.
the class SourceCodeParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
Charset charset = reader.getCharset();
String mediaType = metadata.get(Metadata.CONTENT_TYPE);
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (mediaType != null && name != null) {
MediaType type = MediaType.parse(mediaType);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
StringBuilder out = new StringBuilder();
String line;
int nbLines = 0;
while ((line = reader.readLine()) != null) {
out.append(line + System.getProperty("line.separator"));
String author = parserAuthor(line);
if (author != null) {
metadata.add(TikaCoreProperties.CREATOR, author);
}
nbLines++;
}
metadata.set("LoC", String.valueOf(nbLines));
Renderer renderer = getRenderer(type.toString());
String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
Schema schema = context.get(Schema.class, HTML_SCHEMA);
org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
parser.setContentHandler(handler);
parser.parse(new InputSource(new StringReader(codeAsHtml)));
}
}
}
use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.
the class Pkcs7Parser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
try {
DigestCalculatorProvider digestCalculatorProvider = new JcaDigestCalculatorProviderBuilder().setProvider("BC").build();
CMSSignedDataParser parser = new CMSSignedDataParser(digestCalculatorProvider, new CloseShieldInputStream(stream));
try {
CMSTypedStream content = parser.getSignedContent();
if (content == null) {
throw new TikaException("cannot parse detached pkcs7 signature (no signed data to parse)");
}
try (InputStream input = content.getContentStream()) {
Parser delegate = context.get(Parser.class, EmptyParser.INSTANCE);
delegate.parse(input, handler, metadata, context);
}
} finally {
parser.close();
}
} catch (OperatorCreationException e) {
throw new TikaException("Unable to create DigestCalculatorProvider", e);
} catch (CMSException e) {
throw new TikaException("Unable to parse pkcs7 signed data", e);
}
}
use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.
the class FeedParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// set the encoding?
try {
SyndFeed feed = new SyndFeedInput().build(new InputSource(new CloseShieldInputStream(stream)));
String title = stripTags(feed.getTitleEx());
String description = stripTags(feed.getDescriptionEx());
metadata.set(TikaCoreProperties.TITLE, title);
metadata.set(TikaCoreProperties.DESCRIPTION, description);
// store the other fields in the metadata
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.element("h1", title);
xhtml.element("p", description);
xhtml.startElement("ul");
for (Object e : feed.getEntries()) {
SyndEntry entry = (SyndEntry) e;
String link = entry.getLink();
if (link != null) {
xhtml.startElement("li");
xhtml.startElement("a", "href", link);
xhtml.characters(stripTags(entry.getTitleEx()));
xhtml.endElement("a");
SyndContent content = entry.getDescription();
if (content != null) {
xhtml.newline();
xhtml.characters(stripTags(content));
}
xhtml.endElement("li");
}
}
xhtml.endElement("ul");
xhtml.endDocument();
} catch (FeedException e) {
throw new TikaException("RSS parse error", e);
}
}
use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.
the class ImageParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
String type = metadata.get(Metadata.CONTENT_TYPE);
if (type != null) {
// fix it up to the new one, so Java is happy
if (OLD_BMP_TYPE.toString().equals(type)) {
type = MAIN_BMP_TYPE.toString();
}
try {
Iterator<ImageReader> iterator = ImageIO.getImageReadersByMIMEType(type);
if (iterator.hasNext()) {
ImageReader reader = iterator.next();
try {
try (ImageInputStream imageStream = ImageIO.createImageInputStream(new CloseShieldInputStream(stream))) {
reader.setInput(imageStream);
metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0)));
metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0)));
metadata.set("height", Integer.toString(reader.getHeight(0)));
metadata.set("width", Integer.toString(reader.getWidth(0)));
loadMetadata(reader.getImageMetadata(0), metadata);
}
} finally {
reader.dispose();
}
}
// Translate certain Metadata tags from the ImageIO
// specific namespace into the general Tika one
setIfPresent(metadata, "CommentExtensions CommentExtension", TikaCoreProperties.COMMENTS);
setIfPresent(metadata, "markerSequence com", TikaCoreProperties.COMMENTS);
setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE);
} catch (IIOException e) {
// which Tika will just ignore.
if (!(e.getMessage() != null && e.getMessage().equals("Unexpected block type 0!") && type.equals("image/gif"))) {
throw new TikaException(type + " parse error", e);
}
}
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
Aggregations