use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.
the class DIFParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// TODO Auto-generated method stub
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
TaggedContentHandler tagged = new TaggedContentHandler(handler);
try {
context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("XML parse error", e);
} finally {
xhtml.endElement("p");
xhtml.endDocument();
}
}
use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.
the class EnviHeaderParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Only outputting the MIME type as metadata
metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE);
// The following code was taken from the TXTParser
// Automatically detect the character encoding
TikaConfig tikaConfig = context.get(TikaConfig.class);
if (tikaConfig == null) {
tikaConfig = TikaConfig.getDefaultConfig();
}
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
Charset charset = reader.getCharset();
MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
// text contents of the xhtml
String line;
while ((line = reader.readLine()) != null) {
xhtml.startElement("p");
xhtml.characters(line);
xhtml.endElement("p");
}
xhtml.endDocument();
}
}
use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.
the class EpubContentParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
SAXParser parser = context.getSAXParser();
parser.parse(new CloseShieldInputStream(stream), new OfflineContentHandler(handler));
}
use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.
the class TikaCLI method process.
public void process(String arg) throws Exception {
if (arg.equals("-?") || arg.equals("--help")) {
pipeMode = false;
usage();
} else if (arg.equals("-V") || arg.equals("--version")) {
pipeMode = false;
version();
} else if (arg.equals("-v") || arg.equals("--verbose")) {
org.apache.log4j.Logger.getRootLogger().setLevel(Level.DEBUG);
} else if (arg.equals("-g") || arg.equals("--gui")) {
pipeMode = false;
if (configFilePath != null) {
TikaGUI.main(new String[] { configFilePath });
} else {
TikaGUI.main(new String[0]);
}
} else if (arg.equals("--list-parser") || arg.equals("--list-parsers")) {
pipeMode = false;
displayParsers(false, false);
} else if (arg.equals("--list-detector") || arg.equals("--list-detectors")) {
pipeMode = false;
displayDetectors();
} else if (arg.equals("--list-parser-detail") || arg.equals("--list-parser-details")) {
pipeMode = false;
displayParsers(true, false);
} else if (arg.equals("--list-parser-detail-apt") || arg.equals("--list-parser-details-apt")) {
pipeMode = false;
displayParsers(true, true);
} else if (arg.equals("--list-met-models")) {
pipeMode = false;
displayMetModels();
} else if (arg.equals("--list-supported-types")) {
pipeMode = false;
displaySupportedTypes();
} else if (arg.startsWith("--compare-file-magic=")) {
pipeMode = false;
compareFileMagic(arg.substring(arg.indexOf('=') + 1));
} else if (arg.equals("--dump-minimal-config")) {
pipeMode = false;
dumpConfig(TikaConfigSerializer.Mode.MINIMAL);
} else if (arg.equals("--dump-current-config")) {
pipeMode = false;
dumpConfig(TikaConfigSerializer.Mode.CURRENT);
} else if (arg.equals("--dump-static-config")) {
pipeMode = false;
dumpConfig(TikaConfigSerializer.Mode.STATIC);
} else if (arg.equals("--dump-static-full-config")) {
pipeMode = false;
dumpConfig(TikaConfigSerializer.Mode.STATIC_FULL);
} else if (arg.equals("--container-aware") || arg.equals("--container-aware-detector")) {
// ignore, as container-aware detectors are now always used
} else if (arg.equals("-f") || arg.equals("--fork")) {
fork = true;
} else if (arg.startsWith("--config=")) {
configure(arg.substring("--config=".length()));
} else if (arg.startsWith("--digest=")) {
CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse(arg.substring("--digest=".length()));
digester = new CommonsDigester(MAX_MARK, algos);
parser = new DigestingParser(parser, digester);
} else if (arg.startsWith("-e")) {
encoding = arg.substring("-e".length());
} else if (arg.startsWith("--encoding=")) {
encoding = arg.substring("--encoding=".length());
} else if (arg.startsWith("-p") && !arg.equals("-p")) {
password = arg.substring("-p".length());
} else if (arg.startsWith("--password=")) {
password = arg.substring("--password=".length());
} else if (arg.equals("-j") || arg.equals("--json")) {
type = JSON;
} else if (arg.equals("-J") || arg.equals("--jsonRecursive")) {
recursiveJSON = true;
} else if (arg.equals("-y") || arg.equals("--xmp")) {
type = XMP;
} else if (arg.equals("-x") || arg.equals("--xml")) {
type = XML;
} else if (arg.equals("-h") || arg.equals("--html")) {
type = HTML;
} else if (arg.equals("-t") || arg.equals("--text")) {
type = TEXT;
} else if (arg.equals("-T") || arg.equals("--text-main")) {
type = TEXT_MAIN;
} else if (arg.equals("-m") || arg.equals("--metadata")) {
type = METADATA;
} else if (arg.equals("-l") || arg.equals("--language")) {
type = LANGUAGE;
} else if (arg.equals("-d") || arg.equals("--detect")) {
type = DETECT;
} else if (arg.startsWith("--extract-dir=")) {
extractDir = new File(arg.substring("--extract-dir=".length()));
} else if (arg.equals("-z") || arg.equals("--extract")) {
type = NO_OUTPUT;
context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor());
} else if (arg.equals("-r") || arg.equals("--pretty-print")) {
prettyPrint = true;
} else if (arg.equals("-p") || arg.equals("--port") || arg.equals("-s") || arg.equals("--server")) {
serverMode = true;
pipeMode = false;
} else if (arg.startsWith("-c")) {
URI uri = new URI(arg.substring("-c".length()));
parser = new NetworkParser(uri);
} else if (arg.startsWith("--client=")) {
URI uri = new URI(arg.substring("--client=".length()));
parser = new NetworkParser(uri);
} else {
pipeMode = false;
if (serverMode) {
new TikaServer(Integer.parseInt(arg)).start();
} else if (arg.equals("-")) {
try (InputStream stream = TikaInputStream.get(new CloseShieldInputStream(System.in))) {
type.process(stream, System.out, new Metadata());
}
} else {
URL url;
File file = new File(arg);
if (file.isFile()) {
url = file.toURI().toURL();
} else {
url = new URL(arg);
}
if (recursiveJSON) {
handleRecursiveJson(url, System.out);
} else {
Metadata metadata = new Metadata();
try (InputStream input = TikaInputStream.get(url, metadata)) {
type.process(input, System.out, metadata);
} finally {
System.out.flush();
}
}
}
}
}
use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.
the class OfficeParser method parse.
/**
* Extracts properties and text from an MS Document input stream
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
configure(context);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
final DirectoryNode root;
TikaInputStream tstream = TikaInputStream.cast(stream);
NPOIFSFileSystem mustCloseFs = null;
try {
if (tstream == null) {
mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
root = mustCloseFs.getRoot();
} else {
final Object container = tstream.getOpenContainer();
if (container instanceof NPOIFSFileSystem) {
root = ((NPOIFSFileSystem) container).getRoot();
} else if (container instanceof DirectoryNode) {
root = (DirectoryNode) container;
} else {
NPOIFSFileSystem fs = null;
if (tstream.hasFile()) {
fs = new NPOIFSFileSystem(tstream.getFile(), true);
} else {
fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
}
//tstream will close the fs, no need to close this below
tstream.setOpenContainer(fs);
root = fs.getRoot();
}
}
parse(root, context, metadata, xhtml);
OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);
if (officeParserConfig.getExtractMacros()) {
//now try to get macros
extractMacros(root.getNFileSystem(), xhtml, EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
}
} finally {
IOUtils.closeQuietly(mustCloseFs);
}
xhtml.endDocument();
}
Aggregations