use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class OldExcelParserTest method testPlainText.
/**
* Check we can get the plain text properly
*/
@Test
public void testPlainText() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (TikaInputStream stream = getTestFile(file)) {
new OldExcelParser().parse(stream, handler, metadata, new ParseContext());
}
String text = handler.toString();
// Check we find a few words we expect in there
assertContains("Size", text);
assertContains("Returns", text);
// Check we find a few numbers we expect in there
assertContains("11", text);
assertContains("784", text);
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class AbstractPOIContainerExtractionTest method process.
protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
try (TikaInputStream stream = getTestFile(filename)) {
assertEquals(true, extractor.isSupported(stream));
// Process it
TrackingHandler handler = new TrackingHandler();
if (recurse) {
extractor.extract(stream, extractor, handler);
} else {
extractor.extract(stream, null, handler);
}
// So they can check what happened
return handler;
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class FSBatchProcessCLI method execute.
private void execute(String[] args) throws Exception {
CommandLineParser cliParser = new DefaultParser();
CommandLine line = cliParser.parse(options, args);
if (line.hasOption("help")) {
usage();
System.exit(BatchProcessDriverCLI.PROCESS_NO_RESTART_EXIT_CODE);
}
Map<String, String> mapArgs = new HashMap<String, String>();
for (Option option : line.getOptions()) {
String v = option.getValue();
if (v == null || v.equals("")) {
v = "true";
}
mapArgs.put(option.getOpt(), v);
}
BatchProcessBuilder b = new BatchProcessBuilder();
TikaInputStream is = null;
BatchProcess process = null;
try {
is = getConfigInputStream(args, false);
process = b.build(is, mapArgs);
} finally {
IOUtils.closeQuietly(is);
}
final Thread mainThread = Thread.currentThread();
ExecutorService executor = Executors.newSingleThreadExecutor();
Future<ParallelFileProcessingResult> futureResult = executor.submit(process);
ParallelFileProcessingResult result = futureResult.get();
System.out.println(FINISHED_STRING);
System.out.println("\n");
System.out.println(result.toString());
System.exit(result.getExitStatus());
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class TikaGUI method handleStream.
private void handleStream(InputStream input, Metadata md) throws Exception {
StringWriter htmlBuffer = new StringWriter();
StringWriter textBuffer = new StringWriter();
StringWriter textMainBuffer = new StringWriter();
StringWriter xmlBuffer = new StringWriter();
StringBuilder metadataBuffer = new StringBuilder();
ContentHandler handler = new TeeContentHandler(getHtmlHandler(htmlBuffer), getTextContentHandler(textBuffer), getTextMainContentHandler(textMainBuffer), getXmlContentHandler(xmlBuffer));
context.set(DocumentSelector.class, new ImageDocumentSelector());
input = TikaInputStream.get(new ProgressMonitorInputStream(this, "Parsing stream", input));
if (input.markSupported()) {
int mark = -1;
if (input instanceof TikaInputStream) {
if (((TikaInputStream) input).hasFile()) {
mark = (int) ((TikaInputStream) input).getLength();
}
}
if (mark == -1) {
mark = MAX_MARK;
}
input.mark(mark);
}
parser.parse(input, handler, md, context);
String[] names = md.names();
Arrays.sort(names);
for (String name : names) {
for (String val : md.getValues(name)) {
metadataBuffer.append(name);
metadataBuffer.append(": ");
metadataBuffer.append(val);
metadataBuffer.append("\n");
}
}
String name = md.get(Metadata.RESOURCE_NAME_KEY);
if (name != null && name.length() > 0) {
setTitle("Apache Tika: " + name);
} else {
setTitle("Apache Tika: unnamed document");
}
setText(metadata, metadataBuffer.toString());
setText(xml, xmlBuffer.toString());
setText(text, textBuffer.toString());
setText(textMain, textMainBuffer.toString());
setText(html, htmlBuffer.toString());
if (!input.markSupported()) {
setText(json, "InputStream does not support mark/reset for Recursive Parsing");
layout.show(cards, "metadata");
return;
}
boolean isReset = false;
try {
input.reset();
isReset = true;
} catch (IOException e) {
setText(json, "Error during stream reset.\n" + "There's a limit of " + MAX_MARK + " bytes for this type of processing in the GUI.\n" + "Try the app with command line argument of -J.");
}
if (isReset) {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
wrapper.parse(input, null, new Metadata(), new ParseContext());
StringWriter jsonBuffer = new StringWriter();
JsonMetadataList.setPrettyPrinting(true);
JsonMetadataList.toJson(wrapper.getMetadata(), jsonBuffer);
setText(json, jsonBuffer.toString());
}
layout.show(cards, "metadata");
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class OfficeParser method parse.
/**
* Extracts properties and text from an MS Document input stream
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
configure(context);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
final DirectoryNode root;
TikaInputStream tstream = TikaInputStream.cast(stream);
NPOIFSFileSystem mustCloseFs = null;
try {
if (tstream == null) {
mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
root = mustCloseFs.getRoot();
} else {
final Object container = tstream.getOpenContainer();
if (container instanceof NPOIFSFileSystem) {
root = ((NPOIFSFileSystem) container).getRoot();
} else if (container instanceof DirectoryNode) {
root = (DirectoryNode) container;
} else {
NPOIFSFileSystem fs = null;
if (tstream.hasFile()) {
fs = new NPOIFSFileSystem(tstream.getFile(), true);
} else {
fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
}
//tstream will close the fs, no need to close this below
tstream.setOpenContainer(fs);
root = fs.getRoot();
}
}
parse(root, context, metadata, xhtml);
OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);
if (officeParserConfig.getExtractMacros()) {
//now try to get macros
extractMacros(root.getNFileSystem(), xhtml, EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
}
} finally {
IOUtils.closeQuietly(mustCloseFs);
}
xhtml.endDocument();
}
Aggregations