use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class RTFParserTest method testBinControlWord.
// TIKA-782
@Test
public void testBinControlWord() throws Exception {
ByteCopyingHandler embHandler = new ByteCopyingHandler();
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testBinControlWord.rtf"))) {
ContainerExtractor ex = new ParserContainerExtractor();
assertEquals(true, ex.isSupported(tis));
ex.extract(tis, ex, embHandler);
}
assertEquals(1, embHandler.bytes.size());
byte[] bytes = embHandler.bytes.get(0);
assertEquals(10, bytes.length);
//}
assertEquals(125, (int) bytes[4]);
//make sure that at least the last value is correct
assertEquals(-1, (int) bytes[9]);
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class ExternalEmbedder method embed.
/**
* Executes the configured external command and passes the given document
* stream as a simple XHTML document to the given SAX content handler.
* Metadata is only extracted if {@link #setMetadataCommandArguments(Map)}
* has been called to set arguments.
*/
public void embed(final Metadata metadata, final InputStream inputStream, final OutputStream outputStream, final ParseContext context) throws IOException, TikaException {
boolean inputToStdIn = true;
boolean outputFromStdOut = true;
boolean hasMetadataCommandArguments = (metadataCommandArguments != null && !metadataCommandArguments.isEmpty());
boolean serializeMetadataCommandArgumentsToken = false;
boolean replacedMetadataCommandArgumentsToken = false;
TikaInputStream tikaInputStream = TikaInputStream.get(inputStream);
File tempOutputFile = null;
List<String> commandMetadataSegments = null;
if (hasMetadataCommandArguments) {
commandMetadataSegments = getCommandMetadataSegments(metadata);
}
// Build our command
List<String> origCmd = Arrays.asList(command);
List<String> cmd = new ArrayList<String>();
for (String commandSegment : origCmd) {
if (commandSegment.indexOf(ExternalParser.INPUT_FILE_TOKEN) != -1) {
commandSegment = commandSegment.replace(ExternalParser.INPUT_FILE_TOKEN, tikaInputStream.getFile().toString());
inputToStdIn = false;
}
if (commandSegment.indexOf(ExternalParser.OUTPUT_FILE_TOKEN) != -1) {
tempOutputFile = tmp.createTemporaryFile();
commandSegment = commandSegment.replace(ExternalParser.OUTPUT_FILE_TOKEN, tempOutputFile.toString());
outputFromStdOut = false;
}
if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
serializeMetadataCommandArgumentsToken = true;
}
if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_TOKEN) != -1) {
if (hasMetadataCommandArguments) {
for (String commandMetadataSegment : commandMetadataSegments) {
cmd.add(commandMetadataSegment);
}
}
replacedMetadataCommandArgumentsToken = true;
} else {
cmd.add(commandSegment);
}
}
if (hasMetadataCommandArguments) {
if (serializeMetadataCommandArgumentsToken) {
// Find all metadata tokens and replace with encapsulated metadata
int i = 0;
for (String commandSegment : cmd) {
if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
commandSegment = commandSegment.replace(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN, serializeMetadata(commandMetadataSegments));
cmd.set(i, commandSegment);
}
i++;
}
} else if (!replacedMetadataCommandArgumentsToken && !serializeMetadataCommandArgumentsToken) {
// Tack metadata onto the end of the cmd as arguments
cmd.addAll(commandMetadataSegments);
}
}
// Execute
Process process;
if (cmd.toArray().length == 1) {
process = Runtime.getRuntime().exec(cmd.toArray(new String[] {})[0]);
} else {
process = Runtime.getRuntime().exec(cmd.toArray(new String[] {}));
}
ByteArrayOutputStream stdErrOutputStream = new ByteArrayOutputStream();
try {
sendStdErrToOutputStream(process, stdErrOutputStream);
if (inputToStdIn) {
sendInputStreamToStdIn(inputStream, process);
} else {
// We're not writing to std in this case so close
process.getOutputStream().close();
}
if (outputFromStdOut) {
sendStdOutToOutputStream(process, outputStream);
} else {
tmp.dispose();
try {
process.waitFor();
} catch (InterruptedException ignore) {
}
// The command is finished, read the output file into the given output stream
InputStream tempOutputFileInputStream = TikaInputStream.get(tempOutputFile);
IOUtils.copy(tempOutputFileInputStream, outputStream);
}
} finally {
if (outputFromStdOut) {
try {
process.waitFor();
} catch (InterruptedException ignore) {
}
} else {
try {
// Clean up temp output files
tempOutputFile.delete();
} catch (Exception e) {
}
}
if (!inputToStdIn) {
// Close input file (and delete if created by up TemporaryResources.createTemporaryFile)
IOUtils.closeQuietly(tikaInputStream);
}
IOUtils.closeQuietly(outputStream);
IOUtils.closeQuietly(stdErrOutputStream);
if (process.exitValue() != 0) {
throw new TikaException("There was an error executing the command line" + "\nExecutable Command:\n\n" + cmd + "\nExecutable Error:\n\n" + stdErrOutputStream.toString(UTF_8.name()));
}
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class ParserDecorator method withFallbacks.
/**
* Decorates the given parsers into a virtual parser, where they'll
* be tried in preference order until one works without error.
* TODO Is this the right name?
* TODO Is this the right place to put this? Should it be in CompositeParser? Elsewhere?
* TODO Should we reset the Metadata if we try another parser?
* TODO Should we reset the ContentHandler if we try another parser?
* TODO Should we log/report failures anywhere?
* @deprecated Do not use until the TODOs are resolved, see TIKA-1509
*/
public static final Parser withFallbacks(final Collection<? extends Parser> parsers, final Set<MediaType> types) {
Parser parser = EmptyParser.INSTANCE;
if (!parsers.isEmpty())
parser = parsers.iterator().next();
return new ParserDecorator(parser) {
private static final long serialVersionUID = 1625187131782069683L;
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return types;
}
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Must have a TikaInputStream, so we can re-use it if parsing fails
// Need to close internally created tstream to release resources
TemporaryResources tmp = (TikaInputStream.isTikaInputStream(stream)) ? null : new TemporaryResources();
try {
TikaInputStream tstream = TikaInputStream.get(stream, tmp);
tstream.getFile();
// Try each parser in turn
for (Parser p : parsers) {
tstream.mark(-1);
try {
p.parse(tstream, handler, metadata, context);
return;
} catch (Exception e) {
// TODO How to log / record this failure?
}
// Prepare for the next parser, if present
tstream.reset();
}
} finally {
if (tmp != null) {
tmp.dispose();
}
}
}
@Override
public String getDecorationName() {
return "With Fallback";
}
};
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class CompositeParser method parse.
/**
* Delegates the call to the matching component parser.
* <p>
* Potential {@link RuntimeException}s, {@link IOException}s and
* {@link SAXException}s unrelated to the given input stream and content
* handler are automatically wrapped into {@link TikaException}s to better
* honor the {@link Parser} contract.
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
Parser parser = getParser(metadata, context);
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
TaggedContentHandler taggedHandler = handler != null ? new TaggedContentHandler(handler) : null;
if (parser instanceof ParserDecorator) {
metadata.add("X-Parsed-By", ((ParserDecorator) parser).getWrappedParser().getClass().getName());
} else {
metadata.add("X-Parsed-By", parser.getClass().getName());
}
try {
parser.parse(taggedStream, taggedHandler, metadata, context);
} catch (RuntimeException e) {
throw new TikaException("Unexpected RuntimeException from " + parser, e);
} catch (IOException e) {
taggedStream.throwIfCauseOf(e);
throw new TikaException("TIKA-198: Illegal IOException from " + parser, e);
} catch (SAXException e) {
if (taggedHandler != null)
taggedHandler.throwIfCauseOf(e);
throw new TikaException("TIKA-237: Illegal SAXException from " + parser, e);
}
} finally {
tmp.dispose();
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class NetworkParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
parse(tis, handler, metadata, context);
} finally {
tmp.dispose();
}
}
Aggregations