use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class OpenDocumentParser method parse.
public void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Open the Zip stream
// Use a File if we can, and an already open zip is even better
ZipFile zipFile = null;
ZipInputStream zipStream = null;
if (stream instanceof TikaInputStream) {
TikaInputStream tis = (TikaInputStream) stream;
Object container = ((TikaInputStream) stream).getOpenContainer();
if (container instanceof ZipFile) {
zipFile = (ZipFile) container;
} else if (tis.hasFile()) {
zipFile = new ZipFile(tis.getFile());
} else {
zipStream = new ZipInputStream(stream);
}
} else {
zipStream = new ZipInputStream(stream);
}
// Prepare to handle the content
XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
// As we don't know which of the metadata or the content
// we'll hit first, catch the endDocument call initially
EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(xhtml);
if (zipFile != null) {
try {
handleZipFile(zipFile, metadata, context, handler);
} finally {
//Do we want to close silently == catch an exception here?
zipFile.close();
}
} else {
try {
handleZipStream(zipStream, metadata, context, handler);
} finally {
//Do we want to close silently == catch an exception here?
zipStream.close();
}
}
// Only now call the end document
if (handler.getEndDocumentWasCalled()) {
handler.reallyEndDocument();
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class PooledTimeSeriesParser method parse.
/**
* Parses a document stream into a sequence of XHTML SAX events. Fills in
* related document metadata in the given metadata object.
* <p>
* The given document stream is consumed but not closed by this method. The
* responsibility to close the stream remains on the caller.
* <p>
* Information about the parsing context can be passed in the context
* parameter. See the parser implementations for the kinds of context
* information they expect.
*
* @param stream the document stream (input)
* @param handler handler for the XHTML SAX events (output)
* @param metadata document metadata (input and output)
* @param context parse context
* @throws IOException if the document stream could not be read
* @throws SAXException if the SAX events could not be processed
* @throws TikaException if the document could not be parsed
* @since Apache Tika 0.5
*/
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
if (!isAvailable) {
LOG.warn("PooledTimeSeries not installed!");
return;
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
File input = tikaStream.getFile();
String cmdOutput = computePoT(input);
try (InputStream ofStream = new FileInputStream(new File(input.getAbsoluteFile() + ".of.txt"))) {
try (InputStream ogStream = new FileInputStream(new File(input.getAbsoluteFile() + ".hog.txt"))) {
extractHeaderOutput(ofStream, metadata, "of");
extractHeaderOutput(ogStream, metadata, "og");
xhtml.startDocument();
doExtract(ofStream, xhtml, "Histogram of Optical Flows (HOF)", metadata.get("of_frames"), metadata.get("of_vecSize"));
doExtract(ogStream, xhtml, "Histogram of Oriented Gradients (HOG)", metadata.get("og_frames"), metadata.get("og_vecSize"));
xhtml.endDocument();
}
}
// Temporary workaround for TIKA-1445 - until we can specify
// composite parsers with strategies (eg Composite, Try In Turn),
// always send the image onwards to the regular parser to have
// the metadata for them extracted as well
_TMP_VIDEO_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
} finally {
tmp.dispose();
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class TIAParsingExample method tikaInputStreamGetFile.
public static File tikaInputStreamGetFile(String filename) throws Exception {
try (InputStream stream = TikaInputStream.get(new File(filename))) {
TikaInputStream tikaInputStream = TikaInputStream.get(stream);
File file = tikaInputStream.getFile();
return file;
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class FontParsersTest method testAdobeFontMetricParsing.
@Test
public void testAdobeFontMetricParsing() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
try (TikaInputStream stream = TikaInputStream.get(FontParsersTest.class.getResource("/test-documents/testAFM.afm"))) {
parser.parse(stream, handler, metadata, context);
}
assertEquals("application/x-font-adobe-metric", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("TestFullName", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Fri Jul 15 17:50:51 2011", metadata.get(Metadata.CREATION_DATE));
assertEquals("TestFontName", metadata.get(MET_FONT_NAME));
assertEquals("TestFullName", metadata.get(MET_FONT_FULL_NAME));
assertEquals("TestSymbol", metadata.get(MET_FONT_FAMILY_NAME));
assertEquals("Medium", metadata.get(MET_FONT_WEIGHT));
assertEquals("001.008", metadata.get(MET_FONT_VERSION));
String content = handler.toString();
// Test that the comments got extracted
assertContains("Comments", content);
assertContains("This is a comment in a sample file", content);
assertContains("UniqueID 12345", content);
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class ODFParserTest method testFromFile.
@Test
public void testFromFile() throws Exception {
try (TikaInputStream tis = TikaInputStream.get(this.getClass().getResource("/test-documents/testODFwithOOo3.odt"))) {
assertEquals(true, tis.hasFile());
OpenDocumentParser parser = new OpenDocumentParser();
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
parser.parse(tis, handler, metadata, new ParseContext());
assertEquals("application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("Tika is part of the Lucene project.", content);
}
}
Aggregations