use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class ParsingExample method extractEmbeddedDocumentsExample.
/**
* @param outputPath -- output directory to place files
* @return list of files created
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public List<Path> extractEmbeddedDocumentsExample(Path outputPath) throws IOException, SAXException, TikaException {
ExtractEmbeddedFiles ex = new ExtractEmbeddedFiles();
List<Path> ret = new ArrayList<>();
try (TikaInputStream stream = TikaInputStream.get(ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx"))) {
ex.extract(stream, outputPath);
try (DirectoryStream<Path> dirStream = Files.newDirectoryStream(outputPath)) {
for (Path entry : dirStream) {
ret.add(entry);
}
}
}
return ret;
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class FontParsersTest method testTTFParsing.
@Test
public void testTTFParsing() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
try (TikaInputStream stream = TikaInputStream.get(FontParsersTest.class.getResource("/test-documents/testTrueType3.ttf"))) {
parser.parse(stream, handler, metadata, context);
}
assertEquals("application/x-font-ttf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Open Sans Bold", metadata.get(TikaCoreProperties.TITLE));
assertEquals("2010-12-30T11:04:00Z", metadata.get(Metadata.CREATION_DATE));
assertEquals("2010-12-30T11:04:00Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals("2011-05-05T12:37:53Z", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("Open Sans Bold", metadata.get(MET_FONT_NAME));
assertEquals("Open Sans", metadata.get(MET_FONT_FAMILY_NAME));
assertEquals("Bold", metadata.get(MET_FONT_SUB_FAMILY_NAME));
assertEquals("OpenSans-Bold", metadata.get(MET_PS_NAME));
assertEquals("Digitized", metadata.get("Copyright").substring(0, 9));
assertEquals("Open Sans", metadata.get("Trademark").substring(0, 9));
// Not extracted
assertEquals(null, metadata.get(MET_FONT_FULL_NAME));
assertEquals(null, metadata.get(MET_FONT_WEIGHT));
assertEquals(null, metadata.get(MET_FONT_VERSION));
// Currently, the parser doesn't extract any contents
String content = handler.toString();
assertEquals("", content);
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class OldExcelParserTest method testMetadata.
// Disabled, until we can get the POI code to tell us the version
@Test
@Ignore
public void testMetadata() throws Exception {
TikaInputStream stream = getTestFile(file);
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
OldExcelParser parser = new OldExcelParser();
parser.parse(stream, handler, metadata, new ParseContext());
// We can get the content type
assertEquals("application/vnd.ms-excel.sheet.4", metadata.get(Metadata.CONTENT_TYPE));
// But no other metadata
assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
assertEquals(null, metadata.get(Metadata.SUBJECT));
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class TNEFParserTest method testMetadata.
@Test
public void testMetadata() throws Exception {
TikaInputStream stream = getTestFile(file);
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
TNEFParser tnef = new TNEFParser();
tnef.parse(stream, handler, metadata, new ParseContext());
assertEquals("This is a test message", metadata.get(TikaCoreProperties.TITLE));
assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class FSBatchProcessCLI method getConfigInputStream.
private TikaInputStream getConfigInputStream(String[] args, boolean logDefault) throws IOException {
TikaInputStream is = null;
Path batchConfigFile = getConfigFile(args);
if (batchConfigFile != null) {
//this will throw IOException if it can't find a specified config file
//better to throw an exception than silently back off to default.
is = TikaInputStream.get(batchConfigFile);
} else {
if (logDefault) {
LOG.info("No config file set via -bc, relying on tika-app-batch-config.xml or default-tika-batch-config.xml");
}
//test to see if there's a tika-app-batch-config.xml on the path
URL config = FSBatchProcessCLI.class.getResource("/tika-app-batch-config.xml");
if (config != null) {
is = TikaInputStream.get(FSBatchProcessCLI.class.getResourceAsStream("/tika-app-batch-config.xml"));
} else {
is = TikaInputStream.get(FSBatchProcessCLI.class.getResourceAsStream("default-tika-batch-config.xml"));
}
}
return is;
}
Aggregations