Search in sources :

Example 16 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class ParsingExample method extractEmbeddedDocumentsExample.

/**
     * @param outputPath -- output directory to place files
     * @return list of files created
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
public List<Path> extractEmbeddedDocumentsExample(Path outputPath) throws IOException, SAXException, TikaException {
    ExtractEmbeddedFiles ex = new ExtractEmbeddedFiles();
    List<Path> ret = new ArrayList<>();
    try (TikaInputStream stream = TikaInputStream.get(ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx"))) {
        ex.extract(stream, outputPath);
        try (DirectoryStream<Path> dirStream = Files.newDirectoryStream(outputPath)) {
            for (Path entry : dirStream) {
                ret.add(entry);
            }
        }
    }
    return ret;
}
Also used : Path(java.nio.file.Path) ArrayList(java.util.ArrayList) TikaInputStream(org.apache.tika.io.TikaInputStream)

Example 17 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class FontParsersTest method testTTFParsing.

@Test
public void testTTFParsing() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    try (TikaInputStream stream = TikaInputStream.get(FontParsersTest.class.getResource("/test-documents/testTrueType3.ttf"))) {
        parser.parse(stream, handler, metadata, context);
    }
    assertEquals("application/x-font-ttf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Open Sans Bold", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("2010-12-30T11:04:00Z", metadata.get(Metadata.CREATION_DATE));
    assertEquals("2010-12-30T11:04:00Z", metadata.get(TikaCoreProperties.CREATED));
    assertEquals("2011-05-05T12:37:53Z", metadata.get(TikaCoreProperties.MODIFIED));
    assertEquals("Open Sans Bold", metadata.get(MET_FONT_NAME));
    assertEquals("Open Sans", metadata.get(MET_FONT_FAMILY_NAME));
    assertEquals("Bold", metadata.get(MET_FONT_SUB_FAMILY_NAME));
    assertEquals("OpenSans-Bold", metadata.get(MET_PS_NAME));
    assertEquals("Digitized", metadata.get("Copyright").substring(0, 9));
    assertEquals("Open Sans", metadata.get("Trademark").substring(0, 9));
    // Not extracted
    assertEquals(null, metadata.get(MET_FONT_FULL_NAME));
    assertEquals(null, metadata.get(MET_FONT_WEIGHT));
    assertEquals(null, metadata.get(MET_FONT_VERSION));
    // Currently, the parser doesn't extract any contents
    String content = handler.toString();
    assertEquals("", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TikaInputStream(org.apache.tika.io.TikaInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) AdobeFontMetricParser(org.apache.tika.parser.font.AdobeFontMetricParser) Test(org.junit.Test)

Example 18 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class OldExcelParserTest method testMetadata.

// Disabled, until we can get the POI code to tell us the version
@Test
@Ignore
public void testMetadata() throws Exception {
    TikaInputStream stream = getTestFile(file);
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    OldExcelParser parser = new OldExcelParser();
    parser.parse(stream, handler, metadata, new ParseContext());
    // We can get the content type
    assertEquals("application/vnd.ms-excel.sheet.4", metadata.get(Metadata.CONTENT_TYPE));
    // But no other metadata
    assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
    assertEquals(null, metadata.get(Metadata.SUBJECT));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) TikaInputStream(org.apache.tika.io.TikaInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Ignore(org.junit.Ignore) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 19 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class TNEFParserTest method testMetadata.

@Test
public void testMetadata() throws Exception {
    TikaInputStream stream = getTestFile(file);
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    TNEFParser tnef = new TNEFParser();
    tnef.parse(stream, handler, metadata, new ParseContext());
    assertEquals("This is a test message", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) TikaInputStream(org.apache.tika.io.TikaInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Example 20 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class FSBatchProcessCLI method getConfigInputStream.

private TikaInputStream getConfigInputStream(String[] args, boolean logDefault) throws IOException {
    TikaInputStream is = null;
    Path batchConfigFile = getConfigFile(args);
    if (batchConfigFile != null) {
        //this will throw IOException if it can't find a specified config file
        //better to throw an exception than silently back off to default.
        is = TikaInputStream.get(batchConfigFile);
    } else {
        if (logDefault) {
            LOG.info("No config file set via -bc, relying on tika-app-batch-config.xml or default-tika-batch-config.xml");
        }
        //test to see if there's a tika-app-batch-config.xml on the path
        URL config = FSBatchProcessCLI.class.getResource("/tika-app-batch-config.xml");
        if (config != null) {
            is = TikaInputStream.get(FSBatchProcessCLI.class.getResourceAsStream("/tika-app-batch-config.xml"));
        } else {
            is = TikaInputStream.get(FSBatchProcessCLI.class.getResourceAsStream("default-tika-batch-config.xml"));
        }
    }
    return is;
}
Also used : Path(java.nio.file.Path) TikaInputStream(org.apache.tika.io.TikaInputStream) URL(java.net.URL)

Aggregations

TikaInputStream (org.apache.tika.io.TikaInputStream)100 Metadata (org.apache.tika.metadata.Metadata)40 TemporaryResources (org.apache.tika.io.TemporaryResources)28 IOException (java.io.IOException)27 TikaException (org.apache.tika.exception.TikaException)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)23 Test (org.junit.Test)20 InputStream (java.io.InputStream)19 File (java.io.File)15 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)15 ContentHandler (org.xml.sax.ContentHandler)14 TikaTest (org.apache.tika.TikaTest)13 MediaType (org.apache.tika.mime.MediaType)13 SAXException (org.xml.sax.SAXException)13 ParseContext (org.apache.tika.parser.ParseContext)12 ParserContainerExtractor (org.apache.tika.extractor.ParserContainerExtractor)8 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)6 NPOIFSFileSystem (org.apache.poi.poifs.filesystem.NPOIFSFileSystem)6 EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)6 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)6