Search in sources :

Example 51 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class PackageParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    //lazily load the MediaTypeRegistry at parse time
    //only want to call getDefaultConfig() once, and can't
    //load statically because of the ForkParser
    TikaConfig config = context.get(TikaConfig.class);
    MediaTypeRegistry mediaTypeRegistry = null;
    if (config != null) {
        mediaTypeRegistry = config.getMediaTypeRegistry();
    } else {
        if (bufferedMediaTypeRegistry == null) {
            //buffer this for next time.
            synchronized (lock) {
                //now that we're locked, check again
                if (bufferedMediaTypeRegistry == null) {
                    bufferedMediaTypeRegistry = TikaConfig.getDefaultConfig().getMediaTypeRegistry();
                }
            }
        }
        mediaTypeRegistry = bufferedMediaTypeRegistry;
    }
    // Ensure that the stream supports the mark feature
    if (!stream.markSupported()) {
        stream = new BufferedInputStream(stream);
    }
    TemporaryResources tmp = new TemporaryResources();
    ArchiveInputStream ais = null;
    try {
        ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
        // At the end we want to close the archive stream to release
        // any associated resources, but the underlying document stream
        // should not be closed
        ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
    } catch (StreamingNotSupportedException sne) {
        // Most archive formats work on streams, but a few need files
        if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
            // Rework as a file, and wrap
            stream.reset();
            TikaInputStream tstream = TikaInputStream.get(stream, tmp);
            // Seven Zip suports passwords, was one given?
            String password = null;
            PasswordProvider provider = context.get(PasswordProvider.class);
            if (provider != null) {
                password = provider.getPassword(metadata);
            }
            SevenZFile sevenz;
            if (password == null) {
                sevenz = new SevenZFile(tstream.getFile());
            } else {
                sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
            }
            // Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
            ais = new SevenZWrapper(sevenz);
        } else {
            tmp.close();
            throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
        }
    } catch (ArchiveException e) {
        tmp.close();
        throw new TikaException("Unable to unpack document stream", e);
    }
    updateMediaType(ais, mediaTypeRegistry, metadata);
    // Use the delegate parser to parse the contained document
    EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    try {
        ArchiveEntry entry = ais.getNextEntry();
        while (entry != null) {
            if (!entry.isDirectory()) {
                parseEntry(ais, entry, extractor, metadata, xhtml);
            }
            entry = ais.getNextEntry();
        }
    } catch (UnsupportedZipFeatureException zfe) {
        // If it's an encrypted document of unknown password, report as such
        if (zfe.getFeature() == Feature.ENCRYPTION) {
            throw new EncryptedDocumentException(zfe);
        }
        // Otherwise throw the exception
        throw new TikaException("UnsupportedZipFeature", zfe);
    } catch (PasswordRequiredException pre) {
        throw new EncryptedDocumentException(pre);
    } finally {
        ais.close();
        tmp.close();
    }
    xhtml.endDocument();
}
Also used : StreamingNotSupportedException(org.apache.commons.compress.archivers.StreamingNotSupportedException) TikaException(org.apache.tika.exception.TikaException) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) TikaConfig(org.apache.tika.config.TikaConfig) EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) MediaTypeRegistry(org.apache.tika.mime.MediaTypeRegistry) ZipArchiveEntry(org.apache.commons.compress.archivers.zip.ZipArchiveEntry) ArchiveEntry(org.apache.commons.compress.archivers.ArchiveEntry) PasswordRequiredException(org.apache.commons.compress.PasswordRequiredException) ArchiveException(org.apache.commons.compress.archivers.ArchiveException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) PasswordProvider(org.apache.tika.parser.PasswordProvider) UnsupportedZipFeatureException(org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException) ArchiveStreamFactory(org.apache.commons.compress.archivers.ArchiveStreamFactory) ArArchiveInputStream(org.apache.commons.compress.archivers.ar.ArArchiveInputStream) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) JarArchiveInputStream(org.apache.commons.compress.archivers.jar.JarArchiveInputStream) ArchiveInputStream(org.apache.commons.compress.archivers.ArchiveInputStream) CpioArchiveInputStream(org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream) ZipArchiveInputStream(org.apache.commons.compress.archivers.zip.ZipArchiveInputStream) DumpArchiveInputStream(org.apache.commons.compress.archivers.dump.DumpArchiveInputStream) SevenZFile(org.apache.commons.compress.archivers.sevenz.SevenZFile) BufferedInputStream(java.io.BufferedInputStream) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 52 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class TensorflowRESTVideoRecogniser method getApiUri.

@Override
protected URI getApiUri(Metadata metadata) {
    TikaConfig config = TikaConfig.getDefaultConfig();
    String ext = null;
    //Find extension for video. It's required for OpenCv in InceptionAPI to decode video 
    try {
        MimeType mimeType = config.getMimeRepository().forName(metadata.get("Content-Type"));
        ext = mimeType.getExtension();
        return UriBuilder.fromUri(apiUri).queryParam("ext", ext).build();
    } catch (MimeTypeException e) {
        LOG.error("Can't find extension from metadata");
        return apiUri;
    }
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) MimeTypeException(org.apache.tika.mime.MimeTypeException) MimeType(org.apache.tika.mime.MimeType)

Example 53 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class TikaConfigTest method parserWithChildParsers.

/**
     * TIKA-1653 If one parser has child parsers, those child parsers shouldn't
     *  show up at the top level as well
     */
@Test
public void parserWithChildParsers() throws Exception {
    try {
        TikaConfig config = getConfig("TIKA-1653-norepeat.xml");
        CompositeParser cp = (CompositeParser) config.getParser();
        List<Parser> parsers = cp.getAllComponentParsers();
        Parser p;
        // Just 2 top level parsers
        assertEquals(2, parsers.size());
        // Should have a CompositeParser with 2 child ones, and
        //  and a wrapped empty parser
        p = parsers.get(0);
        assertTrue(p.toString(), p instanceof CompositeParser);
        assertEquals(2, ((CompositeParser) p).getAllComponentParsers().size());
        p = parsers.get(1);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(EmptyParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        assertEquals("hello/world", p.getSupportedTypes(null).iterator().next().toString());
    } catch (TikaException e) {
        fail("Unexpected TikaException: " + e);
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TikaConfig(org.apache.tika.config.TikaConfig) CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) ErrorParser(org.apache.tika.parser.ErrorParser) Test(org.junit.Test) TikaConfigTest(org.apache.tika.config.TikaConfigTest)

Example 54 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class TikaConfigTest method testTikaExecutorServiceFromConfig.

@Test
public void testTikaExecutorServiceFromConfig() throws Exception {
    URL url = TikaConfigTest.class.getResource("TIKA-1762-executors.xml");
    TikaConfig config = new TikaConfig(url);
    ThreadPoolExecutor executorService = (ThreadPoolExecutor) config.getExecutorService();
    assertTrue("Should use Dummy Executor", (executorService instanceof DummyExecutor));
    assertEquals("Should have configured Core Threads", 3, executorService.getCorePoolSize());
    assertEquals("Should have configured Max Threads", 10, executorService.getMaximumPoolSize());
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) DummyExecutor(org.apache.tika.config.DummyExecutor) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) URL(java.net.URL) Test(org.junit.Test) TikaConfigTest(org.apache.tika.config.TikaConfigTest)

Example 55 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class TikaConfigTest method ensureClassLoaderUsedEverywhere.

/**
     * TIKA-1145 If the TikaConfig has a ClassLoader set on it,
     * that should be used when loading the mimetypes and when
     * discovering services
     */
@Test
public void ensureClassLoaderUsedEverywhere() throws Exception {
    ResourceLoggingClassLoader customLoader = new ResourceLoggingClassLoader(getClass().getClassLoader());
    TikaConfig config;
    // Without a classloader set, normal one will be used
    config = new TikaConfig();
    config.getMediaTypeRegistry();
    config.getParser();
    assertEquals(0, customLoader.getLoadedResources().size());
    // With a classloader set, resources will come through it
    config = new TikaConfig(customLoader);
    config.getMediaTypeRegistry();
    config.getParser();
    Map<String, List<URL>> resources = customLoader.getLoadedResources();
    int resourcesCount = resources.size();
    assertTrue("Not enough things used the classloader, found only " + resourcesCount, resourcesCount > 3);
    // Ensure everything that should do, did use it
    // - Parsers
    assertNotNull(resources.get("META-INF/services/org.apache.tika.parser.Parser"));
    // - Detectors
    assertNotNull(resources.get("META-INF/services/org.apache.tika.detect.Detector"));
    // - Built-In Mimetypes
    assertNotNull(resources.get("org/apache/tika/mime/tika-mimetypes.xml"));
    // - Custom Mimetypes
    assertNotNull(resources.get("org/apache/tika/mime/custom-mimetypes.xml"));
}
Also used : ResourceLoggingClassLoader(org.apache.tika.ResourceLoggingClassLoader) TikaConfig(org.apache.tika.config.TikaConfig) List(java.util.List) Test(org.junit.Test) TikaConfigTest(org.apache.tika.config.TikaConfigTest)

Aggregations

TikaConfig (org.apache.tika.config.TikaConfig)62 Test (org.junit.Test)32 Metadata (org.apache.tika.metadata.Metadata)26 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)20 TikaTest (org.apache.tika.TikaTest)16 InputStream (java.io.InputStream)12 Tika (org.apache.tika.Tika)12 IOException (java.io.IOException)10 URL (java.net.URL)10 TikaException (org.apache.tika.exception.TikaException)9 TikaInputStream (org.apache.tika.io.TikaInputStream)9 ParseContext (org.apache.tika.parser.ParseContext)9 Parser (org.apache.tika.parser.Parser)9 MediaType (org.apache.tika.mime.MediaType)8 CompositeParser (org.apache.tika.parser.CompositeParser)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 File (java.io.File)6 TikaConfigTest (org.apache.tika.config.TikaConfigTest)6 HashSet (java.util.HashSet)5 SAXException (org.xml.sax.SAXException)5