use of org.apache.tika.config.TikaConfig in project tika by apache.
the class PackageParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
//lazily load the MediaTypeRegistry at parse time
//only want to call getDefaultConfig() once, and can't
//load statically because of the ForkParser
TikaConfig config = context.get(TikaConfig.class);
MediaTypeRegistry mediaTypeRegistry = null;
if (config != null) {
mediaTypeRegistry = config.getMediaTypeRegistry();
} else {
if (bufferedMediaTypeRegistry == null) {
//buffer this for next time.
synchronized (lock) {
//now that we're locked, check again
if (bufferedMediaTypeRegistry == null) {
bufferedMediaTypeRegistry = TikaConfig.getDefaultConfig().getMediaTypeRegistry();
}
}
}
mediaTypeRegistry = bufferedMediaTypeRegistry;
}
// Ensure that the stream supports the mark feature
if (!stream.markSupported()) {
stream = new BufferedInputStream(stream);
}
TemporaryResources tmp = new TemporaryResources();
ArchiveInputStream ais = null;
try {
ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
// At the end we want to close the archive stream to release
// any associated resources, but the underlying document stream
// should not be closed
ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
} catch (StreamingNotSupportedException sne) {
// Most archive formats work on streams, but a few need files
if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
// Rework as a file, and wrap
stream.reset();
TikaInputStream tstream = TikaInputStream.get(stream, tmp);
// Seven Zip suports passwords, was one given?
String password = null;
PasswordProvider provider = context.get(PasswordProvider.class);
if (provider != null) {
password = provider.getPassword(metadata);
}
SevenZFile sevenz;
if (password == null) {
sevenz = new SevenZFile(tstream.getFile());
} else {
sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
}
// Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
ais = new SevenZWrapper(sevenz);
} else {
tmp.close();
throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
}
} catch (ArchiveException e) {
tmp.close();
throw new TikaException("Unable to unpack document stream", e);
}
updateMediaType(ais, mediaTypeRegistry, metadata);
// Use the delegate parser to parse the contained document
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
ArchiveEntry entry = ais.getNextEntry();
while (entry != null) {
if (!entry.isDirectory()) {
parseEntry(ais, entry, extractor, metadata, xhtml);
}
entry = ais.getNextEntry();
}
} catch (UnsupportedZipFeatureException zfe) {
// If it's an encrypted document of unknown password, report as such
if (zfe.getFeature() == Feature.ENCRYPTION) {
throw new EncryptedDocumentException(zfe);
}
// Otherwise throw the exception
throw new TikaException("UnsupportedZipFeature", zfe);
} catch (PasswordRequiredException pre) {
throw new EncryptedDocumentException(pre);
} finally {
ais.close();
tmp.close();
}
xhtml.endDocument();
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class TensorflowRESTVideoRecogniser method getApiUri.
@Override
protected URI getApiUri(Metadata metadata) {
TikaConfig config = TikaConfig.getDefaultConfig();
String ext = null;
//Find extension for video. It's required for OpenCv in InceptionAPI to decode video
try {
MimeType mimeType = config.getMimeRepository().forName(metadata.get("Content-Type"));
ext = mimeType.getExtension();
return UriBuilder.fromUri(apiUri).queryParam("ext", ext).build();
} catch (MimeTypeException e) {
LOG.error("Can't find extension from metadata");
return apiUri;
}
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class TikaConfigTest method parserWithChildParsers.
/**
* TIKA-1653 If one parser has child parsers, those child parsers shouldn't
* show up at the top level as well
*/
@Test
public void parserWithChildParsers() throws Exception {
try {
TikaConfig config = getConfig("TIKA-1653-norepeat.xml");
CompositeParser cp = (CompositeParser) config.getParser();
List<Parser> parsers = cp.getAllComponentParsers();
Parser p;
// Just 2 top level parsers
assertEquals(2, parsers.size());
// Should have a CompositeParser with 2 child ones, and
// and a wrapped empty parser
p = parsers.get(0);
assertTrue(p.toString(), p instanceof CompositeParser);
assertEquals(2, ((CompositeParser) p).getAllComponentParsers().size());
p = parsers.get(1);
assertTrue(p.toString(), p instanceof ParserDecorator);
assertEquals(EmptyParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
assertEquals("hello/world", p.getSupportedTypes(null).iterator().next().toString());
} catch (TikaException e) {
fail("Unexpected TikaException: " + e);
}
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class TikaConfigTest method testTikaExecutorServiceFromConfig.
@Test
public void testTikaExecutorServiceFromConfig() throws Exception {
URL url = TikaConfigTest.class.getResource("TIKA-1762-executors.xml");
TikaConfig config = new TikaConfig(url);
ThreadPoolExecutor executorService = (ThreadPoolExecutor) config.getExecutorService();
assertTrue("Should use Dummy Executor", (executorService instanceof DummyExecutor));
assertEquals("Should have configured Core Threads", 3, executorService.getCorePoolSize());
assertEquals("Should have configured Max Threads", 10, executorService.getMaximumPoolSize());
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class TikaConfigTest method ensureClassLoaderUsedEverywhere.
/**
* TIKA-1145 If the TikaConfig has a ClassLoader set on it,
* that should be used when loading the mimetypes and when
* discovering services
*/
@Test
public void ensureClassLoaderUsedEverywhere() throws Exception {
ResourceLoggingClassLoader customLoader = new ResourceLoggingClassLoader(getClass().getClassLoader());
TikaConfig config;
// Without a classloader set, normal one will be used
config = new TikaConfig();
config.getMediaTypeRegistry();
config.getParser();
assertEquals(0, customLoader.getLoadedResources().size());
// With a classloader set, resources will come through it
config = new TikaConfig(customLoader);
config.getMediaTypeRegistry();
config.getParser();
Map<String, List<URL>> resources = customLoader.getLoadedResources();
int resourcesCount = resources.size();
assertTrue("Not enough things used the classloader, found only " + resourcesCount, resourcesCount > 3);
// Ensure everything that should do, did use it
// - Parsers
assertNotNull(resources.get("META-INF/services/org.apache.tika.parser.Parser"));
// - Detectors
assertNotNull(resources.get("META-INF/services/org.apache.tika.detect.Detector"));
// - Built-In Mimetypes
assertNotNull(resources.get("org/apache/tika/mime/tika-mimetypes.xml"));
// - Custom Mimetypes
assertNotNull(resources.get("org/apache/tika/mime/custom-mimetypes.xml"));
}
Aggregations