use of org.apache.tika.config.TikaConfig in project jackrabbit-oak by apache.
the class BinaryTextExtractor method initializeTikaConfig.
private static TikaConfigHolder initializeTikaConfig(@Nullable IndexDefinition definition) {
ClassLoader current = Thread.currentThread().getContextClassLoader();
InputStream configStream = null;
String configSource = null;
try {
Thread.currentThread().setContextClassLoader(LuceneIndexEditorContext.class.getClassLoader());
if (definition != null && definition.hasCustomTikaConfig()) {
log.debug("[{}] Using custom tika config", definition.getIndexName());
configSource = "Custom config at " + definition.getIndexPath();
configStream = definition.getTikaConfig();
} else {
URL configUrl = LuceneIndexEditorContext.class.getResource("tika-config.xml");
if (configUrl != null) {
configSource = configUrl.toString();
configStream = configUrl.openStream();
}
}
if (configStream != null) {
return new TikaConfigHolder(new TikaConfig(configStream), configSource);
}
} catch (TikaException | IOException | SAXException e) {
log.warn("Tika configuration not available : " + configSource, e);
} finally {
IOUtils.closeQuietly(configStream);
Thread.currentThread().setContextClassLoader(current);
}
return new TikaConfigHolder(TikaConfig.getDefaultConfig(), "Default Config");
}
use of org.apache.tika.config.TikaConfig in project gate-core by GateNLP.
the class TikaFormat method unpackMarkup.
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
if (doc == null || doc.getSourceUrl() == null) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
// Create a status listener
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
// This is implemented in DocumentFormat.java and inherited here
fireStatusChanged(text);
}
};
XmlDocumentHandler ch = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
Metadata metadata = extractParserTips(doc);
ch.addStatusListener(statusListener);
ch.setRepositioningInfo(repInfo);
// set the object with ampersand coding positions
ch.setAmpCodingInfo(ampCodingInfo);
InputStream input = null;
try {
Parser tikaParser = new TikaConfig().getParser();
input = doc.getSourceUrl().openStream();
tikaParser.parse(input, ch, metadata, new ParseContext());
setDocumentFeatures(metadata, doc);
} catch (IOException e) {
throw new DocumentFormatException(e);
} catch (SAXException e) {
throw new DocumentFormatException(e);
} catch (TikaException e) {
throw new DocumentFormatException(e);
} finally {
// null safe
IOUtils.closeQuietly(input);
ch.removeStatusListener(statusListener);
}
if (doc instanceof DocumentImpl) {
((DocumentImpl) doc).setNextAnnotationId(ch.getCustomObjectsId());
}
}
use of org.apache.tika.config.TikaConfig in project camel by apache.
the class TikaComponent method createEndpoint.
@Override
protected Endpoint createEndpoint(String uri, String remaining, Map<String, Object> parameters) throws Exception {
TikaConfiguration tikaConfiguration = new TikaConfiguration();
setProperties(tikaConfiguration, parameters);
TikaConfig config = resolveAndRemoveReferenceParameter(parameters, TIKA_CONFIG, TikaConfig.class);
if (config != null) {
tikaConfiguration.setTikaConfig(config);
}
tikaConfiguration.setOperation(new URI(uri).getHost());
return new TikaEndpoint(uri, this, tikaConfiguration);
}
use of org.apache.tika.config.TikaConfig in project camel by apache.
the class TikaConfiguration method setTikaConfigUri.
/**
*
* Tika Config Uri: The URI of tika-config.xml
*
*/
public void setTikaConfigUri(String tikaConfigUri) throws TikaException, IOException, SAXException {
this.tikaConfigUri = tikaConfigUri;
this.tikaConfig = new TikaConfig(tikaConfigUri);
}
use of org.apache.tika.config.TikaConfig in project jackrabbit-oak by apache.
the class TikaHelper method getTikaConfig.
private static TikaConfig getTikaConfig(File tikaConfig) throws TikaException, IOException, SAXException {
TikaConfig config;
if (tikaConfig == null) {
URL configUrl = TextExtractor.class.getResource(DEFAULT_TIKA_CONFIG);
if (configUrl != null) {
log.info("Loading default Tika config from {}", configUrl);
config = new TikaConfig(configUrl);
} else {
log.info("Using default Tika config");
config = TikaConfig.getDefaultConfig();
}
} else {
log.info("Loading external Tika config from {}", tikaConfig);
config = new TikaConfig(tikaConfig);
}
return config;
}
Aggregations