use of org.opensextant.solrtexttagger.TaggerFstCorpus in project stanbol by apache.
the class CorpusCreationTask method call.
@Override
public TaggerFstCorpus call() {
if (!indexConfig.isActive()) {
String msg = "Index Configuration already deactivated";
fstInfo.setError(msg);
throw new IllegalStateException(msg);
}
SolrCore core = indexConfig.getIndex();
if (core.isClosed()) {
String msg = "Unable to build " + fstInfo + " becuase SolrCore " + core.getName() + " is closed!";
fstInfo.setError(msg);
throw new IllegalStateException(msg);
}
final TaggerFstCorpus corpus;
RefCounted<SolrIndexSearcher> searcherRef = core.getSearcher();
try {
//STANBOL-1177: create FST models in AccessController.doPrivileged(..)
final SolrIndexSearcher searcher = searcherRef.get();
//we do get the AtomicReader, because TaggerFstCorpus will need it
//anyways. This prevents to create another SlowCompositeReaderWrapper.
final IndexReader reader = searcher.getAtomicReader();
log.info(" ... build FST corpus for {}", fstInfo);
corpus = AccessController.doPrivileged(new PrivilegedExceptionAction<TaggerFstCorpus>() {
public TaggerFstCorpus run() throws IOException {
return new TaggerFstCorpus(reader, searcher.getIndexReader().getVersion(), null, fstInfo.indexedField, fstInfo.storedField, fstInfo.analyzer, fstInfo.partialMatches, 1, 100);
}
});
if (indexConfig.isActive()) {
//set the created corpus to the FST Info
fstInfo.setCorpus(corpus);
} else {
//index configuration no longer active ... ignore the built FST
log.warn("Index Config for " + fstInfo + "was deactivated while building FST. " + "Built FST will be ignored.");
}
return corpus;
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if (e instanceof IOException) {
//IO Exception while loading the file
throw new IllegalStateException("Unable to read Information to build " + fstInfo + " from SolrIndex '" + core.getName() + "'!", e);
} else {
//Runtime exception
throw RuntimeException.class.cast(e);
}
} finally {
//ensure that we dereference the searcher
searcherRef.decref();
}
}
use of org.opensextant.solrtexttagger.TaggerFstCorpus in project stanbol by apache.
the class CorpusCreationTask method run.
@Override
public void run() {
TaggerFstCorpus corpus = null;
RefCounted<SolrIndexSearcher> searcherRef = core.getSearcher();
try {
SolrIndexSearcher searcher = searcherRef.get();
//we do get the AtomicReader, because TaggerFstCorpus will need it
//anyways. This prevents to create another SlowCompositeReaderWrapper.
IndexReader reader = searcher.getAtomicReader();
log.info(" ... build {}", corpusInfo);
corpus = new TaggerFstCorpus(reader, searcher.getIndexReader().getVersion(), null, corpusInfo.indexedField, corpusInfo.storedField, corpusInfo.analyzer, corpusInfo.partialMatches, 1, 200);
} catch (IOException e) {
throw new IllegalStateException("Unable to read Information to build " + corpusInfo + " from SolrIndex '" + core.getName() + "'!", e);
} finally {
//ensure that we dereference the searcher
searcherRef.decref();
}
if (corpusInfo.fst.exists()) {
if (!FileUtils.deleteQuietly(corpusInfo.fst)) {
log.warn("Unable to delete existing FST fiel for {}", corpusInfo);
}
}
if (corpus.getPhrases() != null) {
//the FST is not empty
try {
//NOTE saving an empty corpus results in a NPE
corpus.save(corpusInfo.fst);
} catch (IOException e) {
log.warn("Unable to store FST corpus " + corpusInfo + " to " + corpusInfo.fst.getAbsolutePath() + "!", e);
}
} else {
log.info("FST for {} is empty ... no FST will be stored", corpusInfo);
}
}
use of org.opensextant.solrtexttagger.TaggerFstCorpus in project stanbol by apache.
the class TaggingSession method obtainFstCorpus.
/**
* Obtains the FST corpus for the parsed CorpusInfo. The other parameters
* are just used for error messages in case this is not successful.
* @param indexVersion the current version of the index
* @param fstInfo the info about the corpus
* @return the TaggerFstCorpus
* @throws CorpusException if the requested corpus is currently not available
*/
private TaggerFstCorpus obtainFstCorpus(Long indexVersion, CorpusInfo fstInfo) throws CorpusException {
TaggerFstCorpus fstCorpus;
fstCorpus = fstInfo.getCorpus();
Future<TaggerFstCorpus> enqueuedCorpus = null;
if (fstCorpus == null) {
if (!fstInfo.allowCreation && fstInfo.isFstCreationError()) {
throw new CorpusException(fstInfo.getErrorMessage(), null);
}
fstInfo.corpusLock.readLock().lock();
try {
enqueuedCorpus = fstInfo.getEnqueued();
} finally {
fstInfo.corpusLock.readLock().unlock();
}
if (//not enqueued
enqueuedCorpus == null && fstInfo.allowCreation) {
log.debug(" - enqueue creation of {}", fstInfo);
enqueuedCorpus = enqueue(fstInfo);
}
if (enqueuedCorpus == null) {
throw new CorpusException("Unable to abtain Fst Corpus for " + fstInfo + "(message: " + fstInfo.getErrorMessage() + ")!", null);
}
} else {
//check if the current FST corpus is up to date with the Solr index
if (indexVersion != null && indexVersion.longValue() != fstCorpus.getIndexVersion()) {
log.debug(" - FST corpus for language '{}' is outdated", fstInfo.language);
fstInfo.corpusLock.readLock().lock();
try {
enqueuedCorpus = fstInfo.getEnqueued();
} finally {
fstInfo.corpusLock.readLock().unlock();
}
if (//not already enqueued
enqueuedCorpus == null && fstInfo.allowCreation && config.getExecutorService() != null) {
log.debug(" - enqueue creation of {}", fstInfo);
enqueuedCorpus = enqueue(fstInfo);
} else {
log.warn("Unable to update outdated FST corpus for language '{}' " + "because runtimeCreation is {} and ExecutorServic " + "is {} available!", new Object[] { fstInfo.language, fstInfo.allowCreation ? "enabled" : "disabled", config.getExecutorService() == null ? "not" : "" });
log.warn(" ... please adapt the Engine configuration for up " + "to date FST corpora!");
}
} else {
//FST corpus is up to date with the current Solr index version
log.debug("FST corpus for language '{}' is up to date", fstInfo.language);
}
}
//TODO: maybe make this configurable
int waitTime = fstCorpus == null ? 30 : 10;
if (enqueuedCorpus != null) {
//we needed to build a new corpus
try {
log.debug(" - will wait max {}sec for creation of {}", waitTime, fstInfo);
fstCorpus = enqueuedCorpus.get(waitTime, TimeUnit.SECONDS);
} catch (InterruptedException e) {
//recover interrupted state
Thread.currentThread().interrupt();
} catch (ExecutionException e) {
log.warn("Unable to update outdated FST corpus " + fstInfo + " (message: " + fstInfo.getErrorMessage() + ")", e);
} catch (TimeoutException e) {
if (fstCorpus != null) {
log.debug("unable to build FST corpus for {} in time ({}sec). Will use " + "previouse version ", fstInfo, waitTime);
} else {
throw new CorpusException("Unable to build Fst Corpus for " + fstInfo + "within " + waitTime + "sec! Try again later.", null);
}
} catch (CancellationException e) {
if (fstCorpus != null) {
log.debug("building of FST corpus for {} was cancelled. Will use " + "previouse version.", fstInfo);
} else {
throw new CorpusException("Building of FST Corpus " + fstInfo + "was cancelled!", null);
}
}
}
return fstCorpus;
}
use of org.opensextant.solrtexttagger.TaggerFstCorpus in project stanbol by apache.
the class FstLinkingEngineComponent method updateEngineRegistration.
/**
* This will be called on each <ul>
* <li>update to the Component configuration (activate, deactivate)
* <li>updates on the SolrCore
* </ul>
* on any detected change it will update the registered EnhancementEngine.<p>
* This also initialises the FST configuration.
* @param reference the ServiceRefernece for the SolrServer or <code>null</code>
* in case the service is no longer available.
* @param server the SolrServer (or <code>null</code>
*/
protected void updateEngineRegistration(ServiceReference reference, SolrServer server) {
log.info(" ... updateEngineRegistration for {}: {}", getClass().getSimpleName(), engineName);
if (reference != null && server == null) {
server = solrServerTracker.getService(reference);
}
if (reference == null && this.indexReference == null) {
//and return
return;
}
BundleContext bundleContext = this.bundleContext;
//We need to keep the old configuration vars for unregistering the
//current engine (see #unregisterEngine(..) method)
final ServiceRegistration<?> oldEngineRegistration = this.engineRegistration;
final SolrCore oldSolrCore = this.solrCore;
final IndexConfiguration oldIndexConfig = this.indexConfig;
SolrCore core;
// the indexConfig build by this call
IndexConfiguration indexConfig;
synchronized (this) {
//init one after the other in case of multiple calls
try {
//try to init - finally unregisterEngine
//reset the old field values
this.engineRegistration = null;
this.indexConfig = null;
this.solrCore = null;
//now we can update the engines configuration
if (bundleContext == null) {
//NOTE: unregistering is done in finally block
return;
}
core = getSolrCore(server);
if (core == null) {
//no SolrCore
log.info(" - SolrCore {} present", oldSolrCore == null ? "not yet" : "no longer");
//NOTE: unregistering is done in finally block
return;
} else {
//- we do have a SolrCore
log.info(" - solrCore (name: {} | indexDir: {}", core.getName(), core.getIndexDir());
}
//File fstDir = new File(dataDir,"fst");
//now collect the FST configuration
indexConfig = new IndexConfiguration(fstConfig, core, fieldEncoding, entityLinkerConfig.getDefaultLanguage());
indexConfig.setTypeField(solrTypeField);
indexConfig.setRankingField(solrRankingField);
//set fields parsed in the activate method
indexConfig.setExecutorService(fstCreatorService);
//TODO add support
indexConfig.setRedirectField(null);
indexConfig.setOrigin(origin);
//NOTE: the FST cofnig is processed even if the SolrCore has not changed
// because their might be config changes and/or new FST files in the
// FST directory of the SolrCore.
indexConfig.setFstDirectory(getFstDirectory(core, fstFolder));
//set the DocumentCacheFactory
if (entityCacheSize > 0) {
indexConfig.setEntityCacheManager(new FastLRUCacheManager(entityCacheSize));
}
//else no entityCache is used
if (skipAltTokensConfig != null) {
indexConfig.setSkipAltTokens(skipAltTokensConfig);
}
//activate the index configuration
try {
//this will init the FST directory if necessary so we might run
//into IOExceptions
indexConfig.activate();
} catch (IOException e) {
throw new RuntimeException("Unable to activate Index for FST Linking Engine '" + engineName + "' (solrCore: " + core.getName() + ", instanceDir: " + core.getCoreDescriptor().getInstanceDir() + ")!", e);
}
if (log.isInfoEnabled()) {
//log the initialised languages
Set<String> langSet = new HashSet<String>(indexConfig.getCorpusLanguages());
if (langSet.remove(null)) {
//replace the null for the default language
//with an empty string
langSet.add("");
}
String[] langArray = langSet.toArray(new String[langSet.size()]);
Arrays.sort(langArray, String.CASE_INSENSITIVE_ORDER);
log.info(" ... initialised FST corpora for languages {}", Arrays.toString(langArray));
}
//check if we need to create some FST files
for (CorpusInfo fstInfo : indexConfig.getCorpora()) {
//check if the fst does not exist and the fstInfo allows creation
if (!fstInfo.fst.exists() && fstInfo.allowCreation) {
//create a task on the FST corpus creation service
fstInfo.corpusLock.writeLock().lock();
try {
Future<TaggerFstCorpus> enqueued = fstCreatorService.submit(new CorpusCreationTask(indexConfig, fstInfo));
fstInfo.enqueued(enqueued);
} finally {
fstInfo.corpusLock.writeLock().unlock();
}
}
}
//set the newly configured instances to the fields
this.indexConfig = indexConfig;
this.solrServerReference = reference;
this.solrCore = core;
//create the new FST linking engine instance
FstLinkingEngine engine = new FstLinkingEngine(engineName, linkingMode, indexConfig, textProcessingConfig, entityLinkerConfig, nerTypeMappings);
//register it as a service
String[] services = new String[] { EnhancementEngine.class.getName(), ServiceProperties.class.getName() };
log.info(" ... register {}: {}", engine.getClass().getSimpleName(), engineName);
this.engineRegistration = bundleContext.registerService(services, engine, engineMetadata);
} finally {
//in any case (even an Exception) ensure that the current
//engine registration is unregistered and the currently used
//SolrCore is unregistered!
unregisterEngine(oldEngineRegistration, oldIndexConfig, oldSolrCore);
}
}
}
use of org.opensextant.solrtexttagger.TaggerFstCorpus in project stanbol by apache.
the class CorpusInfo method getCorpus.
public TaggerFstCorpus getCorpus() {
TaggerFstCorpus corpus;
corpusLock.readLock().lock();
try {
corpus = taggerCorpusRef == null ? null : taggerCorpusRef.get();
if (corpus != null) {
//on first usage replace a WeakReference with a SoftReference
if (taggerCorpusRef instanceof WeakReference<?>) {
log.debug(" ... convert Weak to Soft Reference for Corpus {}", fst);
taggerCorpusRef.clear();
taggerCorpusRef = new SoftReference<TaggerFstCorpus>(corpus);
}
} else if (taggerCorpusRef != null) {
//reset to null as the reference was taken
taggerCorpusRef = null;
}
} finally {
corpusLock.readLock().unlock();
}
if (corpus == null) {
log.info(" ... load FST corpus {}", fst);
corpusLock.writeLock().lock();
try {
//STANBOL-1177: load FST models in AccessController.doPrivileged(..)
corpus = taggerCorpusRef == null ? null : taggerCorpusRef.get();
if (corpus == null) {
//corpus not loaded while waiting for the write lock
corpus = AccessController.doPrivileged(new PrivilegedExceptionAction<TaggerFstCorpus>() {
public TaggerFstCorpus run() throws IOException {
if (//if the file exists AND the file was not yet failing to load
fst.exists() && //OR the file is newer as the last version failing to load
(!fstFileError || FileUtils.isFileNewer(fst, fstDate))) {
TaggerFstCorpus corpus = TaggerFstCorpus.load(fst);
if (corpus != null) {
//I need to set fstDate here, because I can not
//access lastModified() outside doPrivileged
fstDate = new Date(fst.lastModified());
if (log.isInfoEnabled()) {
log.info(" ... loaded FST (date: {})", SimpleDateFormat.getDateTimeInstance().format(fstDate));
}
} else {
log.warn(" ... no corpus loaded from {}", fst);
}
return corpus;
} else {
log.warn(" ... unable to load FST from {} (exists: {}, fileError {})", new Object[] { fst, fst.exists(), fstFileError });
return null;
}
}
});
if (corpus != null) {
fstFileError = false;
taggerCorpusRef = new SoftReference<TaggerFstCorpus>(corpus);
}
//else not loaded from file
}
//else corpus was loaded while waiting for the write lock
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if (e instanceof IOException) {
//IO Exception while loading the file
this.errorMessage = new StringBuilder("Unable to load FST corpus from " + "FST file: '").append(fst.getAbsolutePath()).append("' (Message: ").append(e.getMessage()).append(")!").toString();
log.warn(errorMessage, e);
fstFileError = true;
} else {
//Runtime exception
throw RuntimeException.class.cast(e);
}
} finally {
corpusLock.writeLock().unlock();
}
}
return corpus;
}
Aggregations