use of org.apache.clerezza.commons.rdf.RDFTerm in project stanbol by apache.
the class FstLinkingEngineComponent method applyConfig.
/**
* Called by {@link #activate(ComponentContext)},
* {@link PlainFstLinkingComponnet#activate(ComponentContext)} and
* {@link NamedEntityFstLinkingComponnet#activate(ComponentContext)} to
* apply the parsed {@link ComponentContext#getProperties()}. The
* {@link LinkingModeEnum linking mode} is parsed separately as OSGI does not
* allow to modify the parsed config and sup-classes do need to override
* the linking mode.
* @param linkingMode the linking mode
* @param properties
* @throws ConfigurationException
*/
protected void applyConfig(LinkingModeEnum linkingMode, Dictionary<String, Object> properties, NamespacePrefixService prefixService) throws ConfigurationException {
//(0) The name for the Enhancement Engine and the basic metadata
Object value = properties.get(PROPERTY_NAME);
if (value == null || value.toString().isEmpty()) {
throw new ConfigurationException(PROPERTY_NAME, "The EnhancementEngine name MUST BE configured!");
} else {
this.engineName = value.toString();
}
log.info(" - engine name: {}", engineName);
engineMetadata = new Hashtable<String, Object>();
engineMetadata.put(PROPERTY_NAME, this.engineName);
value = properties.get(Constants.SERVICE_RANKING);
engineMetadata.put(Constants.SERVICE_RANKING, value == null ? Integer.valueOf(0) : value);
//(0) set the linking mode
this.linkingMode = linkingMode;
log.info(" - linking mode: {}", linkingMode);
//(1) parse the TextProcessing configuration
//TODO: decide if we should use the TextProcessingConfig for this engine
textProcessingConfig = TextProcessingConfig.createInstance(properties);
//change default for EntityLinkerConfig.MIN_FOUND_TOKENS
value = properties.get(EntityLinkerConfig.MIN_FOUND_TOKENS);
entityLinkerConfig = EntityLinkerConfig.createInstance(properties, prefixService);
if (value == null) {
//no MIN_FOUND_TOKENS config present
//manually set the default to the value used by this engine
entityLinkerConfig.setMinFoundTokens(FST_DEFAULT_MIN_FOUND_TOKENS);
}
//(2) parse the configured IndexReference
value = properties.get(SOLR_CORE);
if (value == null) {
throw new ConfigurationException(SOLR_CORE, "Missing required configuration of the SolrCore");
} else {
indexReference = IndexReference.parse(value.toString());
}
value = properties.get(IndexConfiguration.FIELD_ENCODING);
if (value == null) {
throw new ConfigurationException(IndexConfiguration.FIELD_ENCODING, "Missing required configuration of the Solr Field Encoding");
} else {
try {
fieldEncoding = FieldEncodingEnum.valueOf(value.toString().trim());
} catch (IllegalArgumentException e) {
throw new ConfigurationException(IndexConfiguration.FIELD_ENCODING, "The configured " + "FieldEncoding MUST BE a member of " + Arrays.toString(FieldEncodingEnum.values()), e);
}
}
value = properties.get(IndexConfiguration.SKIP_ALT_TOKENS);
if (value instanceof Boolean) {
skipAltTokensConfig = ((Boolean) value);
} else if (value != null) {
skipAltTokensConfig = Boolean.valueOf(value.toString());
}
// else no config -> will use the default
//(4) parse Origin information
value = properties.get(ORIGIN);
if (value instanceof RDFTerm) {
origin = (RDFTerm) origin;
} else if (value instanceof String) {
try {
URI originUri = new URI((String) value);
if (originUri.isAbsolute()) {
origin = new IRI((String) value);
} else {
origin = new PlainLiteralImpl((String) value);
}
} catch (URISyntaxException e) {
origin = new PlainLiteralImpl((String) value);
}
log.info(" - origin: {}", origin);
} else if (value != null) {
log.warn("Values of the {} property MUST BE of type RDFTerm or String " + "(parsed: {} (type:{}))", new Object[] { ORIGIN, value, value.getClass() });
}
//else no ORIGIN information provided
//(5) init the FST configuration
//We can create the default configuration only here, as it depends on the
//name of the solrIndex
String defaultConfig = "*;" + IndexConfiguration.PARAM_FST + "=" + indexReference.getIndex() + ";" + IndexConfiguration.PARAM_FIELD + "=" + IndexConfiguration.DEFAULT_FIELD;
fstConfig = new LanguageConfiguration(IndexConfiguration.FST_CONFIG, new String[] { defaultConfig });
//now set the actual configuration parsed to the engine
value = properties.get(IndexConfiguration.FST_CONFIG);
if (value != null && !StringUtils.isBlank(value.toString())) {
fstConfig.setConfiguration(properties);
}
//else keep the default
value = properties.get(IndexConfiguration.FST_FOLDER);
if (value instanceof String) {
this.fstFolder = ((String) value).trim();
if (this.fstFolder.isEmpty()) {
this.fstFolder = null;
}
} else if (value == null) {
this.fstFolder = null;
} else {
throw new ConfigurationException(IndexConfiguration.FST_FOLDER, "Values MUST BE of type String" + "(found: " + value.getClass().getName() + ")!");
}
//(6) Create the ThreadPool used for the runtime creation of FST models
value = properties.get(FST_THREAD_POOL_SIZE);
int tpSize;
if (value instanceof Number) {
tpSize = ((Number) value).intValue();
} else if (value != null) {
try {
tpSize = Integer.parseInt(value.toString());
} catch (NumberFormatException e) {
throw new ConfigurationException(FST_THREAD_POOL_SIZE, "Unable to parse the integer FST thread pool size from the " + "configured " + value.getClass().getSimpleName() + " '" + value + "'!", e);
}
} else {
tpSize = -1;
}
if (tpSize <= 0) {
//if configured value <= 0 we use the default
tpSize = DEFAULT_FST_THREAD_POOL_SIZE;
}
//build a ThreadFactoryBuilder for low priority daemon threads that
//do use a meaningful name
ThreadFactoryBuilder tfBuilder = new ThreadFactoryBuilder();
//should be stopped if the VM closes
tfBuilder.setDaemon(true);
//low priority
tfBuilder.setPriority(Thread.MIN_PRIORITY);
tfBuilder.setNameFormat(engineName + "-FstRuntimeCreation-thread-%d");
if (fstCreatorService != null && !fstCreatorService.isTerminated()) {
//NOTE: We can not call terminateNow, because to interrupt threads
// here would also close FileChannels used by the SolrCore
// and produce java.nio.channels.ClosedByInterruptException
// exceptions followed by java.nio.channels.ClosedChannelException
// on following calls to affected files of the SolrIndex.
//Because of that we just log a warning and let uncompleted tasks
//complete!
log.warn("some items in a previouse FST Runtime Creation Threadpool have " + "still not finished!");
}
fstCreatorService = Executors.newFixedThreadPool(tpSize, tfBuilder.build());
//(7) Parse the EntityCache config
int entityCacheSize;
value = properties.get(ENTITY_CACHE_SIZE);
if (value instanceof Number) {
entityCacheSize = ((Number) value).intValue();
} else if (value != null) {
try {
entityCacheSize = Integer.parseInt(value.toString());
} catch (NumberFormatException e) {
throw new ConfigurationException(ENTITY_CACHE_SIZE, "Unable to parse the integer EntityCacheSize from the " + "configured " + value.getClass().getSimpleName() + " '" + value + "'!", e);
}
} else {
entityCacheSize = -1;
}
if (entityCacheSize == 0) {
log.info(" ... EntityCache deactivated");
this.entityCacheSize = entityCacheSize;
} else {
this.entityCacheSize = entityCacheSize < 0 ? DEFAULT_ENTITY_CACHE_SIZE : entityCacheSize;
log.info(" ... EntityCache enabled (size: {})", this.entityCacheSize);
}
//(8) parse the Entity type field
value = properties.get(IndexConfiguration.SOLR_TYPE_FIELD);
if (value == null || StringUtils.isBlank(value.toString())) {
solrTypeField = null;
} else {
solrTypeField = value.toString().trim();
}
//(9) parse the Entity Ranking field
value = properties.get(IndexConfiguration.SOLR_RANKING_FIELD);
if (value == null) {
solrRankingField = null;
} else {
solrRankingField = value.toString().trim();
}
//(10) parse the NamedEntity type mappings (if linkingMode = NER)
if (linkingMode == LinkingModeEnum.NER) {
nerTypeMappings = new HashMap<String, Set<String>>();
value = properties.get(NAMED_ENTITY_TYPE_MAPPINGS);
if (value instanceof String[]) {
//support array
value = Arrays.asList((String[]) value);
} else if (value instanceof String) {
//single value
value = Collections.singleton(value);
}
if (value instanceof Collection<?>) {
//and collection
log.info(" - process Named Entity Type Mappings (used by LinkingMode: {})", linkingMode);
configs: for (Object o : (Iterable<?>) value) {
if (o != null) {
StringBuilder usage = new StringBuilder("useage: ");
usage.append("'{namedEntity-tag-or-uri} > {entityType-1}[,{entityType-n}]'");
String[] config = o.toString().split(">");
String namedEntityType = config[0].trim();
if (namedEntityType.isEmpty()) {
log.warn("Invalid Type Mapping Config '{}': Missing namedEntityType ({}) -> ignore this config", o, usage);
continue configs;
}
if (NamespaceMappingUtils.getPrefix(namedEntityType) != null) {
namedEntityType = NamespaceMappingUtils.getConfiguredUri(prefixService, NAMED_ENTITY_TYPE_MAPPINGS, namedEntityType);
}
if (config.length < 2 || config[1].isEmpty()) {
log.warn("Invalid Type Mapping Config '{}': Missing dc:type URI '{}' ({}) -> ignore this config", o, usage);
continue configs;
}
String entityTypes = config[1].trim();
if (config.length > 2) {
log.warn("Configuration after 2nd '>' gets ignored. Will use mapping '{} > {}' from config {}", new Object[] { namedEntityType, entityTypes, o });
}
Set<String> types = nerTypeMappings.get(namedEntityType);
if (types == null) {
//add new element to the mapping
types = new HashSet<String>();
nerTypeMappings.put(namedEntityType, types);
}
for (String entityType : entityTypes.split(";")) {
entityType = entityType.trim();
if (!entityType.isEmpty()) {
String typeUri;
if ("*".equals(entityType)) {
//null is used as wildcard
typeUri = null;
} else {
typeUri = NamespaceMappingUtils.getConfiguredUri(prefixService, NAMED_ENTITY_TYPE_MAPPINGS, entityType);
}
log.info(" - add {} > {}", namedEntityType, typeUri);
types.add(typeUri);
}
//else ignore empty mapping
}
}
}
} else {
//no mappings defined ... set wildcard mapping
log.info(" - No Named Entity type mappings configured. Will use wildcard mappings");
nerTypeMappings = Collections.singletonMap(null, Collections.<String>singleton(null));
}
}
//(11) start tracking the SolrCore
try {
solrServerTracker = new RegisteredSolrServerTracker(bundleContext, indexReference, null) {
@Override
public void removedService(ServiceReference reference, Object service) {
log.info(" ... SolrCore for {} was removed!", reference);
//try to get an other serviceReference from the tracker
if (reference.equals(FstLinkingEngineComponent.this.solrServerReference)) {
updateEngineRegistration(solrServerTracker.getServiceReference(), null);
} else {
log.info(" - removed SolrCore was not used for FST linking");
}
super.removedService(reference, service);
}
@Override
public void modifiedService(ServiceReference reference, Object service) {
log.info(" ... SolrCore for {} was updated!", indexReference);
updateEngineRegistration(solrServerTracker.getServiceReference(), null);
super.modifiedService(reference, service);
}
@Override
public SolrServer addingService(ServiceReference reference) {
SolrServer server = super.addingService(reference);
if (solrCore != null) {
log.info("Multiple SolrCores for name {}! Will update engine " + "with the newly added {}!", new Object[] { solrCore.getName(), indexReference, reference });
}
updateEngineRegistration(reference, server);
return server;
}
};
} catch (InvalidSyntaxException e) {
throw new ConfigurationException(SOLR_CORE, "parsed SolrCore name '" + value.toString() + "' is invalid (expected: '[{server-name}:]{indexname}'");
}
try {
solrServerTracker.open();
} catch (RuntimeException e) {
//FIX for STANBOL-1416 (see https://issues.apache.org/jira/browse/STANBOL-1416)
//If an available SolrCore can not be correctly initialized we will
//get the exception here. In this case we want this component to be
//activated and waiting for further service events. Because of that
//we catch here the exception.
log.debug("Error while processing existing SolrCore Service during " + "opening SolrServiceTracker ... waiting for further service" + "Events", e);
}
}
use of org.apache.clerezza.commons.rdf.RDFTerm in project stanbol by apache.
the class KeywordLinkingEngineTest method testEngine.
/**
* This tests if the Enhancements created by the Engine confirm to the
* rules defined for the Stanbol Enhancement Structure.
* @throws IOException
* @throws EngineException
*/
@Test
public void testEngine() throws IOException, EngineException {
EntityLinkerConfig linkerConfig = new EntityLinkerConfig();
linkerConfig.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
KeywordLinkingEngine engine = KeywordLinkingEngine.createInstance(openNLP, searcher, new TextAnalyzerConfig(), linkerConfig);
engine.referencedSiteName = TEST_REFERENCED_SITE_NAME;
ContentItem ci = ciFactory.createContentItem(new StringSource(TEST_TEXT));
//tells the engine that this is an English text
ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("en")));
//compute the enhancements
engine.computeEnhancements(ci);
//validate the enhancement results
Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
expectedValues.put(ENHANCER_EXTRACTED_FROM, ci.getUri());
expectedValues.put(DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(engine.getClass().getName()));
//adding null as expected for confidence makes it a required property
expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
//validate create fise:TextAnnotations
int numTextAnnotations = validateAllTextAnnotations(ci.getMetadata(), TEST_TEXT, expectedValues);
assertEquals("Four fise:TextAnnotations are expected by this Test", 4, numTextAnnotations);
//validate create fise:EntityAnnotations
int numEntityAnnotations = validateAllEntityAnnotations(ci, expectedValues);
assertEquals("Five fise:EntityAnnotations are expected by this Test", 5, numEntityAnnotations);
}
use of org.apache.clerezza.commons.rdf.RDFTerm in project stanbol by apache.
the class ClerezzaRDFUtils method urifyBlankNodes.
public static void urifyBlankNodes(Graph model) {
HashMap<BlankNode, IRI> blankNodeMap = new HashMap<BlankNode, IRI>();
Graph remove = new SimpleGraph();
Graph add = new SimpleGraph();
for (Triple t : model) {
BlankNodeOrIRI subj = t.getSubject();
RDFTerm obj = t.getObject();
IRI pred = t.getPredicate();
boolean match = false;
if (subj instanceof BlankNode) {
match = true;
IRI ru = blankNodeMap.get(subj);
if (ru == null) {
ru = createRandomUri();
blankNodeMap.put((BlankNode) subj, ru);
}
subj = ru;
}
if (obj instanceof BlankNode) {
match = true;
IRI ru = blankNodeMap.get(obj);
if (ru == null) {
ru = createRandomUri();
blankNodeMap.put((BlankNode) obj, ru);
}
obj = ru;
}
if (match) {
remove.add(t);
add.add(new TripleImpl(subj, pred, obj));
}
}
model.removeAll(remove);
model.addAll(add);
}
use of org.apache.clerezza.commons.rdf.RDFTerm in project stanbol by apache.
the class RdfSerializingWriter method getObjectExpansionProperties.
private Set<IRI> getObjectExpansionProperties(GraphNode recipe) {
final MultivaluedMap<String, String> queryParams = uriInfo.getQueryParameters(true);
final List<String> paramValues = queryParams.get(OBJ_EXP_PARAM);
final Set<IRI> result = new HashSet<IRI>();
if (paramValues != null) {
for (String uriString : paramValues) {
result.add(new IRI(uriString));
}
}
if (recipe != null) {
Iterator<GraphNode> ingredients = recipe.getObjectNodes(RECIPES.ingredient);
while (ingredients.hasNext()) {
Iterator<RDFTerm> properties = ingredients.next().getObjects(RECIPES.ingredientProperty);
while (properties.hasNext()) {
result.add((IRI) properties.next());
}
}
}
return result;
}
use of org.apache.clerezza.commons.rdf.RDFTerm in project stanbol by apache.
the class SuggestionFunction method processAnnotations.
/**
* Suggestions are selected by all Annotations returned by the parsed
* {@link #annotationSelector}.
* @param backend
* @param annotations suggestions are selected for the union of the parsed
* annotations - the {limit} most linked entities for the parsed
* list of annotations.
* @param limit the maximum number of suggestions for the parsed collection
* of annotations.
* @param missingConfidenceMode
* @param result results are added to this list.
*/
private void processAnnotations(final RDFBackend<RDFTerm> backend, Collection<RDFTerm> annotations, Integer limit, final int missingConfidenceMode, List<RDFTerm> result) {
List<Entry<Double, RDFTerm>> suggestions = new ArrayList<Entry<Double, RDFTerm>>();
for (RDFTerm annotation : annotations) {
//NOTE: no Path Tracking support possible for selectors wrapped in functions
for (RDFTerm suggestion : suggestionSelector.select(backend, annotation, null, null)) {
Collection<RDFTerm> cs = confidenceSelector.select(backend, suggestion, null, null);
Double confidence = !cs.isEmpty() ? backend.doubleValue(cs.iterator().next()) : missingConfidenceMode == MISSING_CONFIDENCE_FILTER ? null : missingConfidenceMode == MISSING_CONFIDENCE_FIRST ? MAX : MIN;
if (confidence != null) {
suggestions.add(singletonMap(confidence, suggestion).entrySet().iterator().next());
}
}
}
Collections.sort(suggestions, SUGGESTION_COMPARATOR);
int resultSize = limit != null ? Math.min(limit, suggestions.size()) : suggestions.size();
for (Entry<Double, RDFTerm> suggestion : suggestions.subList(0, resultSize)) {
if (resultSelector == null) {
result.add(suggestion.getValue());
} else {
result.addAll(resultSelector.select(backend, suggestion.getValue(), null, null));
}
}
}
Aggregations