use of org.apache.clerezza.commons.rdf.IRI in project stanbol by apache.
the class FstLinkingEngine method getDcTypes.
/**
* Retrieves all {@link EntitySearcher#getEncodedTypeField()} values of the parsed
* {@link Suggestion}s and than lookup the {@link NamespaceEnum#dcTerms dc}:type
* values for the {@link LinkedEntity#getTypes()} by using the configured
* {@link EntityLinkerConfig#getTypeMappings() types mappings} (and if
* no mapping is found the {@link EntityLinkerConfig#getDefaultDcType()
* default} type.
* @param conceptTypes The list of suggestions
* @return the types values for the {@link LinkedEntity}
*/
private Set<IRI> getDcTypes(List<Match> matches) {
if (matches == null || matches.isEmpty()) {
return Collections.emptySet();
}
Collection<IRI> conceptTypes = new HashSet<IRI>();
// only consider types of the best ranked Entities
double score = -1;
for (Match match : matches) {
double actScore = match.getScore();
if (actScore < score) {
break;
}
score = actScore;
for (Iterator<IRI> types = match.getTypes().iterator(); types.hasNext(); conceptTypes.add(types.next())) ;
}
Map<IRI, IRI> typeMappings = elConfig.getTypeMappings();
Set<IRI> dcTypes = new HashSet<IRI>();
for (IRI conceptType : conceptTypes) {
IRI dcType = typeMappings.get(conceptType);
if (dcType != null) {
dcTypes.add(dcType);
}
}
if (dcTypes.isEmpty() && elConfig.getDefaultDcType() != null) {
dcTypes.add(elConfig.getDefaultDcType());
}
return dcTypes;
}
use of org.apache.clerezza.commons.rdf.IRI in project stanbol by apache.
the class FstLinkingEngine method writeEnhancements.
/**
* Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
* extracted from the parsed ContentItem
* @param ci
* @param tags
* @param language
*/
private void writeEnhancements(ContentItem ci, String text, Collection<Tag> tags, String language, boolean writeRankings) {
Language languageObject = null;
if (language != null && !language.isEmpty()) {
languageObject = new Language(language);
}
Graph metadata = ci.getMetadata();
for (Tag tag : tags) {
Collection<IRI> textAnnotations = new ArrayList<IRI>(tags.size());
// first create the TextAnnotations for the Occurrences
Literal startLiteral = literalFactory.createTypedLiteral(tag.getStart());
Literal endLiteral = literalFactory.createTypedLiteral(tag.getEnd());
// search for existing text annotation
Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
IRI textAnnotation = null;
while (it.hasNext()) {
Triple t = it.next();
if (metadata.filter(t.getSubject(), ENHANCER_END, endLiteral).hasNext() && metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()) {
textAnnotation = (IRI) t.getSubject();
break;
}
}
if (textAnnotation == null) {
// not found ... create a new one
textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, startLiteral));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, endLiteral));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, tag.getAnchor(), tag.getStart()), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(tag.getAnchor(), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(tag.getScore())));
} else {
// if existing add this engine as contributor
metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, new PlainLiteralImpl(this.getClass().getName())));
}
// add dc:types (even to existing)
for (IRI dcType : getDcTypes(tag.getSuggestions())) {
metadata.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, dcType));
}
textAnnotations.add(textAnnotation);
// now the EntityAnnotations for the Suggestions
for (Match match : tag.getSuggestions()) {
IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
// should we use the label used for the match, or search the
// representation for the best label ... currently its the matched one
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_LABEL, match.getMatchLabel()));
metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, new IRI(match.getUri())));
for (IRI type : match.getTypes()) {
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_TYPE, type));
}
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(match.getScore())));
// add the relation to the fise:TextAnnotation (the tag)
metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
// write origin information
if (indexConfig.getOrigin() != null) {
metadata.add(new TripleImpl(entityAnnotation, FISE_ORIGIN, indexConfig.getOrigin()));
}
// }
if (writeRankings) {
Double ranking = match.getRanking();
if (ranking != null) {
metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_RANKING, literalFactory.createTypedLiteral(ranking)));
}
}
// TODO: dereferencing
// if(linkerConfig.isDereferenceEntitiesEnabled() &&
// dereferencedEntitis.add(entity.getUri())){ //not yet dereferenced
// //add all outgoing triples for this entity
// //NOTE: do not add all triples as there might be other data in the graph
// for(Iterator<Triple> triples = entity.getData().filter(entity.getUri(), null, null);
// triples.hasNext();metadata.add(triples.next()));
// }
}
}
}
use of org.apache.clerezza.commons.rdf.IRI in project stanbol by apache.
the class LanguageDetectionEnhancementEngine method computeEnhancements.
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
String text = "";
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
// do not call trim() on long texts to check if the text is empty
if (text.length() < 50 && text.trim().length() == 0) {
log.info("No text contained in ContentPart {} of ContentItem {}", contentPart.getKey(), ci.getUri());
return;
}
// truncate text to some piece from the middle if probeLength > 0
int checkLength = probeLength;
if (checkLength > 0 && text.length() > checkLength) {
text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
}
List<Language> languages = null;
try {
languages = languageIdentifier.getLanguages(text);
log.debug("language identified: {}", languages);
} catch (LangDetectException e) {
Enum<?> errorCode = e.getCode();
// ignore " 0 - NoTextError" and "5 - CantDetectError"
if (errorCode.ordinal() != 0 && errorCode.ordinal() != 5) {
StringBuilder msg = new StringBuilder("Could not identify language of text: ");
if (text.length() < 200) {
msg.append(text);
} else {
msg.append(text.subSequence(0, 199)).append("...");
}
msg.append(" (Error Code: ").append(errorCode.ordinal()).append(" - ").append(errorCode.name()).append(")");
throw new EngineException(this, ci, msg.toString(), e);
} else {
log.debug("No text to detect the language from present in ContentItem ", ci);
}
}
// add language to metadata
if (languages != null) {
Graph g = ci.getMetadata();
ci.getLock().writeLock().lock();
try {
for (int i = 0; i < maxSuggestedLanguages && i < languages.size(); i++) {
// add a hypothesis
Language hypothesis = languages.get(i);
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(hypothesis.lang)));
g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
}
} finally {
ci.getLock().writeLock().unlock();
}
}
}
use of org.apache.clerezza.commons.rdf.IRI in project stanbol by apache.
the class KeywordLinkingEngine method activateEntityLinkerConfig.
/**
* Configures the parsed {@link EntityLinkerConfig} with the values of the
* following properties:<ul>
* <li>{@link #NAME_FIELD}
* <li>{@link #TYPE_FIELD}
* <li>{@link #REDIRECT_FIELD}
* <li>{@link #REDIRECT_PROCESSING_MODE}
* <li>{@link #MAX_SUGGESTIONS}
* <li>{@link #MIN_SEARCH_TOKEN_LENGTH}
* <li>{@link #MIN_FOUND_TOKENS}
* <li> {@link #MIN_TOKEN_MATCH_FACTOR}
* </ul>
* This Method create an new {@link EntityLinkerConfig} instance only if
* <code>{@link #linkerConfig} == null</code>. If the instance is already initialised
* that all current values for keys missing in the parsed configuration are
* preserved.
* @param configuration the configuration
* @throws ConfigurationException In case of an illegal value in the parsed configuration.
* Note that all configuration are assumed as optional, therefore missing values will not
* case a ConfigurationException.
*/
protected void activateEntityLinkerConfig(Dictionary<String, Object> configuration) throws ConfigurationException {
if (linkerConfig == null) {
this.linkerConfig = new EntityLinkerConfig();
}
Object value;
value = configuration.get(NAME_FIELD);
if (value != null) {
if (value.toString().isEmpty()) {
throw new ConfigurationException(NAME_FIELD, "The configured name field MUST NOT be empty");
}
linkerConfig.setNameField(NamespaceMappingUtils.getConfiguredUri(nsPrefixService, NAME_FIELD, value.toString()));
}
// init case sensitivity
value = configuration.get(CASE_SENSITIVE);
if (value instanceof Boolean) {
linkerConfig.setCaseSensitiveMatchingState((Boolean) value);
} else if (value != null && !value.toString().isEmpty()) {
linkerConfig.setCaseSensitiveMatchingState(Boolean.valueOf(value.toString()));
}
// if NULL or empty use default
// init TYPE_FIELD
value = configuration.get(TYPE_FIELD);
if (value != null) {
if (value.toString().isEmpty()) {
throw new ConfigurationException(TYPE_FIELD, "The configured name field MUST NOT be empty");
}
linkerConfig.setTypeField(NamespaceMappingUtils.getConfiguredUri(nsPrefixService, TYPE_FIELD, value.toString()));
}
// init REDIRECT_FIELD
value = configuration.get(REDIRECT_FIELD);
if (value != null) {
if (value.toString().isEmpty()) {
throw new ConfigurationException(NAME_FIELD, "The configured name field MUST NOT be empty");
}
linkerConfig.setRedirectField(NamespaceMappingUtils.getConfiguredUri(nsPrefixService, REDIRECT_FIELD, value.toString()));
}
// init MAX_SUGGESTIONS
value = configuration.get(MAX_SUGGESTIONS);
Integer maxSuggestions;
if (value instanceof Integer) {
maxSuggestions = (Integer) value;
} else if (value != null) {
try {
maxSuggestions = Integer.valueOf(value.toString());
} catch (NumberFormatException e) {
throw new ConfigurationException(MAX_SUGGESTIONS, "Values MUST be valid Integer values > 0", e);
}
} else {
maxSuggestions = null;
}
if (maxSuggestions != null) {
if (maxSuggestions < 1) {
throw new ConfigurationException(MAX_SUGGESTIONS, "Values MUST be valid Integer values > 0");
}
linkerConfig.setMaxSuggestions(maxSuggestions);
}
// init MIN_FOUND_TOKENS
value = configuration.get(MIN_FOUND_TOKENS);
Integer minFoundTokens;
if (value instanceof Integer) {
minFoundTokens = (Integer) value;
} else if (value != null) {
try {
minFoundTokens = Integer.valueOf(value.toString());
} catch (NumberFormatException e) {
throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0", e);
}
} else {
minFoundTokens = null;
}
if (minFoundTokens != null) {
if (minFoundTokens < 1) {
throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0");
}
linkerConfig.setMinFoundTokens(minFoundTokens);
}
// init MIN_SEARCH_TOKEN_LENGTH
value = configuration.get(MIN_SEARCH_TOKEN_LENGTH);
Integer minSearchTokenLength;
if (value instanceof Integer) {
minSearchTokenLength = (Integer) value;
} else if (value != null) {
try {
minSearchTokenLength = Integer.valueOf(value.toString());
} catch (NumberFormatException e) {
throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0", e);
}
} else {
minSearchTokenLength = null;
}
if (minSearchTokenLength != null) {
if (minSearchTokenLength < 1) {
throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0");
}
linkerConfig.setMinSearchTokenLength(minSearchTokenLength);
}
// init the REDIRECT_PROCESSING_MODE
value = configuration.get(REDIRECT_PROCESSING_MODE);
if (value != null) {
try {
linkerConfig.setRedirectProcessingMode(RedirectProcessingMode.valueOf(value.toString()));
} catch (IllegalArgumentException e) {
throw new ConfigurationException(REDIRECT_PROCESSING_MODE, "Values MUST be one of " + Arrays.toString(RedirectProcessingMode.values()));
}
}
// init the DEFAULT_LANGUAGE
value = configuration.get(DEFAULT_MATCHING_LANGUAGE);
if (value != null) {
String defaultLang = value.toString().trim();
if (defaultLang.isEmpty()) {
linkerConfig.setDefaultLanguage(null);
} else if (defaultLang.length() == 1) {
throw new ConfigurationException(DEFAULT_MATCHING_LANGUAGE, "Illegal language code '" + defaultLang + "'! Language Codes MUST BE at least 2 chars long.");
} else {
linkerConfig.setDefaultLanguage(defaultLang);
}
}
// init MIN_TOKEN_MATCH_FACTOR
value = configuration.get(MIN_TOKEN_MATCH_FACTOR);
float minTokenMatchFactor;
if (value instanceof Number) {
minTokenMatchFactor = ((Number) value).floatValue();
} else if (value != null) {
try {
minTokenMatchFactor = Float.valueOf(value.toString());
} catch (NumberFormatException e) {
throw new ConfigurationException(MIN_TOKEN_MATCH_FACTOR, "Unable to parse the minimum token match factor from the parsed value " + value, e);
}
if (minTokenMatchFactor < 0) {
minTokenMatchFactor = EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR;
}
} else {
minTokenMatchFactor = EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR;
}
if (minTokenMatchFactor == 0 || minTokenMatchFactor > 1) {
throw new ConfigurationException(MIN_TOKEN_MATCH_FACTOR, "The minimum token match factor MUST be > 0 and <= 1 (negative values for the default)");
}
linkerConfig.setMinTokenMatchFactor(minTokenMatchFactor);
// init type mappings
value = configuration.get(TYPE_MAPPINGS);
if (value instanceof String[]) {
// support array
value = Arrays.asList((String[]) value);
} else if (value instanceof String) {
// single value
value = Collections.singleton(value);
}
if (value instanceof Collection<?>) {
// and collection
log.info("Init Type Mappings");
configs: for (Object o : (Iterable<?>) value) {
if (o != null) {
StringBuilder usage = new StringBuilder("useages: ");
usage.append("a: '{uri}' short for {uri} > {uri} | ");
usage.append("b: '{source1};{source2};..;{sourceN} > {target}'");
String[] config = o.toString().split(">");
if (config[0].isEmpty()) {
log.warn("Invalid Type Mapping Config '{}': Missing Source Type ({}) -> ignore this config", o, usage);
continue configs;
}
String[] sourceTypes = config[0].split(";");
if (sourceTypes.length > 1 && (config.length < 2 || config[1].isEmpty())) {
log.warn("Invalid Type Mapping Config '{}': Missing Target Type '{}' ({}) -> ignore this config", o, usage);
continue configs;
}
String targetType = config.length < 2 ? sourceTypes[0] : config[1];
targetType = NamespaceMappingUtils.getConfiguredUri(nsPrefixService, TYPE_MAPPINGS, // support for ns:localName
targetType.trim());
try {
// validate
new URI(targetType);
} catch (URISyntaxException e) {
log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this config", sourceTypes[0], o);
continue configs;
}
IRI targetUri = new IRI(targetType);
for (String sourceType : sourceTypes) {
if (!sourceType.isEmpty()) {
sourceType = NamespaceMappingUtils.getConfiguredUri(nsPrefixService, TYPE_MAPPINGS, // support for ns:localName
sourceType.trim());
try {
// validate
new URI(sourceType);
IRI old = linkerConfig.setTypeMapping(sourceType, targetUri);
if (old == null) {
log.info(" > add type mapping {} > {}", sourceType, targetType);
} else {
log.info(" > set type mapping {} > {} (old: {})", new Object[] { sourceType, targetType, old.getUnicodeString() });
}
} catch (URISyntaxException e) {
log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this source type", sourceTypes[0], o);
}
}
}
}
}
} else {
log.debug("No Type mappings configured");
}
}
use of org.apache.clerezza.commons.rdf.IRI in project stanbol by apache.
the class KeywordLinkingEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
if (isOfflineMode() && !entitySearcher.supportsOfflineMode()) {
throw new EngineException("Offline mode is not supported by the Component used to lookup Entities");
}
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri() + "(supported: '" + SUPPORTED_MIMETYPES + "') -> this indicates that canEnhance was" + "NOT called and indicates a bug in the used EnhancementJobManager!");
}
String text;
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(String.format("Unable to extract " + " text from ContentPart %s of ContentItem %s!", contentPart.getKey(), ci.getUri()), e);
}
if (text.trim().length() == 0) {
// TODO: make the length of the data a field of the ContentItem
// interface to be able to filter out empty items in the canEnhance
// method
log.warn("ContentPart {} of ContentItem does not contain any Text to extract knowledge from", contentPart.getKey(), ci);
return;
}
// Determine the language
String language;
ci.getLock().readLock().lock();
try {
language = extractLanguage(ci);
} finally {
ci.getLock().readLock().unlock();
}
if (isProcessableLanguages(language)) {
log.debug("computeEnhancements for ContentItem {} language {} text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(text, 100) });
EntityLinker entityLinker = new EntityLinker(analysedContentFactory.create(text, language), entitySearcher, linkerConfig);
// process
entityLinker.process();
// write results (requires a write lock)
ci.getLock().writeLock().lock();
try {
writeEnhancements(ci, entityLinker.getLinkedEntities().values(), language);
} finally {
ci.getLock().writeLock().unlock();
}
} else {
log.debug("ignore ContentItem {} because language '{}' is not configured to" + "be processed by this engine.", ci.getUri().getUnicodeString(), language);
}
}
Aggregations