use of org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig in project stanbol by apache.
the class KeywordLinkingEngine method activateEntityLinkerConfig.
/**
* Configures the parsed {@link EntityLinkerConfig} with the values of the
* following properties:<ul>
* <li>{@link #NAME_FIELD}
* <li>{@link #TYPE_FIELD}
* <li>{@link #REDIRECT_FIELD}
* <li>{@link #REDIRECT_PROCESSING_MODE}
* <li>{@link #MAX_SUGGESTIONS}
* <li>{@link #MIN_SEARCH_TOKEN_LENGTH}
* <li>{@link #MIN_FOUND_TOKENS}
* <li> {@link #MIN_TOKEN_MATCH_FACTOR}
* </ul>
* This Method create an new {@link EntityLinkerConfig} instance only if
* <code>{@link #linkerConfig} == null</code>. If the instance is already initialised
* that all current values for keys missing in the parsed configuration are
* preserved.
* @param configuration the configuration
* @throws ConfigurationException In case of an illegal value in the parsed configuration.
* Note that all configuration are assumed as optional, therefore missing values will not
* case a ConfigurationException.
*/
protected void activateEntityLinkerConfig(Dictionary<String, Object> configuration) throws ConfigurationException {
if (linkerConfig == null) {
this.linkerConfig = new EntityLinkerConfig();
}
Object value;
value = configuration.get(NAME_FIELD);
if (value != null) {
if (value.toString().isEmpty()) {
throw new ConfigurationException(NAME_FIELD, "The configured name field MUST NOT be empty");
}
linkerConfig.setNameField(NamespaceMappingUtils.getConfiguredUri(nsPrefixService, NAME_FIELD, value.toString()));
}
//init case sensitivity
value = configuration.get(CASE_SENSITIVE);
if (value instanceof Boolean) {
linkerConfig.setCaseSensitiveMatchingState((Boolean) value);
} else if (value != null && !value.toString().isEmpty()) {
linkerConfig.setCaseSensitiveMatchingState(Boolean.valueOf(value.toString()));
}
//if NULL or empty use default
//init TYPE_FIELD
value = configuration.get(TYPE_FIELD);
if (value != null) {
if (value.toString().isEmpty()) {
throw new ConfigurationException(TYPE_FIELD, "The configured name field MUST NOT be empty");
}
linkerConfig.setTypeField(NamespaceMappingUtils.getConfiguredUri(nsPrefixService, TYPE_FIELD, value.toString()));
}
//init REDIRECT_FIELD
value = configuration.get(REDIRECT_FIELD);
if (value != null) {
if (value.toString().isEmpty()) {
throw new ConfigurationException(NAME_FIELD, "The configured name field MUST NOT be empty");
}
linkerConfig.setRedirectField(NamespaceMappingUtils.getConfiguredUri(nsPrefixService, REDIRECT_FIELD, value.toString()));
}
//init MAX_SUGGESTIONS
value = configuration.get(MAX_SUGGESTIONS);
Integer maxSuggestions;
if (value instanceof Integer) {
maxSuggestions = (Integer) value;
} else if (value != null) {
try {
maxSuggestions = Integer.valueOf(value.toString());
} catch (NumberFormatException e) {
throw new ConfigurationException(MAX_SUGGESTIONS, "Values MUST be valid Integer values > 0", e);
}
} else {
maxSuggestions = null;
}
if (maxSuggestions != null) {
if (maxSuggestions < 1) {
throw new ConfigurationException(MAX_SUGGESTIONS, "Values MUST be valid Integer values > 0");
}
linkerConfig.setMaxSuggestions(maxSuggestions);
}
//init MIN_FOUND_TOKENS
value = configuration.get(MIN_FOUND_TOKENS);
Integer minFoundTokens;
if (value instanceof Integer) {
minFoundTokens = (Integer) value;
} else if (value != null) {
try {
minFoundTokens = Integer.valueOf(value.toString());
} catch (NumberFormatException e) {
throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0", e);
}
} else {
minFoundTokens = null;
}
if (minFoundTokens != null) {
if (minFoundTokens < 1) {
throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0");
}
linkerConfig.setMinFoundTokens(minFoundTokens);
}
// init MIN_SEARCH_TOKEN_LENGTH
value = configuration.get(MIN_SEARCH_TOKEN_LENGTH);
Integer minSearchTokenLength;
if (value instanceof Integer) {
minSearchTokenLength = (Integer) value;
} else if (value != null) {
try {
minSearchTokenLength = Integer.valueOf(value.toString());
} catch (NumberFormatException e) {
throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0", e);
}
} else {
minSearchTokenLength = null;
}
if (minSearchTokenLength != null) {
if (minSearchTokenLength < 1) {
throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0");
}
linkerConfig.setMinSearchTokenLength(minSearchTokenLength);
}
//init the REDIRECT_PROCESSING_MODE
value = configuration.get(REDIRECT_PROCESSING_MODE);
if (value != null) {
try {
linkerConfig.setRedirectProcessingMode(RedirectProcessingMode.valueOf(value.toString()));
} catch (IllegalArgumentException e) {
throw new ConfigurationException(REDIRECT_PROCESSING_MODE, "Values MUST be one of " + Arrays.toString(RedirectProcessingMode.values()));
}
}
//init the DEFAULT_LANGUAGE
value = configuration.get(DEFAULT_MATCHING_LANGUAGE);
if (value != null) {
String defaultLang = value.toString().trim();
if (defaultLang.isEmpty()) {
linkerConfig.setDefaultLanguage(null);
} else if (defaultLang.length() == 1) {
throw new ConfigurationException(DEFAULT_MATCHING_LANGUAGE, "Illegal language code '" + defaultLang + "'! Language Codes MUST BE at least 2 chars long.");
} else {
linkerConfig.setDefaultLanguage(defaultLang);
}
}
// init MIN_TOKEN_MATCH_FACTOR
value = configuration.get(MIN_TOKEN_MATCH_FACTOR);
float minTokenMatchFactor;
if (value instanceof Number) {
minTokenMatchFactor = ((Number) value).floatValue();
} else if (value != null) {
try {
minTokenMatchFactor = Float.valueOf(value.toString());
} catch (NumberFormatException e) {
throw new ConfigurationException(MIN_TOKEN_MATCH_FACTOR, "Unable to parse the minimum token match factor from the parsed value " + value, e);
}
if (minTokenMatchFactor < 0) {
minTokenMatchFactor = EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR;
}
} else {
minTokenMatchFactor = EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR;
}
if (minTokenMatchFactor == 0 || minTokenMatchFactor > 1) {
throw new ConfigurationException(MIN_TOKEN_MATCH_FACTOR, "The minimum token match factor MUST be > 0 and <= 1 (negative values for the default)");
}
linkerConfig.setMinTokenMatchFactor(minTokenMatchFactor);
//init type mappings
value = configuration.get(TYPE_MAPPINGS);
if (value instanceof String[]) {
//support array
value = Arrays.asList((String[]) value);
} else if (value instanceof String) {
//single value
value = Collections.singleton(value);
}
if (value instanceof Collection<?>) {
//and collection
log.info("Init Type Mappings");
configs: for (Object o : (Iterable<?>) value) {
if (o != null) {
StringBuilder usage = new StringBuilder("useages: ");
usage.append("a: '{uri}' short for {uri} > {uri} | ");
usage.append("b: '{source1};{source2};..;{sourceN} > {target}'");
String[] config = o.toString().split(">");
if (config[0].isEmpty()) {
log.warn("Invalid Type Mapping Config '{}': Missing Source Type ({}) -> ignore this config", o, usage);
continue configs;
}
String[] sourceTypes = config[0].split(";");
if (sourceTypes.length > 1 && (config.length < 2 || config[1].isEmpty())) {
log.warn("Invalid Type Mapping Config '{}': Missing Target Type '{}' ({}) -> ignore this config", o, usage);
continue configs;
}
String targetType = config.length < 2 ? sourceTypes[0] : config[1];
targetType = NamespaceMappingUtils.getConfiguredUri(nsPrefixService, TYPE_MAPPINGS, //support for ns:localName
targetType.trim());
try {
//validate
new URI(targetType);
} catch (URISyntaxException e) {
log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this config", sourceTypes[0], o);
continue configs;
}
IRI targetUri = new IRI(targetType);
for (String sourceType : sourceTypes) {
if (!sourceType.isEmpty()) {
sourceType = NamespaceMappingUtils.getConfiguredUri(nsPrefixService, TYPE_MAPPINGS, //support for ns:localName
sourceType.trim());
try {
//validate
new URI(sourceType);
IRI old = linkerConfig.setTypeMapping(sourceType, targetUri);
if (old == null) {
log.info(" > add type mapping {} > {}", sourceType, targetType);
} else {
log.info(" > set type mapping {} > {} (old: {})", new Object[] { sourceType, targetType, old.getUnicodeString() });
}
} catch (URISyntaxException e) {
log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this source type", sourceTypes[0], o);
}
}
}
}
}
} else {
log.debug("No Type mappings configured");
}
}
use of org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig in project stanbol by apache.
the class KeywordLinkingEngineTest method testTaxonomyLinker.
/**
* This tests the EntityLinker functionality (if the expected Entities
* are linked)
* @throws Exception
*/
@Test
public void testTaxonomyLinker() throws Exception {
OpenNlpAnalysedContentFactory acf = OpenNlpAnalysedContentFactory.getInstance(openNLP, new TextAnalyzerConfig());
EntityLinkerConfig config = new EntityLinkerConfig();
config.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
EntityLinker linker = new EntityLinker(acf.create(TEST_TEXT, "en"), searcher, config);
linker.process();
Map<String, List<String>> expectedResults = new HashMap<String, List<String>>();
expectedResults.put("Patrick Marshall", new ArrayList<String>(Arrays.asList("urn:test:PatrickMarshall")));
expectedResults.put("geologist", new ArrayList<String>(//the redirected entity
Arrays.asList("urn:test:redirect:Geologist")));
expectedResults.put("New Zealand", new ArrayList<String>(Arrays.asList("urn:test:NewZealand")));
expectedResults.put("University of Otago", new ArrayList<String>(Arrays.asList("urn:test:UniversityOfOtago", "urn:test:UniversityOfOtago_Texas")));
for (LinkedEntity linkedEntity : linker.getLinkedEntities().values()) {
List<String> expectedSuggestions = expectedResults.remove(linkedEntity.getSelectedText());
assertNotNull("LinkedEntity " + linkedEntity.getSelectedText() + "is not an expected Result (or was found twice)", expectedSuggestions);
linkedEntity.getSuggestions().iterator();
assertEquals("Number of suggestions " + linkedEntity.getSuggestions().size() + " != number of expected suggestions " + expectedSuggestions.size() + "for selection " + linkedEntity.getSelectedText(), linkedEntity.getSuggestions().size(), expectedSuggestions.size());
double score = linkedEntity.getScore();
for (int i = 0; i < expectedSuggestions.size(); i++) {
Suggestion suggestion = linkedEntity.getSuggestions().get(i);
assertEquals("Expecced Suggestion at Rank " + i + " expected: " + expectedSuggestions.get(i) + " suggestion: " + suggestion.getRepresentation().getId(), expectedSuggestions.get(i), suggestion.getRepresentation().getId());
assertTrue("Score of suggestion " + i + "(" + suggestion.getScore() + " > as of the previous one (" + score + ")", score >= suggestion.getScore());
score = suggestion.getScore();
}
}
}
use of org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig in project stanbol by apache.
the class KeywordLinkingEngineTest method testEngine.
/**
* This tests if the Enhancements created by the Engine confirm to the
* rules defined for the Stanbol Enhancement Structure.
* @throws IOException
* @throws EngineException
*/
@Test
public void testEngine() throws IOException, EngineException {
EntityLinkerConfig linkerConfig = new EntityLinkerConfig();
linkerConfig.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
KeywordLinkingEngine engine = KeywordLinkingEngine.createInstance(openNLP, searcher, new TextAnalyzerConfig(), linkerConfig);
engine.referencedSiteName = TEST_REFERENCED_SITE_NAME;
ContentItem ci = ciFactory.createContentItem(new StringSource(TEST_TEXT));
//tells the engine that this is an English text
ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("en")));
//compute the enhancements
engine.computeEnhancements(ci);
//validate the enhancement results
Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
expectedValues.put(ENHANCER_EXTRACTED_FROM, ci.getUri());
expectedValues.put(DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(engine.getClass().getName()));
//adding null as expected for confidence makes it a required property
expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
//validate create fise:TextAnnotations
int numTextAnnotations = validateAllTextAnnotations(ci.getMetadata(), TEST_TEXT, expectedValues);
assertEquals("Four fise:TextAnnotations are expected by this Test", 4, numTextAnnotations);
//validate create fise:EntityAnnotations
int numEntityAnnotations = validateAllEntityAnnotations(ci, expectedValues);
assertEquals("Five fise:EntityAnnotations are expected by this Test", 5, numEntityAnnotations);
}
Aggregations