use of org.apache.stanbol.enhancer.engines.entitylinking.Entity in project stanbol by apache.
the class EntityLinkingEngineTest method setUpServices.
@BeforeClass
public static void setUpServices() throws IOException {
searcher = new TestSearcherImpl(TEST_REFERENCED_SITE_NAME, NAME, new SimpleLabelTokenizer());
//add some terms to the searcher
Graph graph = new IndexedGraph();
IRI uri = new IRI("urn:test:PatrickMarshall");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Patrick Marshall")));
graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PERSON));
searcher.addEntity(new Entity(uri, graph));
uri = new IRI("urn:test:Geologist");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Geologist")));
graph.add(new TripleImpl(uri, TYPE, new IRI(NamespaceEnum.skos + "Concept")));
graph.add(new TripleImpl(uri, REDIRECT, new IRI("urn:test:redirect:Geologist")));
searcher.addEntity(new Entity(uri, graph));
//a redirect
uri = new IRI("urn:test:redirect:Geologist");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Geologe (redirect)")));
graph.add(new TripleImpl(uri, TYPE, new IRI(NamespaceEnum.skos + "Concept")));
searcher.addEntity(new Entity(uri, graph));
uri = new IRI("urn:test:NewZealand");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("New Zealand")));
graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PLACE));
searcher.addEntity(new Entity(uri, graph));
uri = new IRI("urn:test:UniversityOfOtago");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("University of Otago")));
graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_ORGANISATION));
searcher.addEntity(new Entity(uri, graph));
uri = new IRI("urn:test:University");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("University")));
graph.add(new TripleImpl(uri, TYPE, new IRI(NamespaceEnum.skos + "Concept")));
searcher.addEntity(new Entity(uri, graph));
uri = new IRI("urn:test:Otago");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Otago")));
graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PLACE));
searcher.addEntity(new Entity(uri, graph));
//add a 2nd Otago (Place and University
uri = new IRI("urn:test:Otago_Texas");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Otago (Texas)")));
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Otago")));
graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PLACE));
searcher.addEntity(new Entity(uri, graph));
uri = new IRI("urn:test:UniversityOfOtago_Texas");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("University of Otago (Texas)")));
graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_ORGANISATION));
searcher.addEntity(new Entity(uri, graph));
TEST_ANALYSED_TEXT = AnalysedTextFactory.getDefaultInstance().createAnalysedText(ciFactory.createBlob(new StringSource(TEST_TEXT)));
TEST_ANALYSED_TEXT_WO = AnalysedTextFactory.getDefaultInstance().createAnalysedText(ciFactory.createBlob(new StringSource(TEST_TEXT_WO)));
initAnalyzedText(TEST_ANALYSED_TEXT);
TEST_ANALYSED_TEXT.addChunk(0, "Dr. Patrick Marshall".length()).addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
TEST_ANALYSED_TEXT.addToken(4, 11).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
TEST_ANALYSED_TEXT.addToken(12, 20).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
initAnalyzedText(TEST_ANALYSED_TEXT_WO);
TEST_ANALYSED_TEXT_WO.addChunk(0, "Dr. Marshall Patrick".length()).addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
TEST_ANALYSED_TEXT_WO.addToken(4, 12).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
TEST_ANALYSED_TEXT_WO.addToken(13, 20).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
}
use of org.apache.stanbol.enhancer.engines.entitylinking.Entity in project stanbol by apache.
the class EntityLinker method lookupEntities.
/**
* Searches for Entities in the {@link #entitySearcher} corresponding to the
* {@link Token#getText() words} of the current {@link #state position} in
* the text.
* @param searchTokens the list of {@link Token#getText() words} to search
* entities for.
* @return The sorted list with the suggestions.
* If there are no suggestions an empty list will be returned.
* @throws EntitySearcherException
*/
private List<Suggestion> lookupEntities(List<TokenData> searchTokens) throws EntitySearcherException {
Set<String> languages = new HashSet<String>();
languages.add(linkerConfig.getDefaultLanguage());
languages.add(state.getLanguage());
int countryCodeIndex = state.getLanguage() == null ? -1 : state.getLanguage().indexOf('-');
if (countryCodeIndex >= 2) {
languages.add(state.getLanguage().substring(0, countryCodeIndex));
}
List<String> searchStrings = new ArrayList<String>(searchTokens.size());
for (Iterator<TokenData> it = searchTokens.iterator(); it.hasNext(); ) {
searchStrings.add(getSearchString(it.next()));
}
String[] languageArray = languages.toArray(new String[languages.size()]);
List<Suggestion> suggestions = new ArrayList<Suggestion>();
//check if we have the search strings in the cache
List<Entity> results = lookupCache.get(searchStrings);
if (results != null) {
//query is cached
cacheHits++;
//match the cached results
for (Entity result : results) {
processLookupResult(searchTokens, result, suggestions);
}
} else {
// we need to perform a new query
results = new ArrayList<Entity>();
//perform the lookup with the parsed parameter
int numResults = performLookup(searchStrings, languageArray, suggestions, searchTokens, results);
//cache the results
lookupCache.put(searchStrings, results);
//current token
if (suggestions.isEmpty() && numResults > 0 && searchStrings.size() > 1) {
//there where results, but no one matched ...
// ... it is most likely a case where the used search terms are
// not releated. So try to query for the active token only
log.debug(" > No match for '{}' searchStrings ... ", searchStrings);
searchStrings = Collections.singletonList(getSearchString(state.getToken()));
searchTokens = Collections.singletonList(state.getToken());
results = lookupCache.get(searchStrings);
if (results != null) {
//query is cached
cacheHits++;
//match the cached results
for (Entity result : results) {
processLookupResult(searchTokens, result, suggestions);
}
} else {
results = new ArrayList<Entity>();
log.debug(" ... fallback to search for active token '{}' ...", searchStrings);
performLookup(searchStrings, languageArray, suggestions, searchTokens, results);
//cache the results of the fall-back query
lookupCache.put(searchStrings, results);
}
}
}
//sort the suggestions
if (suggestions.size() > 1) {
Collections.sort(suggestions, Suggestion.MATCH_TYPE_SUGGESTION_COMPARATOR);
}
return suggestions;
}
use of org.apache.stanbol.enhancer.engines.entitylinking.Entity in project stanbol by apache.
the class EntityLinker method processRedirects.
/**
* Processes {@link EntitySearcher#getRedirectField() redirect field} values for
* the parsed suggestions based on the {@link RedirectProcessingMode}
* as configured in the {@link #config}.<p>
* The results of this method are stored within the parsed {@link Suggestion}s
* @param suggestion The suggestion to process.
* @throws EntitySearcherException
*/
private void processRedirects(Suggestion suggestion) throws EntitySearcherException {
//if mode is IGNORE -> nothing to do
if (linkerConfig.getRedirectProcessingMode() == RedirectProcessingMode.IGNORE) {
return;
}
//therefore there is a small internal state that stores this information
if (suggestion.isRedirectedProcessed()) {
//Redirects for ResultMatch are already processed ... ignore
return;
}
Entity result = suggestion.getResult();
Iterator<IRI> redirects = result.getReferences(linkerConfig.getRedirectField());
switch(linkerConfig.getRedirectProcessingMode()) {
case ADD_VALUES:
Graph entityData = result.getData();
IRI entityUri = result.getUri();
while (redirects.hasNext()) {
IRI redirect = redirects.next();
if (redirect != null) {
Entity redirectedEntity = entitySearcher.get(redirect, linkerConfig.getSelectedFields());
if (redirectedEntity != null) {
for (Iterator<Triple> data = redirectedEntity.getData().filter(redirectedEntity.getUri(), null, null); data.hasNext(); ) {
Triple t = data.next();
entityData.add(new TripleImpl(entityUri, t.getPredicate(), t.getObject()));
}
}
//set that the redirects where searched for this result
suggestion.setRedirectProcessed(true);
}
}
case FOLLOW:
while (redirects.hasNext()) {
IRI redirect = redirects.next();
if (redirect != null) {
Entity redirectedEntity = entitySearcher.get(redirect, linkerConfig.getSelectedFields());
if (redirectedEntity != null) {
suggestion.setRedirect(redirectedEntity);
}
}
}
//nothing to do
default:
}
}
use of org.apache.stanbol.enhancer.engines.entitylinking.Entity in project stanbol by apache.
the class EntityLinker method performLookup.
/**
* @param searchStrings
* @param languageArray
* @param suggestions
* @param searchTokens
* @param queryResults the unprocessed results of the query for the parsed
* parameters. This is used to cache results of queries. This avoid issuing
* the same query twice for a analysed document.
* string.
* @return
* @throws EntitySearcherException
*/
private int performLookup(List<String> searchStrings, String[] languageArray, List<Suggestion> suggestions, List<TokenData> searchTokens, List<Entity> queryResults) throws EntitySearcherException {
int minProcessedResults = linkerConfig.getMaxSuggestions() * 3;
int lookupLimit = Math.max(MIN_SEARCH_LIMIT, linkerConfig.getMaxSuggestions() * 2 * searchTokens.size());
int maxResults = lookupLimit * 2;
int offset = 0;
int numFiltered = 0;
boolean moreResultsAvailable = true;
int numResults = 0;
// requests are made for the same lookup.
while (suggestions.size() < linkerConfig.getMaxSuggestions() && moreResultsAvailable && (numResults - numFiltered) < (minProcessedResults) && numResults < maxResults) {
Collection<? extends Entity> results;
log.debug(" > request entities [{}-{}] entities ...", offset, (offset + lookupLimit));
//keep statistics
lookupStats.begin();
results = entitySearcher.lookup(linkerConfig.getNameField(), linkerConfig.getSelectedFields(), searchStrings, languageArray, lookupLimit, offset);
lookupStats.complete();
log.debug(" < found {} entities ...", results.size());
//queries might return more as the requested results
moreResultsAvailable = results.size() >= lookupLimit;
numResults = numResults + results.size();
offset = numResults;
matchingStats.begin();
for (Entity result : results) {
if (log.isDebugEnabled()) {
log.debug(" > {} (ranking: {})", result.getId(), result.getEntityRanking());
}
numQueryResults++;
//white/black list based entity type filtering (STANBOL-1111)
if (!linkerConfig.isEntityTypeFilteringActive() || !filterEntity(result.getReferences(linkerConfig.getTypeField()))) {
//a valid query result
queryResults.add(result);
//now match the result against the current position in the text
processLookupResult(searchTokens, result, suggestions);
} else {
//do not process Entities with a filtered type
//global statistics
numFilteredResults++;
numFiltered++;
}
}
matchingStats.complete();
//sort the suggestions
}
return numResults;
}
use of org.apache.stanbol.enhancer.engines.entitylinking.Entity in project stanbol by apache.
the class Suggestion method getBestLabel.
/**
* Getter for the best label in the given language
* @param suggestion the suggestion
* @param nameField the field used to search for labels
* @param language the language
* @return the best match or {@link Suggestion#getMatchedLabel()} if non is found
*/
public Literal getBestLabel(IRI nameField, String language) {
Entity rep = getEntity();
//start with the matched label -> so if we do not find a better one
//we will use the matched!
Literal matchedLabel = getMatchedLabel();
Literal label = matchedLabel;
// 1. check if the returned Entity does has a label -> if not return null
// add labels (set only a single label. Use "en" if available!
Iterator<Literal> labels = rep.getText(nameField);
boolean matchFound = false;
while (labels.hasNext() && !matchFound) {
Literal actLabel = labels.next();
if (label == null) {
label = actLabel;
}
//now we have already a label check the language
Language actLang = actLabel.getLanguage();
//use startWith to match also en-GB and en-US ...
if (actLang != null && actLang.toString().startsWith(language)) {
//prefer labels with the correct language
label = actLabel;
if (matchedLabel != null && matchedLabel.getLexicalForm().equalsIgnoreCase(label.getLexicalForm())) {
//found label in that language that exactly matches the
//label used to match the text
matchFound = true;
}
}
}
return label;
}
Aggregations