Search in sources :

Example 6 with Entity

use of org.apache.stanbol.enhancer.engines.entitylinking.Entity in project stanbol by apache.

the class EntityLinkingEngineTest method setUpServices.

@BeforeClass
public static void setUpServices() throws IOException {
    searcher = new TestSearcherImpl(TEST_REFERENCED_SITE_NAME, NAME, new SimpleLabelTokenizer());
    //add some terms to the searcher
    Graph graph = new IndexedGraph();
    IRI uri = new IRI("urn:test:PatrickMarshall");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Patrick Marshall")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PERSON));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:Geologist");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Geologist")));
    graph.add(new TripleImpl(uri, TYPE, new IRI(NamespaceEnum.skos + "Concept")));
    graph.add(new TripleImpl(uri, REDIRECT, new IRI("urn:test:redirect:Geologist")));
    searcher.addEntity(new Entity(uri, graph));
    //a redirect
    uri = new IRI("urn:test:redirect:Geologist");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Geologe (redirect)")));
    graph.add(new TripleImpl(uri, TYPE, new IRI(NamespaceEnum.skos + "Concept")));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:NewZealand");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("New Zealand")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PLACE));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:UniversityOfOtago");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("University of Otago")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_ORGANISATION));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:University");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("University")));
    graph.add(new TripleImpl(uri, TYPE, new IRI(NamespaceEnum.skos + "Concept")));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:Otago");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Otago")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PLACE));
    searcher.addEntity(new Entity(uri, graph));
    //add a 2nd Otago (Place and University
    uri = new IRI("urn:test:Otago_Texas");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Otago (Texas)")));
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Otago")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PLACE));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:UniversityOfOtago_Texas");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("University of Otago (Texas)")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_ORGANISATION));
    searcher.addEntity(new Entity(uri, graph));
    TEST_ANALYSED_TEXT = AnalysedTextFactory.getDefaultInstance().createAnalysedText(ciFactory.createBlob(new StringSource(TEST_TEXT)));
    TEST_ANALYSED_TEXT_WO = AnalysedTextFactory.getDefaultInstance().createAnalysedText(ciFactory.createBlob(new StringSource(TEST_TEXT_WO)));
    initAnalyzedText(TEST_ANALYSED_TEXT);
    TEST_ANALYSED_TEXT.addChunk(0, "Dr. Patrick Marshall".length()).addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
    TEST_ANALYSED_TEXT.addToken(4, 11).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
    TEST_ANALYSED_TEXT.addToken(12, 20).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
    initAnalyzedText(TEST_ANALYSED_TEXT_WO);
    TEST_ANALYSED_TEXT_WO.addChunk(0, "Dr. Marshall Patrick".length()).addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
    TEST_ANALYSED_TEXT_WO.addToken(4, 12).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
    TEST_ANALYSED_TEXT_WO.addToken(13, 20).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) LinkedEntity(org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity) Entity(org.apache.stanbol.enhancer.engines.entitylinking.Entity) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) Graph(org.apache.clerezza.commons.rdf.Graph) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) SimpleLabelTokenizer(org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.SimpleLabelTokenizer) TestSearcherImpl(org.apache.stanbol.enhancer.engines.entitylinking.impl.TestSearcherImpl) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) BeforeClass(org.junit.BeforeClass)

Example 7 with Entity

use of org.apache.stanbol.enhancer.engines.entitylinking.Entity in project stanbol by apache.

the class EntityLinker method lookupEntities.

/**
     * Searches for Entities in the {@link #entitySearcher} corresponding to the
     * {@link Token#getText() words} of the current {@link #state position} in
     * the text.
     * @param searchTokens the list of {@link Token#getText() words} to search
     * entities for.
     * @return The sorted list with the suggestions.
     * If there are no suggestions an empty list will be returned.
     * @throws EntitySearcherException 
     */
private List<Suggestion> lookupEntities(List<TokenData> searchTokens) throws EntitySearcherException {
    Set<String> languages = new HashSet<String>();
    languages.add(linkerConfig.getDefaultLanguage());
    languages.add(state.getLanguage());
    int countryCodeIndex = state.getLanguage() == null ? -1 : state.getLanguage().indexOf('-');
    if (countryCodeIndex >= 2) {
        languages.add(state.getLanguage().substring(0, countryCodeIndex));
    }
    List<String> searchStrings = new ArrayList<String>(searchTokens.size());
    for (Iterator<TokenData> it = searchTokens.iterator(); it.hasNext(); ) {
        searchStrings.add(getSearchString(it.next()));
    }
    String[] languageArray = languages.toArray(new String[languages.size()]);
    List<Suggestion> suggestions = new ArrayList<Suggestion>();
    //check if we have the search strings in the cache
    List<Entity> results = lookupCache.get(searchStrings);
    if (results != null) {
        //query is cached
        cacheHits++;
        //match the cached results
        for (Entity result : results) {
            processLookupResult(searchTokens, result, suggestions);
        }
    } else {
        // we need to perform a new query
        results = new ArrayList<Entity>();
        //perform the lookup with the parsed parameter
        int numResults = performLookup(searchStrings, languageArray, suggestions, searchTokens, results);
        //cache the results
        lookupCache.put(searchStrings, results);
        //current token
        if (suggestions.isEmpty() && numResults > 0 && searchStrings.size() > 1) {
            //there where results, but no one matched ...
            //   ... it is most likely a case where the used search terms are
            //       not releated. So try to query for the active token only
            log.debug("   > No match for '{}' searchStrings ... ", searchStrings);
            searchStrings = Collections.singletonList(getSearchString(state.getToken()));
            searchTokens = Collections.singletonList(state.getToken());
            results = lookupCache.get(searchStrings);
            if (results != null) {
                //query is cached
                cacheHits++;
                //match the cached results
                for (Entity result : results) {
                    processLookupResult(searchTokens, result, suggestions);
                }
            } else {
                results = new ArrayList<Entity>();
                log.debug("     ... fallback to search for active token '{}' ...", searchStrings);
                performLookup(searchStrings, languageArray, suggestions, searchTokens, results);
                //cache the results of the fall-back query
                lookupCache.put(searchStrings, results);
            }
        }
    }
    //sort the suggestions
    if (suggestions.size() > 1) {
        Collections.sort(suggestions, Suggestion.MATCH_TYPE_SUGGESTION_COMPARATOR);
    }
    return suggestions;
}
Also used : Entity(org.apache.stanbol.enhancer.engines.entitylinking.Entity) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet)

Example 8 with Entity

use of org.apache.stanbol.enhancer.engines.entitylinking.Entity in project stanbol by apache.

the class EntityLinker method processRedirects.

/**
     * Processes {@link EntitySearcher#getRedirectField() redirect field} values for
     * the parsed suggestions based on the {@link RedirectProcessingMode}
     * as configured in the {@link #config}.<p>
     * The results of this method are stored within the parsed {@link Suggestion}s
     * @param suggestion The suggestion to process.
     * @throws EntitySearcherException 
     */
private void processRedirects(Suggestion suggestion) throws EntitySearcherException {
    //if mode is IGNORE -> nothing to do
    if (linkerConfig.getRedirectProcessingMode() == RedirectProcessingMode.IGNORE) {
        return;
    }
    //therefore there is a small internal state that stores this information
    if (suggestion.isRedirectedProcessed()) {
        //Redirects for ResultMatch are already processed ... ignore
        return;
    }
    Entity result = suggestion.getResult();
    Iterator<IRI> redirects = result.getReferences(linkerConfig.getRedirectField());
    switch(linkerConfig.getRedirectProcessingMode()) {
        case ADD_VALUES:
            Graph entityData = result.getData();
            IRI entityUri = result.getUri();
            while (redirects.hasNext()) {
                IRI redirect = redirects.next();
                if (redirect != null) {
                    Entity redirectedEntity = entitySearcher.get(redirect, linkerConfig.getSelectedFields());
                    if (redirectedEntity != null) {
                        for (Iterator<Triple> data = redirectedEntity.getData().filter(redirectedEntity.getUri(), null, null); data.hasNext(); ) {
                            Triple t = data.next();
                            entityData.add(new TripleImpl(entityUri, t.getPredicate(), t.getObject()));
                        }
                    }
                    //set that the redirects where searched for this result
                    suggestion.setRedirectProcessed(true);
                }
            }
        case FOLLOW:
            while (redirects.hasNext()) {
                IRI redirect = redirects.next();
                if (redirect != null) {
                    Entity redirectedEntity = entitySearcher.get(redirect, linkerConfig.getSelectedFields());
                    if (redirectedEntity != null) {
                        suggestion.setRedirect(redirectedEntity);
                    }
                }
            }
        //nothing to do
        default:
    }
}
Also used : Triple(org.apache.clerezza.commons.rdf.Triple) Entity(org.apache.stanbol.enhancer.engines.entitylinking.Entity) IRI(org.apache.clerezza.commons.rdf.IRI) Graph(org.apache.clerezza.commons.rdf.Graph) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 9 with Entity

use of org.apache.stanbol.enhancer.engines.entitylinking.Entity in project stanbol by apache.

the class EntityLinker method performLookup.

/**
     * @param searchStrings
     * @param languageArray
     * @param suggestions
     * @param searchTokens
     * @param queryResults the unprocessed results of the query for the parsed
     * parameters. This is used to cache results of queries. This avoid issuing
     * the same query twice for a analysed document.
     * string.
     * @return
     * @throws EntitySearcherException
     */
private int performLookup(List<String> searchStrings, String[] languageArray, List<Suggestion> suggestions, List<TokenData> searchTokens, List<Entity> queryResults) throws EntitySearcherException {
    int minProcessedResults = linkerConfig.getMaxSuggestions() * 3;
    int lookupLimit = Math.max(MIN_SEARCH_LIMIT, linkerConfig.getMaxSuggestions() * 2 * searchTokens.size());
    int maxResults = lookupLimit * 2;
    int offset = 0;
    int numFiltered = 0;
    boolean moreResultsAvailable = true;
    int numResults = 0;
    //      requests are made for the same lookup.
    while (suggestions.size() < linkerConfig.getMaxSuggestions() && moreResultsAvailable && (numResults - numFiltered) < (minProcessedResults) && numResults < maxResults) {
        Collection<? extends Entity> results;
        log.debug("   > request entities [{}-{}] entities ...", offset, (offset + lookupLimit));
        //keep statistics
        lookupStats.begin();
        results = entitySearcher.lookup(linkerConfig.getNameField(), linkerConfig.getSelectedFields(), searchStrings, languageArray, lookupLimit, offset);
        lookupStats.complete();
        log.debug("      < found {} entities ...", results.size());
        //queries might return more as the requested results
        moreResultsAvailable = results.size() >= lookupLimit;
        numResults = numResults + results.size();
        offset = numResults;
        matchingStats.begin();
        for (Entity result : results) {
            if (log.isDebugEnabled()) {
                log.debug("    > {} (ranking: {})", result.getId(), result.getEntityRanking());
            }
            numQueryResults++;
            //white/black list based entity type filtering (STANBOL-1111)
            if (!linkerConfig.isEntityTypeFilteringActive() || !filterEntity(result.getReferences(linkerConfig.getTypeField()))) {
                //a valid query result
                queryResults.add(result);
                //now match the result against the current position in the text
                processLookupResult(searchTokens, result, suggestions);
            } else {
                //do not process Entities with a filtered type
                //global statistics
                numFilteredResults++;
                numFiltered++;
            }
        }
        matchingStats.complete();
    //sort the suggestions
    }
    return numResults;
}
Also used : Entity(org.apache.stanbol.enhancer.engines.entitylinking.Entity)

Example 10 with Entity

use of org.apache.stanbol.enhancer.engines.entitylinking.Entity in project stanbol by apache.

the class Suggestion method getBestLabel.

/**
     * Getter for the best label in the given language
     * @param suggestion the suggestion
     * @param nameField the field used to search for labels
     * @param language the language
     * @return the best match or {@link Suggestion#getMatchedLabel()} if non is found
     */
public Literal getBestLabel(IRI nameField, String language) {
    Entity rep = getEntity();
    //start with the matched label -> so if we do not find a better one
    //we will use the matched!
    Literal matchedLabel = getMatchedLabel();
    Literal label = matchedLabel;
    // 1. check if the returned Entity does has a label -> if not return null
    // add labels (set only a single label. Use "en" if available!
    Iterator<Literal> labels = rep.getText(nameField);
    boolean matchFound = false;
    while (labels.hasNext() && !matchFound) {
        Literal actLabel = labels.next();
        if (label == null) {
            label = actLabel;
        }
        //now we have already a label check the language
        Language actLang = actLabel.getLanguage();
        //use startWith to match also en-GB and en-US ...
        if (actLang != null && actLang.toString().startsWith(language)) {
            //prefer labels with the correct language
            label = actLabel;
            if (matchedLabel != null && matchedLabel.getLexicalForm().equalsIgnoreCase(label.getLexicalForm())) {
                //found label in that language that exactly matches the
                //label used to match the text
                matchFound = true;
            }
        }
    }
    return label;
}
Also used : Entity(org.apache.stanbol.enhancer.engines.entitylinking.Entity) Language(org.apache.clerezza.commons.rdf.Language) Literal(org.apache.clerezza.commons.rdf.Literal)

Aggregations

Entity (org.apache.stanbol.enhancer.engines.entitylinking.Entity)13 ArrayList (java.util.ArrayList)6 HashSet (java.util.HashSet)6 Literal (org.apache.clerezza.commons.rdf.Literal)5 Graph (org.apache.clerezza.commons.rdf.Graph)4 IRI (org.apache.clerezza.commons.rdf.IRI)4 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)4 Language (org.apache.clerezza.commons.rdf.Language)3 Triple (org.apache.clerezza.commons.rdf.Triple)3 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)3 LinkedEntity (org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity)3 Collection (java.util.Collection)2 HashMap (java.util.HashMap)2 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)2 Occurrence (org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity.Occurrence)2 Suggestion (org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion)2 NlpEngineHelper.getLanguage (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage)2 Representation (org.apache.stanbol.entityhub.servicesapi.model.Representation)2 FieldQuery (org.apache.stanbol.entityhub.servicesapi.query.FieldQuery)2 Entry (java.util.Map.Entry)1