Search in sources :

Example 1 with LanguageProcessingConfig

use of org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig in project stanbol by apache.

the class EntityCoMentionEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    LanguageProcessingConfig languageConfig = textProcessingConfig.getConfiguration(language);
    if (languageConfig == null) {
        throw new IllegalStateException("The language '" + language + "' is not configured " + "to be processed by this Engine. As this is already checked within the " + "canEnhance(..) method this may indicate an bug in the used " + "EnhanceemntJobManager implementation!");
    }
    if (log.isDebugEnabled()) {
        log.debug("compute co-mentions for ContentItem {} language {}  text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100) });
    }
    LabelTokenizer labelTokenizer = (LabelTokenizer) labelTokenizerTracker.getService();
    if (labelTokenizer == null) {
        throw new EngineException(this, ci, "No LabelTokenizer available!", null);
    }
    //create the in-memory database for the mentioned Entities
    ContentItemMentionBuilder entityMentionIndex = new ContentItemMentionBuilder(labelTokenizer, language, linkerConfig.getDefaultLanguage());
    Graph metadata = ci.getMetadata();
    Set<IRI> textAnnotations = new HashSet<IRI>();
    ci.getLock().readLock().lock();
    try {
        //iterate over all TextAnnotations (mentions of Entities)
        for (Iterator<Triple> it = metadata.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION); it.hasNext(); ) {
            IRI ta = (IRI) it.next().getSubject();
            entityMentionIndex.registerTextAnnotation(ta, metadata);
            //store the registered text annotations
            textAnnotations.add(ta);
        }
    } finally {
        ci.getLock().readLock().unlock();
    }
    EntityLinker entityLinker = new EntityLinker(at, language, languageConfig, entityMentionIndex, linkerConfig, labelTokenizer, entityMentionIndex);
    //process
    try {
        entityLinker.process();
    } catch (EntitySearcherException e) {
        log.error("Unable to link Entities with " + entityLinker, e);
        throw new EngineException(this, ci, "Unable to link Entities with " + entityLinker, e);
    }
    //TODO: write results
    ci.getLock().writeLock().lock();
    try {
        writeComentions(ci, entityLinker.getLinkedEntities().values(), language, textAnnotations);
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) LanguageProcessingConfig(org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) EntitySearcherException(org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcherException) EntityLinker(org.apache.stanbol.enhancer.engines.entitylinking.impl.EntityLinker) Triple(org.apache.clerezza.commons.rdf.Triple) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Graph(org.apache.clerezza.commons.rdf.Graph) LabelTokenizer(org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer) ContentItemMentionBuilder(org.apache.stanbol.enhancer.engines.entitycomention.impl.ContentItemMentionBuilder) HashSet(java.util.HashSet)

Example 2 with LanguageProcessingConfig

use of org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig in project stanbol by apache.

the class EntityLinkingEngineTest method testEntityLinkerWithProperNouns.

/**
     * This tests the EntityLinker functionality (if the expected Entities
     * are linked). In this case with the default configurations for
     * {@link Pos#ProperNoun}.
     * @throws Exception
     */
@Test
public void testEntityLinkerWithProperNouns() throws Exception {
    LanguageProcessingConfig tpc = new LanguageProcessingConfig();
    tpc.setLinkedLexicalCategories(Collections.EMPTY_SET);
    tpc.setLinkedPos(LanguageProcessingConfig.DEFAULT_LINKED_POS);
    EntityLinkerConfig config = new EntityLinkerConfig();
    //this is assumed by this test
    config.setMinFoundTokens(2);
    config.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
    EntityLinker linker = new EntityLinker(TEST_ANALYSED_TEXT, "en", tpc, searcher, config, labelTokenizer);
    linker.process();
    Map<String, List<String>> expectedResults = new HashMap<String, List<String>>();
    expectedResults.put("Patrick Marshall", new ArrayList<String>(Arrays.asList("urn:test:PatrickMarshall")));
    //Geologist is a common noun and MUST NOT be found
    //expectedResults.put("geologist", new ArrayList<String>(
    //        Arrays.asList("urn:test:redirect:Geologist"))); //the redirected entity
    expectedResults.put("New Zealand", new ArrayList<String>(Arrays.asList("urn:test:NewZealand")));
    expectedResults.put("University of Otago", new ArrayList<String>(Arrays.asList("urn:test:UniversityOfOtago", "urn:test:UniversityOfOtago_Texas")));
    validateEntityLinkerResults(linker, expectedResults);
}
Also used : LanguageProcessingConfig(org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig) EntityLinkerConfig(org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig) HashMap(java.util.HashMap) List(java.util.List) ArrayList(java.util.ArrayList) EntityLinker(org.apache.stanbol.enhancer.engines.entitylinking.impl.EntityLinker) Test(org.junit.Test)

Example 3 with LanguageProcessingConfig

use of org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig in project stanbol by apache.

the class EntityLinkingEngineTest method testEntityLinkerWithNouns.

/**
     * This tests the EntityLinker functionality (if the expected Entities
     * are linked). In this case with the default configurations for
     * {@link LexicalCategory#Noun}.
     * @throws Exception
     */
@Test
public void testEntityLinkerWithNouns() throws Exception {
    LanguageProcessingConfig tpc = new LanguageProcessingConfig();
    tpc.setLinkedLexicalCategories(LanguageProcessingConfig.DEFAULT_LINKED_LEXICAL_CATEGORIES);
    tpc.setLinkedPos(Collections.EMPTY_SET);
    EntityLinkerConfig config = new EntityLinkerConfig();
    //this is assumed by this test
    config.setMinFoundTokens(2);
    config.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
    EntityLinker linker = new EntityLinker(TEST_ANALYSED_TEXT, "en", tpc, searcher, config, labelTokenizer);
    linker.process();
    Map<String, List<String>> expectedResults = new HashMap<String, List<String>>();
    expectedResults.put("Patrick Marshall", new ArrayList<String>(Arrays.asList("urn:test:PatrickMarshall")));
    expectedResults.put("geologist", new ArrayList<String>(//the redirected entity
    Arrays.asList("urn:test:redirect:Geologist")));
    expectedResults.put("New Zealand", new ArrayList<String>(Arrays.asList("urn:test:NewZealand")));
    expectedResults.put("University of Otago", new ArrayList<String>(Arrays.asList("urn:test:UniversityOfOtago", "urn:test:UniversityOfOtago_Texas")));
    validateEntityLinkerResults(linker, expectedResults);
}
Also used : LanguageProcessingConfig(org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig) EntityLinkerConfig(org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig) HashMap(java.util.HashMap) List(java.util.List) ArrayList(java.util.ArrayList) EntityLinker(org.apache.stanbol.enhancer.engines.entitylinking.impl.EntityLinker) Test(org.junit.Test)

Example 4 with LanguageProcessingConfig

use of org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig in project stanbol by apache.

the class EntityLinkingEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    log.trace(" enhance ci {}", ci.getUri());
    if (isOfflineMode() && !entitySearcher.supportsOfflineMode()) {
        throw new EngineException(this, ci, "Offline mode is not supported by the used EntitySearcher!", null);
    }
    AnalysedText at = getAnalysedText(this, ci, true);
    log.debug("  > AnalysedText {}", at);
    String language = getLanguage(this, ci, true);
    if (log.isDebugEnabled()) {
        log.debug("computeEnhancements for ContentItem {} language {} text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100) });
    }
    log.debug("  > Language {}", language);
    LanguageProcessingConfig languageConfig = textProcessingConfig.getConfiguration(language);
    if (languageConfig == null) {
        throw new IllegalStateException("The language '" + language + "' is not configured " + "to be processed by this Engine. As this is already checked within the " + "canEnhance(..) method this may indicate an bug in the used " + "EnhanceemntJobManager implementation!");
    }
    EntityLinker entityLinker = new EntityLinker(at, language, languageConfig, entitySearcher, linkerConfig, labelTokenizer);
    //process
    try {
        entityLinker.process();
    } catch (EntitySearcherException e) {
        log.error("Unable to link Entities with " + entityLinker, e);
        throw new EngineException(this, ci, "Unable to link Entities with " + entityLinker, e);
    }
    if (log.isInfoEnabled()) {
        entityLinker.logStatistics(log);
    }
    //write results (requires a write lock)
    ci.getLock().writeLock().lock();
    try {
        writeEnhancements(ci, entityLinker.getLinkedEntities().values(), language, linkerConfig.isWriteEntityRankings());
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) LanguageProcessingConfig(org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) EntitySearcherException(org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcherException) EntityLinker(org.apache.stanbol.enhancer.engines.entitylinking.impl.EntityLinker)

Example 5 with LanguageProcessingConfig

use of org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig in project stanbol by apache.

the class EntityLinkingEngineTest method testEntityLinkerWithWrongOrder.

/**
     * This tests the EntityLinker functionality (if the expected Entities
     * are linked). In this case with the default configurations for
     * {@link LexicalCategory#Noun}.
     * @throws Exception
     */
@Test
public void testEntityLinkerWithWrongOrder() throws Exception {
    LanguageProcessingConfig tpc = new LanguageProcessingConfig();
    tpc.setLinkedLexicalCategories(LanguageProcessingConfig.DEFAULT_LINKED_LEXICAL_CATEGORIES);
    tpc.setLinkedPos(Collections.EMPTY_SET);
    //to emulate pre STANBOL-1211
    tpc.setIgnoreChunksState(true);
    EntityLinkerConfig config = new EntityLinkerConfig();
    //this is assumed by this test
    config.setMinFoundTokens(2);
    config.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
    EntityLinker linker = new EntityLinker(TEST_ANALYSED_TEXT_WO, "en", tpc, searcher, config, labelTokenizer);
    linker.process();
    Map<String, List<String>> expectedResults = new HashMap<String, List<String>>();
    expectedResults.put("Marshall Patrick", new ArrayList<String>(Arrays.asList("urn:test:PatrickMarshall")));
    expectedResults.put("geologist", new ArrayList<String>(//the redirected entity
    Arrays.asList("urn:test:redirect:Geologist")));
    expectedResults.put("New Zealand", new ArrayList<String>(Arrays.asList("urn:test:NewZealand")));
    expectedResults.put("University of Otago", new ArrayList<String>(Arrays.asList("urn:test:UniversityOfOtago", "urn:test:UniversityOfOtago_Texas")));
    validateEntityLinkerResults(linker, expectedResults);
}
Also used : LanguageProcessingConfig(org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig) EntityLinkerConfig(org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig) HashMap(java.util.HashMap) List(java.util.List) ArrayList(java.util.ArrayList) EntityLinker(org.apache.stanbol.enhancer.engines.entitylinking.impl.EntityLinker) Test(org.junit.Test)

Aggregations

LanguageProcessingConfig (org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig)5 EntityLinker (org.apache.stanbol.enhancer.engines.entitylinking.impl.EntityLinker)5 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 List (java.util.List)3 EntityLinkerConfig (org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig)3 Test (org.junit.Test)3 EntitySearcherException (org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcherException)2 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)2 NlpEngineHelper.getAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText)2 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)2 HashSet (java.util.HashSet)1 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)1 Graph (org.apache.clerezza.commons.rdf.Graph)1 IRI (org.apache.clerezza.commons.rdf.IRI)1 Triple (org.apache.clerezza.commons.rdf.Triple)1 ContentItemMentionBuilder (org.apache.stanbol.enhancer.engines.entitycomention.impl.ContentItemMentionBuilder)1 LabelTokenizer (org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer)1