Search in sources :

Example 1 with EntityLinker

use of org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinker in project stanbol by apache.

the class KeywordLinkingEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    if (isOfflineMode() && !entitySearcher.supportsOfflineMode()) {
        throw new EngineException("Offline mode is not supported by the Component used to lookup Entities");
    }
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri() + "(supported: '" + SUPPORTED_MIMETYPES + "') -> this indicates that canEnhance was" + "NOT called and indicates a bug in the used EnhancementJobManager!");
    }
    String text;
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(String.format("Unable to extract " + " text from ContentPart %s of ContentItem %s!", contentPart.getKey(), ci.getUri()), e);
    }
    if (text.trim().length() == 0) {
        // TODO: make the length of the data a field of the ContentItem
        // interface to be able to filter out empty items in the canEnhance
        // method
        log.warn("ContentPart {} of ContentItem does not contain any Text to extract knowledge from", contentPart.getKey(), ci);
        return;
    }
    //Determine the language
    String language;
    ci.getLock().readLock().lock();
    try {
        language = extractLanguage(ci);
    } finally {
        ci.getLock().readLock().unlock();
    }
    if (isProcessableLanguages(language)) {
        log.debug("computeEnhancements for ContentItem {} language {} text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(text, 100) });
        EntityLinker entityLinker = new EntityLinker(analysedContentFactory.create(text, language), entitySearcher, linkerConfig);
        //process
        entityLinker.process();
        //write results (requires a write lock)
        ci.getLock().writeLock().lock();
        try {
            writeEnhancements(ci, entityLinker.getLinkedEntities().values(), language);
        } finally {
            ci.getLock().writeLock().unlock();
        }
    } else {
        log.debug("ignore ContentItem {} because language '{}' is not configured to" + "be processed by this engine.", ci.getUri().getUnicodeString(), language);
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) EntityLinker(org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinker)

Example 2 with EntityLinker

use of org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinker in project stanbol by apache.

the class KeywordLinkingEngineTest method testTaxonomyLinker.

/**
     * This tests the EntityLinker functionality (if the expected Entities
     * are linked)
     * @throws Exception
     */
@Test
public void testTaxonomyLinker() throws Exception {
    OpenNlpAnalysedContentFactory acf = OpenNlpAnalysedContentFactory.getInstance(openNLP, new TextAnalyzerConfig());
    EntityLinkerConfig config = new EntityLinkerConfig();
    config.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
    EntityLinker linker = new EntityLinker(acf.create(TEST_TEXT, "en"), searcher, config);
    linker.process();
    Map<String, List<String>> expectedResults = new HashMap<String, List<String>>();
    expectedResults.put("Patrick Marshall", new ArrayList<String>(Arrays.asList("urn:test:PatrickMarshall")));
    expectedResults.put("geologist", new ArrayList<String>(//the redirected entity
    Arrays.asList("urn:test:redirect:Geologist")));
    expectedResults.put("New Zealand", new ArrayList<String>(Arrays.asList("urn:test:NewZealand")));
    expectedResults.put("University of Otago", new ArrayList<String>(Arrays.asList("urn:test:UniversityOfOtago", "urn:test:UniversityOfOtago_Texas")));
    for (LinkedEntity linkedEntity : linker.getLinkedEntities().values()) {
        List<String> expectedSuggestions = expectedResults.remove(linkedEntity.getSelectedText());
        assertNotNull("LinkedEntity " + linkedEntity.getSelectedText() + "is not an expected Result (or was found twice)", expectedSuggestions);
        linkedEntity.getSuggestions().iterator();
        assertEquals("Number of suggestions " + linkedEntity.getSuggestions().size() + " != number of expected suggestions " + expectedSuggestions.size() + "for selection " + linkedEntity.getSelectedText(), linkedEntity.getSuggestions().size(), expectedSuggestions.size());
        double score = linkedEntity.getScore();
        for (int i = 0; i < expectedSuggestions.size(); i++) {
            Suggestion suggestion = linkedEntity.getSuggestions().get(i);
            assertEquals("Expecced Suggestion at Rank " + i + " expected: " + expectedSuggestions.get(i) + " suggestion: " + suggestion.getRepresentation().getId(), expectedSuggestions.get(i), suggestion.getRepresentation().getId());
            assertTrue("Score of suggestion " + i + "(" + suggestion.getScore() + " > as of the previous one (" + score + ")", score >= suggestion.getScore());
            score = suggestion.getScore();
        }
    }
}
Also used : LinkedEntity(org.apache.stanbol.enhancer.engines.keywordextraction.impl.LinkedEntity) EntityLinkerConfig(org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig) OpenNlpAnalysedContentFactory(org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.OpenNlpAnalysedContentFactory) HashMap(java.util.HashMap) EntityLinker(org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinker) Suggestion(org.apache.stanbol.enhancer.engines.keywordextraction.impl.Suggestion) List(java.util.List) ArrayList(java.util.ArrayList) TextAnalyzerConfig(org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig) Test(org.junit.Test)

Aggregations

EntityLinker (org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinker)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 List (java.util.List)1 IRI (org.apache.clerezza.commons.rdf.IRI)1 TextAnalyzerConfig (org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig)1 EntityLinkerConfig (org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig)1 LinkedEntity (org.apache.stanbol.enhancer.engines.keywordextraction.impl.LinkedEntity)1 Suggestion (org.apache.stanbol.enhancer.engines.keywordextraction.impl.Suggestion)1 OpenNlpAnalysedContentFactory (org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.OpenNlpAnalysedContentFactory)1 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)1 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)1 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)1 Test (org.junit.Test)1