Search in sources :

Example 1 with TextAnalyzerConfig

use of org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig in project stanbol by apache.

the class KeywordLinkingEngineTest method testTaxonomyLinker.

/**
     * This tests the EntityLinker functionality (if the expected Entities
     * are linked)
     * @throws Exception
     */
@Test
public void testTaxonomyLinker() throws Exception {
    OpenNlpAnalysedContentFactory acf = OpenNlpAnalysedContentFactory.getInstance(openNLP, new TextAnalyzerConfig());
    EntityLinkerConfig config = new EntityLinkerConfig();
    config.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
    EntityLinker linker = new EntityLinker(acf.create(TEST_TEXT, "en"), searcher, config);
    linker.process();
    Map<String, List<String>> expectedResults = new HashMap<String, List<String>>();
    expectedResults.put("Patrick Marshall", new ArrayList<String>(Arrays.asList("urn:test:PatrickMarshall")));
    expectedResults.put("geologist", new ArrayList<String>(//the redirected entity
    Arrays.asList("urn:test:redirect:Geologist")));
    expectedResults.put("New Zealand", new ArrayList<String>(Arrays.asList("urn:test:NewZealand")));
    expectedResults.put("University of Otago", new ArrayList<String>(Arrays.asList("urn:test:UniversityOfOtago", "urn:test:UniversityOfOtago_Texas")));
    for (LinkedEntity linkedEntity : linker.getLinkedEntities().values()) {
        List<String> expectedSuggestions = expectedResults.remove(linkedEntity.getSelectedText());
        assertNotNull("LinkedEntity " + linkedEntity.getSelectedText() + "is not an expected Result (or was found twice)", expectedSuggestions);
        linkedEntity.getSuggestions().iterator();
        assertEquals("Number of suggestions " + linkedEntity.getSuggestions().size() + " != number of expected suggestions " + expectedSuggestions.size() + "for selection " + linkedEntity.getSelectedText(), linkedEntity.getSuggestions().size(), expectedSuggestions.size());
        double score = linkedEntity.getScore();
        for (int i = 0; i < expectedSuggestions.size(); i++) {
            Suggestion suggestion = linkedEntity.getSuggestions().get(i);
            assertEquals("Expecced Suggestion at Rank " + i + " expected: " + expectedSuggestions.get(i) + " suggestion: " + suggestion.getRepresentation().getId(), expectedSuggestions.get(i), suggestion.getRepresentation().getId());
            assertTrue("Score of suggestion " + i + "(" + suggestion.getScore() + " > as of the previous one (" + score + ")", score >= suggestion.getScore());
            score = suggestion.getScore();
        }
    }
}
Also used : LinkedEntity(org.apache.stanbol.enhancer.engines.keywordextraction.impl.LinkedEntity) EntityLinkerConfig(org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig) OpenNlpAnalysedContentFactory(org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.OpenNlpAnalysedContentFactory) HashMap(java.util.HashMap) EntityLinker(org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinker) Suggestion(org.apache.stanbol.enhancer.engines.keywordextraction.impl.Suggestion) List(java.util.List) ArrayList(java.util.ArrayList) TextAnalyzerConfig(org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig) Test(org.junit.Test)

Example 2 with TextAnalyzerConfig

use of org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig in project stanbol by apache.

the class KeywordLinkingEngineTest method testEngine.

/**
     * This tests if the Enhancements created by the Engine confirm to the
     * rules defined for the Stanbol Enhancement Structure.
     * @throws IOException
     * @throws EngineException
     */
@Test
public void testEngine() throws IOException, EngineException {
    EntityLinkerConfig linkerConfig = new EntityLinkerConfig();
    linkerConfig.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
    KeywordLinkingEngine engine = KeywordLinkingEngine.createInstance(openNLP, searcher, new TextAnalyzerConfig(), linkerConfig);
    engine.referencedSiteName = TEST_REFERENCED_SITE_NAME;
    ContentItem ci = ciFactory.createContentItem(new StringSource(TEST_TEXT));
    //tells the engine that this is an English text
    ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("en")));
    //compute the enhancements
    engine.computeEnhancements(ci);
    //validate the enhancement results
    Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
    expectedValues.put(ENHANCER_EXTRACTED_FROM, ci.getUri());
    expectedValues.put(DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(engine.getClass().getName()));
    //adding null as expected for confidence makes it a required property
    expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
    //validate create fise:TextAnnotations
    int numTextAnnotations = validateAllTextAnnotations(ci.getMetadata(), TEST_TEXT, expectedValues);
    assertEquals("Four fise:TextAnnotations are expected by this Test", 4, numTextAnnotations);
    //validate create fise:EntityAnnotations
    int numEntityAnnotations = validateAllEntityAnnotations(ci, expectedValues);
    assertEquals("Five fise:EntityAnnotations are expected by this Test", 5, numEntityAnnotations);
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) EntityLinkerConfig(org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) HashMap(java.util.HashMap) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) KeywordLinkingEngine(org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine) TextAnalyzerConfig(org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 3 with TextAnalyzerConfig

use of org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig in project stanbol by apache.

the class TextAnalyzerTest method testSingleSentenceChunkerConfig.

@Test
public void testSingleSentenceChunkerConfig() {
    TextAnalyzerConfig config = new TextAnalyzerConfig();
    config.forcePosTypeChunker(false);
    TextAnalyzer analyzer = new TextAnalyzer(openNLP, LANGUAGE, config);
    AnalysedText analysed = analyzer.analyseSentence(SINGLE_SENTENCE);
    assertNotNull(analysed);
    //check the default config
    assertFalse(analyzer.getConfig().isSimpleTokenizerForced());
    assertTrue(analyzer.getConfig().isPosTaggerEnable());
    assertTrue(analyzer.getConfig().isChunkerEnabled());
    assertTrue(analyzer.getConfig().isPosTypeChunkerEnabled());
    assertFalse(analyzer.getConfig().isPosTypeChunkerForced());
    checkSingleSentence(analysed, SINGLE_SENTENCE_TOKENS, true, true);
}
Also used : AnalysedText(org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText) TextAnalyzer(org.apache.stanbol.commons.opennlp.TextAnalyzer) TextAnalyzerConfig(org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig) Test(org.junit.Test)

Example 4 with TextAnalyzerConfig

use of org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig in project stanbol by apache.

the class TextAnalyzerTest method testSingleSentenceNoChunkerNoPosConfig.

@Test
public void testSingleSentenceNoChunkerNoPosConfig() {
    TextAnalyzerConfig config = new TextAnalyzerConfig();
    config.enablePosTagger(false);
    //must be ignored for Chunks if no Pos
    config.enableChunker(true);
    TextAnalyzer analyzer = new TextAnalyzer(openNLP, LANGUAGE, config);
    AnalysedText analysed = analyzer.analyseSentence(SINGLE_SENTENCE);
    assertNotNull(analysed);
    //check the default config
    assertFalse(analyzer.getConfig().isSimpleTokenizerForced());
    assertFalse(analyzer.getConfig().isPosTaggerEnable());
    assertTrue(analyzer.getConfig().isChunkerEnabled());
    assertTrue(analyzer.getConfig().isPosTypeChunkerEnabled());
    assertTrue(analyzer.getConfig().isPosTypeChunkerForced());
    checkSingleSentence(analysed, SINGLE_SENTENCE_TOKENS, false, false);
}
Also used : AnalysedText(org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText) TextAnalyzer(org.apache.stanbol.commons.opennlp.TextAnalyzer) TextAnalyzerConfig(org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig) Test(org.junit.Test)

Example 5 with TextAnalyzerConfig

use of org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig in project stanbol by apache.

the class TextAnalyzerTest method testSingleSentenceNoChunkerConfig.

@Test
public void testSingleSentenceNoChunkerConfig() {
    TextAnalyzerConfig config = new TextAnalyzerConfig();
    config.enableChunker(false);
    TextAnalyzer analyzer = new TextAnalyzer(openNLP, LANGUAGE, config);
    AnalysedText analysed = analyzer.analyseSentence(SINGLE_SENTENCE);
    assertNotNull(analysed);
    //check the default config
    assertFalse(analyzer.getConfig().isSimpleTokenizerForced());
    assertTrue(analyzer.getConfig().isPosTaggerEnable());
    assertFalse(analyzer.getConfig().isChunkerEnabled());
    assertTrue(analyzer.getConfig().isPosTypeChunkerEnabled());
    assertTrue(analyzer.getConfig().isPosTypeChunkerForced());
    checkSingleSentence(analysed, SINGLE_SENTENCE_TOKENS, true, false);
}
Also used : AnalysedText(org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText) TextAnalyzer(org.apache.stanbol.commons.opennlp.TextAnalyzer) TextAnalyzerConfig(org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig) Test(org.junit.Test)

Aggregations

TextAnalyzerConfig (org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig)6 Test (org.junit.Test)5 TextAnalyzer (org.apache.stanbol.commons.opennlp.TextAnalyzer)3 AnalysedText (org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText)3 HashMap (java.util.HashMap)2 EntityLinkerConfig (org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig)2 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 List (java.util.List)1 IRI (org.apache.clerezza.commons.rdf.IRI)1 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)1 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)1 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)1 KeywordLinkingEngine (org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine)1 EntityLinker (org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinker)1 LinkedEntity (org.apache.stanbol.enhancer.engines.keywordextraction.impl.LinkedEntity)1 Suggestion (org.apache.stanbol.enhancer.engines.keywordextraction.impl.Suggestion)1 OpenNlpAnalysedContentFactory (org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.OpenNlpAnalysedContentFactory)1 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)1 StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)1