Search in sources :

Example 1 with EntityLinkerConfig

use of org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig in project stanbol by apache.

the class KeywordLinkingEngine method activateEntityLinkerConfig.

/**
     * Configures the parsed {@link EntityLinkerConfig} with the values of the
     * following properties:<ul>
     * <li>{@link #NAME_FIELD}
     * <li>{@link #TYPE_FIELD}
     * <li>{@link #REDIRECT_FIELD}
     * <li>{@link #REDIRECT_PROCESSING_MODE}
     * <li>{@link #MAX_SUGGESTIONS}
     * <li>{@link #MIN_SEARCH_TOKEN_LENGTH}
     * <li>{@link #MIN_FOUND_TOKENS}
     * <li> {@link #MIN_TOKEN_MATCH_FACTOR}
     * </ul>
     * This Method create an new {@link EntityLinkerConfig} instance only if
     * <code>{@link #linkerConfig} == null</code>. If the instance is already initialised
     * that all current values for keys missing in the parsed configuration are
     * preserved.
     * @param configuration the configuration
     * @throws ConfigurationException In case of an illegal value in the parsed configuration.
     * Note that all configuration are assumed as optional, therefore missing values will not
     * case a ConfigurationException.
     */
protected void activateEntityLinkerConfig(Dictionary<String, Object> configuration) throws ConfigurationException {
    if (linkerConfig == null) {
        this.linkerConfig = new EntityLinkerConfig();
    }
    Object value;
    value = configuration.get(NAME_FIELD);
    if (value != null) {
        if (value.toString().isEmpty()) {
            throw new ConfigurationException(NAME_FIELD, "The configured name field MUST NOT be empty");
        }
        linkerConfig.setNameField(NamespaceMappingUtils.getConfiguredUri(nsPrefixService, NAME_FIELD, value.toString()));
    }
    //init case sensitivity
    value = configuration.get(CASE_SENSITIVE);
    if (value instanceof Boolean) {
        linkerConfig.setCaseSensitiveMatchingState((Boolean) value);
    } else if (value != null && !value.toString().isEmpty()) {
        linkerConfig.setCaseSensitiveMatchingState(Boolean.valueOf(value.toString()));
    }
    //if NULL or empty use default
    //init TYPE_FIELD
    value = configuration.get(TYPE_FIELD);
    if (value != null) {
        if (value.toString().isEmpty()) {
            throw new ConfigurationException(TYPE_FIELD, "The configured name field MUST NOT be empty");
        }
        linkerConfig.setTypeField(NamespaceMappingUtils.getConfiguredUri(nsPrefixService, TYPE_FIELD, value.toString()));
    }
    //init REDIRECT_FIELD
    value = configuration.get(REDIRECT_FIELD);
    if (value != null) {
        if (value.toString().isEmpty()) {
            throw new ConfigurationException(NAME_FIELD, "The configured name field MUST NOT be empty");
        }
        linkerConfig.setRedirectField(NamespaceMappingUtils.getConfiguredUri(nsPrefixService, REDIRECT_FIELD, value.toString()));
    }
    //init MAX_SUGGESTIONS
    value = configuration.get(MAX_SUGGESTIONS);
    Integer maxSuggestions;
    if (value instanceof Integer) {
        maxSuggestions = (Integer) value;
    } else if (value != null) {
        try {
            maxSuggestions = Integer.valueOf(value.toString());
        } catch (NumberFormatException e) {
            throw new ConfigurationException(MAX_SUGGESTIONS, "Values MUST be valid Integer values > 0", e);
        }
    } else {
        maxSuggestions = null;
    }
    if (maxSuggestions != null) {
        if (maxSuggestions < 1) {
            throw new ConfigurationException(MAX_SUGGESTIONS, "Values MUST be valid Integer values > 0");
        }
        linkerConfig.setMaxSuggestions(maxSuggestions);
    }
    //init MIN_FOUND_TOKENS
    value = configuration.get(MIN_FOUND_TOKENS);
    Integer minFoundTokens;
    if (value instanceof Integer) {
        minFoundTokens = (Integer) value;
    } else if (value != null) {
        try {
            minFoundTokens = Integer.valueOf(value.toString());
        } catch (NumberFormatException e) {
            throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0", e);
        }
    } else {
        minFoundTokens = null;
    }
    if (minFoundTokens != null) {
        if (minFoundTokens < 1) {
            throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0");
        }
        linkerConfig.setMinFoundTokens(minFoundTokens);
    }
    // init MIN_SEARCH_TOKEN_LENGTH
    value = configuration.get(MIN_SEARCH_TOKEN_LENGTH);
    Integer minSearchTokenLength;
    if (value instanceof Integer) {
        minSearchTokenLength = (Integer) value;
    } else if (value != null) {
        try {
            minSearchTokenLength = Integer.valueOf(value.toString());
        } catch (NumberFormatException e) {
            throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0", e);
        }
    } else {
        minSearchTokenLength = null;
    }
    if (minSearchTokenLength != null) {
        if (minSearchTokenLength < 1) {
            throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0");
        }
        linkerConfig.setMinSearchTokenLength(minSearchTokenLength);
    }
    //init the REDIRECT_PROCESSING_MODE
    value = configuration.get(REDIRECT_PROCESSING_MODE);
    if (value != null) {
        try {
            linkerConfig.setRedirectProcessingMode(RedirectProcessingMode.valueOf(value.toString()));
        } catch (IllegalArgumentException e) {
            throw new ConfigurationException(REDIRECT_PROCESSING_MODE, "Values MUST be one of " + Arrays.toString(RedirectProcessingMode.values()));
        }
    }
    //init the DEFAULT_LANGUAGE
    value = configuration.get(DEFAULT_MATCHING_LANGUAGE);
    if (value != null) {
        String defaultLang = value.toString().trim();
        if (defaultLang.isEmpty()) {
            linkerConfig.setDefaultLanguage(null);
        } else if (defaultLang.length() == 1) {
            throw new ConfigurationException(DEFAULT_MATCHING_LANGUAGE, "Illegal language code '" + defaultLang + "'! Language Codes MUST BE at least 2 chars long.");
        } else {
            linkerConfig.setDefaultLanguage(defaultLang);
        }
    }
    // init MIN_TOKEN_MATCH_FACTOR
    value = configuration.get(MIN_TOKEN_MATCH_FACTOR);
    float minTokenMatchFactor;
    if (value instanceof Number) {
        minTokenMatchFactor = ((Number) value).floatValue();
    } else if (value != null) {
        try {
            minTokenMatchFactor = Float.valueOf(value.toString());
        } catch (NumberFormatException e) {
            throw new ConfigurationException(MIN_TOKEN_MATCH_FACTOR, "Unable to parse the minimum token match factor from the parsed value " + value, e);
        }
        if (minTokenMatchFactor < 0) {
            minTokenMatchFactor = EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR;
        }
    } else {
        minTokenMatchFactor = EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR;
    }
    if (minTokenMatchFactor == 0 || minTokenMatchFactor > 1) {
        throw new ConfigurationException(MIN_TOKEN_MATCH_FACTOR, "The minimum token match factor MUST be > 0 and <= 1 (negative values for the default)");
    }
    linkerConfig.setMinTokenMatchFactor(minTokenMatchFactor);
    //init type mappings
    value = configuration.get(TYPE_MAPPINGS);
    if (value instanceof String[]) {
        //support array
        value = Arrays.asList((String[]) value);
    } else if (value instanceof String) {
        //single value
        value = Collections.singleton(value);
    }
    if (value instanceof Collection<?>) {
        //and collection
        log.info("Init Type Mappings");
        configs: for (Object o : (Iterable<?>) value) {
            if (o != null) {
                StringBuilder usage = new StringBuilder("useages: ");
                usage.append("a: '{uri}' short for {uri} > {uri} | ");
                usage.append("b: '{source1};{source2};..;{sourceN} > {target}'");
                String[] config = o.toString().split(">");
                if (config[0].isEmpty()) {
                    log.warn("Invalid Type Mapping Config '{}': Missing Source Type ({}) -> ignore this config", o, usage);
                    continue configs;
                }
                String[] sourceTypes = config[0].split(";");
                if (sourceTypes.length > 1 && (config.length < 2 || config[1].isEmpty())) {
                    log.warn("Invalid Type Mapping Config '{}': Missing Target Type '{}' ({}) -> ignore this config", o, usage);
                    continue configs;
                }
                String targetType = config.length < 2 ? sourceTypes[0] : config[1];
                targetType = NamespaceMappingUtils.getConfiguredUri(nsPrefixService, TYPE_MAPPINGS, //support for ns:localName
                targetType.trim());
                try {
                    //validate
                    new URI(targetType);
                } catch (URISyntaxException e) {
                    log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this config", sourceTypes[0], o);
                    continue configs;
                }
                IRI targetUri = new IRI(targetType);
                for (String sourceType : sourceTypes) {
                    if (!sourceType.isEmpty()) {
                        sourceType = NamespaceMappingUtils.getConfiguredUri(nsPrefixService, TYPE_MAPPINGS, //support for ns:localName
                        sourceType.trim());
                        try {
                            //validate
                            new URI(sourceType);
                            IRI old = linkerConfig.setTypeMapping(sourceType, targetUri);
                            if (old == null) {
                                log.info(" > add type mapping {} > {}", sourceType, targetType);
                            } else {
                                log.info(" > set type mapping {} > {} (old: {})", new Object[] { sourceType, targetType, old.getUnicodeString() });
                            }
                        } catch (URISyntaxException e) {
                            log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this source type", sourceTypes[0], o);
                        }
                    }
                }
            }
        }
    } else {
        log.debug("No Type mappings configured");
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) EntityLinkerConfig(org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig) URISyntaxException(java.net.URISyntaxException) URI(java.net.URI) ConfigurationException(org.osgi.service.cm.ConfigurationException) Collection(java.util.Collection)

Example 2 with EntityLinkerConfig

use of org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig in project stanbol by apache.

the class KeywordLinkingEngineTest method testTaxonomyLinker.

/**
     * This tests the EntityLinker functionality (if the expected Entities
     * are linked)
     * @throws Exception
     */
@Test
public void testTaxonomyLinker() throws Exception {
    OpenNlpAnalysedContentFactory acf = OpenNlpAnalysedContentFactory.getInstance(openNLP, new TextAnalyzerConfig());
    EntityLinkerConfig config = new EntityLinkerConfig();
    config.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
    EntityLinker linker = new EntityLinker(acf.create(TEST_TEXT, "en"), searcher, config);
    linker.process();
    Map<String, List<String>> expectedResults = new HashMap<String, List<String>>();
    expectedResults.put("Patrick Marshall", new ArrayList<String>(Arrays.asList("urn:test:PatrickMarshall")));
    expectedResults.put("geologist", new ArrayList<String>(//the redirected entity
    Arrays.asList("urn:test:redirect:Geologist")));
    expectedResults.put("New Zealand", new ArrayList<String>(Arrays.asList("urn:test:NewZealand")));
    expectedResults.put("University of Otago", new ArrayList<String>(Arrays.asList("urn:test:UniversityOfOtago", "urn:test:UniversityOfOtago_Texas")));
    for (LinkedEntity linkedEntity : linker.getLinkedEntities().values()) {
        List<String> expectedSuggestions = expectedResults.remove(linkedEntity.getSelectedText());
        assertNotNull("LinkedEntity " + linkedEntity.getSelectedText() + "is not an expected Result (or was found twice)", expectedSuggestions);
        linkedEntity.getSuggestions().iterator();
        assertEquals("Number of suggestions " + linkedEntity.getSuggestions().size() + " != number of expected suggestions " + expectedSuggestions.size() + "for selection " + linkedEntity.getSelectedText(), linkedEntity.getSuggestions().size(), expectedSuggestions.size());
        double score = linkedEntity.getScore();
        for (int i = 0; i < expectedSuggestions.size(); i++) {
            Suggestion suggestion = linkedEntity.getSuggestions().get(i);
            assertEquals("Expecced Suggestion at Rank " + i + " expected: " + expectedSuggestions.get(i) + " suggestion: " + suggestion.getRepresentation().getId(), expectedSuggestions.get(i), suggestion.getRepresentation().getId());
            assertTrue("Score of suggestion " + i + "(" + suggestion.getScore() + " > as of the previous one (" + score + ")", score >= suggestion.getScore());
            score = suggestion.getScore();
        }
    }
}
Also used : LinkedEntity(org.apache.stanbol.enhancer.engines.keywordextraction.impl.LinkedEntity) EntityLinkerConfig(org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig) OpenNlpAnalysedContentFactory(org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.OpenNlpAnalysedContentFactory) HashMap(java.util.HashMap) EntityLinker(org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinker) Suggestion(org.apache.stanbol.enhancer.engines.keywordextraction.impl.Suggestion) List(java.util.List) ArrayList(java.util.ArrayList) TextAnalyzerConfig(org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig) Test(org.junit.Test)

Example 3 with EntityLinkerConfig

use of org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig in project stanbol by apache.

the class KeywordLinkingEngineTest method testEngine.

/**
     * This tests if the Enhancements created by the Engine confirm to the
     * rules defined for the Stanbol Enhancement Structure.
     * @throws IOException
     * @throws EngineException
     */
@Test
public void testEngine() throws IOException, EngineException {
    EntityLinkerConfig linkerConfig = new EntityLinkerConfig();
    linkerConfig.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
    KeywordLinkingEngine engine = KeywordLinkingEngine.createInstance(openNLP, searcher, new TextAnalyzerConfig(), linkerConfig);
    engine.referencedSiteName = TEST_REFERENCED_SITE_NAME;
    ContentItem ci = ciFactory.createContentItem(new StringSource(TEST_TEXT));
    //tells the engine that this is an English text
    ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("en")));
    //compute the enhancements
    engine.computeEnhancements(ci);
    //validate the enhancement results
    Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
    expectedValues.put(ENHANCER_EXTRACTED_FROM, ci.getUri());
    expectedValues.put(DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(engine.getClass().getName()));
    //adding null as expected for confidence makes it a required property
    expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
    //validate create fise:TextAnnotations
    int numTextAnnotations = validateAllTextAnnotations(ci.getMetadata(), TEST_TEXT, expectedValues);
    assertEquals("Four fise:TextAnnotations are expected by this Test", 4, numTextAnnotations);
    //validate create fise:EntityAnnotations
    int numEntityAnnotations = validateAllEntityAnnotations(ci, expectedValues);
    assertEquals("Five fise:EntityAnnotations are expected by this Test", 5, numEntityAnnotations);
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) EntityLinkerConfig(org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) HashMap(java.util.HashMap) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) KeywordLinkingEngine(org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine) TextAnalyzerConfig(org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Aggregations

EntityLinkerConfig (org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig)3 HashMap (java.util.HashMap)2 IRI (org.apache.clerezza.commons.rdf.IRI)2 TextAnalyzerConfig (org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig)2 Test (org.junit.Test)2 URI (java.net.URI)1 URISyntaxException (java.net.URISyntaxException)1 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 List (java.util.List)1 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)1 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)1 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)1 KeywordLinkingEngine (org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine)1 EntityLinker (org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinker)1 LinkedEntity (org.apache.stanbol.enhancer.engines.keywordextraction.impl.LinkedEntity)1 Suggestion (org.apache.stanbol.enhancer.engines.keywordextraction.impl.Suggestion)1 OpenNlpAnalysedContentFactory (org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.OpenNlpAnalysedContentFactory)1 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)1 StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)1