Search in sources :

Example 1 with RDFTerm

use of org.apache.clerezza.commons.rdf.RDFTerm in project stanbol by apache.

the class FstLinkingEngineComponent method applyConfig.

/**
     * Called by {@link #activate(ComponentContext)}, 
     * {@link PlainFstLinkingComponnet#activate(ComponentContext)} and 
     * {@link NamedEntityFstLinkingComponnet#activate(ComponentContext)} to
     * apply the parsed {@link ComponentContext#getProperties()}. The
     * {@link LinkingModeEnum linking mode} is parsed separately as OSGI does not
     * allow to modify the parsed config and sup-classes do need to override
     * the linking mode.
     * @param linkingMode the linking mode
     * @param properties
     * @throws ConfigurationException
     */
protected void applyConfig(LinkingModeEnum linkingMode, Dictionary<String, Object> properties, NamespacePrefixService prefixService) throws ConfigurationException {
    //(0) The name for the Enhancement Engine and the basic metadata
    Object value = properties.get(PROPERTY_NAME);
    if (value == null || value.toString().isEmpty()) {
        throw new ConfigurationException(PROPERTY_NAME, "The EnhancementEngine name MUST BE configured!");
    } else {
        this.engineName = value.toString();
    }
    log.info(" - engine name: {}", engineName);
    engineMetadata = new Hashtable<String, Object>();
    engineMetadata.put(PROPERTY_NAME, this.engineName);
    value = properties.get(Constants.SERVICE_RANKING);
    engineMetadata.put(Constants.SERVICE_RANKING, value == null ? Integer.valueOf(0) : value);
    //(0) set the linking mode
    this.linkingMode = linkingMode;
    log.info(" - linking mode: {}", linkingMode);
    //(1) parse the TextProcessing configuration
    //TODO: decide if we should use the TextProcessingConfig for this engine
    textProcessingConfig = TextProcessingConfig.createInstance(properties);
    //change default for EntityLinkerConfig.MIN_FOUND_TOKENS
    value = properties.get(EntityLinkerConfig.MIN_FOUND_TOKENS);
    entityLinkerConfig = EntityLinkerConfig.createInstance(properties, prefixService);
    if (value == null) {
        //no MIN_FOUND_TOKENS config present
        //manually set the default to the value used by this engine
        entityLinkerConfig.setMinFoundTokens(FST_DEFAULT_MIN_FOUND_TOKENS);
    }
    //(2) parse the configured IndexReference
    value = properties.get(SOLR_CORE);
    if (value == null) {
        throw new ConfigurationException(SOLR_CORE, "Missing required configuration of the SolrCore");
    } else {
        indexReference = IndexReference.parse(value.toString());
    }
    value = properties.get(IndexConfiguration.FIELD_ENCODING);
    if (value == null) {
        throw new ConfigurationException(IndexConfiguration.FIELD_ENCODING, "Missing required configuration of the Solr Field Encoding");
    } else {
        try {
            fieldEncoding = FieldEncodingEnum.valueOf(value.toString().trim());
        } catch (IllegalArgumentException e) {
            throw new ConfigurationException(IndexConfiguration.FIELD_ENCODING, "The configured " + "FieldEncoding MUST BE a member of " + Arrays.toString(FieldEncodingEnum.values()), e);
        }
    }
    value = properties.get(IndexConfiguration.SKIP_ALT_TOKENS);
    if (value instanceof Boolean) {
        skipAltTokensConfig = ((Boolean) value);
    } else if (value != null) {
        skipAltTokensConfig = Boolean.valueOf(value.toString());
    }
    // else no config -> will use the default
    //(4) parse Origin information
    value = properties.get(ORIGIN);
    if (value instanceof RDFTerm) {
        origin = (RDFTerm) origin;
    } else if (value instanceof String) {
        try {
            URI originUri = new URI((String) value);
            if (originUri.isAbsolute()) {
                origin = new IRI((String) value);
            } else {
                origin = new PlainLiteralImpl((String) value);
            }
        } catch (URISyntaxException e) {
            origin = new PlainLiteralImpl((String) value);
        }
        log.info(" - origin: {}", origin);
    } else if (value != null) {
        log.warn("Values of the {} property MUST BE of type RDFTerm or String " + "(parsed: {} (type:{}))", new Object[] { ORIGIN, value, value.getClass() });
    }
    //else no ORIGIN information provided
    //(5) init the FST configuration
    //We can create the default configuration only here, as it depends on the
    //name of the solrIndex
    String defaultConfig = "*;" + IndexConfiguration.PARAM_FST + "=" + indexReference.getIndex() + ";" + IndexConfiguration.PARAM_FIELD + "=" + IndexConfiguration.DEFAULT_FIELD;
    fstConfig = new LanguageConfiguration(IndexConfiguration.FST_CONFIG, new String[] { defaultConfig });
    //now set the actual configuration parsed to the engine
    value = properties.get(IndexConfiguration.FST_CONFIG);
    if (value != null && !StringUtils.isBlank(value.toString())) {
        fstConfig.setConfiguration(properties);
    }
    //else keep the default
    value = properties.get(IndexConfiguration.FST_FOLDER);
    if (value instanceof String) {
        this.fstFolder = ((String) value).trim();
        if (this.fstFolder.isEmpty()) {
            this.fstFolder = null;
        }
    } else if (value == null) {
        this.fstFolder = null;
    } else {
        throw new ConfigurationException(IndexConfiguration.FST_FOLDER, "Values MUST BE of type String" + "(found: " + value.getClass().getName() + ")!");
    }
    //(6) Create the ThreadPool used for the runtime creation of FST models
    value = properties.get(FST_THREAD_POOL_SIZE);
    int tpSize;
    if (value instanceof Number) {
        tpSize = ((Number) value).intValue();
    } else if (value != null) {
        try {
            tpSize = Integer.parseInt(value.toString());
        } catch (NumberFormatException e) {
            throw new ConfigurationException(FST_THREAD_POOL_SIZE, "Unable to parse the integer FST thread pool size from the " + "configured " + value.getClass().getSimpleName() + " '" + value + "'!", e);
        }
    } else {
        tpSize = -1;
    }
    if (tpSize <= 0) {
        //if configured value <= 0 we use the default
        tpSize = DEFAULT_FST_THREAD_POOL_SIZE;
    }
    //build a ThreadFactoryBuilder for low priority daemon threads that
    //do use a meaningful name
    ThreadFactoryBuilder tfBuilder = new ThreadFactoryBuilder();
    //should be stopped if the VM closes
    tfBuilder.setDaemon(true);
    //low priority
    tfBuilder.setPriority(Thread.MIN_PRIORITY);
    tfBuilder.setNameFormat(engineName + "-FstRuntimeCreation-thread-%d");
    if (fstCreatorService != null && !fstCreatorService.isTerminated()) {
        //NOTE: We can not call terminateNow, because to interrupt threads
        //      here would also close FileChannels used by the SolrCore
        //      and produce java.nio.channels.ClosedByInterruptException
        //      exceptions followed by java.nio.channels.ClosedChannelException
        //      on following calls to affected files of the SolrIndex.
        //Because of that we just log a warning and let uncompleted tasks
        //complete!
        log.warn("some items in a previouse FST Runtime Creation Threadpool have " + "still not finished!");
    }
    fstCreatorService = Executors.newFixedThreadPool(tpSize, tfBuilder.build());
    //(7) Parse the EntityCache config
    int entityCacheSize;
    value = properties.get(ENTITY_CACHE_SIZE);
    if (value instanceof Number) {
        entityCacheSize = ((Number) value).intValue();
    } else if (value != null) {
        try {
            entityCacheSize = Integer.parseInt(value.toString());
        } catch (NumberFormatException e) {
            throw new ConfigurationException(ENTITY_CACHE_SIZE, "Unable to parse the integer EntityCacheSize from the " + "configured " + value.getClass().getSimpleName() + " '" + value + "'!", e);
        }
    } else {
        entityCacheSize = -1;
    }
    if (entityCacheSize == 0) {
        log.info(" ... EntityCache deactivated");
        this.entityCacheSize = entityCacheSize;
    } else {
        this.entityCacheSize = entityCacheSize < 0 ? DEFAULT_ENTITY_CACHE_SIZE : entityCacheSize;
        log.info(" ... EntityCache enabled (size: {})", this.entityCacheSize);
    }
    //(8) parse the Entity type field
    value = properties.get(IndexConfiguration.SOLR_TYPE_FIELD);
    if (value == null || StringUtils.isBlank(value.toString())) {
        solrTypeField = null;
    } else {
        solrTypeField = value.toString().trim();
    }
    //(9) parse the Entity Ranking field
    value = properties.get(IndexConfiguration.SOLR_RANKING_FIELD);
    if (value == null) {
        solrRankingField = null;
    } else {
        solrRankingField = value.toString().trim();
    }
    //(10) parse the NamedEntity type mappings (if linkingMode = NER)
    if (linkingMode == LinkingModeEnum.NER) {
        nerTypeMappings = new HashMap<String, Set<String>>();
        value = properties.get(NAMED_ENTITY_TYPE_MAPPINGS);
        if (value instanceof String[]) {
            //support array
            value = Arrays.asList((String[]) value);
        } else if (value instanceof String) {
            //single value
            value = Collections.singleton(value);
        }
        if (value instanceof Collection<?>) {
            //and collection
            log.info(" - process Named Entity Type Mappings (used by LinkingMode: {})", linkingMode);
            configs: for (Object o : (Iterable<?>) value) {
                if (o != null) {
                    StringBuilder usage = new StringBuilder("useage: ");
                    usage.append("'{namedEntity-tag-or-uri} > {entityType-1}[,{entityType-n}]'");
                    String[] config = o.toString().split(">");
                    String namedEntityType = config[0].trim();
                    if (namedEntityType.isEmpty()) {
                        log.warn("Invalid Type Mapping Config '{}': Missing namedEntityType ({}) -> ignore this config", o, usage);
                        continue configs;
                    }
                    if (NamespaceMappingUtils.getPrefix(namedEntityType) != null) {
                        namedEntityType = NamespaceMappingUtils.getConfiguredUri(prefixService, NAMED_ENTITY_TYPE_MAPPINGS, namedEntityType);
                    }
                    if (config.length < 2 || config[1].isEmpty()) {
                        log.warn("Invalid Type Mapping Config '{}': Missing dc:type URI '{}' ({}) -> ignore this config", o, usage);
                        continue configs;
                    }
                    String entityTypes = config[1].trim();
                    if (config.length > 2) {
                        log.warn("Configuration after 2nd '>' gets ignored. Will use mapping '{} > {}' from config {}", new Object[] { namedEntityType, entityTypes, o });
                    }
                    Set<String> types = nerTypeMappings.get(namedEntityType);
                    if (types == null) {
                        //add new element to the mapping
                        types = new HashSet<String>();
                        nerTypeMappings.put(namedEntityType, types);
                    }
                    for (String entityType : entityTypes.split(";")) {
                        entityType = entityType.trim();
                        if (!entityType.isEmpty()) {
                            String typeUri;
                            if ("*".equals(entityType)) {
                                //null is used as wildcard
                                typeUri = null;
                            } else {
                                typeUri = NamespaceMappingUtils.getConfiguredUri(prefixService, NAMED_ENTITY_TYPE_MAPPINGS, entityType);
                            }
                            log.info("   - add {} > {}", namedEntityType, typeUri);
                            types.add(typeUri);
                        }
                    //else ignore empty mapping
                    }
                }
            }
        } else {
            //no mappings defined ... set wildcard mapping
            log.info(" - No Named Entity type mappings configured. Will use wildcard mappings");
            nerTypeMappings = Collections.singletonMap(null, Collections.<String>singleton(null));
        }
    }
    //(11) start tracking the SolrCore
    try {
        solrServerTracker = new RegisteredSolrServerTracker(bundleContext, indexReference, null) {

            @Override
            public void removedService(ServiceReference reference, Object service) {
                log.info(" ... SolrCore for {} was removed!", reference);
                //try to get an other serviceReference from the tracker
                if (reference.equals(FstLinkingEngineComponent.this.solrServerReference)) {
                    updateEngineRegistration(solrServerTracker.getServiceReference(), null);
                } else {
                    log.info("  - removed SolrCore was not used for FST linking");
                }
                super.removedService(reference, service);
            }

            @Override
            public void modifiedService(ServiceReference reference, Object service) {
                log.info(" ... SolrCore for {} was updated!", indexReference);
                updateEngineRegistration(solrServerTracker.getServiceReference(), null);
                super.modifiedService(reference, service);
            }

            @Override
            public SolrServer addingService(ServiceReference reference) {
                SolrServer server = super.addingService(reference);
                if (solrCore != null) {
                    log.info("Multiple SolrCores for name {}! Will update engine " + "with the newly added {}!", new Object[] { solrCore.getName(), indexReference, reference });
                }
                updateEngineRegistration(reference, server);
                return server;
            }
        };
    } catch (InvalidSyntaxException e) {
        throw new ConfigurationException(SOLR_CORE, "parsed SolrCore name '" + value.toString() + "' is invalid (expected: '[{server-name}:]{indexname}'");
    }
    try {
        solrServerTracker.open();
    } catch (RuntimeException e) {
        //FIX for STANBOL-1416 (see https://issues.apache.org/jira/browse/STANBOL-1416)
        //If an available SolrCore can not be correctly initialized we will
        //get the exception here. In this case we want this component to be
        //activated and waiting for further service events. Because of that
        //we catch here the exception.
        log.debug("Error while processing existing SolrCore Service during " + "opening SolrServiceTracker ... waiting for further service" + "Events", e);
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Set(java.util.Set) HashSet(java.util.HashSet) URISyntaxException(java.net.URISyntaxException) URI(java.net.URI) EmbeddedSolrServer(org.apache.solr.client.solrj.embedded.EmbeddedSolrServer) SolrServer(org.apache.solr.client.solrj.SolrServer) ConfigurationException(org.osgi.service.cm.ConfigurationException) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) InvalidSyntaxException(org.osgi.framework.InvalidSyntaxException) HashSet(java.util.HashSet) RegisteredSolrServerTracker(org.apache.stanbol.commons.solr.RegisteredSolrServerTracker) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) ServiceReference(org.osgi.framework.ServiceReference) Collection(java.util.Collection) LanguageConfiguration(org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration)

Example 2 with RDFTerm

use of org.apache.clerezza.commons.rdf.RDFTerm in project stanbol by apache.

the class KeywordLinkingEngineTest method testEngine.

/**
     * This tests if the Enhancements created by the Engine confirm to the
     * rules defined for the Stanbol Enhancement Structure.
     * @throws IOException
     * @throws EngineException
     */
@Test
public void testEngine() throws IOException, EngineException {
    EntityLinkerConfig linkerConfig = new EntityLinkerConfig();
    linkerConfig.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
    KeywordLinkingEngine engine = KeywordLinkingEngine.createInstance(openNLP, searcher, new TextAnalyzerConfig(), linkerConfig);
    engine.referencedSiteName = TEST_REFERENCED_SITE_NAME;
    ContentItem ci = ciFactory.createContentItem(new StringSource(TEST_TEXT));
    //tells the engine that this is an English text
    ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("en")));
    //compute the enhancements
    engine.computeEnhancements(ci);
    //validate the enhancement results
    Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
    expectedValues.put(ENHANCER_EXTRACTED_FROM, ci.getUri());
    expectedValues.put(DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(engine.getClass().getName()));
    //adding null as expected for confidence makes it a required property
    expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
    //validate create fise:TextAnnotations
    int numTextAnnotations = validateAllTextAnnotations(ci.getMetadata(), TEST_TEXT, expectedValues);
    assertEquals("Four fise:TextAnnotations are expected by this Test", 4, numTextAnnotations);
    //validate create fise:EntityAnnotations
    int numEntityAnnotations = validateAllEntityAnnotations(ci, expectedValues);
    assertEquals("Five fise:EntityAnnotations are expected by this Test", 5, numEntityAnnotations);
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) EntityLinkerConfig(org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) HashMap(java.util.HashMap) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) KeywordLinkingEngine(org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine) TextAnalyzerConfig(org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 3 with RDFTerm

use of org.apache.clerezza.commons.rdf.RDFTerm in project stanbol by apache.

the class ClerezzaRDFUtils method urifyBlankNodes.

public static void urifyBlankNodes(Graph model) {
    HashMap<BlankNode, IRI> blankNodeMap = new HashMap<BlankNode, IRI>();
    Graph remove = new SimpleGraph();
    Graph add = new SimpleGraph();
    for (Triple t : model) {
        BlankNodeOrIRI subj = t.getSubject();
        RDFTerm obj = t.getObject();
        IRI pred = t.getPredicate();
        boolean match = false;
        if (subj instanceof BlankNode) {
            match = true;
            IRI ru = blankNodeMap.get(subj);
            if (ru == null) {
                ru = createRandomUri();
                blankNodeMap.put((BlankNode) subj, ru);
            }
            subj = ru;
        }
        if (obj instanceof BlankNode) {
            match = true;
            IRI ru = blankNodeMap.get(obj);
            if (ru == null) {
                ru = createRandomUri();
                blankNodeMap.put((BlankNode) obj, ru);
            }
            obj = ru;
        }
        if (match) {
            remove.add(t);
            add.add(new TripleImpl(subj, pred, obj));
        }
    }
    model.removeAll(remove);
    model.addAll(add);
}
Also used : Triple(org.apache.clerezza.commons.rdf.Triple) IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) HashMap(java.util.HashMap) BlankNode(org.apache.clerezza.commons.rdf.BlankNode) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 4 with RDFTerm

use of org.apache.clerezza.commons.rdf.RDFTerm in project stanbol by apache.

the class RdfSerializingWriter method getObjectExpansionProperties.

private Set<IRI> getObjectExpansionProperties(GraphNode recipe) {
    final MultivaluedMap<String, String> queryParams = uriInfo.getQueryParameters(true);
    final List<String> paramValues = queryParams.get(OBJ_EXP_PARAM);
    final Set<IRI> result = new HashSet<IRI>();
    if (paramValues != null) {
        for (String uriString : paramValues) {
            result.add(new IRI(uriString));
        }
    }
    if (recipe != null) {
        Iterator<GraphNode> ingredients = recipe.getObjectNodes(RECIPES.ingredient);
        while (ingredients.hasNext()) {
            Iterator<RDFTerm> properties = ingredients.next().getObjects(RECIPES.ingredientProperty);
            while (properties.hasNext()) {
                result.add((IRI) properties.next());
            }
        }
    }
    return result;
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) GraphNode(org.apache.clerezza.rdf.utils.GraphNode) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) HashSet(java.util.HashSet)

Example 5 with RDFTerm

use of org.apache.clerezza.commons.rdf.RDFTerm in project stanbol by apache.

the class SuggestionFunction method processAnnotations.

/**
     * Suggestions are selected by all Annotations returned by the parsed
     * {@link #annotationSelector}.
     * @param backend
     * @param annotations suggestions are selected for the union of the parsed
     * annotations - the {limit} most linked entities for the parsed
     * list of annotations.
     * @param limit the maximum number of suggestions for the parsed collection
     * of annotations.
     * @param missingConfidenceMode
     * @param result results are added to this list.
     */
private void processAnnotations(final RDFBackend<RDFTerm> backend, Collection<RDFTerm> annotations, Integer limit, final int missingConfidenceMode, List<RDFTerm> result) {
    List<Entry<Double, RDFTerm>> suggestions = new ArrayList<Entry<Double, RDFTerm>>();
    for (RDFTerm annotation : annotations) {
        //NOTE: no Path Tracking support possible for selectors wrapped in functions
        for (RDFTerm suggestion : suggestionSelector.select(backend, annotation, null, null)) {
            Collection<RDFTerm> cs = confidenceSelector.select(backend, suggestion, null, null);
            Double confidence = !cs.isEmpty() ? backend.doubleValue(cs.iterator().next()) : missingConfidenceMode == MISSING_CONFIDENCE_FILTER ? null : missingConfidenceMode == MISSING_CONFIDENCE_FIRST ? MAX : MIN;
            if (confidence != null) {
                suggestions.add(singletonMap(confidence, suggestion).entrySet().iterator().next());
            }
        }
    }
    Collections.sort(suggestions, SUGGESTION_COMPARATOR);
    int resultSize = limit != null ? Math.min(limit, suggestions.size()) : suggestions.size();
    for (Entry<Double, RDFTerm> suggestion : suggestions.subList(0, resultSize)) {
        if (resultSelector == null) {
            result.add(suggestion.getValue());
        } else {
            result.addAll(resultSelector.select(backend, suggestion.getValue(), null, null));
        }
    }
}
Also used : Entry(java.util.Map.Entry) ArrayList(java.util.ArrayList) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm)

Aggregations

RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)126 IRI (org.apache.clerezza.commons.rdf.IRI)84 Triple (org.apache.clerezza.commons.rdf.Triple)70 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)48 Literal (org.apache.clerezza.commons.rdf.Literal)35 Test (org.junit.Test)35 HashSet (java.util.HashSet)30 HashMap (java.util.HashMap)28 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)26 Graph (org.apache.clerezza.commons.rdf.Graph)24 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)18 ArrayList (java.util.ArrayList)17 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)16 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)13 OWLOntologyID (org.semanticweb.owlapi.model.OWLOntologyID)13 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)12 Collection (java.util.Collection)10 IndexedGraph (org.apache.stanbol.commons.indexedgraph.IndexedGraph)10 Lock (java.util.concurrent.locks.Lock)9 IOException (java.io.IOException)5