Search in sources :

Example 1 with PlainLiteralImpl

use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.

the class FstLinkingEngine method writeEnhancements.

/**
 * Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
 * extracted from the parsed ContentItem
 * @param ci
 * @param tags
 * @param language
 */
private void writeEnhancements(ContentItem ci, String text, Collection<Tag> tags, String language, boolean writeRankings) {
    Language languageObject = null;
    if (language != null && !language.isEmpty()) {
        languageObject = new Language(language);
    }
    Graph metadata = ci.getMetadata();
    for (Tag tag : tags) {
        Collection<IRI> textAnnotations = new ArrayList<IRI>(tags.size());
        // first create the TextAnnotations for the Occurrences
        Literal startLiteral = literalFactory.createTypedLiteral(tag.getStart());
        Literal endLiteral = literalFactory.createTypedLiteral(tag.getEnd());
        // search for existing text annotation
        Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
        IRI textAnnotation = null;
        while (it.hasNext()) {
            Triple t = it.next();
            if (metadata.filter(t.getSubject(), ENHANCER_END, endLiteral).hasNext() && metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()) {
                textAnnotation = (IRI) t.getSubject();
                break;
            }
        }
        if (textAnnotation == null) {
            // not found ... create a new one
            textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, startLiteral));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, endLiteral));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, tag.getAnchor(), tag.getStart()), languageObject)));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(tag.getAnchor(), languageObject)));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(tag.getScore())));
        } else {
            // if existing add this engine as contributor
            metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, new PlainLiteralImpl(this.getClass().getName())));
        }
        // add dc:types (even to existing)
        for (IRI dcType : getDcTypes(tag.getSuggestions())) {
            metadata.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, dcType));
        }
        textAnnotations.add(textAnnotation);
        // now the EntityAnnotations for the Suggestions
        for (Match match : tag.getSuggestions()) {
            IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            // should we use the label used for the match, or search the
            // representation for the best label ... currently its the matched one
            metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_LABEL, match.getMatchLabel()));
            metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, new IRI(match.getUri())));
            for (IRI type : match.getTypes()) {
                metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_TYPE, type));
            }
            metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(match.getScore())));
            // add the relation to the fise:TextAnnotation (the tag)
            metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
            // write origin information
            if (indexConfig.getOrigin() != null) {
                metadata.add(new TripleImpl(entityAnnotation, FISE_ORIGIN, indexConfig.getOrigin()));
            }
            // }
            if (writeRankings) {
                Double ranking = match.getRanking();
                if (ranking != null) {
                    metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_RANKING, literalFactory.createTypedLiteral(ranking)));
                }
            }
        // TODO: dereferencing
        // if(linkerConfig.isDereferenceEntitiesEnabled() &&
        // dereferencedEntitis.add(entity.getUri())){ //not yet dereferenced
        // //add all outgoing triples for this entity
        // //NOTE: do not add all triples as there might be other data in the graph
        // for(Iterator<Triple> triples = entity.getData().filter(entity.getUri(), null, null);
        // triples.hasNext();metadata.add(triples.next()));
        // }
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) ArrayList(java.util.ArrayList) Triple(org.apache.clerezza.commons.rdf.Triple) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) NlpEngineHelper.getLanguage(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage) Literal(org.apache.clerezza.commons.rdf.Literal) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 2 with PlainLiteralImpl

use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.

the class LanguageDetectionEnhancementEngine method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    String text = "";
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(this, ci, e);
    }
    // do not call trim() on long texts to check if the text is empty
    if (text.length() < 50 && text.trim().length() == 0) {
        log.info("No text contained in ContentPart {} of ContentItem {}", contentPart.getKey(), ci.getUri());
        return;
    }
    // truncate text to some piece from the middle if probeLength > 0
    int checkLength = probeLength;
    if (checkLength > 0 && text.length() > checkLength) {
        text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
    }
    List<Language> languages = null;
    try {
        languages = languageIdentifier.getLanguages(text);
        log.debug("language identified: {}", languages);
    } catch (LangDetectException e) {
        Enum<?> errorCode = e.getCode();
        // ignore " 0 - NoTextError" and "5 - CantDetectError"
        if (errorCode.ordinal() != 0 && errorCode.ordinal() != 5) {
            StringBuilder msg = new StringBuilder("Could not identify language of text: ");
            if (text.length() < 200) {
                msg.append(text);
            } else {
                msg.append(text.subSequence(0, 199)).append("...");
            }
            msg.append(" (Error Code: ").append(errorCode.ordinal()).append(" - ").append(errorCode.name()).append(")");
            throw new EngineException(this, ci, msg.toString(), e);
        } else {
            log.debug("No text to detect the language from present in ContentItem ", ci);
        }
    }
    // add language to metadata
    if (languages != null) {
        Graph g = ci.getMetadata();
        ci.getLock().writeLock().lock();
        try {
            for (int i = 0; i < maxSuggestedLanguages && i < languages.size(); i++) {
                // add a hypothesis
                Language hypothesis = languages.get(i);
                IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
                g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(hypothesis.lang)));
                g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
                g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
                g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
            }
        } finally {
            ci.getLock().writeLock().unlock();
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) Graph(org.apache.clerezza.commons.rdf.Graph) Language(com.cybozu.labs.langdetect.Language) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) LangDetectException(com.cybozu.labs.langdetect.LangDetectException)

Example 3 with PlainLiteralImpl

use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.

the class FstLinkingEngineTest method setup.

@BeforeClass
public static void setup() throws Exception {
    // get the working directory
    // use property substitution to test this feature!
    String prefix = System.getProperty("basedir") == null ? "." : "${basedir}";
    String solrServerDir = prefix + TEST_INDEX_REL_PATH;
    log.info("Test Solr Server Directory: {}", solrServerDir);
    System.setProperty(ManagedSolrServer.MANAGED_SOLR_DIR_PROPERTY, solrServerDir);
    SolrYardConfig config = new SolrYardConfig(TEST_YARD_ID, TEST_SOLR_CORE_NAME);
    config.setAllowInitialisation(false);
    // the dbpedia default data
    config.setIndexConfigurationName(TEST_SOLR_CORE_CONFIGURATION);
    // init from datafile provider
    config.setAllowInitialisation(true);
    config.setName("DBpedia.org default data");
    config.setDescription("Data used for FstLinkingEngie tests");
    // create the Yard used for the tests
    IndexReference solrIndexRef = IndexReference.parse(config.getSolrServerLocation());
    SolrServer server = StandaloneEmbeddedSolrServerProvider.getInstance().getSolrServer(solrIndexRef, config.getIndexConfigurationName());
    Assert.assertNotNull("Unable to initialise SolrServer for testing", server);
    core = ((EmbeddedSolrServer) server).getCoreContainer().getCore(solrIndexRef.getIndex());
    Assert.assertNotNull("Unable to get SolrCore '" + config.getIndexConfigurationName() + "' from SolrServer " + server, core);
    yard = new SolrYard(server, config, null);
    // setup the index configuration
    LanguageConfiguration langConf = new LanguageConfiguration("not.used", new String[] { "en;field=dbpedia-ont:surfaceForm;generate=true" });
    fstConfig = new IndexConfiguration(langConf, core, FieldEncodingEnum.SolrYard, "");
    fstConfig.setExecutorService(Executors.newFixedThreadPool(1));
    fstConfig.setTypeField("rdf:type");
    fstConfig.setRankingField("entityhub:entityRank");
    // fstConfig.setEntityCacheManager(new FastLRUCacheManager(2048));
    fstConfig.setOrigin(new PlainLiteralImpl(TEST_ORIGIN));
    // activate the FST config
    // activate this configuration
    fstConfig.activate();
    // validate that the index contains the expected entities
    validateTestIndex();
    // now create the FST models
    List<Future<?>> creationTasks = new ArrayList<Future<?>>();
    for (CorpusInfo corpus : fstConfig.getCorpora()) {
        Assert.assertTrue("Failure in UnitTest - all FST models need to be generate=true", corpus.allowCreation);
        if (!corpus.isFstFile()) {
            // create a task on the FST corpus creation service
            creationTasks.add(fstConfig.getExecutorService().submit(new CorpusCreationTask(fstConfig, corpus)));
        }
    }
    // typical hardware
    for (Future<?> future : creationTasks) {
        try {
            future.get(FST_CREATION_WAIT_TIME, TimeUnit.SECONDS);
        } catch (TimeoutException e) {
        // we assert on future.isDone instead
        }
        Assert.assertTrue("FST Model creation not finished after " + FST_CREATION_WAIT_TIME + "seconds", future.isDone());
    }
}
Also used : PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) ArrayList(java.util.ArrayList) EmbeddedSolrServer(org.apache.solr.client.solrj.embedded.EmbeddedSolrServer) SolrServer(org.apache.solr.client.solrj.SolrServer) ManagedSolrServer(org.apache.stanbol.commons.solr.managed.ManagedSolrServer) SolrYard(org.apache.stanbol.entityhub.yard.solr.impl.SolrYard) SolrYardConfig(org.apache.stanbol.entityhub.yard.solr.impl.SolrYardConfig) IndexConfiguration(org.apache.stanbol.enhancer.engines.lucenefstlinking.IndexConfiguration) CorpusInfo(org.apache.stanbol.enhancer.engines.lucenefstlinking.CorpusInfo) Future(java.util.concurrent.Future) CorpusCreationTask(org.apache.stanbol.enhancer.engines.lucenefstlinking.CorpusCreationTask) LanguageConfiguration(org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration) IndexReference(org.apache.stanbol.commons.solr.IndexReference) EmbeddedSolrServer(org.apache.solr.client.solrj.embedded.EmbeddedSolrServer) TimeoutException(java.util.concurrent.TimeoutException) BeforeClass(org.junit.BeforeClass)

Example 4 with PlainLiteralImpl

use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.

the class LangIdEnhancementEngine method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    String text = "";
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(this, ci, e);
    }
    if (text.trim().length() == 0) {
        log.info("No text contained in ContentPart {} of ContentItem {}", contentPart.getKey(), ci.getUri());
        return;
    }
    // truncate text to some piece from the middle if probeLength > 0
    int checkLength = probeLength;
    if (checkLength > 0 && text.length() > checkLength) {
        text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
    }
    LanguageIdentifier languageIdentifier = new LanguageIdentifier(text);
    String language = languageIdentifier.getLanguage();
    log.info("language identified as " + language);
    // add language to metadata
    Graph g = ci.getMetadata();
    ci.getLock().writeLock().lock();
    try {
        IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
        g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(language)));
        g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) LanguageIdentifier(org.apache.tika.language.LanguageIdentifier) Graph(org.apache.clerezza.commons.rdf.Graph) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) IOException(java.io.IOException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 5 with PlainLiteralImpl

use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.

the class FstLinkingEngineComponent method applyConfig.

/**
 * Called by {@link #activate(ComponentContext)},
 * {@link PlainFstLinkingComponnet#activate(ComponentContext)} and
 * {@link NamedEntityFstLinkingComponnet#activate(ComponentContext)} to
 * apply the parsed {@link ComponentContext#getProperties()}. The
 * {@link LinkingModeEnum linking mode} is parsed separately as OSGI does not
 * allow to modify the parsed config and sup-classes do need to override
 * the linking mode.
 * @param linkingMode the linking mode
 * @param properties
 * @throws ConfigurationException
 */
protected void applyConfig(LinkingModeEnum linkingMode, Dictionary<String, Object> properties, NamespacePrefixService prefixService) throws ConfigurationException {
    // (0) The name for the Enhancement Engine and the basic metadata
    Object value = properties.get(PROPERTY_NAME);
    if (value == null || value.toString().isEmpty()) {
        throw new ConfigurationException(PROPERTY_NAME, "The EnhancementEngine name MUST BE configured!");
    } else {
        this.engineName = value.toString();
    }
    log.info(" - engine name: {}", engineName);
    engineMetadata = new Hashtable<String, Object>();
    engineMetadata.put(PROPERTY_NAME, this.engineName);
    value = properties.get(Constants.SERVICE_RANKING);
    engineMetadata.put(Constants.SERVICE_RANKING, value == null ? Integer.valueOf(0) : value);
    // (0) set the linking mode
    this.linkingMode = linkingMode;
    log.info(" - linking mode: {}", linkingMode);
    // (1) parse the TextProcessing configuration
    // TODO: decide if we should use the TextProcessingConfig for this engine
    textProcessingConfig = TextProcessingConfig.createInstance(properties);
    // change default for EntityLinkerConfig.MIN_FOUND_TOKENS
    value = properties.get(EntityLinkerConfig.MIN_FOUND_TOKENS);
    entityLinkerConfig = EntityLinkerConfig.createInstance(properties, prefixService);
    if (value == null) {
        // no MIN_FOUND_TOKENS config present
        // manually set the default to the value used by this engine
        entityLinkerConfig.setMinFoundTokens(FST_DEFAULT_MIN_FOUND_TOKENS);
    }
    // (2) parse the configured IndexReference
    value = properties.get(SOLR_CORE);
    if (value == null) {
        throw new ConfigurationException(SOLR_CORE, "Missing required configuration of the SolrCore");
    } else {
        indexReference = IndexReference.parse(value.toString());
    }
    value = properties.get(IndexConfiguration.FIELD_ENCODING);
    if (value == null) {
        throw new ConfigurationException(IndexConfiguration.FIELD_ENCODING, "Missing required configuration of the Solr Field Encoding");
    } else {
        try {
            fieldEncoding = FieldEncodingEnum.valueOf(value.toString().trim());
        } catch (IllegalArgumentException e) {
            throw new ConfigurationException(IndexConfiguration.FIELD_ENCODING, "The configured " + "FieldEncoding MUST BE a member of " + Arrays.toString(FieldEncodingEnum.values()), e);
        }
    }
    value = properties.get(IndexConfiguration.SKIP_ALT_TOKENS);
    if (value instanceof Boolean) {
        skipAltTokensConfig = ((Boolean) value);
    } else if (value != null) {
        skipAltTokensConfig = Boolean.valueOf(value.toString());
    }
    // else no config -> will use the default
    // (4) parse Origin information
    value = properties.get(ORIGIN);
    if (value instanceof RDFTerm) {
        origin = (RDFTerm) origin;
    } else if (value instanceof String) {
        try {
            URI originUri = new URI((String) value);
            if (originUri.isAbsolute()) {
                origin = new IRI((String) value);
            } else {
                origin = new PlainLiteralImpl((String) value);
            }
        } catch (URISyntaxException e) {
            origin = new PlainLiteralImpl((String) value);
        }
        log.info(" - origin: {}", origin);
    } else if (value != null) {
        log.warn("Values of the {} property MUST BE of type RDFTerm or String " + "(parsed: {} (type:{}))", new Object[] { ORIGIN, value, value.getClass() });
    }
    // else no ORIGIN information provided
    // (5) init the FST configuration
    // We can create the default configuration only here, as it depends on the
    // name of the solrIndex
    String defaultConfig = "*;" + IndexConfiguration.PARAM_FST + "=" + indexReference.getIndex() + ";" + IndexConfiguration.PARAM_FIELD + "=" + IndexConfiguration.DEFAULT_FIELD;
    fstConfig = new LanguageConfiguration(IndexConfiguration.FST_CONFIG, new String[] { defaultConfig });
    // now set the actual configuration parsed to the engine
    value = properties.get(IndexConfiguration.FST_CONFIG);
    if (value != null && !StringUtils.isBlank(value.toString())) {
        fstConfig.setConfiguration(properties);
    }
    // else keep the default
    value = properties.get(IndexConfiguration.FST_FOLDER);
    if (value instanceof String) {
        this.fstFolder = ((String) value).trim();
        if (this.fstFolder.isEmpty()) {
            this.fstFolder = null;
        }
    } else if (value == null) {
        this.fstFolder = null;
    } else {
        throw new ConfigurationException(IndexConfiguration.FST_FOLDER, "Values MUST BE of type String" + "(found: " + value.getClass().getName() + ")!");
    }
    // (6) Create the ThreadPool used for the runtime creation of FST models
    value = properties.get(FST_THREAD_POOL_SIZE);
    int tpSize;
    if (value instanceof Number) {
        tpSize = ((Number) value).intValue();
    } else if (value != null) {
        try {
            tpSize = Integer.parseInt(value.toString());
        } catch (NumberFormatException e) {
            throw new ConfigurationException(FST_THREAD_POOL_SIZE, "Unable to parse the integer FST thread pool size from the " + "configured " + value.getClass().getSimpleName() + " '" + value + "'!", e);
        }
    } else {
        tpSize = -1;
    }
    if (tpSize <= 0) {
        // if configured value <= 0 we use the default
        tpSize = DEFAULT_FST_THREAD_POOL_SIZE;
    }
    // build a ThreadFactoryBuilder for low priority daemon threads that
    // do use a meaningful name
    ThreadFactoryBuilder tfBuilder = new ThreadFactoryBuilder();
    // should be stopped if the VM closes
    tfBuilder.setDaemon(true);
    // low priority
    tfBuilder.setPriority(Thread.MIN_PRIORITY);
    tfBuilder.setNameFormat(engineName + "-FstRuntimeCreation-thread-%d");
    if (fstCreatorService != null && !fstCreatorService.isTerminated()) {
        // NOTE: We can not call terminateNow, because to interrupt threads
        // here would also close FileChannels used by the SolrCore
        // and produce java.nio.channels.ClosedByInterruptException
        // exceptions followed by java.nio.channels.ClosedChannelException
        // on following calls to affected files of the SolrIndex.
        // Because of that we just log a warning and let uncompleted tasks
        // complete!
        log.warn("some items in a previouse FST Runtime Creation Threadpool have " + "still not finished!");
    }
    fstCreatorService = Executors.newFixedThreadPool(tpSize, tfBuilder.build());
    // (7) Parse the EntityCache config
    int entityCacheSize;
    value = properties.get(ENTITY_CACHE_SIZE);
    if (value instanceof Number) {
        entityCacheSize = ((Number) value).intValue();
    } else if (value != null) {
        try {
            entityCacheSize = Integer.parseInt(value.toString());
        } catch (NumberFormatException e) {
            throw new ConfigurationException(ENTITY_CACHE_SIZE, "Unable to parse the integer EntityCacheSize from the " + "configured " + value.getClass().getSimpleName() + " '" + value + "'!", e);
        }
    } else {
        entityCacheSize = -1;
    }
    if (entityCacheSize == 0) {
        log.info(" ... EntityCache deactivated");
        this.entityCacheSize = entityCacheSize;
    } else {
        this.entityCacheSize = entityCacheSize < 0 ? DEFAULT_ENTITY_CACHE_SIZE : entityCacheSize;
        log.info(" ... EntityCache enabled (size: {})", this.entityCacheSize);
    }
    // (8) parse the Entity type field
    value = properties.get(IndexConfiguration.SOLR_TYPE_FIELD);
    if (value == null || StringUtils.isBlank(value.toString())) {
        solrTypeField = null;
    } else {
        solrTypeField = value.toString().trim();
    }
    // (9) parse the Entity Ranking field
    value = properties.get(IndexConfiguration.SOLR_RANKING_FIELD);
    if (value == null) {
        solrRankingField = null;
    } else {
        solrRankingField = value.toString().trim();
    }
    // (10) parse the NamedEntity type mappings (if linkingMode = NER)
    if (linkingMode == LinkingModeEnum.NER) {
        nerTypeMappings = new HashMap<String, Set<String>>();
        value = properties.get(NAMED_ENTITY_TYPE_MAPPINGS);
        if (value instanceof String[]) {
            // support array
            value = Arrays.asList((String[]) value);
        } else if (value instanceof String) {
            // single value
            value = Collections.singleton(value);
        }
        if (value instanceof Collection<?>) {
            // and collection
            log.info(" - process Named Entity Type Mappings (used by LinkingMode: {})", linkingMode);
            configs: for (Object o : (Iterable<?>) value) {
                if (o != null) {
                    StringBuilder usage = new StringBuilder("useage: ");
                    usage.append("'{namedEntity-tag-or-uri} > {entityType-1}[,{entityType-n}]'");
                    String[] config = o.toString().split(">");
                    String namedEntityType = config[0].trim();
                    if (namedEntityType.isEmpty()) {
                        log.warn("Invalid Type Mapping Config '{}': Missing namedEntityType ({}) -> ignore this config", o, usage);
                        continue configs;
                    }
                    if (NamespaceMappingUtils.getPrefix(namedEntityType) != null) {
                        namedEntityType = NamespaceMappingUtils.getConfiguredUri(prefixService, NAMED_ENTITY_TYPE_MAPPINGS, namedEntityType);
                    }
                    if (config.length < 2 || config[1].isEmpty()) {
                        log.warn("Invalid Type Mapping Config '{}': Missing dc:type URI '{}' ({}) -> ignore this config", o, usage);
                        continue configs;
                    }
                    String entityTypes = config[1].trim();
                    if (config.length > 2) {
                        log.warn("Configuration after 2nd '>' gets ignored. Will use mapping '{} > {}' from config {}", new Object[] { namedEntityType, entityTypes, o });
                    }
                    Set<String> types = nerTypeMappings.get(namedEntityType);
                    if (types == null) {
                        // add new element to the mapping
                        types = new HashSet<String>();
                        nerTypeMappings.put(namedEntityType, types);
                    }
                    for (String entityType : entityTypes.split(";")) {
                        entityType = entityType.trim();
                        if (!entityType.isEmpty()) {
                            String typeUri;
                            if ("*".equals(entityType)) {
                                // null is used as wildcard
                                typeUri = null;
                            } else {
                                typeUri = NamespaceMappingUtils.getConfiguredUri(prefixService, NAMED_ENTITY_TYPE_MAPPINGS, entityType);
                            }
                            log.info("   - add {} > {}", namedEntityType, typeUri);
                            types.add(typeUri);
                        }
                    // else ignore empty mapping
                    }
                }
            }
        } else {
            // no mappings defined ... set wildcard mapping
            log.info(" - No Named Entity type mappings configured. Will use wildcard mappings");
            nerTypeMappings = Collections.singletonMap(null, Collections.<String>singleton(null));
        }
    }
    // (11) start tracking the SolrCore
    try {
        solrServerTracker = new RegisteredSolrServerTracker(bundleContext, indexReference, null) {

            @Override
            public void removedService(ServiceReference reference, Object service) {
                log.info(" ... SolrCore for {} was removed!", reference);
                // try to get an other serviceReference from the tracker
                if (reference.equals(FstLinkingEngineComponent.this.solrServerReference)) {
                    updateEngineRegistration(solrServerTracker.getServiceReference(), null);
                } else {
                    log.info("  - removed SolrCore was not used for FST linking");
                }
                super.removedService(reference, service);
            }

            @Override
            public void modifiedService(ServiceReference reference, Object service) {
                log.info(" ... SolrCore for {} was updated!", indexReference);
                updateEngineRegistration(solrServerTracker.getServiceReference(), null);
                super.modifiedService(reference, service);
            }

            @Override
            public SolrServer addingService(ServiceReference reference) {
                SolrServer server = super.addingService(reference);
                if (solrCore != null) {
                    log.info("Multiple SolrCores for name {}! Will update engine " + "with the newly added {}!", new Object[] { solrCore.getName(), indexReference, reference });
                }
                updateEngineRegistration(reference, server);
                return server;
            }
        };
    } catch (InvalidSyntaxException e) {
        throw new ConfigurationException(SOLR_CORE, "parsed SolrCore name '" + value.toString() + "' is invalid (expected: '[{server-name}:]{indexname}'");
    }
    try {
        solrServerTracker.open();
    } catch (RuntimeException e) {
        // FIX for STANBOL-1416 (see https://issues.apache.org/jira/browse/STANBOL-1416)
        // If an available SolrCore can not be correctly initialized we will
        // get the exception here. In this case we want this component to be
        // activated and waiting for further service events. Because of that
        // we catch here the exception.
        log.debug("Error while processing existing SolrCore Service during " + "opening SolrServiceTracker ... waiting for further service" + "Events", e);
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Set(java.util.Set) HashSet(java.util.HashSet) URISyntaxException(java.net.URISyntaxException) URI(java.net.URI) EmbeddedSolrServer(org.apache.solr.client.solrj.embedded.EmbeddedSolrServer) SolrServer(org.apache.solr.client.solrj.SolrServer) ConfigurationException(org.osgi.service.cm.ConfigurationException) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) InvalidSyntaxException(org.osgi.framework.InvalidSyntaxException) HashSet(java.util.HashSet) RegisteredSolrServerTracker(org.apache.stanbol.commons.solr.RegisteredSolrServerTracker) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) ServiceReference(org.osgi.framework.ServiceReference) Collection(java.util.Collection) LanguageConfiguration(org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration)

Aggregations

PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)82 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)69 IRI (org.apache.clerezza.commons.rdf.IRI)58 Graph (org.apache.clerezza.commons.rdf.Graph)34 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)20 Language (org.apache.clerezza.commons.rdf.Language)19 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)18 Literal (org.apache.clerezza.commons.rdf.Literal)16 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)16 IOException (java.io.IOException)14 HashMap (java.util.HashMap)13 Triple (org.apache.clerezza.commons.rdf.Triple)12 StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)12 ArrayList (java.util.ArrayList)11 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)11 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)10 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)10 Test (org.junit.Test)10 HashSet (java.util.HashSet)8 SOAPException (javax.xml.soap.SOAPException)6