Search in sources :

Example 1 with Text

use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.

the class TestSearcherImpl method addEntity.

public void addEntity(Representation rep) {
    entities.put(rep.getId(), rep);
    Iterator<Text> labels = rep.getText(nameField);
    while (labels.hasNext()) {
        Text label = labels.next();
        for (String token : tokenizer.tokenize(label.getText())) {
            Collection<Representation> values = data.get(token);
            if (values == null) {
                values = new ArrayList<Representation>();
                data.put(label.getText(), values);
            }
            values.add(rep);
        }
    }
}
Also used : Text(org.apache.stanbol.entityhub.servicesapi.model.Text) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation)

Example 2 with Text

use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.

the class EntityLinker method matchLabels.

/**
     * Matches the labels of the parsed {@link Representation} with the Tokens of
     * the texts (beginning with the currently active 
     * {@link ProcessingState#getToken() token}).<p>
     * The field used to get the labels is retrieved from 
     * {@link EntitySearcher#getNameField()}. Only labels with no language or the
     * language of the current sentence are considered. If less than 
     * {@link EntityLinkerConfig#getMinFoundTokens()} tokens match with an
     * label the Concept is only considered to match if the label is
     * {@link String#equalsIgnoreCase(String)} to the text covered by the
     * matched token(s). Otherwise also {@link MATCH#FULL} and {@link MATCH#PARTIAL}
     * results are allowed.
     * @param rep The representation including at least the data for the
     * {@link EntitySearcher#getNameField()} property.
     * @return The result of the matching.
     */
private Suggestion matchLabels(Representation rep) {
    //language of the current sentence
    String curLang = state.getLanguage();
    //configured default language 
    String defLang = config.getDefaultLanguage();
    //        Iterator<Text> labels = rep.get(config.getNameField(), //get all labels
    //            state.getLanguage(), //in the current language
    //            config.getDefaultLanguage()); //and the default language
    Iterator<Text> labels = rep.getText(config.getNameField());
    Suggestion match = new Suggestion(rep);
    Collection<Text> defaultLabels = new ArrayList<Text>();
    boolean matchedCurLangLabel = false;
    while (labels.hasNext()) {
        Text label = labels.next();
        String lang = label.getLanguage();
        if ((lang == null && curLang == null) || (lang != null && curLang != null && lang.startsWith(curLang))) {
            matchLabel(match, label);
            matchedCurLangLabel = true;
        } else if ((lang == null && defLang == null) || (lang != null && defLang != null && lang.startsWith(defLang))) {
            defaultLabels.add(label);
        }
    }
    // * no label in the current language or
    if (!matchedCurLangLabel) {
        // || match.getMatch() == MATCH.NONE){
        for (Text defaultLangLabel : defaultLabels) {
            matchLabel(match, defaultLangLabel);
        }
    }
    return match;
}
Also used : ArrayList(java.util.ArrayList) Text(org.apache.stanbol.entityhub.servicesapi.model.Text)

Example 3 with Text

use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.

the class FreebaseKeyProcessor method process.

@Override
public Representation process(Representation rep) {
    //wikipedia
    if (dbpediaState) {
        //we try to link only a single page. So get the English label and
        //search for the according dbpedia key 
        Text enLabel = rep.getFirst(RDFS_LABEL, "en");
        String mainKey = enLabel != null ? decodeKey(enLabel.getText()).replace(' ', '_') : null;
        Iterator<Text> wpEnKeys = rep.getText(WP_EN);
        Collection<String> keys = new ArrayList<String>();
        boolean foundMain = false;
        if (wpEnKeys.hasNext()) {
            //link to the English dbpedia
            while (!foundMain & wpEnKeys.hasNext()) {
                String key = decodeKey(wpEnKeys.next().getText());
                if (key.equals(mainKey)) {
                    foundMain = true;
                    rep.addReference(linkProperty, linkeDbPedia(null, key));
                } else {
                    keys.add(key);
                }
            }
            if (!foundMain) {
                //add all links
                for (String key : keys) {
                    rep.addReference(linkProperty, linkeDbPedia(null, key));
                }
            }
        } else {
            //search for other wikipedia keys
            Map<String, String> wikipediaFields = new HashMap<String, String>();
            //(1) collect the fields
            for (Iterator<String> fields = rep.getFieldNames(); fields.hasNext(); ) {
                String field = fields.next();
                int nsIndex = field.lastIndexOf('/') + 1;
                if (field.indexOf(WP_PREFIX, nsIndex) == nsIndex && //no '_' in the property name
                field.indexOf('_', nsIndex + WP_PREFIX_LEN + 2) < 1) {
                    String language = field.substring(nsIndex + WP_PREFIX.length(), field.length());
                    wikipediaFields.put(field, language);
                }
            // else no key:wikipedia.* field
            }
            //(2) add the values to avoid concurrent modification exceptions
            for (Entry<String, String> entry : wikipediaFields.entrySet()) {
                for (Iterator<Text> langWpKeys = rep.getText(entry.getKey()); langWpKeys.hasNext(); ) {
                    rep.addReference(linkProperty, linkeDbPedia(entry.getValue(), langWpKeys.next().getText()));
                }
            }
        }
    }
    if (musicbrainzState) {
        Iterator<Text> mbKeys = rep.getText(MB_KEY);
        if (mbKeys.hasNext()) {
            String key = mbKeys.next().getText();
            //we need the type
            Iterator<Reference> types = rep.getReferences(RDF_TYPE);
            String type = null;
            while (types.hasNext() && !MB_TYPES.contains(type)) {
                String fbType = types.next().getReference();
                if (MUSIC_PROP_PREFIX.equals(fbType.subSequence(FB_NS_LEN, FB_NS_LEN + MUSIC_PROP_PREFIX_LEN))) {
                    type = fbType.substring(FB_NS_LEN + MUSIC_PROP_PREFIX_LEN);
                }
            }
            if (type != null) {
                StringBuilder uri = new StringBuilder(MB_NS);
                uri.append(type).append('/').append(key).append("#_");
                rep.addReference(linkProperty, uri.toString());
            }
        }
    }
    return rep;
}
Also used : HashMap(java.util.HashMap) Reference(org.apache.stanbol.entityhub.servicesapi.model.Reference) ArrayList(java.util.ArrayList) Text(org.apache.stanbol.entityhub.servicesapi.model.Text)

Example 4 with Text

use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.

the class AlternateLabelProcessor method process.

@Override
public Representation process(Representation source) {
    Integer id = source.getFirst(GeonamesPropertyEnum.idx_id.toString(), Integer.class);
    if (id == null) {
        log.warn("The <{}> field MUST contain the integer ID!", GeonamesPropertyEnum.idx_id);
        return source;
    }
    //use remove, because we need not need it a 2nd time!
    List<FeatureName> alternateNames = featureNames.remove(id);
    if (alternateNames != null) {
        List<Text> altList = new ArrayList<Text>(alternateNames.size());
        List<Text> officialList = new ArrayList<Text>(alternateNames.size());
        List<String> postalCodes = new ArrayList<String>();
        List<URL> wikipediaLinks = new ArrayList<URL>();
        List<Text> shortNames = new ArrayList<Text>();
        List<Text> colloquialNames = new ArrayList<Text>();
        for (FeatureName name : alternateNames) {
            if (name.isNaturalLanguageLabel()) {
                Text act = vf.createText(name.getName(), name.getLang());
                if (name.isPreferred()) {
                    officialList.add(act);
                } else {
                    altList.add(act);
                }
                if (name.isShortName()) {
                    shortNames.add(act);
                }
                if (name.isColloquial()) {
                    colloquialNames.add(act);
                }
            } else if (name.getLabelType() == NameType.postal) {
                postalCodes.add(name.getName());
            } else if (name.getLabelType() == NameType.link) {
                if (name.getName().contains("wikipedia.org")) {
                    try {
                        wikipediaLinks.add(new URL(name.getName()));
                    } catch (MalformedURLException e) {
                        log.warn("Unable to parse URL for link label " + name.getName());
                    //ignore
                    }
                }
            }
        }
        if (!altList.isEmpty()) {
            source.add(GeonamesPropertyEnum.gn_alternateName.toString(), altList);
        }
        if (!officialList.isEmpty()) {
            source.add(GeonamesPropertyEnum.gn_officialName.toString(), officialList);
        }
        if (!postalCodes.isEmpty()) {
            source.add(GeonamesPropertyEnum.gn_postalCode.toString(), postalCodes);
        }
        if (!wikipediaLinks.isEmpty()) {
            source.add(GeonamesPropertyEnum.gn_wikipediaArticle.toString(), wikipediaLinks);
        }
        if (!shortNames.isEmpty()) {
            source.add(GeonamesPropertyEnum.gn_shortName.toString(), shortNames);
        }
        if (!colloquialNames.isEmpty()) {
            source.add(GeonamesPropertyEnum.gn_colloquialName.toString(), colloquialNames);
        }
    }
    return source;
}
Also used : MalformedURLException(java.net.MalformedURLException) ArrayList(java.util.ArrayList) Text(org.apache.stanbol.entityhub.servicesapi.model.Text) URL(java.net.URL)

Example 5 with Text

use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.

the class TopicClassificationEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri() + "(supported: '" + SUPPORTED_MIMETYPES + "') -> this indicates that canEnhance was" + "NOT called and indicates a bug in the used EnhancementJobManager!");
    }
    String language = EnhancementEngineHelper.getLanguage(ci);
    if (!(acceptedLanguageSet.isEmpty() || acceptedLanguageSet.contains(language) || acceptedLanguageSet.contains(""))) {
        throw new IllegalStateException("The language '" + language + "' of the ContentItem is not configured as " + " active for this Engine (active: " + acceptedLanguageSet + ").");
    }
    String text;
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(String.format("Unable to extract " + " textual content from ContentPart %s of ContentItem %s!", contentPart.getKey(), ci.getUri()), e);
    }
    if (text.trim().isEmpty()) {
        log.warn("ContentPart {} of ContentItem {} does not contain any " + "text to extract topics from", contentPart.getKey(), ci.getUri());
        return;
    }
    Graph metadata = ci.getMetadata();
    List<TopicSuggestion> topics;
    try {
        topics = suggestTopics(text);
        if (topics.isEmpty()) {
            return;
        }
    } catch (ClassifierException e) {
        throw new EngineException(e);
    }
    IRI precision = new IRI(NamespaceEnum.fise + "classifier/precision");
    IRI recall = new IRI(NamespaceEnum.fise + "classifier/recall");
    IRI f1 = new IRI(NamespaceEnum.fise + "classifier/f1");
    LiteralFactory lf = LiteralFactory.getInstance();
    ci.getLock().writeLock().lock();
    try {
        // Global text annotation to attach all the topic annotation to it.
        IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
        metadata.add(new TripleImpl(textAnnotation, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE, OntologicalClasses.SKOS_CONCEPT));
        for (TopicSuggestion topic : topics) {
            IRI enhancement = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE, TechnicalClasses.ENHANCER_TOPICANNOTATION));
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION, textAnnotation));
            // add link to entity
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE, new IRI(topic.conceptUri)));
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE, OntologicalClasses.SKOS_CONCEPT));
            // add confidence information
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE, lf.createTypedLiteral(Double.valueOf(topic.score))));
            // add performance estimates of the classifier if available
            ClassificationReport perf = getPerformanceEstimates(topic.conceptUri);
            if (perf.uptodate) {
                metadata.add(new TripleImpl(enhancement, precision, lf.createTypedLiteral(Double.valueOf(perf.precision))));
                metadata.add(new TripleImpl(enhancement, recall, lf.createTypedLiteral(Double.valueOf(perf.recall))));
                metadata.add(new TripleImpl(enhancement, f1, lf.createTypedLiteral(Double.valueOf(perf.f1))));
            }
            // fetch concept label from the entityhub or a referenced site if available
            Entity entity = entityhub.getEntity(topic.conceptUri);
            if (entity == null) {
                entity = referencedSiteManager.getEntity(topic.conceptUri);
            }
            if (entity != null) {
                Representation representation = entity.getRepresentation();
                // TODO: extract all languages based on some configuration instead of hardcoding English
                Text label = representation.getFirst(NamespaceEnum.skos + "prefLabel", "en", "en-US", "en-GB");
                if (label == null) {
                    label = representation.getFirst(NamespaceEnum.rdfs + "label", "en", "en-US", "en-GB");
                }
                if (label != null) {
                    metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL, new PlainLiteralImpl(label.getText())));
                }
            }
        }
    } catch (ClassifierException e) {
        throw new EngineException(e);
    } catch (IllegalArgumentException e) {
        throw new EngineException(e);
    } catch (EntityhubException e) {
        throw new EngineException(e);
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Entity(org.apache.stanbol.entityhub.servicesapi.model.Entity) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) Text(org.apache.stanbol.entityhub.servicesapi.model.Text) IOException(java.io.IOException) TopicSuggestion(org.apache.stanbol.enhancer.topic.api.TopicSuggestion) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) ImmutableGraph(org.apache.clerezza.commons.rdf.ImmutableGraph) Graph(org.apache.clerezza.commons.rdf.Graph) EntityhubException(org.apache.stanbol.entityhub.servicesapi.EntityhubException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) ClassificationReport(org.apache.stanbol.enhancer.topic.api.ClassificationReport) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException)

Aggregations

Text (org.apache.stanbol.entityhub.servicesapi.model.Text)50 Representation (org.apache.stanbol.entityhub.servicesapi.model.Representation)32 Test (org.junit.Test)24 HashSet (java.util.HashSet)14 Reference (org.apache.stanbol.entityhub.servicesapi.model.Reference)12 ArrayList (java.util.ArrayList)11 IRI (org.apache.clerezza.commons.rdf.IRI)6 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)4 Entity (org.apache.stanbol.entityhub.servicesapi.model.Entity)4 ValueFactory (org.apache.stanbol.entityhub.servicesapi.model.ValueFactory)4 RepresentationTest (org.apache.stanbol.entityhub.test.model.RepresentationTest)4 Graph (org.apache.clerezza.commons.rdf.Graph)3 Language (org.apache.clerezza.commons.rdf.Language)3 Literal (org.apache.clerezza.commons.rdf.Literal)3 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)3 EntityhubException (org.apache.stanbol.entityhub.servicesapi.EntityhubException)3 FieldQuery (org.apache.stanbol.entityhub.servicesapi.query.FieldQuery)3 TextConstraint (org.apache.stanbol.entityhub.servicesapi.query.TextConstraint)3 URI (java.net.URI)2 URL (java.net.URL)2