use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.
the class TestSearcherImpl method addEntity.
public void addEntity(Representation rep) {
entities.put(rep.getId(), rep);
Iterator<Text> labels = rep.getText(nameField);
while (labels.hasNext()) {
Text label = labels.next();
for (String token : tokenizer.tokenize(label.getText())) {
Collection<Representation> values = data.get(token);
if (values == null) {
values = new ArrayList<Representation>();
data.put(label.getText(), values);
}
values.add(rep);
}
}
}
use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.
the class EntityLinker method matchLabels.
/**
* Matches the labels of the parsed {@link Representation} with the Tokens of
* the texts (beginning with the currently active
* {@link ProcessingState#getToken() token}).<p>
* The field used to get the labels is retrieved from
* {@link EntitySearcher#getNameField()}. Only labels with no language or the
* language of the current sentence are considered. If less than
* {@link EntityLinkerConfig#getMinFoundTokens()} tokens match with an
* label the Concept is only considered to match if the label is
* {@link String#equalsIgnoreCase(String)} to the text covered by the
* matched token(s). Otherwise also {@link MATCH#FULL} and {@link MATCH#PARTIAL}
* results are allowed.
* @param rep The representation including at least the data for the
* {@link EntitySearcher#getNameField()} property.
* @return The result of the matching.
*/
private Suggestion matchLabels(Representation rep) {
//language of the current sentence
String curLang = state.getLanguage();
//configured default language
String defLang = config.getDefaultLanguage();
// Iterator<Text> labels = rep.get(config.getNameField(), //get all labels
// state.getLanguage(), //in the current language
// config.getDefaultLanguage()); //and the default language
Iterator<Text> labels = rep.getText(config.getNameField());
Suggestion match = new Suggestion(rep);
Collection<Text> defaultLabels = new ArrayList<Text>();
boolean matchedCurLangLabel = false;
while (labels.hasNext()) {
Text label = labels.next();
String lang = label.getLanguage();
if ((lang == null && curLang == null) || (lang != null && curLang != null && lang.startsWith(curLang))) {
matchLabel(match, label);
matchedCurLangLabel = true;
} else if ((lang == null && defLang == null) || (lang != null && defLang != null && lang.startsWith(defLang))) {
defaultLabels.add(label);
}
}
// * no label in the current language or
if (!matchedCurLangLabel) {
// || match.getMatch() == MATCH.NONE){
for (Text defaultLangLabel : defaultLabels) {
matchLabel(match, defaultLangLabel);
}
}
return match;
}
use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.
the class FreebaseKeyProcessor method process.
@Override
public Representation process(Representation rep) {
//wikipedia
if (dbpediaState) {
//we try to link only a single page. So get the English label and
//search for the according dbpedia key
Text enLabel = rep.getFirst(RDFS_LABEL, "en");
String mainKey = enLabel != null ? decodeKey(enLabel.getText()).replace(' ', '_') : null;
Iterator<Text> wpEnKeys = rep.getText(WP_EN);
Collection<String> keys = new ArrayList<String>();
boolean foundMain = false;
if (wpEnKeys.hasNext()) {
//link to the English dbpedia
while (!foundMain & wpEnKeys.hasNext()) {
String key = decodeKey(wpEnKeys.next().getText());
if (key.equals(mainKey)) {
foundMain = true;
rep.addReference(linkProperty, linkeDbPedia(null, key));
} else {
keys.add(key);
}
}
if (!foundMain) {
//add all links
for (String key : keys) {
rep.addReference(linkProperty, linkeDbPedia(null, key));
}
}
} else {
//search for other wikipedia keys
Map<String, String> wikipediaFields = new HashMap<String, String>();
//(1) collect the fields
for (Iterator<String> fields = rep.getFieldNames(); fields.hasNext(); ) {
String field = fields.next();
int nsIndex = field.lastIndexOf('/') + 1;
if (field.indexOf(WP_PREFIX, nsIndex) == nsIndex && //no '_' in the property name
field.indexOf('_', nsIndex + WP_PREFIX_LEN + 2) < 1) {
String language = field.substring(nsIndex + WP_PREFIX.length(), field.length());
wikipediaFields.put(field, language);
}
// else no key:wikipedia.* field
}
//(2) add the values to avoid concurrent modification exceptions
for (Entry<String, String> entry : wikipediaFields.entrySet()) {
for (Iterator<Text> langWpKeys = rep.getText(entry.getKey()); langWpKeys.hasNext(); ) {
rep.addReference(linkProperty, linkeDbPedia(entry.getValue(), langWpKeys.next().getText()));
}
}
}
}
if (musicbrainzState) {
Iterator<Text> mbKeys = rep.getText(MB_KEY);
if (mbKeys.hasNext()) {
String key = mbKeys.next().getText();
//we need the type
Iterator<Reference> types = rep.getReferences(RDF_TYPE);
String type = null;
while (types.hasNext() && !MB_TYPES.contains(type)) {
String fbType = types.next().getReference();
if (MUSIC_PROP_PREFIX.equals(fbType.subSequence(FB_NS_LEN, FB_NS_LEN + MUSIC_PROP_PREFIX_LEN))) {
type = fbType.substring(FB_NS_LEN + MUSIC_PROP_PREFIX_LEN);
}
}
if (type != null) {
StringBuilder uri = new StringBuilder(MB_NS);
uri.append(type).append('/').append(key).append("#_");
rep.addReference(linkProperty, uri.toString());
}
}
}
return rep;
}
use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.
the class AlternateLabelProcessor method process.
@Override
public Representation process(Representation source) {
Integer id = source.getFirst(GeonamesPropertyEnum.idx_id.toString(), Integer.class);
if (id == null) {
log.warn("The <{}> field MUST contain the integer ID!", GeonamesPropertyEnum.idx_id);
return source;
}
//use remove, because we need not need it a 2nd time!
List<FeatureName> alternateNames = featureNames.remove(id);
if (alternateNames != null) {
List<Text> altList = new ArrayList<Text>(alternateNames.size());
List<Text> officialList = new ArrayList<Text>(alternateNames.size());
List<String> postalCodes = new ArrayList<String>();
List<URL> wikipediaLinks = new ArrayList<URL>();
List<Text> shortNames = new ArrayList<Text>();
List<Text> colloquialNames = new ArrayList<Text>();
for (FeatureName name : alternateNames) {
if (name.isNaturalLanguageLabel()) {
Text act = vf.createText(name.getName(), name.getLang());
if (name.isPreferred()) {
officialList.add(act);
} else {
altList.add(act);
}
if (name.isShortName()) {
shortNames.add(act);
}
if (name.isColloquial()) {
colloquialNames.add(act);
}
} else if (name.getLabelType() == NameType.postal) {
postalCodes.add(name.getName());
} else if (name.getLabelType() == NameType.link) {
if (name.getName().contains("wikipedia.org")) {
try {
wikipediaLinks.add(new URL(name.getName()));
} catch (MalformedURLException e) {
log.warn("Unable to parse URL for link label " + name.getName());
//ignore
}
}
}
}
if (!altList.isEmpty()) {
source.add(GeonamesPropertyEnum.gn_alternateName.toString(), altList);
}
if (!officialList.isEmpty()) {
source.add(GeonamesPropertyEnum.gn_officialName.toString(), officialList);
}
if (!postalCodes.isEmpty()) {
source.add(GeonamesPropertyEnum.gn_postalCode.toString(), postalCodes);
}
if (!wikipediaLinks.isEmpty()) {
source.add(GeonamesPropertyEnum.gn_wikipediaArticle.toString(), wikipediaLinks);
}
if (!shortNames.isEmpty()) {
source.add(GeonamesPropertyEnum.gn_shortName.toString(), shortNames);
}
if (!colloquialNames.isEmpty()) {
source.add(GeonamesPropertyEnum.gn_colloquialName.toString(), colloquialNames);
}
}
return source;
}
use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.
the class TopicClassificationEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri() + "(supported: '" + SUPPORTED_MIMETYPES + "') -> this indicates that canEnhance was" + "NOT called and indicates a bug in the used EnhancementJobManager!");
}
String language = EnhancementEngineHelper.getLanguage(ci);
if (!(acceptedLanguageSet.isEmpty() || acceptedLanguageSet.contains(language) || acceptedLanguageSet.contains(""))) {
throw new IllegalStateException("The language '" + language + "' of the ContentItem is not configured as " + " active for this Engine (active: " + acceptedLanguageSet + ").");
}
String text;
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(String.format("Unable to extract " + " textual content from ContentPart %s of ContentItem %s!", contentPart.getKey(), ci.getUri()), e);
}
if (text.trim().isEmpty()) {
log.warn("ContentPart {} of ContentItem {} does not contain any " + "text to extract topics from", contentPart.getKey(), ci.getUri());
return;
}
Graph metadata = ci.getMetadata();
List<TopicSuggestion> topics;
try {
topics = suggestTopics(text);
if (topics.isEmpty()) {
return;
}
} catch (ClassifierException e) {
throw new EngineException(e);
}
IRI precision = new IRI(NamespaceEnum.fise + "classifier/precision");
IRI recall = new IRI(NamespaceEnum.fise + "classifier/recall");
IRI f1 = new IRI(NamespaceEnum.fise + "classifier/f1");
LiteralFactory lf = LiteralFactory.getInstance();
ci.getLock().writeLock().lock();
try {
// Global text annotation to attach all the topic annotation to it.
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(textAnnotation, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE, OntologicalClasses.SKOS_CONCEPT));
for (TopicSuggestion topic : topics) {
IRI enhancement = EnhancementEngineHelper.createEntityEnhancement(ci, this);
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE, TechnicalClasses.ENHANCER_TOPICANNOTATION));
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION, textAnnotation));
// add link to entity
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE, new IRI(topic.conceptUri)));
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE, OntologicalClasses.SKOS_CONCEPT));
// add confidence information
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE, lf.createTypedLiteral(Double.valueOf(topic.score))));
// add performance estimates of the classifier if available
ClassificationReport perf = getPerformanceEstimates(topic.conceptUri);
if (perf.uptodate) {
metadata.add(new TripleImpl(enhancement, precision, lf.createTypedLiteral(Double.valueOf(perf.precision))));
metadata.add(new TripleImpl(enhancement, recall, lf.createTypedLiteral(Double.valueOf(perf.recall))));
metadata.add(new TripleImpl(enhancement, f1, lf.createTypedLiteral(Double.valueOf(perf.f1))));
}
// fetch concept label from the entityhub or a referenced site if available
Entity entity = entityhub.getEntity(topic.conceptUri);
if (entity == null) {
entity = referencedSiteManager.getEntity(topic.conceptUri);
}
if (entity != null) {
Representation representation = entity.getRepresentation();
// TODO: extract all languages based on some configuration instead of hardcoding English
Text label = representation.getFirst(NamespaceEnum.skos + "prefLabel", "en", "en-US", "en-GB");
if (label == null) {
label = representation.getFirst(NamespaceEnum.rdfs + "label", "en", "en-US", "en-GB");
}
if (label != null) {
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL, new PlainLiteralImpl(label.getText())));
}
}
}
} catch (ClassifierException e) {
throw new EngineException(e);
} catch (IllegalArgumentException e) {
throw new EngineException(e);
} catch (EntityhubException e) {
throw new EngineException(e);
} finally {
ci.getLock().writeLock().unlock();
}
}
Aggregations