use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class DBPSpotlightAnnotateEnhancementEngine method computeEnhancements.
/**
* Calculate the enhancements by doing a POST request to the DBpedia
* Spotlight endpoint and processing the results
*
* @param ci
* the {@link ContentItem}
*/
public void computeEnhancements(ContentItem ci) throws EngineException {
Language language = SpotlightEngineUtils.getContentLanguage(ci);
String text = SpotlightEngineUtils.getPlainContent(ci);
Collection<Annotation> dbpslGraph = doPostRequest(text, ci.getUri());
Map<SurfaceForm, IRI> surfaceForm2TextAnnotation = new HashMap<SurfaceForm, IRI>();
if (dbpslGraph != null) {
// Acquire a write lock on the ContentItem when adding the
// enhancements
ci.getLock().writeLock().lock();
try {
createEnhancements(dbpslGraph, ci, text, language, surfaceForm2TextAnnotation);
if (log.isDebugEnabled()) {
Serializer serializer = Serializer.getInstance();
ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
serializer.serialize(debugStream, ci.getMetadata(), "application/rdf+xml");
try {
log.debug("DBPedia Spotlight Enhancements:\n{}", debugStream.toString("UTF-8"));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class ClerezzaRDFParser method handleStatement.
private void handleStatement(RDFDataset result, Triple t, Map<BlankNode, String> bNodeMap) {
final String subject = getResourceValue(t.getSubject(), bNodeMap);
final String predicate = getResourceValue(t.getPredicate(), bNodeMap);
final RDFTerm object = t.getObject();
if (object instanceof Literal) {
final String value = ((Literal) object).getLexicalForm();
final String language;
final String datatype;
datatype = getResourceValue(((Literal) object).getDataType(), bNodeMap);
Language l = ((Literal) object).getLanguage();
if (l == null) {
language = null;
} else {
language = l.toString();
}
result.addTriple(subject, predicate, value, datatype, language);
count++;
} else {
result.addTriple(subject, predicate, getResourceValue((BlankNodeOrIRI) object, bNodeMap));
count++;
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class IndexedGraph method compare.
/**
* Compares Resources to correctly sort them within the index.<p>
* Sort criteria are:<ol>
* <li> URIs are sorted by the {@link IRI#getUnicodeString()} unicode
* string)
* <li> Literals
* <ol>
* <li> sort by the {@link Literal#getLexicalForm() lixical form}
* <li> sort by {@link Literal#getLanguage() language}
* (<code>null</code> value first)
* <li> sort by {@link Literal#getDataType() type} (<code>null</code>
* value fist
* </ol>
* <li> BlankNode
* <ol>
* <li> sorted by their
* {@link System#identityHashCode(Object) Object hasCode}
* <li> on hasCode conflicts (same hasCode but not equals) a random order is
* chosen and kept in the parsed conflictsMap
* </ol>
* </ol>
* <b>NOTEs</b><ul>
* <li> parsed {@link RDFTerm} are not required to correctly implement
* {@link Object#hashCode() hashCode} and
* {@link Object#equals(Object) equals}
* <li> parsed {@link IRI} and {@link BlankNode} and {@link Literal} MUST
* NOT extend/implement any of the other classes/interfaces. This means that
* an {@link IRI} MUST NOT implement {@link BlankNode} nor {@link Literal}
* <li> parsed {@link Literal}s MAY implement PlainLiteral AND
* TypedLiteral. This allows wrappers over frameworks that do not
* distinguish between those two literal types to be used with the
* {@link IndexedGraph}.
* </ul>
*
* @param a the first resource to compare
* @param b the second resource to compare
* @param confictsMap the map used to resolve BlankNodes with hasCode
* conflicts
* @return
*/
protected static int compare(RDFTerm a, RDFTerm b, Map<Integer, List<RDFTerm>> confictsMap) {
// Handle special cases for MAX and MIN values
if (a == MIN || b == MAX) {
return -1;
} else if (a == MAX || b == MIN) {
return 1;
}
// sort (0) IRIs < (1) Literals (PlainLiterals & TypedLiterals) < (3) BlankNodes
int at = a instanceof IRI ? 0 : a instanceof Literal ? 1 : 2;
int bt = b instanceof IRI ? 0 : b instanceof Literal ? 1 : 2;
if (at == bt) {
// same type sort the different types
if (at < 2) {
// no BlankNode
// sort in alphabetic order of the string representation
String as = at == 0 ? ((IRI) a).getUnicodeString() : ((Literal) a).getLexicalForm();
String bs = bt == 0 ? ((IRI) b).getUnicodeString() : ((Literal) b).getLexicalForm();
int sc = as.compareTo(bs);
if (sc == 0 && at == 1) {
// same string value and Literals
// check if the language and types are the same
Language al = a instanceof Literal ? ((Literal) a).getLanguage() : null;
Language bl = b instanceof Literal ? ((Literal) b).getLanguage() : null;
// first try to sort by language
if (al == null) {
sc = bl == null ? 0 : -1;
} else if (bl == null) {
sc = 1;
} else {
sc = al.toString().compareTo(bl.toString());
}
if (sc == 0) {
// if still equals look at the dataType
IRI adt = a instanceof Literal ? ((Literal) a).getDataType() : null;
IRI bdt = b instanceof Literal ? ((Literal) b).getDataType() : null;
if (adt == null) {
sc = bdt == null ? 0 : -1;
} else if (bdt == null) {
sc = 1;
} else {
sc = adt.getUnicodeString().compareTo(bdt.getUnicodeString());
}
}
return sc;
} else {
// for IRIs return the string compare
return sc;
}
} else {
// handle BlankNodes
// sort BlankNodes based on hashCode
int ah = a.hashCode();
int bh = b.hashCode();
if (ah == bh) {
if (!a.equals(b)) {
// if implementations hash is the same, but the instances
// are not equals, try to sort them by identity hash code
int ash = System.identityHashCode(a);
int bsh = System.identityHashCode(b);
if (ash == bsh) {
// decision in a confilctMap
return resolveBlankNodeHashConflict(a, b, confictsMap);
} else {
return ash < bsh ? -1 : 1;
}
} else {
// same hash and equals
return 0;
}
} else {
// sort by hash
return ah < bh ? -1 : 1;
}
}
} else {
return at < bt ? -1 : 1;
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class ClerezzaTripleCallback method triple.
private void triple(String s, String p, String value, String datatype, String language, String graph) {
final BlankNodeOrIRI subject = getBlankNodeOrIRI(s);
final IRI predicate = new IRI(p);
RDFTerm object;
if (language != null) {
object = new PlainLiteralImpl(value, new Language(language));
} else if (datatype == null || RDF_LANG_STRING.equals(datatype)) {
object = new PlainLiteralImpl(value);
} else {
object = new TypedLiteralImpl(value, new IRI(datatype));
}
mGraph.add(new TripleImpl(subject, predicate, object));
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class KuromojiNlpEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
throw new IllegalStateException("The detected language is NOT 'ja'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
// start with the Tokenizer
TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
// build the analyzing chain by adding all TokenFilters
for (TokenFilterFactory filterFactory : filterFactories) {
tokenStream = filterFactory.create(tokenStream);
}
// Try to extract sentences based on POS tags ...
int sentStartOffset = -1;
// NER data
List<NerData> nerList = new ArrayList<NerData>();
// the next index where the NerData.context need to be set
int nerSentIndex = 0;
NerData ner = null;
OffsetAttribute offset = null;
try {
// required with Solr 4
tokenStream.reset();
while (tokenStream.incrementToken()) {
offset = tokenStream.addAttribute(OffsetAttribute.class);
Token token = at.addToken(offset.startOffset(), offset.endOffset());
// Get the POS attribute and init the PosTag
PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = adhocTags.get(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = new PosTag(posAttr.getPartOfSpeech());
adhocTags.put(posAttr.getPartOfSpeech(), posTag);
log.warn(" ... missing PosTag mapping for {}", posAttr.getPartOfSpeech());
}
}
// Sentence detection by POS tag
if (sentStartOffset < 0) {
// the last token was a sentence ending
sentStartOffset = offset.startOffset();
}
if (posTag.hasPos(Pos.Point)) {
Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
// add the sentence as context to the NerData instances
while (nerSentIndex < nerList.size()) {
nerList.get(nerSentIndex).context = sent.getSpan();
nerSentIndex++;
}
sentStartOffset = -1;
}
// POS
token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
// NER
NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))) {
// write NER annotation
Chunk chunk = at.addChunk(ner.start, ner.end);
chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
// NOTE that the fise:TextAnnotation are written later based on the nerList
// clean up
ner = null;
}
if (nerTag != null) {
if (ner == null) {
ner = new NerData(nerTag, offset.startOffset());
nerList.add(ner);
}
ner.end = offset.endOffset();
}
BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
MorphoFeatures morpho = null;
if (baseFormAttr != null && baseFormAttr.getBaseForm() != null) {
morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
// and add the posTag
morpho.addPos(posTag);
}
InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
inflectionAttr.getInflectionForm();
inflectionAttr.getInflectionType();
if (morpho != null) {
// if present add the morpho
token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
}
}
// we still need to write the last sentence
Sentence lastSent = null;
if (offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset) {
lastSent = at.addSentence(sentStartOffset, offset.endOffset());
}
// and set the context off remaining named entities
while (nerSentIndex < nerList.size()) {
if (lastSent != null) {
nerList.get(nerSentIndex).context = lastSent.getSpan();
} else {
// no sentence detected
nerList.get(nerSentIndex).context = at.getSpan();
}
nerSentIndex++;
}
} catch (IOException e) {
throw new EngineException(this, ci, "Exception while reading from " + "AnalyzedText contentpart", e);
} finally {
try {
tokenStream.close();
} catch (IOException e) {
/* ignore */
}
}
// finally write the NER annotations to the metadata of the ContentItem
final Graph metadata = ci.getMetadata();
ci.getLock().writeLock().lock();
try {
Language lang = new Language("ja");
for (NerData nerData : nerList) {
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(at.getSpan().substring(nerData.start, nerData.end), lang)));
metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(nerData.context, lang)));
}
} finally {
ci.getLock().writeLock().unlock();
}
}
Aggregations