use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class SentimentSummarizationEngine method getSentimentContext.
private Integer[] getSentimentContext(Integer index, Sentiment sentiment, NavigableMap<Integer, Token> verbs, NavigableMap<Integer, Token> conjunctions, NavigableMap<Integer, Token> nouns, Integer[] sectionSpan) {
Integer[] context;
PosTag pos = sentiment.getPosTag();
boolean isPredicative;
if (pos != null && pos.getPosHierarchy().contains(Pos.PredicativeAdjective)) {
isPredicative = true;
} else if (pos != null && pos.hasCategory(LexicalCategory.Adjective) && //Adjective that are not directly in front of a Noun
nouns.get(Integer.valueOf(index + 1)) == null) {
isPredicative = true;
} else {
isPredicative = false;
}
if (isPredicative) {
// Integer floorConjunction = conjunctions.floorKey(index);
// if(floorConjunction != null && floorConjunction.compareTo(
// Integer.valueOf(Math.max(index-conjuctionContext,sectionSpan[0]))) >= 0){
// lowIndex = Integer.valueOf(floorConjunction-1);
// }
// Integer ceilingConjunction = conjunctions.ceilingKey(index);
// if(ceilingConjunction != null && ceilingConjunction.compareTo(
// Integer.valueOf(Math.min(index+conjuctionContext,sectionSpan[1]))) <= 0){
// highIndex = Integer.valueOf(ceilingConjunction+1);
// }
//use the verb as context
Integer floorNoun = nouns.floorKey(index);
Entry<Integer, Token> floorVerb = verbs.floorEntry(index);
Integer ceilingNoun = nouns.ceilingKey(index);
Entry<Integer, Token> ceilingVerb = verbs.ceilingEntry(index);
floorVerb = floorVerb == null || floorVerb.getKey().compareTo(sectionSpan[0]) < 0 || //do not use verbs with an noun in-between
(floorNoun != null && floorVerb.getKey().compareTo(floorNoun) < 0) ? null : floorVerb;
ceilingVerb = ceilingVerb == null || ceilingVerb.getKey().compareTo(sectionSpan[1]) > 0 || //do not use verbs with an noun in-between
(ceilingNoun != null && ceilingVerb.getKey().compareTo(ceilingNoun) > 0) ? null : ceilingVerb;
Entry<Integer, Token> verb;
if (ceilingVerb != null && floorVerb != null) {
verb = (index - floorVerb.getKey()) < (ceilingVerb.getKey() - index) ? floorVerb : ceilingVerb;
} else if (ceilingVerb != null) {
verb = ceilingVerb;
} else if (floorVerb != null) {
verb = floorVerb;
} else {
//no verb that can be used as context ... return an area around the current pos.
verb = null;
}
if (verb != null) {
if (verb.getKey().compareTo(index) < 0) {
Integer floorConjunction = conjunctions.floorKey(verb.getKey());
if (floorConjunction != null && floorConjunction.compareTo(Integer.valueOf(Math.max(verb.getKey() - conjuctionContext, sectionSpan[0]))) >= 0) {
//search an other verb in the same direction
floorVerb = verbs.floorEntry(floorConjunction);
if (floorVerb != null && floorVerb.getKey().compareTo(sectionSpan[0]) >= 0 && //do not step over an noun
(floorNoun == null || floorVerb.getKey().compareTo(floorNoun) >= 0)) {
verb = floorVerb;
}
}
} else if (verb.getKey().compareTo(index) > 0) {
Integer ceilingConjunction = conjunctions.ceilingKey(verb.getKey());
if (ceilingConjunction != null && ceilingConjunction.compareTo(Integer.valueOf(Math.min(verb.getKey() + conjuctionContext, sectionSpan[1]))) >= 0) {
//search an other verb in the same direction
ceilingVerb = verbs.floorEntry(ceilingConjunction);
if (ceilingVerb != null && ceilingVerb.getKey().compareTo(sectionSpan[1]) <= 0 && //do not step over an noun
(ceilingNoun == null || ceilingVerb.getKey().compareTo(ceilingNoun) <= 0)) {
verb = ceilingVerb;
}
}
}
context = new Integer[] { Integer.valueOf(verb.getKey() - nounContext), Integer.valueOf(verb.getKey() + nounContext) };
sentiment.setVerb(verb.getValue());
} else {
context = new Integer[] { Integer.valueOf(index - nounContext), Integer.valueOf(index + nounContext) };
}
} else if (pos != null && pos.hasCategory(LexicalCategory.Adjective)) {
//for all other adjective the affected noun is expected directly
//after the noun
context = new Integer[] { index, Integer.valueOf(index + 1) };
} else if (pos != null && pos.hasCategory(LexicalCategory.Noun)) {
//a noun with an sentiment
context = new Integer[] { index, index };
} else {
//else (includes pos == null) return default
context = new Integer[] { Integer.valueOf(index - nounContext), Integer.valueOf(index + nounContext) };
}
//ensure the returned context does not exceed the parsed sectionSpan
if (context[0].compareTo(sectionSpan[0]) < 0) {
context[0] = sectionSpan[0];
}
if (context[1].compareTo(sectionSpan[1]) > 0) {
context[1] = sectionSpan[1];
}
return context;
}
use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class SentimentSummarizationEngine method extractSentiments.
/**
* Extracts {@link Sentiment}s for words with a {@link NlpAnnotations#SENTIMENT_ANNOTATION}.
* The {@link NlpAnnotations#POS_ANNOTATION}s are used to link those words with
* {@link LexicalCategory#Noun}s.
* @param at the AnalyzedText to process
* @return the {@link Sentiment} instances organised along {@link Sentence}s. If
* no {@link Sentence}s are present on the parsed {@link AnalysedText}, than all
* {@link Sentiment}s are added to the {@link AnalysedText}. Otherwise only
* {@link Sentiment}s not contained within a {@link Sentence} are added to the
* {@link AnalysedText} key.
*/
private List<SentimentPhrase> extractSentiments(AnalysedText at, String language) {
//we do use Sentences (optional) and Tokens (required)
Iterator<Span> tokenIt = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Token));
List<Sentiment> sentimentTokens = new ArrayList<Sentiment>(32);
NavigableMap<Integer, Token> negations = new TreeMap<Integer, Token>();
NavigableMap<Integer, Token> nounsAndPronouns = new TreeMap<Integer, Token>();
NavigableMap<Integer, Token> verbs = new TreeMap<Integer, Token>();
NavigableMap<Integer, Token> conjuctions = new TreeMap<Integer, Token>();
NavigableMap<Integer, Token> sectionBorders = new TreeMap<Integer, Token>();
boolean firstTokenInSentence = true;
Sentence sentence = null;
final List<SentimentPhrase> sentimentPhrases = new ArrayList<SentimentPhrase>();
while (tokenIt.hasNext()) {
Span span = tokenIt.next();
switch(span.getType()) {
case Token:
Token word = (Token) span;
Integer wordIndex = sentimentTokens.size();
Value<Double> sentimentAnnotation = span.getAnnotation(SENTIMENT_ANNOTATION);
boolean addToList = false;
Sentiment sentiment = null;
if (sentimentAnnotation != null && sentimentAnnotation.value() != null && !sentimentAnnotation.value().equals(ZERO)) {
sentiment = new Sentiment(word, sentimentAnnotation.value(), sentence == null || word.getEnd() > sentence.getEnd() ? null : sentence);
addToList = true;
}
if (isNegation((Token) span, language)) {
addToList = true;
negations.put(wordIndex, word);
} else if (isNoun(word, firstTokenInSentence, language) || isPronoun(word, language)) {
addToList = true;
nounsAndPronouns.put(wordIndex, word);
} else if (isSectionBorder(word, language)) {
addToList = true;
sectionBorders.put(wordIndex, word);
} else if (isVerb(word, language)) {
addToList = true;
verbs.put(wordIndex, word);
} else if (isCoordinatingConjuction(word, language)) {
addToList = true;
conjuctions.put(wordIndex, word);
} else if (isCountable(word, language)) {
addToList = true;
}
if (log.isDebugEnabled()) {
Value<PosTag> pos = word.getAnnotation(NlpAnnotations.POS_ANNOTATION);
log.debug(" [{}] '{}' pos: {}, sentiment {}", new Object[] { addToList ? sentimentTokens.size() : "-", word.getSpan(), pos.value().getCategories(), sentiment == null ? "none" : sentiment.getValue() });
}
if (addToList) {
//add the token
sentimentTokens.add(sentiment);
}
firstTokenInSentence = false;
break;
case Sentence:
//cleanup the previous sentence
sentimentPhrases.addAll(summarizeSentence(sentimentTokens, negations, nounsAndPronouns, verbs, conjuctions, sectionBorders));
negations.clear();
nounsAndPronouns.clear();
sentimentTokens.clear();
verbs.clear();
sectionBorders.clear();
firstTokenInSentence = true;
sentence = (Sentence) span;
break;
case TextSection:
break;
default:
break;
}
}
sentimentPhrases.addAll(summarizeSentence(sentimentTokens, negations, nounsAndPronouns, verbs, conjuctions, sectionBorders));
return sentimentPhrases;
}
use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class OpenNlpPosTaggingEngine method posTag.
/**
* POS tags the parsed tokens by using the pos tagger. Annotations are
* added based on the posModel and already created adhoc tags.
* @param tokenList
* @param posTagger
* @param posModel
* @param adhocTags
* @param language
*/
private void posTag(List<Token> tokenList, POSTagger posTagger, TagSet<PosTag> posModel, Map<String, PosTag> adhocTags, String language) {
String[] tokenTexts = new String[tokenList.size()];
for (int i = 0; i < tokenList.size(); i++) {
tokenTexts[i] = tokenList.get(i).getSpan();
}
//get the topK POS tags and props and copy it over to the 2dim Arrays
Sequence[] posSequences = posTagger.topKSequences(tokenTexts);
//extract the POS tags and props for the current token from the
//posSequences.
//NOTE: Sequence includes always POS tags for all Tokens. If
// less then posSequences.length are available it adds the
// best match for all followings.
// We do not want such copies.
PosTag[] actPos = new PosTag[posSequences.length];
double[] actProp = new double[posSequences.length];
for (int i = 0; i < tokenTexts.length; i++) {
Token token = tokenList.get(i);
boolean done = false;
int j = 0;
while (j < posSequences.length && !done) {
String p = posSequences[j].getOutcomes().get(i);
done = j > 0 && p.equals(actPos[0].getTag());
if (!done) {
actPos[j] = getPosTag(posModel, adhocTags, p, language);
actProp[j] = posSequences[j].getProbs()[i];
j++;
}
}
//create the POS values
token.addAnnotations(POS_ANNOTATION, Value.values(actPos, actProp, j));
}
}
use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class KuromojiNlpEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
throw new IllegalStateException("The detected language is NOT 'ja'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
//start with the Tokenizer
TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
//build the analyzing chain by adding all TokenFilters
for (TokenFilterFactory filterFactory : filterFactories) {
tokenStream = filterFactory.create(tokenStream);
}
//Try to extract sentences based on POS tags ...
int sentStartOffset = -1;
//NER data
List<NerData> nerList = new ArrayList<NerData>();
//the next index where the NerData.context need to be set
int nerSentIndex = 0;
NerData ner = null;
OffsetAttribute offset = null;
try {
//required with Solr 4
tokenStream.reset();
while (tokenStream.incrementToken()) {
offset = tokenStream.addAttribute(OffsetAttribute.class);
Token token = at.addToken(offset.startOffset(), offset.endOffset());
//Get the POS attribute and init the PosTag
PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = adhocTags.get(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = new PosTag(posAttr.getPartOfSpeech());
adhocTags.put(posAttr.getPartOfSpeech(), posTag);
log.warn(" ... missing PosTag mapping for {}", posAttr.getPartOfSpeech());
}
}
//Sentence detection by POS tag
if (sentStartOffset < 0) {
//the last token was a sentence ending
sentStartOffset = offset.startOffset();
}
if (posTag.hasPos(Pos.Point)) {
Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
//add the sentence as context to the NerData instances
while (nerSentIndex < nerList.size()) {
nerList.get(nerSentIndex).context = sent.getSpan();
nerSentIndex++;
}
sentStartOffset = -1;
}
//POS
token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
//NER
NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))) {
//write NER annotation
Chunk chunk = at.addChunk(ner.start, ner.end);
chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
//NOTE that the fise:TextAnnotation are written later based on the nerList
//clean up
ner = null;
}
if (nerTag != null) {
if (ner == null) {
ner = new NerData(nerTag, offset.startOffset());
nerList.add(ner);
}
ner.end = offset.endOffset();
}
BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
MorphoFeatures morpho = null;
if (baseFormAttr != null && baseFormAttr.getBaseForm() != null) {
morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
//and add the posTag
morpho.addPos(posTag);
}
InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
inflectionAttr.getInflectionForm();
inflectionAttr.getInflectionType();
if (morpho != null) {
//if present add the morpho
token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
}
}
//we still need to write the last sentence
Sentence lastSent = null;
if (offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset) {
lastSent = at.addSentence(sentStartOffset, offset.endOffset());
}
//and set the context off remaining named entities
while (nerSentIndex < nerList.size()) {
if (lastSent != null) {
nerList.get(nerSentIndex).context = lastSent.getSpan();
} else {
//no sentence detected
nerList.get(nerSentIndex).context = at.getSpan();
}
nerSentIndex++;
}
} catch (IOException e) {
throw new EngineException(this, ci, "Exception while reading from " + "AnalyzedText contentpart", e);
} finally {
try {
tokenStream.close();
} catch (IOException e) {
/* ignore */
}
}
//finally write the NER annotations to the metadata of the ContentItem
final Graph metadata = ci.getMetadata();
ci.getLock().writeLock().lock();
try {
Language lang = new Language("ja");
for (NerData nerData : nerList) {
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(at.getSpan().substring(nerData.start, nerData.end), lang)));
metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(nerData.context, lang)));
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class CeliMorphoFeatures method featuresAsTriples.
public Collection<? extends Triple> featuresAsTriples(IRI textAnnotation, Language lang) {
Collection<TripleImpl> result = new Vector<TripleImpl>();
result.add(new TripleImpl(textAnnotation, CeliLemmatizerEnhancementEngine.hasLemmaForm, new PlainLiteralImpl(getLemma(), lang)));
for (PosTag pos : getPosList()) {
if (pos.isMapped()) {
for (LexicalCategory cat : pos.getCategories()) {
result.add(new TripleImpl(textAnnotation, RDF_TYPE, cat.getUri()));
}
}
}
for (NumberTag num : getNumberList()) {
if (num.getNumber() != null) {
result.add(new TripleImpl(textAnnotation, HAS_NUMBER, num.getNumber().getUri()));
}
}
for (Person pers : getPersonList()) {
result.add(new TripleImpl(textAnnotation, HAS_PERSON, pers.getUri()));
}
for (GenderTag gender : getGenderList()) {
if (gender.getGender() != null) {
result.add(new TripleImpl(textAnnotation, HAS_GENDER, gender.getGender().getUri()));
}
}
for (Definitness def : getDefinitnessList()) {
result.add(new TripleImpl(textAnnotation, HAS_DEFINITENESS, def.getUri()));
}
for (CaseTag caseFeat : getCaseList()) {
if (caseFeat.getCase() != null) {
result.add(new TripleImpl(textAnnotation, HAS_CASE, caseFeat.getCase().getUri()));
}
}
for (VerbMoodTag vf : getVerbMoodList()) {
if (vf.getVerbForm() != null) {
result.add(new TripleImpl(textAnnotation, HAS_MOOD, vf.getVerbForm().getUri()));
}
}
for (TenseTag tense : getTenseList()) {
if (tense.getTense() != null) {
result.add(new TripleImpl(textAnnotation, HAS_TENSE, tense.getTense().getUri()));
}
}
return result;
}
Aggregations