use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.
the class TestKuromojiNlpEngine method testEngine.
@Test
public void testEngine() throws EngineException {
LiteralFactory lf = LiteralFactory.getInstance();
Assert.assertEquals(EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(contentItem));
engine.computeEnhancements(contentItem);
//assert the results
Map<IRI, RDFTerm> expected = new HashMap<IRI, RDFTerm>();
expected.put(Properties.DC_CREATOR, lf.createTypedLiteral(engine.getClass().getName()));
expected.put(Properties.ENHANCER_EXTRACTED_FROM, contentItem.getUri());
Assert.assertEquals(16, EnhancementStructureHelper.validateAllTextAnnotations(contentItem.getMetadata(), text, expected));
AnalysedText at = AnalysedTextUtils.getAnalysedText(contentItem);
Assert.assertNotNull(at);
List<Sentence> sentences = AnalysedTextUtils.asList(at.getSentences());
Assert.assertNotNull(sentences);
Assert.assertEquals(7, sentences.size());
//TODO: values in the following arrays are based on the first run of the
// engine. So this is only to detect changes in results. It can not validate
// that the tokenization and NER detections are correct - sorry I do not
// speak Japanese ...
int[] expectedChunks = new int[] { 5, 3, 1, 0, 1, 2, 4 };
int[] expectedTokens = new int[] { 25, 25, 25, 24, 33, 17, 32 };
int sentIndex = 0;
for (Sentence sent : sentences) {
List<Chunk> sentenceNer = AnalysedTextUtils.asList(sent.getChunks());
Assert.assertEquals(expectedChunks[sentIndex], sentenceNer.size());
for (Chunk chunk : sentenceNer) {
Value<NerTag> nerValue = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION);
Assert.assertNotNull(nerValue);
Assert.assertNotNull(nerValue.value().getType());
}
List<Token> tokens = AnalysedTextUtils.asList(sent.getTokens());
Assert.assertEquals(expectedTokens[sentIndex], tokens.size());
for (Token token : tokens) {
Value<PosTag> posValue = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
Assert.assertNotNull(posValue);
}
sentIndex++;
}
}
use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.
the class CorefFeatureSupportTest method initCorefAnnotations.
private static void initCorefAnnotations() {
Sentence sentence1 = at.addSentence(0, sentenceText1.indexOf(".") + 1);
Token obama = sentence1.addToken(0, "Obama".length());
Sentence sentence2 = at.addSentence(sentenceText1.indexOf(".") + 2, sentenceText2.indexOf(".") + 1);
int heStartIdx = sentence2.getSpan().indexOf("He");
Token he = sentence2.addToken(heStartIdx, heStartIdx + "He".length());
Set<Span> obamaMentions = new HashSet<Span>();
obamaMentions.add(he);
obama.addAnnotation(NlpAnnotations.COREF_ANNOTATION, Value.value(new CorefFeature(true, obamaMentions)));
Set<Span> heMentions = new HashSet<Span>();
heMentions.add(obama);
he.addAnnotation(NlpAnnotations.COREF_ANNOTATION, Value.value(new CorefFeature(false, heMentions)));
}
use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.
the class DependencyRelationSupportTest method initDepTreeAnnotations.
private static void initDepTreeAnnotations() {
Sentence sentence = at.addSentence(0, text.indexOf(".") + 1);
Token obama = sentence.addToken(0, "Obama".length());
int visitedStartIdx = sentence.getSpan().indexOf("visited");
Token visited = sentence.addToken(visitedStartIdx, visitedStartIdx + "visited".length());
int chinaStartIdx = sentence.getSpan().indexOf("China");
Token china = sentence.addToken(chinaStartIdx, chinaStartIdx + "China".length());
GrammaticalRelationTag nSubjGrammRelTag = new GrammaticalRelationTag("nsubj", GrammaticalRelation.NominalSubject);
obama.addAnnotation(NlpAnnotations.DEPENDENCY_ANNOTATION, Value.value(new DependencyRelation(nSubjGrammRelTag, true, visited)));
GrammaticalRelationTag rootGrammRelTag = new GrammaticalRelationTag("root", GrammaticalRelation.Root);
GrammaticalRelationTag dobjGrammRelTag = new GrammaticalRelationTag("dobj", GrammaticalRelation.DirectObject);
visited.addAnnotation(NlpAnnotations.DEPENDENCY_ANNOTATION, Value.value(new DependencyRelation(rootGrammRelTag, true, null)));
visited.addAnnotation(NlpAnnotations.DEPENDENCY_ANNOTATION, Value.value(new DependencyRelation(nSubjGrammRelTag, false, obama)));
visited.addAnnotation(NlpAnnotations.DEPENDENCY_ANNOTATION, Value.value(new DependencyRelation(dobjGrammRelTag, false, china)));
china.addAnnotation(NlpAnnotations.DEPENDENCY_ANNOTATION, Value.value(new DependencyRelation(dobjGrammRelTag, true, visited)));
}
use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.
the class SmartcnSentenceEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
throw new IllegalStateException("The detected language is NOT 'zh'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
//first the sentences
TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
try {
sentences.reset();
while (sentences.incrementToken()) {
OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
if (log.isTraceEnabled()) {
log.trace("detected {}:{}", s, s.getSpan());
}
}
} catch (IOException e) {
String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
log.error(message, e);
throw new EngineException(this, ci, message, e);
}
}
use of org.apache.stanbol.enhancer.nlp.model.Sentence in project stanbol by apache.
the class SentimentSummarizationEngine method writeSentimentEnhancements.
private void writeSentimentEnhancements(ContentItem ci, List<SentimentPhrase> sentimentPhrases, AnalysedText at, Language lang) {
// TODO Auto-generated method stub
Graph metadata = ci.getMetadata();
Sentence currentSentence = null;
final List<SentimentPhrase> sentencePhrases = new ArrayList<SentimentPhrase>();
for (SentimentPhrase sentPhrase : sentimentPhrases) {
Sentence sentence = sentPhrase.getSentence();
if (log.isDebugEnabled()) {
//debug sentiment info
CharSequence phraseText = at.getText().subSequence(sentPhrase.getStartIndex(), sentPhrase.getEndIndex());
log.debug("Write SentimentPhrase for {} (sentence: {})", phraseText, sentence == null ? "none" : sentence.getSpan().length() > 17 ? (sentence.getSpan().subSequence(0, 17) + "...") : sentence.getSpan());
List<Sentiment> sentiments = sentPhrase.getSentiments();
log.debug(" > {} Sentiments:", sentiments.size());
for (int i = 0; i < sentiments.size(); i++) {
log.debug(" {}. {}", i + 1, sentiments.get(i));
}
}
if (writeSentimentPhrases) {
IRI enh = createTextEnhancement(ci, this);
String phraseText = at.getSpan().substring(sentPhrase.getStartIndex(), sentPhrase.getEndIndex());
metadata.add(new TripleImpl(enh, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(phraseText, lang)));
if (sentPhrase.getSentence() == null) {
metadata.add(new TripleImpl(enh, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(at.getSpan(), phraseText, sentPhrase.getStartIndex()), lang)));
} else {
metadata.add(new TripleImpl(enh, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(sentPhrase.getSentence().getSpan(), lang)));
}
metadata.add(new TripleImpl(enh, ENHANCER_START, lf.createTypedLiteral(sentPhrase.getStartIndex())));
metadata.add(new TripleImpl(enh, ENHANCER_END, lf.createTypedLiteral(sentPhrase.getEndIndex())));
if (sentPhrase.getPositiveSentiment() != null) {
metadata.add(new TripleImpl(enh, POSITIVE_SENTIMENT_PROPERTY, lf.createTypedLiteral(sentPhrase.getPositiveSentiment())));
}
if (sentPhrase.getNegativeSentiment() != null) {
metadata.add(new TripleImpl(enh, NEGATIVE_SENTIMENT_PROPERTY, lf.createTypedLiteral(sentPhrase.getNegativeSentiment())));
}
metadata.add(new TripleImpl(enh, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentPhrase.getSentiment())));
//add the Sentiment type as well as the type of the SSO Ontology
metadata.add(new TripleImpl(enh, DC_TYPE, SENTIMENT_TYPE));
IRI ssoType = NIFHelper.SPAN_TYPE_TO_SSO_TYPE.get(SpanTypeEnum.Chunk);
if (ssoType != null) {
metadata.add(new TripleImpl(enh, DC_TYPE, ssoType));
}
}
if (writeSentencesSentimet && sentence != null) {
if (sentence.equals(currentSentence)) {
sentencePhrases.add(sentPhrase);
} else {
writeSentiment(ci, currentSentence, sentencePhrases);
//reset
currentSentence = sentence;
sentencePhrases.clear();
sentencePhrases.add(sentPhrase);
}
}
}
if (!sentencePhrases.isEmpty()) {
writeSentiment(ci, currentSentence, sentencePhrases);
}
if (writeDocumentSentiment) {
writeSentiment(ci, at, sentimentPhrases);
}
}
Aggregations