use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class OpenNlpSentenceDetectionEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, true);
SentenceDetector sentenceDetector = getSentenceDetector(language);
if (sentenceDetector != null) {
for (opennlp.tools.util.Span sentSpan : sentenceDetector.sentPosDetect(at.getSpan())) {
//detect sentences and add it to the AnalyzedText.
Sentence sentence = at.addSentence(sentSpan.getStart(), sentSpan.getEnd());
log.trace(" > add {}", sentence);
}
} else {
log.warn("SentenceDetector model for language {} is no longer available. " + "This might happen if the model becomes unavailable during enhancement. " + "If this happens more often it might also indicate an bug in the used " + "EnhancementJobManager implementation as the availability is also checked " + "in the canEnhance(..) method of this Enhancement Engine.");
}
}
use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class FstLinkingEngineTest method setupTest.
/**
* Initialises the {@link #ci} and {@link #content} fields for tests.
* It creates a ContentItem containing a '<code>plain/text</code>'
* {@link Blob} for the {@value #TEST_TEXT_FILE} and an {@link AnalysedText}
* filled with the NLP analysis results stored in
* {@link #TEST_TEXT_NLP_FILE}
* @return the {@link ContentItem} as used for the tests
* @throws IOException on any IO releated error while reading the test files
*/
@Before
public void setupTest() throws IOException {
//create a contentItem for the plain text used for testing
InputStream is = FstLinkingEngineTest.class.getClassLoader().getResourceAsStream(TEST_TEXT_FILE);
Assert.assertNotNull("Unable to load '" + TEST_TEXT_FILE + "' via classpath", is);
ContentItem ci = cif.createContentItem(new StreamSource(is, "text/plain"));
AnalysedText at = atf.createAnalysedText(ci, ci.getBlob());
is.close();
//parse the prepared NLP results and add it to the ContentItem
is = FstLinkingEngineTest.class.getClassLoader().getResourceAsStream(TEST_TEXT_NLP_FILE);
Assert.assertNotNull("Unable to load '" + TEST_TEXT_NLP_FILE + "' via classpath", is);
AnalyzedTextParser.getDefaultInstance().parse(is, Charset.forName("UTF-8"), at);
is.close();
//set the language of the contentItem
ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, EN_LANGUAGE));
//set the contentItem and also the content
this.ci = ci;
this.content = at.getText().toString();
}
use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class Nlp2RdfMetadataEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String lang = EnhancementEngineHelper.getLanguage(ci);
Language language = lang == null ? null : new Language(lang);
//now iterate over the AnalysedText data and create the RDF representation
//TODO: make configureable
boolean sentences = true;
boolean phrases = true;
boolean words = true;
EnumSet<SpanTypeEnum> activeTypes = EnumSet.noneOf(SpanTypeEnum.class);
if (sentences) {
activeTypes.add(SpanTypeEnum.Sentence);
}
if (phrases) {
activeTypes.add(SpanTypeEnum.Chunk);
}
if (words) {
activeTypes.add(SpanTypeEnum.Token);
}
Graph metadata = ci.getMetadata();
IRI base = ci.getUri();
ci.getLock().writeLock().lock();
try {
Iterator<Span> spans = at.getEnclosed(activeTypes);
IRI sentence = null;
IRI phrase = null;
IRI word = null;
boolean firstWordInSentence = true;
while (spans.hasNext()) {
Span span = spans.next();
//TODO: filter Spans based on additional requirements
//(1) write generic information about the span
IRI current = writeSpan(metadata, base, at, language, span);
//(2) add the relations between the different spans
switch(span.getType()) {
case Sentence:
if (sentence != null) {
metadata.add(new TripleImpl(sentence, SsoOntology.nextSentence.getUri(), current));
}
sentence = current;
firstWordInSentence = true;
break;
case Chunk:
if (sentence != null) {
metadata.add(new TripleImpl(current, StringOntology.superString.getUri(), sentence));
if (word != null) {
metadata.add(new TripleImpl(word, SsoOntology.lastWord.getUri(), sentence));
}
}
phrase = current;
break;
case Token:
if (sentence != null) {
metadata.add(new TripleImpl(current, SsoOntology.sentence.getUri(), sentence));
if (firstWordInSentence) {
metadata.add(new TripleImpl(current, SsoOntology.firstWord.getUri(), sentence));
firstWordInSentence = false;
}
}
if (phrase != null) {
metadata.add(new TripleImpl(current, SsoOntology.parent.getUri(), phrase));
}
if (word != null) {
metadata.add(new TripleImpl(word, SsoOntology.nextWord.getUri(), current));
metadata.add(new TripleImpl(current, SsoOntology.previousWord.getUri(), word));
}
word = current;
break;
default:
break;
}
//(3) add specific information such as POS, chunk type ...
writePos(metadata, span, current);
writePhrase(metadata, span, current);
//OlIA does not include Sentiments
Value<Double> sentiment = span.getAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION);
if (sentiment != null && sentiment.value() != null) {
metadata.add(new TripleImpl(current, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentiment.value())));
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class AnalyzedTextSerializerAndParserTest method testSerialization.
@Test
public void testSerialization() throws IOException {
ByteArrayOutputStream bout = new ByteArrayOutputStream();
AnalyzedTextSerializer serializer = AnalyzedTextSerializer.getDefaultInstance();
serializer.serialize(analysedTextWithData, bout, null);
//get the serialized String and check for some expected elements
byte[] data = bout.toByteArray();
String serialized = new String(data, Charset.forName("UTF-8"));
log.info(serialized);
Assert.assertTrue(serialized.contains("\"spans\" : [ {"));
Assert.assertTrue(serialized.contains("\"type\" : \"Text\""));
Assert.assertTrue(serialized.contains("\"type\" : \"Sentence\""));
Assert.assertTrue(serialized.contains("\"type\" : \"Token\""));
Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.pos\" : {"));
Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.pos.PosTag\""));
Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.ner\" : {"));
Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.ner.NerTag\""));
Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.morpho\" : {"));
Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures\""));
//deserialize
AnalyzedTextParser parser = AnalyzedTextParser.getDefaultInstance();
AnalysedText parsedAt = parser.parse(new ByteArrayInputStream(data), null, atFactory.createAnalysedText(textBlob.getValue()));
Assert.assertEquals(analysedTextWithData, parsedAt);
Iterator<Span> origSpanIt = analysedTextWithData.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
Iterator<Span> parsedSpanIt = parsedAt.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
while (origSpanIt.hasNext() && parsedSpanIt.hasNext()) {
Span orig = origSpanIt.next();
Span parsed = parsedSpanIt.next();
Assert.assertEquals(orig, parsed);
Set<String> origKeys = orig.getKeys();
Set<String> parsedKeys = parsed.getKeys();
Assert.assertEquals(origKeys, parsedKeys);
for (String key : origKeys) {
List<Value<?>> origValues = orig.getValues(key);
List<Value<?>> parsedValues = parsed.getValues(key);
Assert.assertEquals(origValues, parsedValues);
}
}
Assert.assertFalse("Original AnalyzedText MUST NOT have additional Spans", origSpanIt.hasNext());
Assert.assertFalse("Parsed AnalyzedText MUST NOT have additional Spans", parsedSpanIt.hasNext());
}
use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class DependencyRelationSupportTest method testSerializationAndParse.
@Test
public void testSerializationAndParse() throws IOException {
String serialized = getSerializedString();
Assert.assertTrue(serialized.contains(jsonCheckObama));
Assert.assertTrue(serialized.contains(jsonCheckVisited));
Assert.assertTrue(serialized.contains(jsonCheckChina));
AnalysedText parsedAt = getParsedAnalysedText(serialized);
assertAnalysedTextEquality(parsedAt);
}
Aggregations