use of org.apache.stanbol.enhancer.nlp.phrase.PhraseTag in project stanbol by apache.
the class OpenNlpChunkingEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
isLangaugeConfigured(this, languageConfiguration, language, true);
ChunkerME chunker = initChunker(language);
if (chunker == null) {
return;
}
//init the Phrase TagSet
TagSet<PhraseTag> tagSet = tagSetRegistry.getTagSet(language);
if (tagSet == null) {
}
if (tagSet == null) {
log.warn("No Phrase TagSet registered for Language '{}'. Will build an " + "adhoc set based on encountered Tags!", language);
//for now only created to avoid checks for tagSet == null
//TODO: in future we might want to automatically create posModels based
//on tagged texts. However this makes no sense as long we can not
//persist TagSets.
tagSet = new TagSet<PhraseTag>("dummy", language);
}
//holds PosTags created for POS tags that where not part of the posModel
//(will hold all PosTags in case tagSet is NULL
Map<String, PhraseTag> adhocTags = languageAdhocTags.get(language);
if (adhocTags == null) {
adhocTags = new HashMap<String, PhraseTag>();
languageAdhocTags.put(language, adhocTags);
}
ci.getLock().writeLock().lock();
try {
Iterator<? extends Section> sentences = at.getSentences();
if (!sentences.hasNext()) {
//no sentences ... iterate over the whole text
sentences = Collections.singleton(at).iterator();
}
List<String> tokenTextList = new ArrayList<String>(64);
List<String> posList = new ArrayList<String>(64);
List<Token> tokenList = new ArrayList<Token>(64);
//process each sentence seperatly
while (sentences.hasNext()) {
// (1) get Tokens and POS information for the sentence
Section sentence = sentences.next();
Iterator<Token> tokens = sentence.getTokens();
while (tokens.hasNext()) {
Token token = tokens.next();
tokenList.add(token);
tokenTextList.add(token.getSpan());
Value<PosTag> posValue = token.getAnnotation(POS_ANNOTATION);
if (posValue == null) {
throw new EngineException("Missing POS value for Token '" + token.getSpan() + "' of ContentItem " + ci.getUri() + "(Sentence: '" + sentence.getSpan() + "'). This may " + "indicate that a POS tagging Engine is missing in " + "the EnhancementChain or that the used POS tagging " + "does not provide POS tags for each token!");
} else {
posList.add(posValue.value().getTag());
}
}
String[] tokenStrings = tokenTextList.toArray(new String[tokenTextList.size()]);
String[] tokenPos = posList.toArray(new String[tokenTextList.size()]);
if (log.isTraceEnabled()) {
log.trace("Tokens: {}" + Arrays.toString(tokenStrings));
}
//free memory
tokenTextList.clear();
//free memory
posList.clear();
// (2) Chunk the sentence
String[] chunkTags = chunker.chunk(tokenStrings, tokenPos);
double[] chunkProb = chunker.probs();
if (log.isTraceEnabled()) {
log.trace("Chunks: {}" + Arrays.toString(chunkTags));
}
//free memory
tokenStrings = null;
//free memory
tokenPos = null;
// (3) Process the results and write the Annotations
double chunkProps = 0;
int chunkTokenCount = 0;
PhraseTag tag = null;
int i;
/*
* This assumes:
* - 'B-{tag}' ... for start of a new chunk
* - '???' ... anything other for continuing the current chunk
* - 'O' ... no chunk (ends current chunk)
*/
for (i = 0; i < tokenList.size(); i++) {
boolean start = chunkTags[i].charAt(0) == 'B';
boolean end = tag != null && (start || chunkTags[i].charAt(0) == 'O');
if (end) {
//add the current phrase
//add at AnalysedText level, because offsets are absolute
//NOTE we are already at the next token when we detect the end
Chunk chunk = at.addChunk(tokenList.get(i - chunkTokenCount).getStart(), tokenList.get(i - 1).getEnd());
chunk.addAnnotation(PHRASE_ANNOTATION, new Value<PhraseTag>(tag, chunkProps / (double) chunkTokenCount));
//reset the state
tag = null;
chunkTokenCount = 0;
chunkProps = 0;
}
if (start) {
//create the new tag
tag = getPhraseTag(tagSet, adhocTags, chunkTags[i].substring(2), //skip 'B-'
language);
}
if (tag != null) {
//count this token for the current chunk
chunkProps = chunkProps + chunkProb[i];
chunkTokenCount++;
}
}
if (tag != null) {
Chunk chunk = at.addChunk(tokenList.get(i - chunkTokenCount).getStart(), tokenList.get(i - 1).getEnd());
chunk.addAnnotation(PHRASE_ANNOTATION, new Value<PhraseTag>(tag, chunkProps / (double) chunkTokenCount));
}
// (4) clean up
tokenList.clear();
}
} finally {
ci.getLock().writeLock().unlock();
}
if (log.isTraceEnabled()) {
logChunks(at);
}
}
use of org.apache.stanbol.enhancer.nlp.phrase.PhraseTag in project stanbol by apache.
the class AnalyzedTextSerializerAndParserTest method setup.
@BeforeClass
public static final void setup() throws IOException {
ci = ciFactory.createContentItem(new StringSource(text));
textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
analysedTextWithData = createAnalysedText();
int sentence = text.indexOf('.') + 1;
Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " + "cities such as Paris and people such as Bob Marley.");
Token the = sent1.addToken(0, 3);
expectedTokens.put(the, "The");
the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PREP", Pos.Preposition), 0.85));
Token stanbol = sent1.addToken(4, 11);
expectedTokens.put(stanbol, "Stanbol");
stanbol.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
stanbol.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, Value.value(0.5));
//use index to create Tokens
int enhancerStart = sent1.getSpan().indexOf("enhancer");
Token enhancer = sent1.addToken(enhancerStart, enhancerStart + "enhancer".length());
expectedTokens.put(enhancer, "enhancer");
enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("N", LexicalCategory.Noun), 0.87));
MorphoFeatures morpho = new MorphoFeatures("enhance");
morpho.addCase(new CaseTag("test-case-1", Case.Comitative));
morpho.addCase(new CaseTag("test-case-2", Case.Abessive));
morpho.addDefinitness(Definitness.Definite);
morpho.addPerson(Person.First);
morpho.addPos(new PosTag("PN", Pos.ProperNoun));
morpho.addGender(new GenderTag("test-gender", Gender.Masculine));
morpho.addNumber(new NumberTag("test-number", NumberFeature.Plural));
morpho.addTense(new TenseTag("test-tense", Tense.Present));
morpho.addVerbForm(new VerbMoodTag("test-verb-mood", VerbMood.ConditionalVerb));
enhancer.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, Value.value(morpho));
//create a chunk
Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
stanbolEnhancer.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("organization", DBPEDIA_ORGANISATION)));
stanbolEnhancer.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("NP", LexicalCategory.Noun), 0.98));
}
use of org.apache.stanbol.enhancer.nlp.phrase.PhraseTag in project stanbol by apache.
the class PhraseTagSupport method parse.
@Override
public PhraseTag parse(ObjectNode jValue, AnalysedText at) {
JsonNode tag = jValue.path("tag");
if (!tag.isTextual()) {
throw new IllegalStateException("Unable to parse PhraseTag. The value of the " + "'tag' field MUST have a textual value (json: " + jValue + ")");
}
JsonNode jCat = jValue.path("lc");
LexicalCategory lc = null;
if (jCat.isTextual()) {
try {
lc = LexicalCategory.valueOf(jCat.getTextValue());
} catch (IllegalArgumentException e) {
log.warn("Unable to parse category for PhraseTag from '" + jCat.getTextValue() + "' (will create with tag only)!", e);
}
} else if (jCat.isInt()) {
lc = LexicalCategory.values()[jCat.getIntValue()];
} else if (!jCat.isMissingNode()) {
log.warn("Unable to parse category for PhraseTag from " + jCat + "(will create with tag only)");
}
return new PhraseTag(tag.getTextValue(), lc);
}
use of org.apache.stanbol.enhancer.nlp.phrase.PhraseTag in project stanbol by apache.
the class EntityCoReferenceEngineTest method testSpatialCoref.
@Test
public void testSpatialCoref() throws EngineException, IOException {
ContentItem ci = ciFactory.createContentItem(new StringSource(SPATIAL_TEXT));
Graph graph = ci.getMetadata();
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, engine);
graph.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl("en")));
graph.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, new PlainLiteralImpl("100.0")));
graph.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
AnalysedText at = atFactory.createAnalysedText(ci, textBlob.getValue());
Sentence sentence1 = at.addSentence(0, SPATIAL_SENTENCE_1.indexOf(".") + 1);
Chunk angelaMerkel = sentence1.addChunk(0, "Angela Merkel".length());
angelaMerkel.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("Angela Merkel", OntologicalClasses.DBPEDIA_PERSON)));
Sentence sentence2 = at.addSentence(SPATIAL_SENTENCE_1.indexOf(".") + 1, SPATIAL_SENTENCE_1.length() + SPATIAL_SENTENCE_2.indexOf(".") + 1);
int theStartIdx = sentence2.getSpan().indexOf("The");
int germanStartIdx = sentence2.getSpan().indexOf("German");
int chancellorStartIdx = sentence2.getSpan().indexOf("politician");
Token the = sentence2.addToken(theStartIdx, theStartIdx + "The".length());
the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("The", LexicalCategory.PronounOrDeterminer, Pos.Determiner)));
Token german = sentence2.addToken(germanStartIdx, germanStartIdx + "German".length());
german.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("German", LexicalCategory.Adjective)));
Token politician = sentence2.addToken(chancellorStartIdx, chancellorStartIdx + "politician".length());
politician.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("politician", LexicalCategory.Noun)));
Chunk theGermanChancellor = sentence2.addChunk(theStartIdx, chancellorStartIdx + "politician".length());
theGermanChancellor.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("The German politician", LexicalCategory.Noun)));
engine.computeEnhancements(ci);
Value<CorefFeature> representativeCorefValue = angelaMerkel.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
Assert.assertNotNull(representativeCorefValue);
CorefFeature representativeCoref = representativeCorefValue.value();
Assert.assertTrue(representativeCoref.isRepresentative());
Assert.assertTrue(representativeCoref.getMentions().contains(theGermanChancellor));
Value<CorefFeature> subordinateCorefValue = theGermanChancellor.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
Assert.assertNotNull(subordinateCorefValue);
CorefFeature subordinateCoref = subordinateCorefValue.value();
Assert.assertTrue(!subordinateCoref.isRepresentative());
Assert.assertTrue(subordinateCoref.getMentions().contains(angelaMerkel));
}
use of org.apache.stanbol.enhancer.nlp.phrase.PhraseTag in project stanbol by apache.
the class NIFHelper method writePhrase.
/**
* Writes a {@link NlpAnnotations#PHRASE_ANNOTATION} as NIF 1.0 to the
* parsed RDF graph by using the segmentUri as subject
* @param graph the graph
* @param annotated the annotated element (e.g. a {@link Chunk})
* @param segmentUri the URI of the resource representing the parsed
* annotated element in the graph
*/
public static void writePhrase(Graph graph, Annotated annotated, IRI segmentUri) {
Value<PhraseTag> phraseTag = annotated.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION);
if (phraseTag != null) {
IRI phraseTypeUri = LEXICAL_TYPE_TO_PHRASE_TYPE.get(phraseTag.value().getCategory());
if (phraseTypeUri != null) {
//add the oliaLink for the Phrase
graph.add(new TripleImpl(segmentUri, SsoOntology.oliaLink.getUri(), phraseTypeUri));
graph.add(new TripleImpl(segmentUri, ENHANCER_CONFIDENCE, lf.createTypedLiteral(phraseTag.probability())));
}
}
}
Aggregations