use of org.apache.stanbol.enhancer.nlp.model.Section in project stanbol by apache.
the class OpenNlpPosTaggingEngine method detectSentences.
private List<Section> detectSentences(AnalysedText at, String language) {
SentenceDetector sentenceDetector = getSentenceDetector(language);
List<Section> sentences;
if (sentenceDetector != null) {
sentences = new ArrayList<Section>();
for (opennlp.tools.util.Span sentSpan : sentenceDetector.sentPosDetect(at.getSpan())) {
Sentence sentence = at.addSentence(sentSpan.getStart(), sentSpan.getEnd());
log.trace(" > add {}", sentence);
sentences.add(sentence);
}
} else {
sentences = null;
}
return sentences;
}
use of org.apache.stanbol.enhancer.nlp.model.Section in project stanbol by apache.
the class OpenNlpTokenizerEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, true);
Tokenizer tokenizer = getTokenizer(language);
if (tokenizer == null) {
log.warn("Tokenizer for language {} is no longer available. " + "This might happen if the model becomes unavailable during enhancement. " + "If this happens more often it might also indicate an bug in the used " + "EnhancementJobManager implementation as the availability is also checked " + "in the canEnhance(..) method of this Enhancement Engine.");
return;
}
//Try to use sentences for tokenizing
Iterator<? extends Section> sections = at.getSentences();
if (!sections.hasNext()) {
//if no sentences are annotated
sections = Collections.singleton(at).iterator();
}
//for all sentences (or the whole Text - if no sentences available)
while (sections.hasNext()) {
Section section = sections.next();
//Tokenize section
opennlp.tools.util.Span[] tokenSpans = tokenizer.tokenizePos(section.getSpan());
for (int i = 0; i < tokenSpans.length; i++) {
Token token = section.addToken(tokenSpans[i].getStart(), tokenSpans[i].getEnd());
log.trace(" > add {}", token);
}
}
}
use of org.apache.stanbol.enhancer.nlp.model.Section in project stanbol by apache.
the class EntityCoReferenceEngine method extractNersAndNounPhrases.
/**
* Extracts the NERs and the noun phrases from the given text and puts them in the given lists.
*
* @param ci
* @param ners
* @param nounPhrases
*/
private void extractNersAndNounPhrases(ContentItem ci, Map<Integer, List<Span>> ners, List<NounPhrase> nounPhrases) {
AnalysedText at = NlpEngineHelper.getAnalysedText(this, ci, true);
Iterator<? extends Section> sections = at.getSentences();
if (!sections.hasNext()) {
// process as single sentence
sections = Collections.singleton(at).iterator();
}
int sentenceCnt = 0;
while (sections.hasNext()) {
sentenceCnt++;
Section section = sections.next();
List<NounPhrase> sectionNounPhrases = new ArrayList<NounPhrase>();
List<Span> sectionNers = new ArrayList<Span>();
Iterator<Span> chunks = section.getEnclosed(EnumSet.of(SpanTypeEnum.Chunk));
while (chunks.hasNext()) {
Span chunk = chunks.next();
Value<NerTag> ner = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION);
if (ner != null) {
sectionNers.add(chunk);
}
Value<PhraseTag> phrase = chunk.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION);
if (phrase != null && phrase.value().getCategory() == LexicalCategory.Noun) {
sectionNounPhrases.add(new NounPhrase(chunk, sentenceCnt));
}
}
for (NounPhrase nounPhrase : sectionNounPhrases) {
Iterator<Span> tokens = section.getEnclosed(EnumSet.of(SpanTypeEnum.Token));
while (tokens.hasNext()) {
Span token = tokens.next();
if (nounPhrase.containsSpan(token)) {
nounPhrase.addToken(token);
}
}
for (Span sectionNer : sectionNers) {
if (nounPhrase.containsSpan(sectionNer)) {
nounPhrase.addNerChunk(sectionNer);
}
}
}
nounPhrases.addAll(sectionNounPhrases);
if (!sectionNers.isEmpty()) {
ners.put(sentenceCnt, sectionNers);
}
}
}
Aggregations