Search in sources :

Example 1 with MorphoFeatures

use of org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures in project stanbol by apache.

the class CeliAnalyzedTextLemmatizerEngineTest method testEngineDe.

@Test
public void testEngineDe() throws IOException, EngineException {
    ContentItem ci = ciFactory.createContentItem(new StringSource(de_text));
    Assert.assertNotNull(ci);
    AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
    Assert.assertNotNull(at);
    ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
    Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));
    // Add some Tokens with POS annotations to test the usage of
    // existing POS annotations by the lemmatizer
    Token verbrachten = at.addToken(de_verbStart, de_verbStart + de_verb.length());
    verbrachten.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("V", LexicalCategory.Verb), de_verbProb));
    Token schonen = at.addToken(de_adjectiveStart, de_adjectiveStart + de_adjective.length());
    schonen.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("ADJ", LexicalCategory.Adjective), de_adjectiveProb));
    Token urlaub = at.addToken(de_nounStart, de_nounStart + de_noun.length());
    urlaub.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NC", LexicalCategory.Noun), de_nounProb));
    Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
    // compute the enhancements
    try {
        engine.computeEnhancements(ci);
    } catch (EngineException e) {
        RemoteServiceHelper.checkServiceUnavailable(e);
        // deactivate test
        return;
    }
    // now validate the enhancements
    boolean foundVerb = false;
    boolean foundAdjective = false;
    boolean foundNoun = false;
    for (Iterator<Token> tokens = at.getTokens(); tokens.hasNext(); ) {
        Token token = tokens.next();
        log.info("Token: {}", token);
        List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
        if (de_verb.equals(token.getSpan())) {
            foundVerb = !mfs.isEmpty();
            validateMorphFeatureProbability(mfs, LexicalCategory.Verb, de_verbProb);
        } else if (de_adjective.equals(token.getSpan())) {
            foundAdjective = !mfs.isEmpty();
            validateMorphFeatureProbability(mfs, LexicalCategory.Adjective, de_adjectiveProb);
        } else if (de_noun.equals(token.getSpan())) {
            foundNoun = !mfs.isEmpty();
            validateMorphFeatureProbability(mfs, LexicalCategory.Noun, de_nounProb);
        }
        for (Value<MorphoFeatures> mf : mfs) {
            log.info("  - {}", mf);
            Assert.assertNotNull(mf.value().getLemma());
        }
    }
    Assert.assertTrue("No MorphoFeatures found for '" + de_verb + "'!", foundVerb);
    Assert.assertTrue("No MorphoFeatures found for '" + de_adjective + "'!", foundAdjective);
    Assert.assertTrue("No MorphoFeatures found for '" + de_noun + "'!", foundNoun);
}
Also used : PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Value(org.apache.stanbol.enhancer.nlp.model.annotation.Value) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 2 with MorphoFeatures

use of org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures in project stanbol by apache.

the class CeliAnalyzedTextLemmatizerEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    isLangaugeConfigured(this, languageConfig, language, true);
    List<LexicalEntry> terms;
    try {
        terms = this.client.performMorfologicalAnalysis(at.getSpan(), language);
    } catch (IOException e) {
        throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
    } catch (SOAPException e) {
        throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
    }
    Map<LexicalCategory, Double> tokenLexCats = new EnumMap<LexicalCategory, Double>(LexicalCategory.class);
    for (LexicalEntry term : terms) {
        if (term.getTermReadings().isEmpty()) {
            // ignore terms without readings
            continue;
        }
        // Add the LexicalEntry as Token to the Text. NOTE that if a
        // Token with the same start/end positions already exist this
        // Method returns the existing instance
        Token token = at.addToken(term.getFrom(), term.getTo());
        // Now try to get POS annotations for the Token
        for (Value<PosTag> posAnno : token.getAnnotations(NlpAnnotations.POS_ANNOTATION)) {
            if (posAnno.value().isMapped()) {
                for (LexicalCategory cat : posAnno.value().getCategories()) {
                    if (!tokenLexCats.containsKey(cat)) {
                        // do not override with lover prob
                        tokenLexCats.put(cat, posAnno.probability());
                    }
                }
            }
        }
        for (Reading reading : term.getTermReadings()) {
            MorphoFeatures mf = CeliMorphoFeatures.parseFrom(reading, language);
            // add the readings (MorphoFeatures)
            if (mf != null) {
                // use the POS tags of the morpho analysis and compare it
                // with existing POS tags.
                double posProbability = -1;
                Set<LexicalCategory> mfCats = EnumSet.noneOf(LexicalCategory.class);
                for (PosTag mfPos : mf.getPosList()) {
                    mfCats.addAll(mfPos.getCategories());
                }
                for (LexicalCategory mfCat : mfCats) {
                    Double prob = tokenLexCats.get(mfCat);
                    if (prob != null && posProbability < prob) {
                        posProbability = prob;
                    }
                }
                // add the morpho features with the posProbabiliy
                Value<MorphoFeatures> value = Value.value(mf, posProbability < 0 ? Value.UNKNOWN_PROBABILITY : posProbability);
                token.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, value);
            }
        }
    }
}
Also used : EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) IOException(java.io.IOException) LexicalCategory(org.apache.stanbol.enhancer.nlp.pos.LexicalCategory) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) SOAPException(javax.xml.soap.SOAPException) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) CeliMorphoFeatures(org.apache.stanbol.enhancer.engines.celi.CeliMorphoFeatures) EnumMap(java.util.EnumMap)

Example 3 with MorphoFeatures

use of org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures in project stanbol by apache.

the class AnalyzedTextSerializerAndParserTest method setup.

@BeforeClass
public static final void setup() throws IOException {
    ci = ciFactory.createContentItem(new StringSource(text));
    textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
    analysedTextWithData = createAnalysedText();
    int sentence = text.indexOf('.') + 1;
    Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
    expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " + "cities such as Paris and people such as Bob Marley.");
    Token the = sent1.addToken(0, 3);
    expectedTokens.put(the, "The");
    the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PREP", Pos.Preposition), 0.85));
    Token stanbol = sent1.addToken(4, 11);
    expectedTokens.put(stanbol, "Stanbol");
    stanbol.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
    stanbol.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, Value.value(0.5));
    // use index to create Tokens
    int enhancerStart = sent1.getSpan().indexOf("enhancer");
    Token enhancer = sent1.addToken(enhancerStart, enhancerStart + "enhancer".length());
    expectedTokens.put(enhancer, "enhancer");
    enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
    enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("N", LexicalCategory.Noun), 0.87));
    MorphoFeatures morpho = new MorphoFeatures("enhance");
    morpho.addCase(new CaseTag("test-case-1", Case.Comitative));
    morpho.addCase(new CaseTag("test-case-2", Case.Abessive));
    morpho.addDefinitness(Definitness.Definite);
    morpho.addPerson(Person.First);
    morpho.addPos(new PosTag("PN", Pos.ProperNoun));
    morpho.addGender(new GenderTag("test-gender", Gender.Masculine));
    morpho.addNumber(new NumberTag("test-number", NumberFeature.Plural));
    morpho.addTense(new TenseTag("test-tense", Tense.Present));
    morpho.addVerbForm(new VerbMoodTag("test-verb-mood", VerbMood.ConditionalVerb));
    enhancer.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, Value.value(morpho));
    // create a chunk
    Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
    expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
    stanbolEnhancer.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("organization", DBPEDIA_ORGANISATION)));
    stanbolEnhancer.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("NP", LexicalCategory.Noun), 0.98));
}
Also used : CaseTag(org.apache.stanbol.enhancer.nlp.morpho.CaseTag) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) Token(org.apache.stanbol.enhancer.nlp.model.Token) VerbMoodTag(org.apache.stanbol.enhancer.nlp.morpho.VerbMoodTag) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) NumberTag(org.apache.stanbol.enhancer.nlp.morpho.NumberTag) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TenseTag(org.apache.stanbol.enhancer.nlp.morpho.TenseTag) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) GenderTag(org.apache.stanbol.enhancer.nlp.morpho.GenderTag) BeforeClass(org.junit.BeforeClass)

Example 4 with MorphoFeatures

use of org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures in project stanbol by apache.

the class MorphoFeaturesSupport method parse.

@Override
public MorphoFeatures parse(ObjectNode jMorpho, AnalysedText at) {
    JsonNode jLemma = jMorpho.path("lemma");
    if (!jLemma.isTextual()) {
        throw new IllegalStateException("Field 'lemma' MUST provide a String value (parsed JSON: " + jMorpho);
    }
    MorphoFeatures morpho = new MorphoFeatures(jLemma.asText());
    JsonNode node = jMorpho.path("case");
    if (node.isArray()) {
        ArrayNode jCases = (ArrayNode) node;
        for (int i = 0; i < jCases.size(); i++) {
            JsonNode member = jCases.get(i);
            if (member.isObject()) {
                ObjectNode jCase = (ObjectNode) member;
                JsonNode tag = jCase.path("tag");
                if (tag.isTextual()) {
                    EnumSet<Case> type = JsonUtils.parseEnum(jCase, "type", Case.class);
                    if (type.isEmpty()) {
                        morpho.addCase(new CaseTag(tag.getTextValue()));
                    } else {
                        morpho.addCase(new CaseTag(tag.getTextValue(), type.iterator().next()));
                    }
                } else {
                    log.warn("Unable to parse CaseTag becuase 'tag' value is " + "missing or is not a String (json: " + jCase.toString() + ")");
                }
            } else {
                log.warn("Unable to parse CaseTag from " + member.toString());
            }
        }
    } else if (!node.isMissingNode()) {
        log.warn("Unable to parse CaseTags (Json Array expected as value for field 'case' but was " + node);
    }
    if (jMorpho.has("definitness")) {
        for (Definitness d : JsonUtils.parseEnum(jMorpho, "definitness", Definitness.class)) {
            morpho.addDefinitness(d);
        }
    }
    node = jMorpho.path("gender");
    if (node.isArray()) {
        ArrayNode jGenders = (ArrayNode) node;
        for (int i = 0; i < jGenders.size(); i++) {
            JsonNode member = jGenders.get(i);
            if (member.isObject()) {
                ObjectNode jGender = (ObjectNode) member;
                JsonNode tag = jGender.path("tag");
                if (tag.isTextual()) {
                    EnumSet<Gender> type = JsonUtils.parseEnum(jGender, "type", Gender.class);
                    if (type.isEmpty()) {
                        morpho.addGender(new GenderTag(tag.getTextValue()));
                    } else {
                        morpho.addGender(new GenderTag(tag.getTextValue(), type.iterator().next()));
                    }
                } else {
                    log.warn("Unable to parse GenderTag becuase 'tag' value is " + "missing or is not a String (json: " + jGender.toString() + ")");
                }
            } else {
                log.warn("Unable to parse GenderTag from " + member.toString());
            }
        }
    } else if (!node.isMissingNode()) {
        log.warn("Unable to parse GenderTag (Json Array expected as value for field 'case' but was " + node);
    }
    node = jMorpho.path("number");
    if (node.isArray()) {
        ArrayNode jNumbers = (ArrayNode) node;
        for (int i = 0; i < jNumbers.size(); i++) {
            JsonNode member = jNumbers.get(i);
            if (member.isObject()) {
                ObjectNode jNumber = (ObjectNode) member;
                JsonNode tag = jNumber.path("tag");
                if (tag.isTextual()) {
                    EnumSet<NumberFeature> type = JsonUtils.parseEnum(jNumber, "type", NumberFeature.class);
                    if (type.isEmpty()) {
                        morpho.addNumber(new NumberTag(tag.getTextValue()));
                    } else {
                        morpho.addNumber(new NumberTag(tag.getTextValue(), type.iterator().next()));
                    }
                } else {
                    log.warn("Unable to parse NumberTag becuase 'tag' value is " + "missing or is not a String (json: " + jNumber.toString() + ")");
                }
            } else {
                log.warn("Unable to parse NumberTag from " + member.toString());
            }
        }
    } else if (!node.isMissingNode()) {
        log.warn("Unable to parse NumberTag (Json Array expected as value for field 'case' but was " + node);
    }
    if (jMorpho.has("person")) {
        for (Person p : JsonUtils.parseEnum(jMorpho, "person", Person.class)) {
            morpho.addPerson(p);
        }
    }
    node = jMorpho.path("pos");
    if (node.isArray()) {
        ArrayNode jPosTags = (ArrayNode) node;
        for (int i = 0; i < jPosTags.size(); i++) {
            JsonNode member = jPosTags.get(i);
            if (member.isObject()) {
                ObjectNode jPosTag = (ObjectNode) member;
                morpho.addPos(getPosTagParser().parse(jPosTag, at));
            } else {
                log.warn("Unable to parse PosTag from " + member.toString());
            }
        }
    } else if (!node.isMissingNode()) {
        log.warn("Unable to parse PosTag (Json Array expected as value for field 'case' but was " + node);
    }
    node = jMorpho.path("tense");
    if (node.isArray()) {
        ArrayNode jTenses = (ArrayNode) node;
        for (int i = 0; i < jTenses.size(); i++) {
            JsonNode member = jTenses.get(i);
            if (member.isObject()) {
                ObjectNode jTense = (ObjectNode) member;
                JsonNode tag = jTense.path("tag");
                if (tag.isTextual()) {
                    EnumSet<Tense> type = JsonUtils.parseEnum(jTense, "type", Tense.class);
                    if (type.isEmpty()) {
                        morpho.addTense(new TenseTag(tag.getTextValue()));
                    } else {
                        morpho.addTense(new TenseTag(tag.getTextValue(), type.iterator().next()));
                    }
                } else {
                    log.warn("Unable to parse TenseTag becuase 'tag' value is " + "missing or is not a String (json: " + jTense.toString() + ")");
                }
            } else {
                log.warn("Unable to parse TenseTag from " + member.toString());
            }
        }
    } else if (!node.isMissingNode()) {
        log.warn("Unable to parse TenseTag (Json Array expected as value for field 'case' but was " + node);
    }
    node = jMorpho.path("verb-mood");
    if (node.isArray()) {
        ArrayNode jVerbMoods = (ArrayNode) node;
        for (int i = 0; i < jVerbMoods.size(); i++) {
            JsonNode member = jVerbMoods.get(i);
            if (member.isObject()) {
                ObjectNode jVerbMood = (ObjectNode) member;
                JsonNode tag = jVerbMood.path("tag");
                if (tag.isTextual()) {
                    EnumSet<VerbMood> type = JsonUtils.parseEnum(jVerbMood, "type", VerbMood.class);
                    if (type.isEmpty()) {
                        morpho.addVerbForm(new VerbMoodTag(tag.getTextValue()));
                    } else {
                        morpho.addVerbForm(new VerbMoodTag(tag.getTextValue(), type.iterator().next()));
                    }
                } else {
                    log.warn("Unable to parse VerbMoodTag becuase 'tag' value is " + "missing or is not a String (json: " + jVerbMood.toString() + ")");
                }
            } else {
                log.warn("Unable to parse VerbMoodTag from " + member.toString());
            }
        }
    } else if (!node.isMissingNode()) {
        log.warn("Unable to parse VerbMoodTag (Json Array expected as value for field 'case' but was " + node);
    }
    return morpho;
}
Also used : Tense(org.apache.stanbol.enhancer.nlp.morpho.Tense) CaseTag(org.apache.stanbol.enhancer.nlp.morpho.CaseTag) ObjectNode(org.codehaus.jackson.node.ObjectNode) Definitness(org.apache.stanbol.enhancer.nlp.morpho.Definitness) JsonNode(org.codehaus.jackson.JsonNode) VerbMoodTag(org.apache.stanbol.enhancer.nlp.morpho.VerbMoodTag) Gender(org.apache.stanbol.enhancer.nlp.morpho.Gender) Case(org.apache.stanbol.enhancer.nlp.morpho.Case) NumberFeature(org.apache.stanbol.enhancer.nlp.morpho.NumberFeature) NumberTag(org.apache.stanbol.enhancer.nlp.morpho.NumberTag) VerbMood(org.apache.stanbol.enhancer.nlp.morpho.VerbMood) ArrayNode(org.codehaus.jackson.node.ArrayNode) TenseTag(org.apache.stanbol.enhancer.nlp.morpho.TenseTag) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) Person(org.apache.stanbol.enhancer.nlp.morpho.Person) GenderTag(org.apache.stanbol.enhancer.nlp.morpho.GenderTag)

Example 5 with MorphoFeatures

use of org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures in project stanbol by apache.

the class KuromojiNlpEngine method computeEnhancements.

/**
 * Compute enhancements for supplied ContentItem. The results of the process
 * are expected to be stored in the metadata of the content item.
 * <p/>
 * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
 * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
 * <p/>
 * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
 * stores it as a new part in the content item. The metadata is not changed.
 *
 * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
 *          if the underlying process failed to work as
 *          expected
 */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
    String language = getLanguage(this, ci, false);
    if (!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
        throw new IllegalStateException("The detected language is NOT 'ja'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
    }
    // start with the Tokenizer
    TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
    // build the analyzing chain by adding all TokenFilters
    for (TokenFilterFactory filterFactory : filterFactories) {
        tokenStream = filterFactory.create(tokenStream);
    }
    // Try to extract sentences based on POS tags ...
    int sentStartOffset = -1;
    // NER data
    List<NerData> nerList = new ArrayList<NerData>();
    // the next index where the NerData.context need to be set
    int nerSentIndex = 0;
    NerData ner = null;
    OffsetAttribute offset = null;
    try {
        // required with Solr 4
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            offset = tokenStream.addAttribute(OffsetAttribute.class);
            Token token = at.addToken(offset.startOffset(), offset.endOffset());
            // Get the POS attribute and init the PosTag
            PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
            PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
            if (posTag == null) {
                posTag = adhocTags.get(posAttr.getPartOfSpeech());
                if (posTag == null) {
                    posTag = new PosTag(posAttr.getPartOfSpeech());
                    adhocTags.put(posAttr.getPartOfSpeech(), posTag);
                    log.warn(" ... missing PosTag mapping for {}", posAttr.getPartOfSpeech());
                }
            }
            // Sentence detection by POS tag
            if (sentStartOffset < 0) {
                // the last token was a sentence ending
                sentStartOffset = offset.startOffset();
            }
            if (posTag.hasPos(Pos.Point)) {
                Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
                // add the sentence as context to the NerData instances
                while (nerSentIndex < nerList.size()) {
                    nerList.get(nerSentIndex).context = sent.getSpan();
                    nerSentIndex++;
                }
                sentStartOffset = -1;
            }
            // POS
            token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
            // NER
            NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
            if (ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))) {
                // write NER annotation
                Chunk chunk = at.addChunk(ner.start, ner.end);
                chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
                // NOTE that the fise:TextAnnotation are written later based on the nerList
                // clean up
                ner = null;
            }
            if (nerTag != null) {
                if (ner == null) {
                    ner = new NerData(nerTag, offset.startOffset());
                    nerList.add(ner);
                }
                ner.end = offset.endOffset();
            }
            BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
            MorphoFeatures morpho = null;
            if (baseFormAttr != null && baseFormAttr.getBaseForm() != null) {
                morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
                // and add the posTag
                morpho.addPos(posTag);
            }
            InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
            inflectionAttr.getInflectionForm();
            inflectionAttr.getInflectionType();
            if (morpho != null) {
                // if present add the morpho
                token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
            }
        }
        // we still need to write the last sentence
        Sentence lastSent = null;
        if (offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset) {
            lastSent = at.addSentence(sentStartOffset, offset.endOffset());
        }
        // and set the context off remaining named entities
        while (nerSentIndex < nerList.size()) {
            if (lastSent != null) {
                nerList.get(nerSentIndex).context = lastSent.getSpan();
            } else {
                // no sentence detected
                nerList.get(nerSentIndex).context = at.getSpan();
            }
            nerSentIndex++;
        }
    } catch (IOException e) {
        throw new EngineException(this, ci, "Exception while reading from " + "AnalyzedText contentpart", e);
    } finally {
        try {
            tokenStream.close();
        } catch (IOException e) {
        /* ignore */
        }
    }
    // finally write the NER annotations to the metadata of the ContentItem
    final Graph metadata = ci.getMetadata();
    ci.getLock().writeLock().lock();
    try {
        Language lang = new Language("ja");
        for (NerData nerData : nerList) {
            IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
            metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(at.getSpan().substring(nerData.start, nerData.end), lang)));
            metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
            metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
            metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
            metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(nerData.context, lang)));
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) IRI(org.apache.clerezza.commons.rdf.IRI) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) NlpEngineHelper.initAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) CharSequenceReader(org.apache.commons.io.input.CharSequenceReader) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Language(org.apache.clerezza.commons.rdf.Language) NlpEngineHelper.getLanguage(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage) BaseFormAttribute(org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) InflectionAttribute(org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) PartOfSpeechAttribute(org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute) IOException(java.io.IOException) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) Graph(org.apache.clerezza.commons.rdf.Graph) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Aggregations

MorphoFeatures (org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures)5 Token (org.apache.stanbol.enhancer.nlp.model.Token)4 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)4 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)3 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)3 IOException (java.io.IOException)2 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)2 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)2 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)2 Sentence (org.apache.stanbol.enhancer.nlp.model.Sentence)2 CaseTag (org.apache.stanbol.enhancer.nlp.morpho.CaseTag)2 GenderTag (org.apache.stanbol.enhancer.nlp.morpho.GenderTag)2 NumberTag (org.apache.stanbol.enhancer.nlp.morpho.NumberTag)2 TenseTag (org.apache.stanbol.enhancer.nlp.morpho.TenseTag)2 VerbMoodTag (org.apache.stanbol.enhancer.nlp.morpho.VerbMoodTag)2 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)2 StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)2 ArrayList (java.util.ArrayList)1 EnumMap (java.util.EnumMap)1 SOAPException (javax.xml.soap.SOAPException)1