Search in sources :

Example 1 with PhraseTag

use of org.apache.stanbol.enhancer.nlp.phrase.PhraseTag in project stanbol by apache.

the class OpenNlpChunkingEngine method computeEnhancements.

/**
     * Compute enhancements for supplied ContentItem. The results of the process
     * are expected to be stored in the metadata of the content item.
     * <p/>
     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the underlying process failed to work as
     *          expected
     */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    isLangaugeConfigured(this, languageConfiguration, language, true);
    ChunkerME chunker = initChunker(language);
    if (chunker == null) {
        return;
    }
    //init the Phrase TagSet
    TagSet<PhraseTag> tagSet = tagSetRegistry.getTagSet(language);
    if (tagSet == null) {
    }
    if (tagSet == null) {
        log.warn("No Phrase TagSet registered for Language '{}'. Will build an " + "adhoc set based on encountered Tags!", language);
        //for now only created to avoid checks for tagSet == null
        //TODO: in future we might want to automatically create posModels based
        //on tagged texts. However this makes no sense as long we can not
        //persist TagSets.
        tagSet = new TagSet<PhraseTag>("dummy", language);
    }
    //holds PosTags created for POS tags that where not part of the posModel
    //(will hold all PosTags in case tagSet is NULL
    Map<String, PhraseTag> adhocTags = languageAdhocTags.get(language);
    if (adhocTags == null) {
        adhocTags = new HashMap<String, PhraseTag>();
        languageAdhocTags.put(language, adhocTags);
    }
    ci.getLock().writeLock().lock();
    try {
        Iterator<? extends Section> sentences = at.getSentences();
        if (!sentences.hasNext()) {
            //no sentences ... iterate over the whole text
            sentences = Collections.singleton(at).iterator();
        }
        List<String> tokenTextList = new ArrayList<String>(64);
        List<String> posList = new ArrayList<String>(64);
        List<Token> tokenList = new ArrayList<Token>(64);
        //process each sentence seperatly
        while (sentences.hasNext()) {
            // (1) get Tokens and POS information for the sentence
            Section sentence = sentences.next();
            Iterator<Token> tokens = sentence.getTokens();
            while (tokens.hasNext()) {
                Token token = tokens.next();
                tokenList.add(token);
                tokenTextList.add(token.getSpan());
                Value<PosTag> posValue = token.getAnnotation(POS_ANNOTATION);
                if (posValue == null) {
                    throw new EngineException("Missing POS value for Token '" + token.getSpan() + "' of ContentItem " + ci.getUri() + "(Sentence: '" + sentence.getSpan() + "'). This may " + "indicate that a POS tagging Engine is missing in " + "the EnhancementChain or that the used POS tagging " + "does not provide POS tags for each token!");
                } else {
                    posList.add(posValue.value().getTag());
                }
            }
            String[] tokenStrings = tokenTextList.toArray(new String[tokenTextList.size()]);
            String[] tokenPos = posList.toArray(new String[tokenTextList.size()]);
            if (log.isTraceEnabled()) {
                log.trace("Tokens: {}" + Arrays.toString(tokenStrings));
            }
            //free memory
            tokenTextList.clear();
            //free memory
            posList.clear();
            // (2) Chunk the sentence
            String[] chunkTags = chunker.chunk(tokenStrings, tokenPos);
            double[] chunkProb = chunker.probs();
            if (log.isTraceEnabled()) {
                log.trace("Chunks: {}" + Arrays.toString(chunkTags));
            }
            //free memory
            tokenStrings = null;
            //free memory
            tokenPos = null;
            // (3) Process the results and write the Annotations
            double chunkProps = 0;
            int chunkTokenCount = 0;
            PhraseTag tag = null;
            int i;
            /*
                 * This assumes:
                 *  - 'B-{tag}' ... for start of a new chunk
                 *  - '???' ... anything other for continuing the current chunk
                 *  - 'O' ... no chunk (ends current chunk)
                 */
            for (i = 0; i < tokenList.size(); i++) {
                boolean start = chunkTags[i].charAt(0) == 'B';
                boolean end = tag != null && (start || chunkTags[i].charAt(0) == 'O');
                if (end) {
                    //add the current phrase
                    //add at AnalysedText level, because offsets are absolute
                    //NOTE we are already at the next token when we detect the end
                    Chunk chunk = at.addChunk(tokenList.get(i - chunkTokenCount).getStart(), tokenList.get(i - 1).getEnd());
                    chunk.addAnnotation(PHRASE_ANNOTATION, new Value<PhraseTag>(tag, chunkProps / (double) chunkTokenCount));
                    //reset the state
                    tag = null;
                    chunkTokenCount = 0;
                    chunkProps = 0;
                }
                if (start) {
                    //create the new tag
                    tag = getPhraseTag(tagSet, adhocTags, chunkTags[i].substring(2), //skip 'B-'
                    language);
                }
                if (tag != null) {
                    //count this token for the current chunk
                    chunkProps = chunkProps + chunkProb[i];
                    chunkTokenCount++;
                }
            }
            if (tag != null) {
                Chunk chunk = at.addChunk(tokenList.get(i - chunkTokenCount).getStart(), tokenList.get(i - 1).getEnd());
                chunk.addAnnotation(PHRASE_ANNOTATION, new Value<PhraseTag>(tag, chunkProps / (double) chunkTokenCount));
            }
            // (4) clean up
            tokenList.clear();
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
    if (log.isTraceEnabled()) {
        logChunks(at);
    }
}
Also used : ArrayList(java.util.ArrayList) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) Section(org.apache.stanbol.enhancer.nlp.model.Section) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) ChunkerME(opennlp.tools.chunker.ChunkerME)

Example 2 with PhraseTag

use of org.apache.stanbol.enhancer.nlp.phrase.PhraseTag in project stanbol by apache.

the class AnalyzedTextSerializerAndParserTest method setup.

@BeforeClass
public static final void setup() throws IOException {
    ci = ciFactory.createContentItem(new StringSource(text));
    textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
    analysedTextWithData = createAnalysedText();
    int sentence = text.indexOf('.') + 1;
    Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
    expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " + "cities such as Paris and people such as Bob Marley.");
    Token the = sent1.addToken(0, 3);
    expectedTokens.put(the, "The");
    the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PREP", Pos.Preposition), 0.85));
    Token stanbol = sent1.addToken(4, 11);
    expectedTokens.put(stanbol, "Stanbol");
    stanbol.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
    stanbol.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, Value.value(0.5));
    //use index to create Tokens
    int enhancerStart = sent1.getSpan().indexOf("enhancer");
    Token enhancer = sent1.addToken(enhancerStart, enhancerStart + "enhancer".length());
    expectedTokens.put(enhancer, "enhancer");
    enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
    enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("N", LexicalCategory.Noun), 0.87));
    MorphoFeatures morpho = new MorphoFeatures("enhance");
    morpho.addCase(new CaseTag("test-case-1", Case.Comitative));
    morpho.addCase(new CaseTag("test-case-2", Case.Abessive));
    morpho.addDefinitness(Definitness.Definite);
    morpho.addPerson(Person.First);
    morpho.addPos(new PosTag("PN", Pos.ProperNoun));
    morpho.addGender(new GenderTag("test-gender", Gender.Masculine));
    morpho.addNumber(new NumberTag("test-number", NumberFeature.Plural));
    morpho.addTense(new TenseTag("test-tense", Tense.Present));
    morpho.addVerbForm(new VerbMoodTag("test-verb-mood", VerbMood.ConditionalVerb));
    enhancer.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, Value.value(morpho));
    //create a chunk
    Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
    expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
    stanbolEnhancer.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("organization", DBPEDIA_ORGANISATION)));
    stanbolEnhancer.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("NP", LexicalCategory.Noun), 0.98));
}
Also used : CaseTag(org.apache.stanbol.enhancer.nlp.morpho.CaseTag) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) Token(org.apache.stanbol.enhancer.nlp.model.Token) VerbMoodTag(org.apache.stanbol.enhancer.nlp.morpho.VerbMoodTag) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) NumberTag(org.apache.stanbol.enhancer.nlp.morpho.NumberTag) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TenseTag(org.apache.stanbol.enhancer.nlp.morpho.TenseTag) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) GenderTag(org.apache.stanbol.enhancer.nlp.morpho.GenderTag) BeforeClass(org.junit.BeforeClass)

Example 3 with PhraseTag

use of org.apache.stanbol.enhancer.nlp.phrase.PhraseTag in project stanbol by apache.

the class PhraseTagSupport method parse.

@Override
public PhraseTag parse(ObjectNode jValue, AnalysedText at) {
    JsonNode tag = jValue.path("tag");
    if (!tag.isTextual()) {
        throw new IllegalStateException("Unable to parse PhraseTag. The value of the " + "'tag' field MUST have a textual value (json: " + jValue + ")");
    }
    JsonNode jCat = jValue.path("lc");
    LexicalCategory lc = null;
    if (jCat.isTextual()) {
        try {
            lc = LexicalCategory.valueOf(jCat.getTextValue());
        } catch (IllegalArgumentException e) {
            log.warn("Unable to parse category for PhraseTag from '" + jCat.getTextValue() + "' (will create with tag only)!", e);
        }
    } else if (jCat.isInt()) {
        lc = LexicalCategory.values()[jCat.getIntValue()];
    } else if (!jCat.isMissingNode()) {
        log.warn("Unable to parse category for PhraseTag from " + jCat + "(will create with tag only)");
    }
    return new PhraseTag(tag.getTextValue(), lc);
}
Also used : JsonNode(org.codehaus.jackson.JsonNode) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) LexicalCategory(org.apache.stanbol.enhancer.nlp.pos.LexicalCategory)

Example 4 with PhraseTag

use of org.apache.stanbol.enhancer.nlp.phrase.PhraseTag in project stanbol by apache.

the class EntityCoReferenceEngineTest method testSpatialCoref.

@Test
public void testSpatialCoref() throws EngineException, IOException {
    ContentItem ci = ciFactory.createContentItem(new StringSource(SPATIAL_TEXT));
    Graph graph = ci.getMetadata();
    IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, engine);
    graph.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl("en")));
    graph.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, new PlainLiteralImpl("100.0")));
    graph.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
    Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
    AnalysedText at = atFactory.createAnalysedText(ci, textBlob.getValue());
    Sentence sentence1 = at.addSentence(0, SPATIAL_SENTENCE_1.indexOf(".") + 1);
    Chunk angelaMerkel = sentence1.addChunk(0, "Angela Merkel".length());
    angelaMerkel.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("Angela Merkel", OntologicalClasses.DBPEDIA_PERSON)));
    Sentence sentence2 = at.addSentence(SPATIAL_SENTENCE_1.indexOf(".") + 1, SPATIAL_SENTENCE_1.length() + SPATIAL_SENTENCE_2.indexOf(".") + 1);
    int theStartIdx = sentence2.getSpan().indexOf("The");
    int germanStartIdx = sentence2.getSpan().indexOf("German");
    int chancellorStartIdx = sentence2.getSpan().indexOf("politician");
    Token the = sentence2.addToken(theStartIdx, theStartIdx + "The".length());
    the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("The", LexicalCategory.PronounOrDeterminer, Pos.Determiner)));
    Token german = sentence2.addToken(germanStartIdx, germanStartIdx + "German".length());
    german.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("German", LexicalCategory.Adjective)));
    Token politician = sentence2.addToken(chancellorStartIdx, chancellorStartIdx + "politician".length());
    politician.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("politician", LexicalCategory.Noun)));
    Chunk theGermanChancellor = sentence2.addChunk(theStartIdx, chancellorStartIdx + "politician".length());
    theGermanChancellor.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("The German politician", LexicalCategory.Noun)));
    engine.computeEnhancements(ci);
    Value<CorefFeature> representativeCorefValue = angelaMerkel.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
    Assert.assertNotNull(representativeCorefValue);
    CorefFeature representativeCoref = representativeCorefValue.value();
    Assert.assertTrue(representativeCoref.isRepresentative());
    Assert.assertTrue(representativeCoref.getMentions().contains(theGermanChancellor));
    Value<CorefFeature> subordinateCorefValue = theGermanChancellor.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
    Assert.assertNotNull(subordinateCorefValue);
    CorefFeature subordinateCoref = subordinateCorefValue.value();
    Assert.assertTrue(!subordinateCoref.isRepresentative());
    Assert.assertTrue(subordinateCoref.getMentions().contains(angelaMerkel));
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) CorefFeature(org.apache.stanbol.enhancer.nlp.coref.CorefFeature) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) Token(org.apache.stanbol.enhancer.nlp.model.Token) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Graph(org.apache.clerezza.commons.rdf.Graph) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 5 with PhraseTag

use of org.apache.stanbol.enhancer.nlp.phrase.PhraseTag in project stanbol by apache.

the class NIFHelper method writePhrase.

/**
     * Writes a {@link NlpAnnotations#PHRASE_ANNOTATION} as NIF 1.0 to the
     * parsed RDF graph by using the segmentUri as subject
     * @param graph the graph
     * @param annotated the annotated element (e.g. a {@link Chunk})
     * @param segmentUri the URI of the resource representing the parsed 
     * annotated element in the graph
     */
public static void writePhrase(Graph graph, Annotated annotated, IRI segmentUri) {
    Value<PhraseTag> phraseTag = annotated.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION);
    if (phraseTag != null) {
        IRI phraseTypeUri = LEXICAL_TYPE_TO_PHRASE_TYPE.get(phraseTag.value().getCategory());
        if (phraseTypeUri != null) {
            //add the oliaLink for the Phrase
            graph.add(new TripleImpl(segmentUri, SsoOntology.oliaLink.getUri(), phraseTypeUri));
            graph.add(new TripleImpl(segmentUri, ENHANCER_CONFIDENCE, lf.createTypedLiteral(phraseTag.probability())));
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag)

Aggregations

PhraseTag (org.apache.stanbol.enhancer.nlp.phrase.PhraseTag)8 IRI (org.apache.clerezza.commons.rdf.IRI)3 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)3 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)3 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)3 Token (org.apache.stanbol.enhancer.nlp.model.Token)3 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)3 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)3 ArrayList (java.util.ArrayList)2 Section (org.apache.stanbol.enhancer.nlp.model.Section)2 Sentence (org.apache.stanbol.enhancer.nlp.model.Sentence)2 StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)2 ChunkerME (opennlp.tools.chunker.ChunkerME)1 Graph (org.apache.clerezza.commons.rdf.Graph)1 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)1 NounPhrase (org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase)1 CorefFeature (org.apache.stanbol.enhancer.nlp.coref.CorefFeature)1 Span (org.apache.stanbol.enhancer.nlp.model.Span)1 CaseTag (org.apache.stanbol.enhancer.nlp.morpho.CaseTag)1 GenderTag (org.apache.stanbol.enhancer.nlp.morpho.GenderTag)1