Search in sources :

Example 31 with POS

use of de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS in project webanno by webanno.

the class RemoveZeroSizeTokensAndSentencesRepair method repair.

@Override
public void repair(Project aProject, CAS aCas, List<LogMessage> aMessages) {
    try {
        for (Sentence s : select(aCas.getJCas(), Sentence.class)) {
            if (s.getBegin() >= s.getEnd()) {
                s.removeFromIndexes();
                aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed sentence with illegal span: %s", s));
            }
        }
        for (Token t : select(aCas.getJCas(), Token.class)) {
            if (t.getBegin() >= t.getEnd()) {
                Lemma lemma = t.getLemma();
                if (lemma != null) {
                    lemma.removeFromIndexes();
                    aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed lemma attached to token with illegal span: %s", t));
                }
                POS pos = t.getPos();
                if (pos != null) {
                    pos.removeFromIndexes();
                    aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed POS attached to token with illegal span: %s", t));
                }
                Stem stem = t.getStem();
                if (stem != null) {
                    stem.removeFromIndexes();
                    aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed stem attached to token with illegal span: %s", t));
                }
                t.removeFromIndexes();
                aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed token with illegal span: %s", t));
            }
        }
    } catch (CASException e) {
        log.error("Unabled to access JCas", e);
        aMessages.add(new LogMessage(this, LogLevel.ERROR, "Unabled to access JCas", e.getMessage()));
    }
}
Also used : LogMessage(de.tudarmstadt.ukp.clarin.webanno.diag.CasDoctor.LogMessage) POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) Lemma(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) CASException(org.apache.uima.cas.CASException) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) Stem(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem)

Example 32 with POS

use of de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS in project dkpro-lab by dkpro.

the class ExamplePosAnnotator method process.

@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
    Collection<TOP> addToIndexes = new ArrayList<TOP>();
    // generate a list of training instances for each sentence in the  document
    for (Sentence sentence : select(jCas, Sentence.class)) {
        List<Instance<String>> instances = new ArrayList<Instance<String>>();
        List<Token> tokens = selectCovered(jCas, Token.class, sentence);
        // for each token, extract all feature values and the label
        for (Token token : tokens) {
            Instance<String> instance = new Instance<String>();
            // extract all features that require only the token annotation
            for (SimpleFeatureExtractor extractor : this.tokenFeatureExtractors) {
                instance.addAll(extractor.extract(jCas, token));
            }
            // extract all features that require the token and sentence annotations
            for (ContextExtractor<Token> extractor : this.contextFeatureExtractors) {
                instance.addAll(extractor.extractWithin(jCas, token, sentence));
            }
            // set the instance label from the token's part of speech
            if (this.isTraining()) {
                instance.setOutcome(token.getPos().getPosValue());
            }
            // add the instance to the list
            instances.add(instance);
        }
        if (this.isTraining()) {
            // for training, write instances to the data write
            this.dataWriter.write(instances);
        } else {
            // for classification, set the labels as the token POS labels
            Iterator<Token> tokensIter = tokens.iterator();
            List<String> labels = classify(instances);
            for (String label : labels) {
                Token t = tokensIter.next();
                POS pos = t.getPos();
                if (pos == null) {
                    pos = new POS(jCas, t.getBegin(), t.getEnd());
                    addToIndexes.add(pos);
                    t.setPos(pos);
                }
                pos.setPosValue(label);
            }
        }
        for (TOP fs : addToIndexes) {
            fs.addToIndexes();
        }
    }
}
Also used : Instance(org.cleartk.classifier.Instance) ArrayList(java.util.ArrayList) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) TOP(org.apache.uima.jcas.cas.TOP) POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) SimpleFeatureExtractor(org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 33 with POS

use of de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS in project dkpro-tc by dkpro.

the class PosNGramMC method sentenceBasedDistribution.

private static FrequencyDistribution<String> sentenceBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) {
    FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>();
    for (Sentence s : selectCovered(jcas, Sentence.class, focus)) {
        List<String> postagstrings = new ArrayList<String>();
        for (POS p : selectCovered(jcas, POS.class, s)) {
            if (useCanonical) {
                postagstrings.add(p.getClass().getSimpleName());
            } else {
                postagstrings.add(p.getPosValue());
            }
        }
        String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
        for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) {
            posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
        }
    }
    return posNgrams;
}
Also used : POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) ArrayList(java.util.ArrayList) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 34 with POS

use of de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS in project dkpro-tc by dkpro.

the class PosNGramMC method documentBasedDistribution.

private static FrequencyDistribution<String> documentBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) {
    FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>();
    List<String> postagstrings = new ArrayList<String>();
    for (POS p : selectCovered(jcas, POS.class, focus)) {
        if (useCanonical) {
            postagstrings.add(p.getClass().getSimpleName());
        } else {
            postagstrings.add(p.getPosValue());
        }
    }
    String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
    for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) {
        posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
    }
    return posNgrams;
}
Also used : POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) ArrayList(java.util.ArrayList) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 35 with POS

use of de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS in project dkpro-tc by dkpro.

the class ConversionAnnotator method process.

@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
    for (TextClassificationOutcome o : JCasUtil.select(aJCas, TextClassificationOutcome.class)) {
        POS p = new POS(aJCas, o.getBegin(), o.getEnd());
        String val = o.getOutcome();
        if (suffix != null && !suffix.isEmpty()) {
            val += suffix;
        }
        p.setPosValue(val);
        p.addToIndexes();
    }
}
Also used : POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) TextClassificationOutcome(org.dkpro.tc.api.type.TextClassificationOutcome)

Aggregations

POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)35 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)21 ArrayList (java.util.ArrayList)15 JCas (org.apache.uima.jcas.JCas)14 Test (org.junit.Test)12 Lemma (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma)11 Dependency (de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency)9 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)8 List (java.util.List)8 Type (org.apache.uima.cas.Type)8 AnnotationFeature (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)7 MorphologicalFeatures (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures)7 LinkedHashMap (java.util.LinkedHashMap)7 Feature (org.apache.uima.cas.Feature)7 AnnotationFS (org.apache.uima.cas.text.AnnotationFS)7 Stem (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem)5 HashMap (java.util.HashMap)5 Evaluator (de.tudarmstadt.ukp.clarin.webanno.constraints.evaluator.Evaluator)3 PossibleValue (de.tudarmstadt.ukp.clarin.webanno.constraints.evaluator.PossibleValue)3 ValuesGenerator (de.tudarmstadt.ukp.clarin.webanno.constraints.evaluator.ValuesGenerator)3