Search in sources :

Example 81 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class WebAnnoTsv3WriterTestBase method makeJCasOneSentence.

private static JCas makeJCasOneSentence() throws UIMAException {
    JCas jcas = makeJCas();
    TokenBuilder<Token, Sentence> tb = new TokenBuilder<>(Token.class, Sentence.class);
    tb.buildTokens(jcas, "This is a test .");
    return jcas;
}
Also used : TokenBuilder(org.apache.uima.fit.testing.factory.TokenBuilder) JCas(org.apache.uima.jcas.JCas) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 82 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class WebAnnoTsv3WriterTestBase method testZeroLengthSpanBetweenAdjacentTokens.

@Test
public void testZeroLengthSpanBetweenAdjacentTokens() throws Exception {
    JCas jcas = makeJCas();
    jcas.setDocumentText("word.");
    new Token(jcas, 0, 4).addToIndexes();
    new Token(jcas, 4, 5).addToIndexes();
    new Sentence(jcas, 0, 5).addToIndexes();
    CAS cas = jcas.getCas();
    Type simpleSpanType = cas.getTypeSystem().getType("webanno.custom.SimpleSpan");
    // Insert zero-width annotation between the adjacent tokens (at end of first token).
    AnnotationFS fs1a = cas.createAnnotation(simpleSpanType, 4, 4);
    cas.addFsToIndexes(fs1a);
    writeAndAssertEquals(jcas, WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan"));
}
Also used : AnnotationFS(org.apache.uima.cas.text.AnnotationFS) Type(org.apache.uima.cas.Type) CAS(org.apache.uima.cas.CAS) JCas(org.apache.uima.jcas.JCas) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) Test(org.junit.Test)

Example 83 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class Tsv3XSerializerTest method makeJCasOneSentence.

private JCas makeJCasOneSentence(String aText) throws UIMAException {
    TypeSystemDescription global = TypeSystemDescriptionFactory.createTypeSystemDescription();
    TypeSystemDescription local = TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath("src/test/resources/desc/type/webannoTestTypes.xml");
    TypeSystemDescription merged = CasCreationUtils.mergeTypeSystems(asList(global, local));
    JCas jcas = JCasFactory.createJCas(merged);
    DocumentMetaData.create(jcas).setDocumentId("doc");
    TokenBuilder<Token, Sentence> tb = new TokenBuilder<>(Token.class, Sentence.class);
    tb.buildTokens(jcas, aText);
    // sentence break
    for (Sentence s : select(jcas, Sentence.class)) {
        s.removeFromIndexes();
    }
    // Add a new sentence covering the whole text
    new Sentence(jcas, 0, jcas.getDocumentText().length()).addToIndexes();
    return jcas;
}
Also used : TokenBuilder(org.apache.uima.fit.testing.factory.TokenBuilder) TypeSystemDescription(org.apache.uima.resource.metadata.TypeSystemDescription) JCas(org.apache.uima.jcas.JCas) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 84 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class RemoveZeroSizeTokensAndSentencesRepair method repair.

@Override
public void repair(Project aProject, CAS aCas, List<LogMessage> aMessages) {
    try {
        for (Sentence s : select(aCas.getJCas(), Sentence.class)) {
            if (s.getBegin() >= s.getEnd()) {
                s.removeFromIndexes();
                aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed sentence with illegal span: %s", s));
            }
        }
        for (Token t : select(aCas.getJCas(), Token.class)) {
            if (t.getBegin() >= t.getEnd()) {
                Lemma lemma = t.getLemma();
                if (lemma != null) {
                    lemma.removeFromIndexes();
                    aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed lemma attached to token with illegal span: %s", t));
                }
                POS pos = t.getPos();
                if (pos != null) {
                    pos.removeFromIndexes();
                    aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed POS attached to token with illegal span: %s", t));
                }
                Stem stem = t.getStem();
                if (stem != null) {
                    stem.removeFromIndexes();
                    aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed stem attached to token with illegal span: %s", t));
                }
                t.removeFromIndexes();
                aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed token with illegal span: %s", t));
            }
        }
    } catch (CASException e) {
        log.error("Unabled to access JCas", e);
        aMessages.add(new LogMessage(this, LogLevel.ERROR, "Unabled to access JCas", e.getMessage()));
    }
}
Also used : LogMessage(de.tudarmstadt.ukp.clarin.webanno.diag.CasDoctor.LogMessage) POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) Lemma(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) CASException(org.apache.uima.cas.CASException) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) Stem(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem)

Example 85 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-lab by dkpro.

the class ExamplePosAnnotator method process.

@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
    Collection<TOP> addToIndexes = new ArrayList<TOP>();
    // generate a list of training instances for each sentence in the  document
    for (Sentence sentence : select(jCas, Sentence.class)) {
        List<Instance<String>> instances = new ArrayList<Instance<String>>();
        List<Token> tokens = selectCovered(jCas, Token.class, sentence);
        // for each token, extract all feature values and the label
        for (Token token : tokens) {
            Instance<String> instance = new Instance<String>();
            // extract all features that require only the token annotation
            for (SimpleFeatureExtractor extractor : this.tokenFeatureExtractors) {
                instance.addAll(extractor.extract(jCas, token));
            }
            // extract all features that require the token and sentence annotations
            for (ContextExtractor<Token> extractor : this.contextFeatureExtractors) {
                instance.addAll(extractor.extractWithin(jCas, token, sentence));
            }
            // set the instance label from the token's part of speech
            if (this.isTraining()) {
                instance.setOutcome(token.getPos().getPosValue());
            }
            // add the instance to the list
            instances.add(instance);
        }
        if (this.isTraining()) {
            // for training, write instances to the data write
            this.dataWriter.write(instances);
        } else {
            // for classification, set the labels as the token POS labels
            Iterator<Token> tokensIter = tokens.iterator();
            List<String> labels = classify(instances);
            for (String label : labels) {
                Token t = tokensIter.next();
                POS pos = t.getPos();
                if (pos == null) {
                    pos = new POS(jCas, t.getBegin(), t.getEnd());
                    addToIndexes.add(pos);
                    t.setPos(pos);
                }
                pos.setPosValue(label);
            }
        }
        for (TOP fs : addToIndexes) {
            fs.addToIndexes();
        }
    }
}
Also used : Instance(org.cleartk.classifier.Instance) ArrayList(java.util.ArrayList) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) TOP(org.apache.uima.jcas.cas.TOP) POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) SimpleFeatureExtractor(org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Aggregations

Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)90 JCas (org.apache.uima.jcas.JCas)41 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)34 ArrayList (java.util.ArrayList)22 AnnotatorState (de.tudarmstadt.ukp.clarin.webanno.api.annotation.model.AnnotatorState)14 Type (org.apache.uima.cas.Type)12 AnnotationFS (org.apache.uima.cas.text.AnnotationFS)12 IOException (java.io.IOException)9 SourceDocument (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument)8 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)8 Test (org.junit.Test)8 HashMap (java.util.HashMap)7 TokenBuilder (org.apache.uima.fit.testing.factory.TokenBuilder)7 AnnotationException (de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException)6 WebAnnoCasUtil.getFirstSentence (de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.getFirstSentence)6 AnnotationDocument (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument)6 AnnotationFeature (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)6 FrequencyDistribution (de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)6 CASException (org.apache.uima.cas.CASException)6 AutomationTypeAdapter (de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter)5