Search in sources :

Example 11 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class TcfWriter method writeSentence.

private void writeSentence(JCas aJCas, TextCorpus aTextCorpus, Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> aTokensBeginPositionMap) {
    // if not TCF file, add sentence layer (Sentence is required for BRAT)
    SentencesLayer sentencesLayer = aTextCorpus.getSentencesLayer();
    if (sentencesLayer != null) {
        getLogger().debug("Layer [" + TextCorpusLayerTag.SENTENCES.getXmlName() + "]: found");
        return;
    }
    sentencesLayer = aTextCorpus.createSentencesLayer();
    getLogger().debug("Layer [" + TextCorpusLayerTag.SENTENCES.getXmlName() + "]: created");
    for (Sentence sentence : select(aJCas, Sentence.class)) {
        List<eu.clarin.weblicht.wlfxb.tc.api.Token> tokens = new ArrayList<>();
        for (Token token : selectCovered(Token.class, sentence)) {
            tokens.add(aTokensBeginPositionMap.get(token.getBegin()));
        }
        sentencesLayer.addSentence(tokens);
    }
}
Also used : SentencesLayer(eu.clarin.weblicht.wlfxb.tc.api.SentencesLayer) ArrayList(java.util.ArrayList) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 12 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class WebannoTsv2Reader method setAnnotations.

/**
 * Iterate through lines and create span annotations accordingly. For multiple span annotation,
 * based on the position of the annotation in the line, update only the end position of the
 * annotation
 */
private void setAnnotations(JCas aJcas, InputStream aIs, String aEncoding, StringBuilder text) throws IOException {
    // getting header information
    LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding);
    // token number + token columns (minimum required)
    int columns = 1;
    int tokenStart = 0, sentenceStart = 0;
    Map<Type, Set<Feature>> spanLayers = new LinkedHashMap<>();
    Map<Type, Type> relationayers = new LinkedHashMap<>();
    // an annotation for every feature in a layer
    Map<Type, Map<Integer, AnnotationFS>> annotations = new LinkedHashMap<>();
    // store if this is a Begin/Intermediate/End of an annotation
    Map<Type, Map<Integer, String>> beginEndAnno = new LinkedHashMap<>();
    // Store annotations of tokens so that it can be used later for relation
    // annotations
    Map<Type, Map<String, List<AnnotationFS>>> tokenAnnotations = new LinkedHashMap<>();
    // store target token ids used for a relation
    Map<Type, Map<String, List<String>>> relationTargets = new LinkedHashMap<>();
    // store tokens indexing with the concat of itsbegin-end so that lemma
    // and pos annotation
    // can be attached, if exists, later
    indexedTokens = new HashMap<>();
    while (lineIterator.hasNext()) {
        String line = lineIterator.next().trim();
        if (line.trim().equals("") && sentenceStart == tokenStart) {
            continue;
        }
        if (line.trim().equals("")) {
            text.replace(tokenStart - 1, tokenStart, "");
            tokenStart = tokenStart - 1;
            Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart);
            sentence.addToIndexes();
            tokenStart++;
            sentenceStart = tokenStart;
            text.append("\n");
            continue;
        }
        // sentence
        if (line.startsWith("#text=")) {
            continue;
        }
        if (line.startsWith("#id=")) {
            // it is a comment line
            continue;
        }
        if (line.startsWith("#")) {
            columns = getLayerAndFeature(aJcas, columns, spanLayers, relationayers, line);
            continue;
        }
        // so skip such lines
        if (!Character.isDigit(line.split(" ")[0].charAt(0))) {
            continue;
        }
        // a token number, check if it didn't in the format NUM-NUM
        if (!Character.isDigit(line.split("-")[1].charAt(0))) {
            continue;
        }
        int count = StringUtils.countMatches(line, "\t");
        if (columns != count) {
            throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line);
        }
        // adding tokens and sentence
        StringTokenizer lineTk = new StringTokenizer(line, "\t");
        String tokenNumberColumn = lineTk.nextToken();
        String tokenColumn = lineTk.nextToken();
        Token token = new Token(aJcas, tokenStart, tokenStart + tokenColumn.length());
        token.addToIndexes();
        Type posType = JCasUtil.getType(aJcas, POS.class);
        Type lemmaType = JCasUtil.getType(aJcas, Lemma.class);
        if (spanLayers.containsKey(posType) || spanLayers.containsKey(lemmaType)) {
            indexedTokens.put(tokenStart + "-" + tokenStart + tokenColumn.length(), token);
        }
        // adding the annotations
        createSpanAnnotation(aJcas, tokenStart, spanLayers, relationayers, annotations, beginEndAnno, tokenAnnotations, relationTargets, lineTk, tokenColumn, tokenNumberColumn);
        tokenStart = tokenStart + tokenColumn.length() + 1;
        text.append(tokenColumn).append(" ");
    }
    if (tokenStart > sentenceStart) {
        Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart);
        sentence.addToIndexes();
        text.append("\n");
    }
    createRelationLayer(aJcas, relationayers, tokenAnnotations, relationTargets);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) Set(java.util.Set) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) IOException(java.io.IOException) LineIterator(org.apache.commons.io.LineIterator) LinkedHashMap(java.util.LinkedHashMap) AnnotationFS(org.apache.uima.cas.text.AnnotationFS) Type(org.apache.uima.cas.Type) CasUtil.getType(org.apache.uima.fit.util.CasUtil.getType) StringTokenizer(java.util.StringTokenizer) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 13 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class WebannoTsv2Writer method convertToTsv.

private void convertToTsv(JCas aJCas, OutputStream aOs, String aEncoding) throws IOException, ResourceInitializationException, CASRuntimeException, CASException {
    LowLevelCAS llCas = aJCas.getLowLevelCas();
    tokenIds = new HashMap<>();
    setTokenId(aJCas, tokenIds);
    tokenPositions = new TreeMap<>();
    setTokenPosition(aJCas, tokenPositions);
    Map<Integer, Integer> getTokensPerSentence = new TreeMap<>();
    setTokenSentenceAddress(aJCas, getTokensPerSentence);
    // list of annotation types
    Set<Type> allTypes = new LinkedHashSet<>();
    for (Annotation a : select(aJCas, Annotation.class)) {
        if (!(a instanceof Token || a instanceof Sentence || a instanceof DocumentMetaData || a instanceof TagsetDescription || a instanceof CoreferenceLink)) {
            allTypes.add(a.getType());
        }
    }
    Set<Type> relationTypes = new LinkedHashSet<>();
    // get all arc types
    for (Type type : allTypes) {
        if (type.getFeatures().size() == 0) {
            continue;
        }
        for (Feature feature : type.getFeatures()) {
            if (feature.getShortName().equals(GOVERNOR)) {
                relationTypes.add(type);
                break;
            }
        }
    }
    allTypes.removeAll(relationTypes);
    // relation annotations
    Map<Type, String> relationTypesMap = new HashMap<>();
    for (Type type : relationTypes) {
        if (type.getName().equals(Dependency.class.getName())) {
            relationTypesMap.put(type, POS.class.getName());
            continue;
        }
        for (AnnotationFS anno : CasUtil.select(aJCas.getCas(), type)) {
            for (Feature feature : type.getFeatures()) {
                if (feature.getShortName().equals(GOVERNOR)) {
                    relationTypesMap.put(type, anno.getFeatureValue(feature).getType().getName());
                }
            }
        }
    }
    // all span annotation first
    Map<Feature, Type> spanFeatures = new LinkedHashMap<>();
    allTypes: for (Type type : allTypes) {
        if (type.getFeatures().size() == 0) {
            continue;
        }
        for (Feature feature : type.getFeatures()) {
            // coreference annotation not supported
            if (feature.getShortName().equals(FIRST) || feature.getShortName().equals(NEXT)) {
                continue allTypes;
            }
        }
        IOUtils.write(" # " + type.getName(), aOs, aEncoding);
        for (Feature feature : type.getFeatures()) {
            if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end")) {
                continue;
            }
            spanFeatures.put(feature, type);
            IOUtils.write(" | " + feature.getShortName(), aOs, aEncoding);
        }
    }
    // write all relation annotation first
    Set<Feature> relationFeatures = new LinkedHashSet<>();
    for (Type type : relationTypes) {
        IOUtils.write(" # " + type.getName(), aOs, aEncoding);
        for (Feature feature : type.getFeatures()) {
            if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT)) {
                continue;
            }
            relationFeatures.add(feature);
            IOUtils.write(" | " + feature.getShortName(), aOs, aEncoding);
        }
        // Add the attach type for the realtion anotation
        IOUtils.write(" | AttachTo=" + relationTypesMap.get(type), aOs, aEncoding);
    }
    IOUtils.write("\n", aOs, aEncoding);
    Map<Feature, Map<Integer, String>> allAnnos = new HashMap<>();
    allTypes: for (Type type : allTypes) {
        for (Feature feature : type.getFeatures()) {
            // coreference annotation not supported
            if (feature.getShortName().equals(FIRST) || feature.getShortName().equals(NEXT)) {
                continue allTypes;
            }
        }
        for (Feature feature : type.getFeatures()) {
            if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end")) {
                continue;
            }
            Map<Integer, String> tokenAnnoMap = new TreeMap<>();
            setTokenAnnos(aJCas.getCas(), tokenAnnoMap, type, feature);
            allAnnos.put(feature, tokenAnnoMap);
        }
    }
    // get tokens where dependents are drown to
    Map<Feature, Map<Integer, String>> relAnnos = new HashMap<>();
    for (Type type : relationTypes) {
        for (Feature feature : type.getFeatures()) {
            if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT)) {
                continue;
            }
            Map<Integer, String> tokenAnnoMap = new HashMap<>();
            setRelationFeatureAnnos(aJCas.getCas(), tokenAnnoMap, type, feature);
            relAnnos.put(feature, tokenAnnoMap);
        }
    }
    // get tokens where dependents are drown from - the governor
    Map<Type, Map<Integer, String>> governorAnnos = new HashMap<>();
    for (Type type : relationTypes) {
        Map<Integer, String> govAnnoMap = new HashMap<>();
        setRelationGovernorPos(aJCas.getCas(), govAnnoMap, type);
        governorAnnos.put(type, govAnnoMap);
    }
    int sentId = 1;
    for (Sentence sentence : select(aJCas, Sentence.class)) {
        IOUtils.write("#id=" + sentId++ + "\n", aOs, aEncoding);
        IOUtils.write("#text=" + sentence.getCoveredText().replace("\n", "") + "\n", aOs, aEncoding);
        for (Token token : selectCovered(Token.class, sentence)) {
            IOUtils.write(tokenIds.get(llCas.ll_getFSRef(token)) + "\t" + token.getCoveredText() + "\t", aOs, aEncoding);
            // all span annotations on this token
            for (Feature feature : spanFeatures.keySet()) {
                String annos = allAnnos.get(feature).get(llCas.ll_getFSRef(token));
                if (annos == null) {
                    if (multipleSpans.contains(spanFeatures.get(feature).getName())) {
                        IOUtils.write("O\t", aOs, aEncoding);
                    } else {
                        IOUtils.write("_\t", aOs, aEncoding);
                    }
                } else {
                    IOUtils.write(annos + "\t", aOs, aEncoding);
                }
            }
            for (Type type : relationTypes) {
                for (Feature feature : type.getFeatures()) {
                    if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT)) {
                        continue;
                    }
                    String annos = relAnnos.get(feature).get(llCas.ll_getFSRef(token));
                    if (annos == null) {
                        IOUtils.write("_\t", aOs, aEncoding);
                    } else {
                        IOUtils.write(annos + "\t", aOs, aEncoding);
                    }
                }
                // the governor positions
                String govPos = governorAnnos.get(type).get(llCas.ll_getFSRef(token));
                if (govPos == null) {
                    IOUtils.write("_\t", aOs, aEncoding);
                } else {
                    IOUtils.write(governorAnnos.get(type).get(llCas.ll_getFSRef(token)) + "\t", aOs, aEncoding);
                }
            }
            IOUtils.write("\n", aOs, aEncoding);
        }
        IOUtils.write("\n", aOs, aEncoding);
    }
}
Also used : LinkedHashSet(java.util.LinkedHashSet) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Feature(org.apache.uima.cas.Feature) TagsetDescription(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription) LinkedHashMap(java.util.LinkedHashMap) AnnotationFS(org.apache.uima.cas.text.AnnotationFS) CoreferenceLink(de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData) LowLevelCAS(org.apache.uima.cas.impl.LowLevelCAS) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) Dependency(de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency) TreeMap(java.util.TreeMap) Annotation(org.apache.uima.jcas.tcas.Annotation) Type(org.apache.uima.cas.Type) POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) NavigableMap(java.util.NavigableMap) TreeMap(java.util.TreeMap)

Example 14 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class WebannoTsv2Writer method setTokenId.

private void setTokenId(JCas aJCas, Map<Integer, String> aTokenAddress) {
    LowLevelCAS llCas = aJCas.getLowLevelCas();
    int sentenceId = 1;
    for (Sentence sentence : select(aJCas, Sentence.class)) {
        int tokenId = 1;
        for (Token token : selectCovered(Token.class, sentence)) {
            aTokenAddress.put(llCas.ll_getFSRef(token), sentenceId + "-" + tokenId++);
        }
        sentenceId++;
    }
}
Also used : Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) LowLevelCAS(org.apache.uima.cas.impl.LowLevelCAS) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 15 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class ConstraintsGeneratorTest method makeJCasOneSentence.

private JCas makeJCasOneSentence() throws UIMAException {
    TypeSystemDescription global = TypeSystemDescriptionFactory.createTypeSystemDescription();
    TypeSystemDescription local = TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath("src/test/resources/desc/types/webannoTestTypes.xml");
    TypeSystemDescription merged = CasCreationUtils.mergeTypeSystems(asList(global, local));
    JCas jcas = JCasFactory.createJCas(merged);
    DocumentMetaData.create(jcas).setDocumentId("doc");
    TokenBuilder<Token, Sentence> tb = new TokenBuilder<>(Token.class, Sentence.class);
    tb.buildTokens(jcas, "This is a test .");
    return jcas;
}
Also used : TokenBuilder(org.apache.uima.fit.testing.factory.TokenBuilder) TypeSystemDescription(org.apache.uima.resource.metadata.TypeSystemDescription) JCas(org.apache.uima.jcas.JCas) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Aggregations

Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)90 JCas (org.apache.uima.jcas.JCas)41 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)34 ArrayList (java.util.ArrayList)22 AnnotatorState (de.tudarmstadt.ukp.clarin.webanno.api.annotation.model.AnnotatorState)14 Type (org.apache.uima.cas.Type)12 AnnotationFS (org.apache.uima.cas.text.AnnotationFS)12 IOException (java.io.IOException)9 SourceDocument (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument)8 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)8 Test (org.junit.Test)8 HashMap (java.util.HashMap)7 TokenBuilder (org.apache.uima.fit.testing.factory.TokenBuilder)7 AnnotationException (de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException)6 WebAnnoCasUtil.getFirstSentence (de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.getFirstSentence)6 AnnotationDocument (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument)6 AnnotationFeature (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)6 FrequencyDistribution (de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)6 CASException (org.apache.uima.cas.CASException)6 AutomationTypeAdapter (de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter)5