Search in sources :

Example 1 with TsvSubToken

use of de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken in project webanno by webanno.

the class Tsv3XCasDocumentBuilder method of.

public static TsvDocument of(TsvSchema aSchema, JCas aJCas) {
    TsvFormatHeader format = new TsvFormatHeader("WebAnno TSV", "3.2");
    TsvDocument doc = new TsvDocument(format, aSchema, aJCas);
    // Fill document with all the sentences and tokens
    for (Sentence uimaSentence : select(aJCas, Sentence.class)) {
        TsvSentence sentence = doc.createSentence(uimaSentence);
        for (Token uimaToken : selectCovered(Token.class, uimaSentence)) {
            sentence.createToken(uimaToken);
        }
    }
    // Scan for chains
    for (Type headType : aSchema.getChainHeadTypes()) {
        for (FeatureStructure chainHead : CasUtil.selectFS(aJCas.getCas(), headType)) {
            List<AnnotationFS> elements = new ArrayList<>();
            AnnotationFS link = getFeature(chainHead, CHAIN_FIRST_FEAT, AnnotationFS.class);
            while (link != null) {
                elements.add(link);
                link = getFeature(link, CHAIN_NEXT_FEAT, AnnotationFS.class);
            }
            if (!elements.isEmpty()) {
                Type elementType = headType.getFeatureByBaseName(CHAIN_FIRST_FEAT).getRange();
                doc.createChain(headType, elementType, elements);
            }
        }
    }
    // Build indexes over the token start and end positions such that we can quickly locate
    // tokens based on their offsets.
    NavigableMap<Integer, TsvToken> tokenBeginIndex = new TreeMap<>();
    NavigableMap<Integer, TsvToken> tokenEndIndex = new TreeMap<>();
    List<TsvToken> tokens = new ArrayList<>();
    for (TsvSentence sentence : doc.getSentences()) {
        for (TsvToken token : sentence.getTokens()) {
            tokenBeginIndex.put(token.getBegin(), token);
            tokenEndIndex.put(token.getEnd(), token);
            tokens.add(token);
        }
    }
    // units.
    for (Type type : aSchema.getUimaTypes()) {
        LayerType layerType = aSchema.getLayerType(type);
        boolean addDisambiguationIdIfStacked = SPAN.equals(layerType);
        for (AnnotationFS annotation : CasUtil.select(aJCas.getCas(), type)) {
            doc.activateType(annotation.getType());
            // Get the relevant begin and end offsets for the current annotation
            int begin = annotation.getBegin();
            int end = annotation.getEnd();
            // to be sure.
            if (RELATION.equals(layerType)) {
                AnnotationFS targetFS = getFeature(annotation, FEAT_REL_TARGET, AnnotationFS.class);
                begin = targetFS.getBegin();
                end = targetFS.getEnd();
            }
            TsvToken beginToken = tokenBeginIndex.floorEntry(begin).getValue();
            TsvToken endToken = tokenEndIndex.ceilingEntry(end).getValue();
            // value obtained from the tokenBeginIndex.
            if (begin == end) {
                beginToken = endToken;
            }
            boolean singleToken = beginToken == endToken;
            boolean zeroWitdh = begin == end;
            boolean multiTokenCapable = SPAN.equals(layerType) || CHAIN.equals(layerType);
            // in either case.
            if (beginToken.getBegin() == begin && endToken.getEnd() == end) {
                doc.mapFS2Unit(annotation, beginToken);
                beginToken.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
                if (multiTokenCapable) {
                    endToken.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
                }
            } else if (zeroWitdh) {
                TsvSubToken t = beginToken.createSubToken(begin, min(beginToken.getEnd(), end));
                doc.mapFS2Unit(annotation, t);
                t.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
            } else {
                // the annotation.
                if (beginToken.getBegin() < begin) {
                    TsvSubToken t = beginToken.createSubToken(begin, min(beginToken.getEnd(), end));
                    doc.mapFS2Unit(annotation, t);
                    t.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
                } else // If not the sub-token is ID-defining, then the begin token is ID-defining
                {
                    beginToken.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
                    doc.mapFS2Unit(annotation, beginToken);
                }
                // checking if if singleToke is true.
                if (endToken.getEnd() > end) {
                    TsvSubToken t = endToken.createSubToken(max(endToken.getBegin(), begin), end);
                    t.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
                    if (!singleToken) {
                        doc.mapFS2Unit(annotation, t);
                    }
                } else if (!singleToken && multiTokenCapable) {
                    endToken.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
                }
            }
            // the end token
            if (multiTokenCapable && !singleToken) {
                ListIterator<TsvToken> i = tokens.listIterator(tokens.indexOf(beginToken));
                TsvToken t;
                while ((t = i.next()) != endToken) {
                    if (t != beginToken) {
                        t.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
                    }
                }
            }
            // Multi-token span annotations must get a disambiguation ID
            if (SPAN.equals(layerType) && !singleToken) {
                doc.addDisambiguationId(annotation);
            }
        }
    }
    // Scan all created units to see which columns actually contains values
    for (TsvSentence sentence : doc.getSentences()) {
        for (TsvToken token : sentence.getTokens()) {
            scanUnitForActiveColumns(token);
            scanUnitForAmbiguousSlotReferences(token);
            for (TsvSubToken subToken : token.getSubTokens()) {
                scanUnitForActiveColumns(subToken);
                scanUnitForAmbiguousSlotReferences(subToken);
            }
        }
    }
    // Activate the placeholder columns for any active types for which no other columns are
    // active.
    Set<Type> activeTypesNeedingPlaceholders = new HashSet<>(doc.getActiveTypes());
    for (TsvColumn col : doc.getActiveColumns()) {
        activeTypesNeedingPlaceholders.remove(col.uimaType);
    }
    for (TsvColumn col : doc.getSchema().getColumns()) {
        if (PLACEHOLDER.equals(col.featureType) && activeTypesNeedingPlaceholders.contains(col.uimaType)) {
            doc.activateColumn(col);
        }
    }
    return doc;
}
Also used : TsvFormatHeader(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvFormatHeader) ArrayList(java.util.ArrayList) TsvToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvToken) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) TsvSubToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken) TsvSentence(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSentence) TreeMap(java.util.TreeMap) TsvSubToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken) FeatureStructure(org.apache.uima.cas.FeatureStructure) AnnotationFS(org.apache.uima.cas.text.AnnotationFS) LayerType(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.LayerType) Type(org.apache.uima.cas.Type) TsvColumn(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvColumn) LayerType(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.LayerType) TsvDocument(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvDocument) TsvToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvToken) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) TsvSentence(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSentence) HashSet(java.util.HashSet)

Example 2 with TsvSubToken

use of de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken in project webanno by webanno.

the class Tsv3XDeserializer method readContent.

private void readContent(LineNumberReader aIn, TsvDocument aDoc) throws IOException {
    StringBuilder text = new StringBuilder();
    State prevState = State.INTER_SENTENCE_SPACE;
    State state = State.INTER_SENTENCE_SPACE;
    StringBuilder sentenceText = new StringBuilder();
    TsvSentence prevSentence = null;
    TsvSentence sentence = null;
    TsvToken token = null;
    String line = aIn.readLine();
    while (!State.END.equals(state)) {
        // These variables are only used in TOKEN and SUBTOKEN states.
        String[] fields = null;
        String id = null;
        String[] offsets = null;
        int begin = -1;
        int end = -1;
        // Determine the status of the current line
        if (startsWith(line, PREFIX_TEXT)) {
            state = State.SENTENCE;
        } else if (line == null) {
            state = State.END;
        } else if (isEmpty(line)) {
            state = State.INTER_SENTENCE_SPACE;
        } else {
            fields = splitPreserveAllTokens(line, FIELD_SEPARATOR);
            // Get token metadata
            id = fields[0];
            offsets = split(fields[1], "-");
            begin = Integer.valueOf(offsets[0]);
            end = Integer.valueOf(offsets[1]);
            // TOKEN or SUBTOKEN?
            if (id.contains(".")) {
                state = State.SUBTOKEN;
            } else {
                state = State.TOKEN;
            }
        }
        // Assert that the order of information in the file is correct
        switch(prevState) {
            case INTER_SENTENCE_SPACE:
                if (!State.SENTENCE.equals(state)) {
                    throw new IOException("Line " + aIn.getLineNumber() + ": Expected sentence header but got [" + state + "]");
                }
                break;
            case SENTENCE:
                if (!(State.SENTENCE.equals(state) || State.TOKEN.equals(state))) {
                    throw new IOException("Line " + aIn.getLineNumber() + ": Expected sentence header or token but got [" + state + "]");
                }
                break;
            case TOKEN:
            case SUBTOKEN:
                if (!(State.INTER_SENTENCE_SPACE.equals(state) || State.END.equals(state) || State.TOKEN.equals(state) || State.SUBTOKEN.equals(state))) {
                    throw new IOException("Line " + aIn.getLineNumber() + ": Expected token, sub-token or sentence break but got [" + state + "]");
                }
                break;
        }
        // Do the actual parsing
        switch(state) {
            case END:
            case INTER_SENTENCE_SPACE:
                // End of sentence action
                // The -1 here is to account for the tailing line break
                sentence.getUimaSentence().setEnd(text.length() - 1);
                sentence.getUimaSentence().addToIndexes();
                prevSentence = sentence;
                sentence = null;
                break;
            case TOKEN:
                // End of sentence header action
                if (State.SENTENCE.equals(prevState)) {
                    // last sentence!
                    if (text.length() > begin) {
                        assert text.length() == begin + 1;
                        assert text.charAt(text.length() - 1) == LINE_BREAK;
                        text.setLength(text.length() - 1);
                    }
                    // the gap.
                    if (text.length() < begin) {
                        text.append(repeat(' ', begin - text.length()));
                    }
                    assert text.length() == begin;
                    assert sentence == null;
                    Sentence uimaSentence = new Sentence(aDoc.getJCas());
                    uimaSentence.setBegin(text.length());
                    sentence = aDoc.createSentence(uimaSentence);
                    text.append(sentenceText);
                    sentenceText.setLength(0);
                }
                // Token parsing action
                Token uimaToken = new Token(aDoc.getJCas(), begin, end);
                uimaToken.addToIndexes();
                token = sentence.createToken(uimaToken);
                // Read annotations from the columns
                parseAnnotations(aDoc, sentence, token, fields);
                break;
            case SUBTOKEN:
                // Read annotations from the columns
                TsvSubToken subToken = token.createSubToken(begin, end);
                parseAnnotations(aDoc, sentence, subToken, fields);
                break;
            case SENTENCE:
                // Header parsing action
                String textFragment = substringAfter(line, "=");
                textFragment = unescapeText(aDoc.getFormatHeader(), textFragment);
                sentenceText.append(textFragment);
                sentenceText.append(LINE_BREAK);
                break;
        }
        prevState = state;
        line = aIn.readLine();
    }
    aDoc.getJCas().setDocumentText(text.toString());
    // After all data has been read, we also add the annotations with disambiguation ID to
    // the CAS indexes. This ensures we only add them after their final begin/end offsets
    // have been determined since most of these annotations are actually multi-token
    // annotations.
    CAS cas = aDoc.getJCas().getCas();
    Set<FeatureStructure> fses = new LinkedHashSet<>();
    for (TsvSentence s : aDoc.getSentences()) {
        for (TsvToken t : s.getTokens()) {
            for (Type type : t.getUimaTypes()) {
                fses.addAll(t.getUimaAnnotations(type));
            }
            for (TsvSubToken st : t.getSubTokens()) {
                for (Type type : st.getUimaTypes()) {
                    fses.addAll(st.getUimaAnnotations(type));
                }
            }
        }
    }
    fses.forEach(cas::addFsToIndexes);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) TsvToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvToken) TsvSubToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) TsvSentence(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSentence) IOException(java.io.IOException) TsvSubToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken) FeatureStructure(org.apache.uima.cas.FeatureStructure) Type(org.apache.uima.cas.Type) LayerType(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.LayerType) CAS(org.apache.uima.cas.CAS) TsvToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvToken) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) TsvSentence(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSentence)

Example 3 with TsvSubToken

use of de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken in project webanno by webanno.

the class Tsv3XSerializer method write.

public void write(PrintWriter aOut, TsvSentence aSentence) {
    String[] lines = splitPreserveAllTokens(aSentence.getUimaSentence().getCoveredText(), LINE_BREAK);
    for (String line : lines) {
        aOut.print(PREFIX_TEXT);
        aOut.print(escapeText(line));
        aOut.print(LINE_BREAK);
    }
    for (TsvToken token : aSentence.getTokens()) {
        write(aOut, token);
        aOut.write(LINE_BREAK);
        for (TsvSubToken subToken : token.getSubTokens()) {
            write(aOut, subToken);
            aOut.write(LINE_BREAK);
        }
    }
}
Also used : TsvToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvToken) TsvSubToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken)

Aggregations

TsvSubToken (de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken)3 TsvToken (de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvToken)3 LayerType (de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.LayerType)2 TsvSentence (de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSentence)2 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)2 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)2 FeatureStructure (org.apache.uima.cas.FeatureStructure)2 Type (org.apache.uima.cas.Type)2 TsvColumn (de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvColumn)1 TsvDocument (de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvDocument)1 TsvFormatHeader (de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvFormatHeader)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 LinkedHashSet (java.util.LinkedHashSet)1 TreeMap (java.util.TreeMap)1 CAS (org.apache.uima.cas.CAS)1 AnnotationFS (org.apache.uima.cas.text.AnnotationFS)1