Search in sources :

Example 1 with LayerType

use of de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.LayerType in project webanno by webanno.

the class Tsv3XCasDocumentBuilder method of.

public static TsvDocument of(TsvSchema aSchema, JCas aJCas) {
    TsvFormatHeader format = new TsvFormatHeader("WebAnno TSV", "3.2");
    TsvDocument doc = new TsvDocument(format, aSchema, aJCas);
    // Fill document with all the sentences and tokens
    for (Sentence uimaSentence : select(aJCas, Sentence.class)) {
        TsvSentence sentence = doc.createSentence(uimaSentence);
        for (Token uimaToken : selectCovered(Token.class, uimaSentence)) {
            sentence.createToken(uimaToken);
        }
    }
    // Scan for chains
    for (Type headType : aSchema.getChainHeadTypes()) {
        for (FeatureStructure chainHead : CasUtil.selectFS(aJCas.getCas(), headType)) {
            List<AnnotationFS> elements = new ArrayList<>();
            AnnotationFS link = getFeature(chainHead, CHAIN_FIRST_FEAT, AnnotationFS.class);
            while (link != null) {
                elements.add(link);
                link = getFeature(link, CHAIN_NEXT_FEAT, AnnotationFS.class);
            }
            if (!elements.isEmpty()) {
                Type elementType = headType.getFeatureByBaseName(CHAIN_FIRST_FEAT).getRange();
                doc.createChain(headType, elementType, elements);
            }
        }
    }
    // Build indexes over the token start and end positions such that we can quickly locate
    // tokens based on their offsets.
    NavigableMap<Integer, TsvToken> tokenBeginIndex = new TreeMap<>();
    NavigableMap<Integer, TsvToken> tokenEndIndex = new TreeMap<>();
    List<TsvToken> tokens = new ArrayList<>();
    for (TsvSentence sentence : doc.getSentences()) {
        for (TsvToken token : sentence.getTokens()) {
            tokenBeginIndex.put(token.getBegin(), token);
            tokenEndIndex.put(token.getEnd(), token);
            tokens.add(token);
        }
    }
    // units.
    for (Type type : aSchema.getUimaTypes()) {
        LayerType layerType = aSchema.getLayerType(type);
        boolean addDisambiguationIdIfStacked = SPAN.equals(layerType);
        for (AnnotationFS annotation : CasUtil.select(aJCas.getCas(), type)) {
            doc.activateType(annotation.getType());
            // Get the relevant begin and end offsets for the current annotation
            int begin = annotation.getBegin();
            int end = annotation.getEnd();
            // to be sure.
            if (RELATION.equals(layerType)) {
                AnnotationFS targetFS = getFeature(annotation, FEAT_REL_TARGET, AnnotationFS.class);
                begin = targetFS.getBegin();
                end = targetFS.getEnd();
            }
            TsvToken beginToken = tokenBeginIndex.floorEntry(begin).getValue();
            TsvToken endToken = tokenEndIndex.ceilingEntry(end).getValue();
            // value obtained from the tokenBeginIndex.
            if (begin == end) {
                beginToken = endToken;
            }
            boolean singleToken = beginToken == endToken;
            boolean zeroWitdh = begin == end;
            boolean multiTokenCapable = SPAN.equals(layerType) || CHAIN.equals(layerType);
            // in either case.
            if (beginToken.getBegin() == begin && endToken.getEnd() == end) {
                doc.mapFS2Unit(annotation, beginToken);
                beginToken.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
                if (multiTokenCapable) {
                    endToken.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
                }
            } else if (zeroWitdh) {
                TsvSubToken t = beginToken.createSubToken(begin, min(beginToken.getEnd(), end));
                doc.mapFS2Unit(annotation, t);
                t.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
            } else {
                // the annotation.
                if (beginToken.getBegin() < begin) {
                    TsvSubToken t = beginToken.createSubToken(begin, min(beginToken.getEnd(), end));
                    doc.mapFS2Unit(annotation, t);
                    t.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
                } else // If not the sub-token is ID-defining, then the begin token is ID-defining
                {
                    beginToken.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
                    doc.mapFS2Unit(annotation, beginToken);
                }
                // checking if if singleToke is true.
                if (endToken.getEnd() > end) {
                    TsvSubToken t = endToken.createSubToken(max(endToken.getBegin(), begin), end);
                    t.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
                    if (!singleToken) {
                        doc.mapFS2Unit(annotation, t);
                    }
                } else if (!singleToken && multiTokenCapable) {
                    endToken.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
                }
            }
            // the end token
            if (multiTokenCapable && !singleToken) {
                ListIterator<TsvToken> i = tokens.listIterator(tokens.indexOf(beginToken));
                TsvToken t;
                while ((t = i.next()) != endToken) {
                    if (t != beginToken) {
                        t.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
                    }
                }
            }
            // Multi-token span annotations must get a disambiguation ID
            if (SPAN.equals(layerType) && !singleToken) {
                doc.addDisambiguationId(annotation);
            }
        }
    }
    // Scan all created units to see which columns actually contains values
    for (TsvSentence sentence : doc.getSentences()) {
        for (TsvToken token : sentence.getTokens()) {
            scanUnitForActiveColumns(token);
            scanUnitForAmbiguousSlotReferences(token);
            for (TsvSubToken subToken : token.getSubTokens()) {
                scanUnitForActiveColumns(subToken);
                scanUnitForAmbiguousSlotReferences(subToken);
            }
        }
    }
    // Activate the placeholder columns for any active types for which no other columns are
    // active.
    Set<Type> activeTypesNeedingPlaceholders = new HashSet<>(doc.getActiveTypes());
    for (TsvColumn col : doc.getActiveColumns()) {
        activeTypesNeedingPlaceholders.remove(col.uimaType);
    }
    for (TsvColumn col : doc.getSchema().getColumns()) {
        if (PLACEHOLDER.equals(col.featureType) && activeTypesNeedingPlaceholders.contains(col.uimaType)) {
            doc.activateColumn(col);
        }
    }
    return doc;
}
Also used : TsvFormatHeader(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvFormatHeader) ArrayList(java.util.ArrayList) TsvToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvToken) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) TsvSubToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken) TsvSentence(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSentence) TreeMap(java.util.TreeMap) TsvSubToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken) FeatureStructure(org.apache.uima.cas.FeatureStructure) AnnotationFS(org.apache.uima.cas.text.AnnotationFS) LayerType(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.LayerType) Type(org.apache.uima.cas.Type) TsvColumn(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvColumn) LayerType(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.LayerType) TsvDocument(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvDocument) TsvToken(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvToken) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) TsvSentence(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSentence) HashSet(java.util.HashSet)

Example 2 with LayerType

use of de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.LayerType in project webanno by webanno.

the class Tsv3XDeserializer method parseColumnDeclaration.

private TsvColumn parseColumnDeclaration(JCas aJCas, LayerType aLayerType, Type aUimaType, int aIndex, String aColDecl, TsvColumn aPrevCol) throws IOException {
    TypeSystem ts = aJCas.getTypeSystem();
    TsvColumn column;
    // SLOT_ROLE - starts with "ROLE_"
    if (SPAN.equals(aLayerType) && startsWith(aColDecl, HEADER_PREFIX_ROLE)) {
        String[] subFields = splitPreserveAllTokens(aColDecl, '_');
        String featureName = substringAfter(subFields[1], ":");
        Feature feat = aUimaType.getFeatureByBaseName(featureName);
        if (feat == null) {
            throw new IOException("CAS type [" + aUimaType.getName() + "] does not have a feature called [" + featureName + "]");
        }
        column = new TsvColumn(aIndex, aUimaType, aLayerType, featureName, SLOT_ROLE);
        String typeName = subFields[2];
        Type type = ts.getType(typeName);
        if (type == null) {
            throw new IOException("CAS does not contain a type called [" + typeName + "]");
        }
        column.setTargetTypeHint(type);
    } else // RELATION_REF - starts with "BT_
    if (RELATION.equals(aLayerType) && startsWith(aColDecl, HEADER_PREFIX_BASE_TYPE)) {
        column = new TsvColumn(aIndex, aUimaType, aLayerType, FEAT_REL_SOURCE, RELATION_REF);
        String typeName = substringAfter(aColDecl, HEADER_PREFIX_BASE_TYPE);
        Type type = ts.getType(typeName);
        if (type == null) {
            throw new IOException("CAS does not contain a type called [" + typeName + "]");
        }
        column.setTargetTypeHint(type);
    } else // CHAIN_ELEMENT_TYPE - "referenceType"
    if (CHAIN.equals(aLayerType) && COREFERENCE_TYPE_FEATURE.equals(aColDecl)) {
        column = new TsvColumn(aIndex, aUimaType, aLayerType, COREFERENCE_TYPE_FEATURE, CHAIN_ELEMENT_TYPE);
    } else // CHAIN_LINK_TYPE - "referenceRelation"
    if (CHAIN.equals(aLayerType) && COREFERENCE_RELATION_FEATURE.equals(aColDecl)) {
        column = new TsvColumn(aIndex, aUimaType, aLayerType, COREFERENCE_RELATION_FEATURE, CHAIN_LINK_TYPE);
    } else // SLOT_TARGET - name of the link target type
    if (SPAN.equals(aLayerType) && aColDecl.contains(".") || ts.getType(aColDecl) != null) {
        // the type name really exists in the target CAS.
        if (ts.getType(aColDecl) == null) {
            throw new IOException("CAS type system does not contain a type named [" + aColDecl + "]");
        }
        // name from it.
        if (aPrevCol == null || !SLOT_ROLE.equals(aPrevCol.featureType)) {
            throw new IOException("Slot target column declaration must follow slot role column declaration");
        }
        column = new TsvColumn(aIndex, aUimaType, aLayerType, aPrevCol.uimaFeature.getShortName(), SLOT_TARGET);
        Type type = ts.getType(aColDecl);
        if (type == null) {
            throw new IOException("CAS does not contain a type called [" + aColDecl + "]");
        }
        column.setTargetTypeHint(type);
    } else // PRIMITIVE - feature name
    if (aUimaType.getFeatureByBaseName(aColDecl) != null) {
        column = new TsvColumn(aIndex, aUimaType, aLayerType, aColDecl, PRIMITIVE);
    } else {
        throw new IOException("Type [" + aUimaType.getName() + "] does not contain a feature called [" + aColDecl + "]");
    }
    return column;
}
Also used : TypeSystem(org.apache.uima.cas.TypeSystem) Type(org.apache.uima.cas.Type) LayerType(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.LayerType) TsvColumn(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvColumn) IOException(java.io.IOException) FSUtil.setFeature(org.apache.uima.fit.util.FSUtil.setFeature) FSUtil.getFeature(org.apache.uima.fit.util.FSUtil.getFeature) Feature(org.apache.uima.cas.Feature)

Example 3 with LayerType

use of de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.LayerType in project webanno by webanno.

the class Tsv3XDeserializer method readSchema.

private TsvSchema readSchema(LineNumberReader aIn, JCas aJCas) throws IOException {
    TsvSchema schema = new TsvSchema();
    int columnIndex = 0;
    // Read first line
    for (String line = aIn.readLine(); !isBlank(line); line = aIn.readLine()) {
        LayerType layerType;
        // Determine layer type
        if (startsWith(line, HEADER_PREFIX_SPAN_LAYER)) {
            layerType = SPAN;
        } else if (startsWith(line, HEADER_PREFIX_RELATION_LAYER)) {
            layerType = RELATION;
        } else if (startsWith(line, HEADER_PREFIX_CHAIN_LAYER)) {
            layerType = CHAIN;
        } else {
            // End of header
            break;
        }
        // Split up layer declaration
        String rest = substringAfter(line, HEADER_LAYER_PREFIX_SEPARATOR);
        String[] fields = split(rest, HEADER_FIELD_SEPARATOR);
        // Get the type name and the corresponding UIMA type from the type system of the
        // target CAS
        String typeName = fields[0];
        Type uimaType = aJCas.getTypeSystem().getType(typeName);
        if (uimaType == null) {
            throw new IOException("CAS type system does not contain a type named [" + typeName + "]");
        }
        // Parse the column declarations starting at the second field (the first is the
        // type name)
        TsvColumn prevColumn = null;
        for (int i = 1; i < fields.length; i++) {
            String colDecl = fields[i];
            TsvColumn col = parseColumnDeclaration(aJCas, layerType, uimaType, columnIndex, colDecl, prevColumn);
            schema.addColumn(col);
            columnIndex++;
            prevColumn = col;
        }
        // If there is no second field, then add a placeholder column
        if (fields.length == 1) {
            schema.addColumn(new TsvColumn(columnIndex, uimaType, layerType));
            columnIndex++;
        }
    }
    return schema;
}
Also used : Type(org.apache.uima.cas.Type) LayerType(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.LayerType) TsvColumn(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvColumn) LayerType(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.LayerType) TsvSchema(de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSchema) IOException(java.io.IOException)

Aggregations

LayerType (de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.LayerType)3 TsvColumn (de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvColumn)3 Type (org.apache.uima.cas.Type)3 IOException (java.io.IOException)2 TsvDocument (de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvDocument)1 TsvFormatHeader (de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvFormatHeader)1 TsvSchema (de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSchema)1 TsvSentence (de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSentence)1 TsvSubToken (de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken)1 TsvToken (de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvToken)1 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)1 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 TreeMap (java.util.TreeMap)1 Feature (org.apache.uima.cas.Feature)1 FeatureStructure (org.apache.uima.cas.FeatureStructure)1 TypeSystem (org.apache.uima.cas.TypeSystem)1 AnnotationFS (org.apache.uima.cas.text.AnnotationFS)1 FSUtil.getFeature (org.apache.uima.fit.util.FSUtil.getFeature)1