Search in sources :

Example 1 with AnnotationUnit

use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.

the class WebannoTsv3Reader method createTokens.

private AnnotationUnit createTokens(JCas aJCas, String[] lines, int begin, int end) {
    // subtokens should not be consider as tokens. example 1-2.1 ==> subtoken under token 2
    if (!lines[0].contains(".")) {
        Token token = new Token(aJCas, begin, end);
        AnnotationUnit unit = new AnnotationUnit(begin, end, false, "");
        units.add(unit);
        token.addToIndexes();
        token2Units.put(lines[0], unit);
        units2Tokens.put(unit, token);
        return unit;
    } else {
        AnnotationUnit unit = new AnnotationUnit(begin, end, true, "");
        units.add(unit);
        token2Units.put(lines[0], unit);
        return unit;
    }
}
Also used : AnnotationUnit(de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)

Example 2 with AnnotationUnit

use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.

the class WebannoTsv3Reader method addAnnotationWithNoFeature.

private int addAnnotationWithNoFeature(JCas aJCas, Type aType, AnnotationUnit aUnit, List<AnnotationFS> aAnnos, Map<AnnotationUnit, Map<Integer, AnnotationFS>> aMultiTokUnits, int aEnd, int aRef) {
    String anno = annotationsPerPostion.get(aType).get(aUnit).get(0);
    if (!anno.equals("_")) {
        int i = 0;
        String stackedAnnoRegex = "(?<!\\\\)" + Pattern.quote("|");
        for (String mAnnos : anno.split(stackedAnnoRegex)) {
            String multipleSlotAnno = "(?<!\\\\)" + Pattern.quote(";");
            for (String mAnno : mAnnos.split(multipleSlotAnno)) {
                String depRef = "";
                if (mAnno.endsWith("]")) {
                    depRef = mAnno.substring(mAnno.indexOf("[") + 1, mAnno.length() - 1);
                    aRef = depRef.contains("_") ? 0 : Integer.valueOf(mAnno.substring(mAnno.indexOf("[") + 1, mAnno.length() - 1));
                    mAnno = mAnno.substring(0, mAnno.indexOf("["));
                }
                boolean isMultitoken = false;
                AnnotationFS multiAnnoFs = null;
                if (!aMultiTokUnits.isEmpty()) {
                    for (AnnotationUnit u : aMultiTokUnits.keySet()) {
                        for (Integer r : aMultiTokUnits.get(u).keySet()) {
                            if (aRef == r) {
                                isMultitoken = true;
                                multiAnnoFs = aMultiTokUnits.get(u).get(r);
                                break;
                            }
                        }
                    }
                }
                if (isMultitoken) {
                    Feature endF = aType.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_END);
                    multiAnnoFs.setIntValue(endF, aEnd);
                    setAnnoRefPerUnit(aUnit, aType, aRef, multiAnnoFs);
                } else {
                    aMultiTokUnits.putIfAbsent(aUnit, new HashMap<>());
                    aMultiTokUnits.get(aUnit).put(aRef, aAnnos.get(i));
                    aJCas.addFsToIndexes(aAnnos.get(i));
                    setAnnoRefPerUnit(aUnit, aType, aRef, aAnnos.get(i));
                }
                aRef++;
            }
            i++;
        }
    }
    return aRef;
}
Also used : AnnotationFS(org.apache.uima.cas.text.AnnotationFS) AnnotationUnit(de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit) Feature(org.apache.uima.cas.Feature)

Example 3 with AnnotationUnit

use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.

the class WebannoTsv3Writer method setRelationAnnotation.

private void setRelationAnnotation(JCas aJCas) {
    for (String l : relationLayers) {
        if (l.equals(Token.class.getName())) {
            continue;
        }
        Map<AnnotationUnit, List<List<String>>> annotationsPertype;
        if (annotationsPerPostion.get(l) == null) {
            annotationsPertype = new HashMap<>();
        } else {
            annotationsPertype = annotationsPerPostion.get(l);
        }
        Type type = getType(aJCas.getCas(), l);
        Feature dependentFeature = null;
        Feature governorFeature = null;
        for (Feature feature : type.getFeatures()) {
            if (feature.getShortName().equals(DEPENDENT)) {
                // check if the dependent is
                dependentFeature = feature;
            }
            if (feature.getShortName().equals(GOVERNOR)) {
                governorFeature = feature;
            }
        }
        for (AnnotationFS fs : CasUtil.select(aJCas.getCas(), type)) {
            AnnotationFS depFs = (AnnotationFS) fs.getFeatureValue(dependentFeature);
            AnnotationFS govFs = (AnnotationFS) fs.getFeatureValue(governorFeature);
            Type govType = govFs.getType();
            AnnotationUnit govUnit = getFirstUnit(getUnit(govFs.getBegin(), govFs.getEnd(), govFs.getCoveredText()));
            if (ambigUnits.get(govType.getName()).get(govUnit) == null) {
                govUnit = getUnit(govFs.getBegin(), govFs.getEnd(), govFs.getCoveredText());
            }
            AnnotationUnit depUnit = getFirstUnit(getUnit(depFs.getBegin(), depFs.getEnd(), depFs.getCoveredText()));
            if (ambigUnits.get(govType.getName()).get(depUnit) == null) {
                depUnit = getUnit(depFs.getBegin(), depFs.getEnd(), depFs.getCoveredText());
            }
            if (type.getName().equals(Dependency.class.getName())) {
                govType = aJCas.getCas().getTypeSystem().getType(POS.class.getName());
            }
            int govRef = 0;
            int depRef = 0;
            // The WebAnno world do not ever process Token as an annotation
            if (!govType.getName().equals(Token.class.getName()) && ambigUnits.get(govType.getName()).get(govUnit).equals(true)) {
                govRef = annotaionRefPerType.get(govType).get(govFs);
            }
            if (!govType.getName().equals(Token.class.getName()) && ambigUnits.get(govType.getName()).get(depUnit).equals(true)) {
                depRef = annotaionRefPerType.get(govType).get(depFs);
            }
            setRelationAnnoPerFeature(annotationsPertype, type, fs, depUnit, govUnit, govRef, depRef, govType);
        }
        if (annotationsPertype.keySet().size() > 0) {
            annotationsPerPostion.put(l, annotationsPertype);
        }
    }
}
Also used : AnnotationFS(org.apache.uima.cas.text.AnnotationFS) Type(org.apache.uima.cas.Type) CasUtil.getType(org.apache.uima.fit.util.CasUtil.getType) AnnotationUnit(de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) ArrayList(java.util.ArrayList) List(java.util.List) Dependency(de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency) Feature(org.apache.uima.cas.Feature)

Example 4 with AnnotationUnit

use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.

the class WebannoTsv3Writer method getSubUnits.

private Set<AnnotationUnit> getSubUnits(SubTokenAnno aSTA, Set<AnnotationUnit> aSubUnits) {
    AnnotationUnit prevUnit = null;
    List<AnnotationUnit> tmpUnits = new ArrayList<>(units);
    if (aSTA.getBegin() == aSTA.getEnd()) {
        AnnotationUnit newUnit = new AnnotationUnit(aSTA.getBegin(), aSTA.getEnd(), false, "");
        for (AnnotationUnit unit : units) {
            if (unit.begin >= newUnit.begin && unit.end >= newUnit.end) {
                updateUnitLists(tmpUnits, unit, newUnit);
                aSubUnits.add(newUnit);
                units = new ArrayList<>(tmpUnits);
                return aSubUnits;
            }
        }
    }
    for (AnnotationUnit unit : units) {
        if (unit.end > aSTA.end) {
            if (unit.begin == aSTA.begin) {
                AnnotationUnit newUnit = new AnnotationUnit(aSTA.getBegin(), aSTA.getEnd(), false, aSTA.getText());
                updateUnitLists(tmpUnits, unit, newUnit);
                aSubUnits.add(newUnit);
            }
            break;
        }
        // this is a sub-token annotation
        if (unit.begin <= aSTA.getBegin() && aSTA.getBegin() <= unit.end && aSTA.getEnd() <= unit.end) {
            AnnotationUnit newUnit = new AnnotationUnit(aSTA.getBegin(), aSTA.getEnd(), false, aSTA.getText());
            updateUnitLists(tmpUnits, unit, newUnit);
            aSubUnits.add(newUnit);
        } else // if sub-token annotation crosses multiple tokens
        if ((unit.begin <= aSTA.getBegin() && aSTA.getBegin() < unit.end && aSTA.getEnd() > unit.end)) {
            int thisSubTextLen = unit.end - aSTA.begin;
            AnnotationUnit newUnit = new AnnotationUnit(aSTA.getBegin(), unit.end, false, aSTA.getText().substring(0, thisSubTextLen));
            aSubUnits.add(newUnit);
            updateUnitLists(tmpUnits, unit, newUnit);
            aSTA.setBegin(getNextUnitBegin(aSTA.getBegin()));
            aSTA.setText(aSTA.getText().trim().substring(thisSubTextLen));
            getSubUnits(aSTA, aSubUnits);
        } else // empty annotation between tokens
        if (aSTA.getBegin() <= unit.begin && prevUnit != null && prevUnit.end < unit.begin) {
            int thisSubTextLen = unit.begin - aSTA.begin;
            AnnotationUnit newUnit = new AnnotationUnit(aSTA.getBegin(), unit.begin, false, aSTA.getText().substring(0, thisSubTextLen));
            aSubUnits.add(newUnit);
            updateUnitLists(tmpUnits, prevUnit, newUnit);
            aSTA.setBegin(unit.begin);
            aSTA.setText(aSTA.getText().trim().substring(thisSubTextLen));
            getSubUnits(aSTA, aSubUnits);
        } else {
            prevUnit = unit;
        }
    }
    units = new ArrayList<>(tmpUnits);
    return aSubUnits;
}
Also used : AnnotationUnit(de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit) ArrayList(java.util.ArrayList)

Example 5 with AnnotationUnit

use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.

the class WebannoTsv3Writer method setSpanAnnoPerFeature.

private void setSpanAnnoPerFeature(Map<AnnotationUnit, List<List<String>>> aAnnotationsPertype, Type aType, AnnotationFS aFs, AnnotationUnit aUnit, boolean aIsMultiToken, boolean aIsFirst) {
    List<String> annoPerFeatures = new ArrayList<>();
    featurePerLayer.putIfAbsent(aType.getName(), new LinkedHashSet<>());
    int ref = getRefId(aType, aFs, aUnit);
    if (ambigUnits.get(aType.getName()).get(getFirstUnit(aUnit)) != null && ambigUnits.get(aType.getName()).get(getFirstUnit(aUnit)).equals(false)) {
        ref = 0;
    }
    if (ambigUnits.get(aType.getName()).get(getFirstUnit(aUnit)) == null && ambigUnits.get(aType.getName()).get(aUnit).equals(false)) {
        ref = 0;
    }
    for (Feature feature : aType.getFeatures()) {
        if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT) || feature.getShortName().equals(FIRST) || feature.getShortName().equals(NEXT)) {
            continue;
        }
        // if slot feature
        if (slotFeatures != null && slotFeatures.contains(feature.getName())) {
            if (aFs.getFeatureValue(feature) != null) {
                ArrayFS array = (ArrayFS) aFs.getFeatureValue(feature);
                StringBuilder sbRole = new StringBuilder();
                StringBuilder sbTarget = new StringBuilder();
                for (FeatureStructure linkFS : array.toArray()) {
                    String role = linkFS.getStringValue(linkFS.getType().getFeatureByBaseName("role"));
                    AnnotationFS targetFs = (AnnotationFS) linkFS.getFeatureValue(linkFS.getType().getFeatureByBaseName("target"));
                    Type tType = targetFs.getType();
                    AnnotationUnit firstUnit = getFirstUnit(targetFs);
                    ref = getRefId(tType, targetFs, firstUnit);
                    // Check if the target is ambiguous or not
                    if (ambigUnits.get(tType.getName()).get(firstUnit).equals(false)) {
                        ref = 0;
                    }
                    if (role == null) {
                        role = "*";
                    } else {
                        // Escape special character
                        role = replaceEscapeChars(role);
                    }
                    if (sbRole.length() < 1) {
                        sbRole.append(role);
                        // record the actual target type column number if slot target is
                        // uima.tcas.Annotation
                        int targetTypeNumber = 0;
                        if (slotFeatureTypes.get(feature).getName().equals(CAS.TYPE_NAME_ANNOTATION)) {
                            targetTypeNumber = layerMaps.get(tType);
                        }
                        sbTarget.append(unitsLineNumber.get(firstUnit)).append(targetTypeNumber == 0 ? "" : "-" + targetTypeNumber).append(ref > 0 ? "[" + ref + "]" : "");
                    } else {
                        sbRole.append(";");
                        sbTarget.append(";");
                        sbRole.append(role);
                        int targetTypeNumber = 0;
                        if (slotFeatureTypes.get(feature).getName().equals(CAS.TYPE_NAME_ANNOTATION)) {
                            targetTypeNumber = layerMaps.get(tType);
                        }
                        sbTarget.append(unitsLineNumber.get(firstUnit)).append(targetTypeNumber == 0 ? "" : "-" + targetTypeNumber).append(ref > 0 ? "[" + ref + "]" : "");
                    }
                }
                annoPerFeatures.add(sbRole.toString().isEmpty() ? "_" : sbRole.toString());
                annoPerFeatures.add(sbTarget.toString().isEmpty() ? "_" : sbTarget.toString());
            } else {
                // setting it to null
                annoPerFeatures.add("_");
                annoPerFeatures.add("_");
            }
            featurePerLayer.get(aType.getName()).add(ROLE + feature.getName() + "_" + slotLinkTypes.get(feature.getName()));
            featurePerLayer.get(aType.getName()).add(slotFeatureTypes.get(feature).getName());
        } else {
            String annotation = aFs.getFeatureValueAsString(feature);
            if (annotation == null) {
                annotation = "*";
            } else {
                // Escape special character
                annotation = replaceEscapeChars(annotation);
            }
            annotation = annotation + (ref > 0 ? "[" + ref + "]" : "");
            // only add BIO markers to multiple annotations
            setAnnoFeature(aIsMultiToken, aIsFirst, annoPerFeatures, annotation);
            featurePerLayer.get(aType.getName()).add(feature.getShortName());
        }
    }
    aAnnotationsPertype.putIfAbsent(aUnit, new ArrayList<>());
    // If the layer do not have a feature at all, add dummy * as a place holder
    if (annoPerFeatures.size() == 0) {
        setAnnoFeature(aIsMultiToken, aIsFirst, annoPerFeatures, "*" + (ref > 0 ? "[" + ref + "]" : ""));
    }
    aAnnotationsPertype.get(aUnit).add(annoPerFeatures);
}
Also used : ArrayList(java.util.ArrayList) Feature(org.apache.uima.cas.Feature) FeatureStructure(org.apache.uima.cas.FeatureStructure) AnnotationFS(org.apache.uima.cas.text.AnnotationFS) Type(org.apache.uima.cas.Type) CasUtil.getType(org.apache.uima.fit.util.CasUtil.getType) AnnotationUnit(de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit) ArrayFS(org.apache.uima.cas.ArrayFS)

Aggregations

AnnotationUnit (de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit)16 AnnotationFS (org.apache.uima.cas.text.AnnotationFS)9 ArrayList (java.util.ArrayList)8 Type (org.apache.uima.cas.Type)8 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)6 List (java.util.List)6 Feature (org.apache.uima.cas.Feature)6 LinkedHashSet (java.util.LinkedHashSet)5 CasUtil.getType (org.apache.uima.fit.util.CasUtil.getType)5 HashMap (java.util.HashMap)4 LinkedHashMap (java.util.LinkedHashMap)4 FeatureStructure (org.apache.uima.cas.FeatureStructure)3 IOException (java.io.IOException)2 Map (java.util.Map)2 TreeMap (java.util.TreeMap)2 MorphologicalFeatures (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures)1 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)1 Lemma (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma)1 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)1 Stem (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem)1