Search in sources :

Example 6 with AnnotationUnit

use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.

the class WebannoTsv3Writer method addChinFeatureAnno.

/**
 * @param aAnnotationsPertype
 *            store annotations per type associated with the annotation units
 * @param aType
 *            the coreference annotation type
 * @param aFs
 *            the feature structure
 * @param aUnit
 *            the current annotation unit of the coreference chain
 * @param aLinkNo
 *            a reference to the link in a chain, starting at one for the first link and n for
 *            the last link in the chain
 * @param achainNo
 *            a reference to the chain, starting at 1 for the first chain and n for the last
 *            chain where n is the number of coreference chains the document
 */
private void addChinFeatureAnno(Map<AnnotationUnit, List<List<String>>> aAnnotationsPertype, Type aType, AnnotationFS aFs, AnnotationUnit aUnit, int aLinkNo, int achainNo) {
    featurePerLayer.putIfAbsent(aType.getName(), new LinkedHashSet<>());
    // annotation is per Token
    if (units.contains(aUnit)) {
        setChainAnnoPerFeature(aAnnotationsPertype, aType, aFs, aUnit, aLinkNo, achainNo, false, false);
    } else // Annotation is on sub-token or multiple tokens
    {
        SubTokenAnno sta = new SubTokenAnno();
        sta.setBegin(aFs.getBegin());
        sta.setEnd(aFs.getEnd());
        sta.setText(aFs.getCoveredText());
        boolean isMultiToken = isMultiToken(aFs);
        boolean isFirst = true;
        Set<AnnotationUnit> sus = new LinkedHashSet<>();
        for (AnnotationUnit newUnit : getSubUnits(sta, sus)) {
            setChainAnnoPerFeature(aAnnotationsPertype, aType, aFs, newUnit, aLinkNo, achainNo, isMultiToken, isFirst);
            isFirst = false;
        }
    }
}
Also used : LinkedHashSet(java.util.LinkedHashSet) AnnotationUnit(de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit)

Example 7 with AnnotationUnit

use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.

the class WebannoTsv3Writer method getFirstUnit.

private AnnotationUnit getFirstUnit(AnnotationFS targetFs) {
    SubTokenAnno sta = new SubTokenAnno();
    sta.setBegin(targetFs.getBegin());
    sta.setEnd(targetFs.getEnd());
    sta.setText(targetFs.getCoveredText());
    Set<AnnotationUnit> sus = new LinkedHashSet<>();
    AnnotationUnit firstUnit = null;
    for (AnnotationUnit u : getSubUnits(sta, sus)) {
        firstUnit = u;
        break;
    }
    return firstUnit;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) AnnotationUnit(de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit)

Example 8 with AnnotationUnit

use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.

the class WebannoTsv3Reader method addAnnotations.

/**
 * Importing span annotations including slot annotations.
 */
private void addAnnotations(JCas aJCas, Map<Type, Map<AnnotationUnit, List<AnnotationFS>>> aAnnosPerTypePerUnit) {
    for (Type type : annotationsPerPostion.keySet()) {
        Map<AnnotationUnit, Map<Integer, AnnotationFS>> multiTokUnits = new HashMap<>();
        int ref = 1;
        // to see if it is on multiple token
        AnnotationFS prevAnnoFs = null;
        for (AnnotationUnit unit : annotationsPerPostion.get(type).keySet()) {
            int end = unit.end;
            List<AnnotationFS> annos = aAnnosPerTypePerUnit.get(type).get(unit);
            int j = 0;
            Feature linkeF = null;
            Map<AnnotationFS, List<FeatureStructure>> linkFSesPerSlotAnno = new HashMap<>();
            if (allLayers.get(type).size() == 0) {
                ref = addAnnotationWithNoFeature(aJCas, type, unit, annos, multiTokUnits, end, ref);
                continue;
            }
            for (Feature feat : allLayers.get(type)) {
                String anno = annotationsPerPostion.get(type).get(unit).get(j);
                if (!anno.equals("_")) {
                    int i = 0;
                    // if it is a slot annotation (multiple slots per
                    // single annotation
                    // (Target1<--role1--Base--role2-->Target2)
                    int slot = 0;
                    boolean targetAdd = false;
                    String stackedAnnoRegex = "(?<!\\\\)" + Pattern.quote("|");
                    String[] stackedAnnos = anno.split(stackedAnnoRegex);
                    for (String mAnnos : stackedAnnos) {
                        String multipleSlotAnno = "(?<!\\\\)" + Pattern.quote(";");
                        for (String mAnno : mAnnos.split(multipleSlotAnno)) {
                            String depRef = "";
                            String multSpliter = "(?<!\\\\)" + Pattern.quote("[");
                            // is this slot target ambiguous?
                            boolean ambigTarget = false;
                            if (mAnno.split(multSpliter).length > 1) {
                                ambigTarget = true;
                                depRef = mAnno.substring(mAnno.indexOf("[") + 1, mAnno.length() - 1);
                                ref = depRef.contains("_") ? ref : Integer.valueOf(mAnno.substring(mAnno.indexOf("[") + 1, mAnno.length() - 1));
                                mAnno = mAnno.substring(0, mAnno.indexOf("["));
                            }
                            if (mAnno.equals("*")) {
                                mAnno = null;
                            }
                            boolean isMultitoken = false;
                            if (!multiTokUnits.isEmpty() && prevAnnoFs != null && prevAnnoFs.getBegin() != unit.begin) {
                                contAnno: for (AnnotationUnit u : multiTokUnits.keySet()) {
                                    for (Integer r : multiTokUnits.get(u).keySet()) {
                                        if (ref == r) {
                                            isMultitoken = true;
                                            prevAnnoFs = multiTokUnits.get(u).get(r);
                                            break contAnno;
                                        }
                                    }
                                }
                            }
                            if (isMultitoken) {
                                Feature endF = type.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_END);
                                prevAnnoFs.setIntValue(endF, end);
                                mAnno = getEscapeChars(mAnno);
                                prevAnnoFs.setFeatureValueFromString(feat, mAnno);
                                if (feat.getShortName().equals(REF_LINK)) {
                                    // since REF_REL do not start with BIO,
                                    // update it it...
                                    annos.set(i, prevAnnoFs);
                                }
                                setAnnoRefPerUnit(unit, type, ref, prevAnnoFs);
                            } else {
                                if (roleLinks.containsKey(feat)) {
                                    linkeF = feat;
                                    FeatureStructure link = aJCas.getCas().createFS(slotLinkTypes.get(feat));
                                    Feature roleFeat = link.getType().getFeatureByBaseName("role");
                                    mAnno = getEscapeChars(mAnno);
                                    link.setStringValue(roleFeat, mAnno);
                                    linkFSesPerSlotAnno.putIfAbsent(annos.get(i), new ArrayList<>());
                                    linkFSesPerSlotAnno.get(annos.get(i)).add(link);
                                } else if (roleTargets.containsKey(feat)) {
                                    FeatureStructure link = linkFSesPerSlotAnno.get(annos.get(i)).get(slot);
                                    int customTypeNumber = 0;
                                    if (mAnno.split("-").length > 2) {
                                        customTypeNumber = Integer.valueOf(mAnno.substring(mAnno.lastIndexOf("-") + 1));
                                        mAnno = mAnno.substring(0, mAnno.lastIndexOf("-"));
                                    }
                                    AnnotationUnit targetUnit = token2Units.get(mAnno);
                                    Type tType = null;
                                    if (customTypeNumber == 0) {
                                        tType = roleTargets.get(feat);
                                    } else {
                                        tType = layerMaps.get(customTypeNumber);
                                    }
                                    AnnotationFS targetFs;
                                    if (ambigTarget) {
                                        targetFs = annosPerRef.get(tType).get(targetUnit).get(ref);
                                    } else {
                                        targetFs = annosPerRef.get(tType).get(targetUnit).entrySet().iterator().next().getValue();
                                    }
                                    link.setFeatureValue(feat, targetFs);
                                    addSlotAnnotations(linkFSesPerSlotAnno, linkeF);
                                    targetAdd = true;
                                    slot++;
                                } else if (feat.getShortName().equals(REF_REL)) {
                                    int chainNo = Integer.valueOf(mAnno.split("->")[1].split("-")[0]);
                                    int LinkNo = Integer.valueOf(mAnno.split("->")[1].split("-")[1]);
                                    chainAnnosPerTyep.putIfAbsent(type, new TreeMap<>());
                                    if (chainAnnosPerTyep.get(type).get(chainNo) != null && chainAnnosPerTyep.get(type).get(chainNo).get(LinkNo) != null) {
                                        continue;
                                    }
                                    String refRel = mAnno.split("->")[0];
                                    refRel = getEscapeChars(refRel);
                                    if (refRel.equals("*")) {
                                        refRel = null;
                                    }
                                    annos.get(i).setFeatureValueFromString(feat, refRel);
                                    chainAnnosPerTyep.putIfAbsent(type, new TreeMap<>());
                                    chainAnnosPerTyep.get(type).putIfAbsent(chainNo, new TreeMap<>());
                                    chainAnnosPerTyep.get(type).get(chainNo).put(LinkNo, annos.get(i));
                                } else if (feat.getShortName().equals(REF_LINK)) {
                                    mAnno = getEscapeChars(mAnno);
                                    annos.get(i).setFeatureValueFromString(feat, mAnno);
                                    aJCas.addFsToIndexes(annos.get(i));
                                } else if (depFeatures.get(type) != null && depFeatures.get(type).equals(feat)) {
                                    int g = depRef.isEmpty() ? 0 : Integer.valueOf(depRef.split("_")[0]);
                                    int d = depRef.isEmpty() ? 0 : Integer.valueOf(depRef.split("_")[1]);
                                    Type depType = depTypess.get(type);
                                    AnnotationUnit govUnit = token2Units.get(mAnno);
                                    int l = annotationsPerPostion.get(type).get(unit).size();
                                    String thisUnit = annotationsPerPostion.get(type).get(unit).get(l - 1);
                                    AnnotationUnit depUnit = token2Units.get(thisUnit);
                                    AnnotationFS govFs;
                                    AnnotationFS depFs;
                                    if (depType.getName().equals(POS.class.getName())) {
                                        depType = aJCas.getCas().getTypeSystem().getType(Token.class.getName());
                                        govFs = units2Tokens.get(govUnit);
                                        depFs = units2Tokens.get(unit);
                                    } else // in WebAnno world :)(!
                                    if (depType.getName().equals(Token.class.getName())) {
                                        govFs = units2Tokens.get(govUnit);
                                        depFs = units2Tokens.get(unit);
                                    } else if (g == 0 && d == 0) {
                                        govFs = annosPerRef.get(depType).get(govUnit).entrySet().iterator().next().getValue();
                                        depFs = annosPerRef.get(depType).get(depUnit).entrySet().iterator().next().getValue();
                                    } else if (g == 0) {
                                        govFs = annosPerRef.get(depType).get(govUnit).entrySet().iterator().next().getValue();
                                        depFs = annosPerRef.get(depType).get(depUnit).get(d);
                                    } else {
                                        govFs = annosPerRef.get(depType).get(govUnit).get(g);
                                        depFs = annosPerRef.get(depType).get(depUnit).entrySet().iterator().next().getValue();
                                    }
                                    annos.get(i).setFeatureValue(feat, depFs);
                                    annos.get(i).setFeatureValue(type.getFeatureByBaseName(GOVERNOR), govFs);
                                    if (depFs.getBegin() <= annos.get(i).getBegin()) {
                                        Feature beginF = type.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_BEGIN);
                                        annos.get(i).setIntValue(beginF, depFs.getBegin());
                                    } else {
                                        Feature endF = type.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_END);
                                        annos.get(i).setIntValue(endF, depFs.getEnd());
                                    }
                                    aJCas.addFsToIndexes(annos.get(i));
                                } else {
                                    mAnno = getEscapeChars(mAnno);
                                    multiTokUnits.putIfAbsent(unit, new HashMap<>());
                                    multiTokUnits.get(unit).put(ref, annos.get(i));
                                    prevAnnoFs = annos.get(i);
                                    annos.get(i).setFeatureValueFromString(feat, mAnno);
                                    aJCas.addFsToIndexes(annos.get(i));
                                    setAnnoRefPerUnit(unit, type, ref, annos.get(i));
                                }
                            }
                            if (stackedAnnos.length > 1) {
                                ref++;
                            }
                        }
                        if (type.getName().equals(POS.class.getName())) {
                            units2Tokens.get(unit).setPos((POS) annos.get(i));
                        }
                        if (type.getName().equals(Lemma.class.getName())) {
                            units2Tokens.get(unit).setLemma((Lemma) annos.get(i));
                        }
                        if (type.getName().equals(Stem.class.getName())) {
                            units2Tokens.get(unit).setStem((Stem) annos.get(i));
                        }
                        if (type.getName().equals(MorphologicalFeatures.class.getName())) {
                            units2Tokens.get(unit).setMorph((MorphologicalFeatures) annos.get(i));
                        }
                        i++;
                    }
                    if (targetAdd) {
                        linkFSesPerSlotAnno = new HashMap<>();
                    }
                } else {
                    prevAnnoFs = null;
                }
                j++;
            }
            if (prevAnnoFs != null) {
                ref++;
            }
        }
        annosPerRef.put(type, multiTokUnits);
    }
}
Also used : MorphologicalFeatures(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Feature(org.apache.uima.cas.Feature) Stem(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem) FeatureStructure(org.apache.uima.cas.FeatureStructure) AnnotationFS(org.apache.uima.cas.text.AnnotationFS) Type(org.apache.uima.cas.Type) AnnotationUnit(de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit) POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) Lemma(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) TreeMap(java.util.TreeMap)

Example 9 with AnnotationUnit

use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.

the class WebannoTsv3Reader method setAnnotations.

/**
 * Iterate through lines and create span annotations accordingly. For multiple span annotation,
 * based on the position of the annotation in the line, update only the end position of the
 * annotation
 */
private void setAnnotations(JCas aJCas, InputStream aIs, String aEncoding) throws IOException {
    // getting header information
    LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding);
    int sentBegin = -1, sentEnd = 0;
    int prevSentEnd = 0;
    StringBuilder sentLineSb = new StringBuilder();
    String lastSent = "";
    int format = -1;
    while (lineIterator.hasNext()) {
        String line = lineIterator.next();
        if (line.startsWith("#T_")) {
            setLayerAndFeature(aJCas, line);
            continue;
        }
        if (line.startsWith("#Text=")) {
            String text = line.substring(line.indexOf("=") + 1);
            if (format == 31) {
                text = unescapeJava(text);
            } else if (format == 32) {
                text = unEscapeSpecial(text);
            }
            if (sentLineSb.toString().isEmpty()) {
                sentLineSb.append(text);
            } else {
                sentLineSb.append(LF).append(text);
            }
            lastSent = sentLineSb.toString();
            continue;
        }
        if (line.startsWith("#FORMAT=")) {
            if ("#FORMAT=WebAnno TSV 3".equals(line)) {
                format = 3;
            } else if ("#FORMAT=WebAnno TSV 3.1".equals(line)) {
                format = 31;
            } else if ("#FORMAT=WebAnno TSV 3.2".equals(line)) {
                format = 32;
            }
            continue;
        }
        if (line.trim().isEmpty()) {
            if (!sentLineSb.toString().isEmpty()) {
                createSentence(aJCas, sentLineSb.toString(), sentBegin, sentEnd, prevSentEnd);
                prevSentEnd = sentEnd;
                // reset for next sentence begin
                sentBegin = -1;
                sentLineSb = new StringBuilder();
            }
            continue;
        }
        line = line.trim();
        int count = StringUtils.countMatches(line, "\t");
        if (columns != count) {
            throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line);
        }
        String regex = "(?<!\\\\)*" + Pattern.quote(TAB);
        String[] lines = line.split(regex);
        int begin = Integer.parseInt(lines[1].split("-")[0]);
        int end = Integer.parseInt(lines[1].split("-")[1]);
        if (sentBegin == -1) {
            sentBegin = begin;
        }
        sentEnd = end;
        AnnotationUnit unit = createTokens(aJCas, lines, begin, end);
        int ind = 3;
        setAnnosPerTypePerUnit(lines, unit, ind);
    }
    // the last sentence
    if (!lastSent.isEmpty()) {
        createSentence(aJCas, lastSent, sentBegin, sentEnd, prevSentEnd);
    }
    Map<Type, Map<AnnotationUnit, List<AnnotationFS>>> annosPerTypePerUnit = new HashMap<>();
    setAnnosPerUnit(aJCas, annosPerTypePerUnit);
    addAnnotations(aJCas, annosPerTypePerUnit);
    addChainAnnotations(aJCas);
}
Also used : AnnotationFS(org.apache.uima.cas.text.AnnotationFS) Type(org.apache.uima.cas.Type) AnnotationUnit(de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) IOException(java.io.IOException) LineIterator(org.apache.commons.io.LineIterator) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) TreeMap(java.util.TreeMap)

Example 10 with AnnotationUnit

use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.

the class WebannoTsv3Reader method setAnnosPerUnit.

private void setAnnosPerUnit(JCas aJCas, Map<Type, Map<AnnotationUnit, List<AnnotationFS>>> aAnnosPerTypePerUnit) {
    for (Type type : annotationsPerPostion.keySet()) {
        Map<AnnotationUnit, List<AnnotationFS>> annosPerUnit = new HashMap<>();
        for (AnnotationUnit unit : annotationsPerPostion.get(type).keySet()) {
            int begin = unit.begin;
            int end = unit.end;
            List<AnnotationFS> annos = new ArrayList<>();
            // if there are multiple annos
            int multAnnos = 1;
            for (String anno : annotationsPerPostion.get(type).get(unit)) {
                String stackedAnnoRegex = "(?<!\\\\)" + Pattern.quote("|");
                if (anno.split(stackedAnnoRegex).length > multAnnos) {
                    multAnnos = anno.split(stackedAnnoRegex).length;
                }
            }
            for (int i = 0; i < multAnnos; i++) {
                annos.add(aJCas.getCas().createAnnotation(type, begin, end));
            }
            annosPerUnit.put(unit, annos);
        }
        aAnnosPerTypePerUnit.put(type, annosPerUnit);
    }
}
Also used : AnnotationFS(org.apache.uima.cas.text.AnnotationFS) Type(org.apache.uima.cas.Type) AnnotationUnit(de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List)

Aggregations

AnnotationUnit (de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit)16 AnnotationFS (org.apache.uima.cas.text.AnnotationFS)9 ArrayList (java.util.ArrayList)8 Type (org.apache.uima.cas.Type)8 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)6 List (java.util.List)6 Feature (org.apache.uima.cas.Feature)6 LinkedHashSet (java.util.LinkedHashSet)5 CasUtil.getType (org.apache.uima.fit.util.CasUtil.getType)5 HashMap (java.util.HashMap)4 LinkedHashMap (java.util.LinkedHashMap)4 FeatureStructure (org.apache.uima.cas.FeatureStructure)3 IOException (java.io.IOException)2 Map (java.util.Map)2 TreeMap (java.util.TreeMap)2 MorphologicalFeatures (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures)1 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)1 Lemma (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma)1 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)1 Stem (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem)1