Search in sources :

Example 6 with ConfigurationSet

use of de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.ConfigurationSet in project webanno by webanno.

the class SuggestionBuilder method buildCurationContainer.

public CurationContainer buildCurationContainer(AnnotatorState aBModel) throws UIMAException, ClassNotFoundException, IOException, AnnotationException {
    CurationContainer curationContainer = new CurationContainer();
    // initialize Variables
    SourceDocument sourceDocument = aBModel.getDocument();
    Map<Integer, Integer> segmentBeginEnd = new HashMap<>();
    Map<Integer, Integer> segmentNumber = new HashMap<>();
    Map<String, Map<Integer, Integer>> segmentAdress = new HashMap<>();
    // get annotation documents
    List<AnnotationDocument> finishedAnnotationDocuments = new ArrayList<>();
    for (AnnotationDocument annotationDocument : documentService.listAnnotationDocuments(aBModel.getDocument())) {
        if (annotationDocument.getState().equals(AnnotationDocumentState.FINISHED)) {
            finishedAnnotationDocuments.add(annotationDocument);
        }
    }
    Map<String, JCas> jCases = new HashMap<>();
    AnnotationDocument randomAnnotationDocument = null;
    JCas mergeJCas;
    // get the correction/automation JCas for the logged in user
    if (aBModel.getMode().equals(Mode.AUTOMATION) || aBModel.getMode().equals(Mode.CORRECTION)) {
        jCases = listJcasesforCorrection(randomAnnotationDocument, sourceDocument, aBModel.getMode());
        mergeJCas = getMergeCas(aBModel, sourceDocument, jCases, randomAnnotationDocument, false);
        String username = jCases.keySet().iterator().next();
        updateSegment(aBModel, segmentBeginEnd, segmentNumber, segmentAdress, jCases.get(username), username, aBModel.getWindowBeginOffset(), aBModel.getWindowEndOffset());
    } else {
        jCases = listJcasesforCuration(finishedAnnotationDocuments, randomAnnotationDocument, aBModel.getMode());
        mergeJCas = getMergeCas(aBModel, sourceDocument, jCases, randomAnnotationDocument, false);
        updateSegment(aBModel, segmentBeginEnd, segmentNumber, segmentAdress, mergeJCas, WebAnnoConst.CURATION_USER, WebAnnoCasUtil.getFirstSentence(mergeJCas).getBegin(), mergeJCas.getDocumentText().length());
    }
    List<Type> entryTypes = null;
    segmentAdress.put(WebAnnoConst.CURATION_USER, new HashMap<>());
    for (Sentence sentence : selectCovered(mergeJCas, Sentence.class, diffRangeBegin, diffRangeEnd)) {
        segmentAdress.get(WebAnnoConst.CURATION_USER).put(sentence.getBegin(), getAddr(sentence));
    }
    if (entryTypes == null) {
        entryTypes = getEntryTypes(mergeJCas, aBModel.getAnnotationLayers(), annotationService);
    }
    // for cross-sentences annotation, update the end of the segment
    if (firstload) {
        long start = System.currentTimeMillis();
        log.debug("Updating cross sentence annotation list...");
        updateCrossSentAnnoList(segmentBeginEnd, segmentNumber, jCases, entryTypes);
        firstload = false;
        log.debug("Cross sentence annotation list complete in {}ms", (System.currentTimeMillis() - start));
    }
    long diffStart = System.currentTimeMillis();
    log.debug("Calculating differences...");
    int count = 0;
    for (Integer begin : segmentBeginEnd.keySet()) {
        Integer end = segmentBeginEnd.get(begin);
        count++;
        if (count % 100 == 0) {
            log.debug("Processing differences: {} of {} sentences...", count, segmentBeginEnd.size());
        }
        DiffResult diff = CasDiff2.doDiffSingle(annotationService, aBModel.getProject(), entryTypes, LinkCompareBehavior.LINK_ROLE_AS_LABEL, jCases, begin, end);
        SourceListView curationSegment = new SourceListView();
        curationSegment.setBegin(begin);
        curationSegment.setEnd(end);
        curationSegment.setSentenceNumber(segmentNumber.get(begin));
        if (diff.hasDifferences() || !diff.getIncompleteConfigurationSets().isEmpty()) {
            // Is this confSet a diff due to stacked annotations (with same configuration)?
            boolean stackedDiff = false;
            stackedDiffSet: for (ConfigurationSet d : diff.getDifferingConfigurationSets().values()) {
                for (Configuration c : d.getConfigurations()) {
                    if (c.getCasGroupIds().size() != d.getCasGroupIds().size()) {
                        stackedDiff = true;
                        break stackedDiffSet;
                    }
                }
            }
            if (stackedDiff) {
                curationSegment.setSentenceState(SentenceState.DISAGREE);
            } else if (!diff.getIncompleteConfigurationSets().isEmpty()) {
                curationSegment.setSentenceState(SentenceState.DISAGREE);
            } else {
                curationSegment.setSentenceState(SentenceState.AGREE);
            }
        } else {
            curationSegment.setSentenceState(SentenceState.AGREE);
        }
        for (String username : segmentAdress.keySet()) {
            curationSegment.getSentenceAddress().put(username, segmentAdress.get(username).get(begin));
        }
        curationContainer.getCurationViewByBegin().put(begin, curationSegment);
    }
    log.debug("Difference calculation completed in {}ms", (System.currentTimeMillis() - diffStart));
    return curationContainer;
}
Also used : Configuration(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.Configuration) HashMap(java.util.HashMap) SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) ArrayList(java.util.ArrayList) JCas(org.apache.uima.jcas.JCas) AnnotationDocument(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument) ConfigurationSet(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.ConfigurationSet) Type(org.apache.uima.cas.Type) DiffResult(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.DiffResult) HashMap(java.util.HashMap) Map(java.util.Map) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 7 with ConfigurationSet

use of de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.ConfigurationSet in project webanno by webanno.

the class AgreementUtils method makeStudy.

private static AgreementResult makeStudy(DiffResult aDiff, Collection<String> aUsers, String aType, String aFeature, boolean aExcludeIncomplete, boolean aNullLabelsAsEmpty, Map<String, List<JCas>> aCasMap) {
    List<String> users = new ArrayList<>(aUsers);
    Collections.sort(users);
    List<ConfigurationSet> completeSets = new ArrayList<>();
    List<ConfigurationSet> setsWithDifferences = new ArrayList<>();
    List<ConfigurationSet> incompleteSetsByPosition = new ArrayList<>();
    List<ConfigurationSet> incompleteSetsByLabel = new ArrayList<>();
    List<ConfigurationSet> pluralitySets = new ArrayList<>();
    List<ConfigurationSet> irrelevantSets = new ArrayList<>();
    CodingAnnotationStudy study = new CodingAnnotationStudy(users.size());
    // Check if the feature we are looking at is a primitive feature or a link feature
    // We do this by looking it up in the first available CAS. Mind that at this point all
    // CASes should have exactly the same typesystem.
    JCas someCas = findSomeCas(aCasMap);
    if (someCas == null) {
        // Well... there is NOTHING here!
        // All positions are irrelevant
        aDiff.getPositions().forEach(p -> irrelevantSets.add(aDiff.getConfigurtionSet(p)));
        return new AgreementResult(aType, aFeature, aDiff, study, users, completeSets, irrelevantSets, setsWithDifferences, incompleteSetsByPosition, incompleteSetsByLabel, pluralitySets, aExcludeIncomplete);
    }
    TypeSystem ts = someCas.getTypeSystem();
    // We should just do the right thing here which is: do nothing
    if (ts.getType(aType) == null) {
        // All positions are irrelevant
        aDiff.getPositions().forEach(p -> irrelevantSets.add(aDiff.getConfigurtionSet(p)));
        return new AgreementResult(aType, aFeature, aDiff, study, users, completeSets, irrelevantSets, setsWithDifferences, incompleteSetsByPosition, incompleteSetsByLabel, pluralitySets, aExcludeIncomplete);
    }
    // Check that the feature really exists instead of just getting a NPE later
    if (ts.getType(aType).getFeatureByBaseName(aFeature) == null) {
        throw new IllegalArgumentException("Type [" + aType + "] has no feature called [" + aFeature + "]");
    }
    boolean isPrimitiveFeature = ts.getType(aType).getFeatureByBaseName(aFeature).getRange().isPrimitive();
    nextPosition: for (Position p : aDiff.getPositions()) {
        ConfigurationSet cfgSet = aDiff.getConfigurtionSet(p);
        // Only calculate agreement for the given layer
        if (!cfgSet.getPosition().getType().equals(aType)) {
            // We don't even consider these as irrelevant, they are just filtered out
            continue;
        }
        // If the feature on a position is set, then it is a subposition
        boolean isSubPosition = p.getFeature() != null;
        // this is an inverted XOR!
        if (!(isPrimitiveFeature ^ isSubPosition)) {
            irrelevantSets.add(cfgSet);
            continue;
        }
        // feature
        if (isSubPosition && !aFeature.equals(cfgSet.getPosition().getFeature())) {
            irrelevantSets.add(cfgSet);
            continue nextPosition;
        }
        // If non of the current users has made any annotation at this position, then skip it
        if (users.stream().filter(u -> cfgSet.getCasGroupIds().contains(u)).count() == 0) {
            irrelevantSets.add(cfgSet);
            continue nextPosition;
        }
        Object[] values = new Object[users.size()];
        int i = 0;
        for (String user : users) {
            // this configuration set.
            if (!cfgSet.getCasGroupIds().contains(user)) {
                incompleteSetsByPosition.add(cfgSet);
                if (aExcludeIncomplete) {
                    // Record as incomplete
                    continue nextPosition;
                } else {
                    // Record as missing value
                    values[i] = null;
                    i++;
                    continue;
                }
            }
            // Make sure a single user didn't do multiple alternative annotations at a single
            // position. So there is currently no support for calculating agreement on stacking
            // annotations.
            List<Configuration> cfgs = cfgSet.getConfigurations(user);
            if (cfgs.size() > 1) {
                pluralitySets.add(cfgSet);
                continue nextPosition;
            }
            Configuration cfg = cfgs.get(0);
            // Check if source and/or targets of a relation are stacked
            if (cfg.getPosition() instanceof ArcPosition) {
                ArcPosition pos = (ArcPosition) cfg.getPosition();
                FeatureStructure arc = cfg.getFs(user, pos.getCasId(), aCasMap);
                ArcDiffAdapter adapter = (ArcDiffAdapter) aDiff.getDiffAdapter(pos.getType());
                // Check if the source of the relation is stacked
                AnnotationFS source = FSUtil.getFeature(arc, adapter.getSourceFeature(), AnnotationFS.class);
                List<AnnotationFS> sourceCandidates = CasUtil.selectAt(arc.getCAS(), source.getType(), source.getBegin(), source.getEnd());
                if (sourceCandidates.size() > 1) {
                    pluralitySets.add(cfgSet);
                    continue nextPosition;
                }
                // Check if the target of the relation is stacked
                AnnotationFS target = FSUtil.getFeature(arc, adapter.getTargetFeature(), AnnotationFS.class);
                List<AnnotationFS> targetCandidates = CasUtil.selectAt(arc.getCAS(), target.getType(), target.getBegin(), target.getEnd());
                if (targetCandidates.size() > 1) {
                    pluralitySets.add(cfgSet);
                    continue nextPosition;
                }
            }
            // Only calculate agreement for the given feature
            FeatureStructure fs = cfg.getFs(user, cfg.getPosition().getCasId(), aCasMap);
            // BEGIN PARANOIA
            assert fs.getType().getFeatureByBaseName(aFeature).getRange().isPrimitive() == isPrimitiveFeature;
            // should never have gotten here in the first place.
            assert !isPrimitiveFeature || !isSubPosition;
            if (isPrimitiveFeature && !isSubPosition) {
                // Primitive feature / primary position
                values[i] = getFeature(fs, aFeature);
            } else if (!isPrimitiveFeature && isSubPosition) {
                // Link feature / sub-position
                ArrayFS links = (ArrayFS) fs.getFeatureValue(fs.getType().getFeatureByBaseName(aFeature));
                FeatureStructure link = links.get(cfg.getAID(user).index);
                switch(cfg.getPosition().getLinkCompareBehavior()) {
                    case LINK_TARGET_AS_LABEL:
                        // FIXME The target feature name should be obtained from the feature
                        // definition!
                        AnnotationFS target = (AnnotationFS) link.getFeatureValue(link.getType().getFeatureByBaseName("target"));
                        values[i] = target.getBegin() + "-" + target.getEnd() + " [" + target.getCoveredText() + "]";
                        break;
                    case LINK_ROLE_AS_LABEL:
                        // FIXME The role feature name should be obtained from the feature
                        // definition!
                        String role = link.getStringValue(link.getType().getFeatureByBaseName("role"));
                        values[i] = role;
                        break;
                    default:
                        throw new IllegalStateException("Unknown link target comparison mode [" + cfg.getPosition().getLinkCompareBehavior() + "]");
                }
            } else {
                throw new IllegalStateException("Should never get here: primitive: " + fs.getType().getFeatureByBaseName(aFeature).getRange().isPrimitive() + "; subpos: " + isSubPosition);
            }
            // agreement calculation. The empty label is still a valid label.
            if (aNullLabelsAsEmpty && values[i] == null) {
                values[i] = "";
            }
            // "null" cannot be used in agreement calculations. We treat these as incomplete
            if (values[i] == null) {
                incompleteSetsByLabel.add(cfgSet);
                if (aExcludeIncomplete) {
                    continue nextPosition;
                }
            }
            i++;
        }
        if (ObjectUtils.notEqual(values[0], values[1])) {
            setsWithDifferences.add(cfgSet);
        }
        // are calculating agreement over
        assert cfgSet.getPosition().getFeature() == null || cfgSet.getPosition().getFeature().equals(aFeature);
        completeSets.add(cfgSet);
        study.addItemAsArray(values);
    }
    return new AgreementResult(aType, aFeature, aDiff, study, users, completeSets, irrelevantSets, setsWithDifferences, incompleteSetsByPosition, incompleteSetsByLabel, pluralitySets, aExcludeIncomplete);
}
Also used : WebAnnoCasUtil.getFeature(de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.getFeature) NominalDistanceFunction(de.tudarmstadt.ukp.dkpro.statistics.agreement.distance.NominalDistanceFunction) AnnotationFS(org.apache.uima.cas.text.AnnotationFS) ByteArrayOutputStream(java.io.ByteArrayOutputStream) KrippendorffAlphaAgreement(de.tudarmstadt.ukp.dkpro.statistics.agreement.coding.KrippendorffAlphaAgreement) FSUtil(org.apache.uima.fit.util.FSUtil) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) CodingAnnotationStudy(de.tudarmstadt.ukp.dkpro.statistics.agreement.coding.CodingAnnotationStudy) ByteArrayInputStream(java.io.ByteArrayInputStream) CSVFormat(org.apache.commons.csv.CSVFormat) Arrays.asList(java.util.Arrays.asList) ObjectUtils(org.apache.commons.lang3.ObjectUtils) Map(java.util.Map) ArcDiffAdapter(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.ArcDiffAdapter) Configuration(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.Configuration) OutputStreamWriter(java.io.OutputStreamWriter) FeatureStructure(org.apache.uima.cas.FeatureStructure) ArcPosition(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.ArcPosition) PrintStream(java.io.PrintStream) JCas(org.apache.uima.jcas.JCas) ICodingAnnotationStudy(de.tudarmstadt.ukp.dkpro.statistics.agreement.coding.ICodingAnnotationStudy) TypeSystem(org.apache.uima.cas.TypeSystem) ICodingAnnotationItem(de.tudarmstadt.ukp.dkpro.statistics.agreement.coding.ICodingAnnotationItem) ArrayFS(org.apache.uima.cas.ArrayFS) CohenKappaAgreement(de.tudarmstadt.ukp.dkpro.statistics.agreement.coding.CohenKappaAgreement) Collection(java.util.Collection) Position(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.Position) IAnnotationUnit(de.tudarmstadt.ukp.dkpro.statistics.agreement.IAnnotationUnit) IOException(java.io.IOException) FleissKappaAgreement(de.tudarmstadt.ukp.dkpro.statistics.agreement.coding.FleissKappaAgreement) CasUtil(org.apache.uima.fit.util.CasUtil) IAgreementMeasure(de.tudarmstadt.ukp.dkpro.statistics.agreement.IAgreementMeasure) ConfigurationSet(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.ConfigurationSet) List(java.util.List) Entry(java.util.Map.Entry) DiffResult(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.DiffResult) Collections(java.util.Collections) CSVPrinter(org.apache.commons.csv.CSVPrinter) InputStream(java.io.InputStream) ExceptionUtils(org.apache.commons.lang3.exception.ExceptionUtils) TypeSystem(org.apache.uima.cas.TypeSystem) Configuration(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.Configuration) ArcDiffAdapter(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.ArcDiffAdapter) ArcPosition(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.ArcPosition) Position(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.Position) ArrayList(java.util.ArrayList) JCas(org.apache.uima.jcas.JCas) CodingAnnotationStudy(de.tudarmstadt.ukp.dkpro.statistics.agreement.coding.CodingAnnotationStudy) ICodingAnnotationStudy(de.tudarmstadt.ukp.dkpro.statistics.agreement.coding.ICodingAnnotationStudy) FeatureStructure(org.apache.uima.cas.FeatureStructure) ConfigurationSet(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.ConfigurationSet) AnnotationFS(org.apache.uima.cas.text.AnnotationFS) ArcPosition(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.ArcPosition) ArrayFS(org.apache.uima.cas.ArrayFS) ArrayList(java.util.ArrayList) Arrays.asList(java.util.Arrays.asList) List(java.util.List)

Aggregations

ConfigurationSet (de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.ConfigurationSet)6 Configuration (de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.Configuration)4 ArrayList (java.util.ArrayList)4 Map (java.util.Map)4 DiffResult (de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.DiffResult)3 Position (de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.Position)3 HashMap (java.util.HashMap)3 FeatureStructure (org.apache.uima.cas.FeatureStructure)3 Type (org.apache.uima.cas.Type)3 JCas (org.apache.uima.jcas.JCas)3 ArcPosition (de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.ArcPosition)2 AnnotationFeature (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)2 IAnnotationUnit (de.tudarmstadt.ukp.dkpro.statistics.agreement.IAnnotationUnit)2 ICodingAnnotationItem (de.tudarmstadt.ukp.dkpro.statistics.agreement.coding.ICodingAnnotationItem)2 List (java.util.List)2 ArrayFS (org.apache.uima.cas.ArrayFS)2 AnnotationFS (org.apache.uima.cas.text.AnnotationFS)2 TypeAdapter (de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.TypeAdapter)1 VID (de.tudarmstadt.ukp.clarin.webanno.api.annotation.model.VID)1 WebAnnoCasUtil.getFeature (de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.getFeature)1