Search in sources :

Example 31 with AnnotationFeature

use of de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature in project webanno by webanno.

the class AutomationUtil method otherFeatureClassifiers.

/**
 * When additional layers are used as training feature, the training document should be
 * auto-predicted with the other layers. Example, if the train layer is Named Entity and POS
 * layer is used as additional feature, the training document should be predicted using the POS
 * layer documents for POS annotation
 *
 * @param aTemplate
 *            the template.
 * @param aRepository
 *            the repository.
 * @throws IOException
 *             hum?
 * @throws ClassNotFoundException
 *             hum?
 */
public static void otherFeatureClassifiers(MiraTemplate aTemplate, DocumentService aRepository, AutomationService aAutomationService) throws IOException, ClassNotFoundException {
    Mira mira = new Mira();
    int frequency = 2;
    double sigma = 1;
    int iterations = 10;
    int beamSize = 0;
    boolean maxPosteriors = false;
    String templateName = null;
    for (AnnotationFeature feature : aTemplate.getOtherFeatures()) {
        templateName = createTemplate(feature, getMiraTemplateFile(feature, aAutomationService), 0);
        File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
        File trainFile = new File(miraDir, feature.getId() + ".train");
        String initalModelName = "";
        String trainName = trainFile.getAbsolutePath();
        String modelName = aAutomationService.getMiraModel(feature, true, null).getAbsolutePath();
        boolean randomInit = false;
        if (!feature.getLayer().isLockToTokenOffset()) {
            mira.setIobScorer();
        }
        mira.loadTemplates(templateName);
        mira.setClip(sigma);
        mira.maxPosteriors = maxPosteriors;
        mira.beamSize = beamSize;
        int numExamples = mira.count(trainName, frequency);
        mira.initModel(randomInit);
        if (!initalModelName.equals("")) {
            mira.loadModel(initalModelName);
        }
        for (int i = 0; i < iterations; i++) {
            mira.train(trainName, iterations, numExamples, i);
            mira.averageWeights(iterations * numExamples);
        }
        mira.saveModel(modelName);
    }
}
Also used : File(java.io.File) Mira(edu.lium.mira.Mira) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)

Example 32 with AnnotationFeature

use of de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature in project webanno by webanno.

the class AutomationUtil method generateTrainDocument.

public static void generateTrainDocument(MiraTemplate aTemplate, DocumentService aRepository, CurationDocumentService aCurationDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao, boolean aBase) throws IOException, UIMAException, ClassNotFoundException, AutomationException {
    LOG.info("Starting to generate training document");
    File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
    if (!miraDir.exists()) {
        FileUtils.forceMkdir(miraDir);
    }
    AnnotationFeature feature = aTemplate.getTrainFeature();
    boolean documentChanged = false;
    // A. training document for other train layers were changed
    for (AnnotationFeature otherrFeature : aTemplate.getOtherFeatures()) {
        for (TrainingDocument document : aAutomationService.listTrainingDocuments(aTemplate.getTrainFeature().getProject())) {
            if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(otherrFeature)) {
                documentChanged = true;
                break;
            }
        }
    }
    // B. Training document for the main training layer were changed
    for (TrainingDocument document : aAutomationService.listTrainingDocuments(feature.getProject())) {
        if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(feature))) {
            documentChanged = true;
            break;
        }
    }
    // C. New Curation document arrives
    if (aRepository.listSourceDocuments(feature.getProject()).size() > 0) {
        documentChanged = true;
    }
    // D. tab-sep training documents
    for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
        if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) {
            documentChanged = true;
            break;
        }
    }
    if (!documentChanged) {
        return;
    }
    File trainFile;
    if (aBase) {
        trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.ft");
    } else {
        trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.base");
    }
    AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
    BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile));
    AutomationTypeAdapter adapter = (AutomationTypeAdapter) aAnnotationService.getAdapter(feature.getLayer());
    // Training documents (Curated or webanno-compatible imported ones - read using UIMA)
    List<TrainingDocument> trainingDocuments = aAutomationService.listTrainingDocuments(feature.getProject());
    int trainingDocsCount = 0;
    for (TrainingDocument trainingDocument : trainingDocuments) {
        if ((trainingDocument.getFeature() != null && trainingDocument.getFeature().equals(feature)) && !trainingDocument.getFormat().equals(WebAnnoConst.TAB_SEP)) {
            JCas jCas = aAutomationService.readTrainingAnnotationCas(trainingDocument);
            for (Sentence sentence : select(jCas, Sentence.class)) {
                if (aBase) {
                    // base training document
                    trainOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
                } else {
                    // training document with other features
                    trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
                }
            }
            trainingDocument.setProcessed(!aBase);
            if (!aBase) {
                status.setTrainDocs(status.getTrainDocs() - 1);
            }
        }
    }
    // for curated docuemnts
    List<SourceDocument> sourceDocuments = aRepository.listSourceDocuments(feature.getProject());
    for (SourceDocument sourceDocument : sourceDocuments) {
        if (sourceDocument.getState().equals(SourceDocumentState.CURATION_FINISHED)) {
            JCas jCas = aCurationDocumentService.readCurationCas(sourceDocument);
            for (Sentence sentence : select(jCas, Sentence.class)) {
                if (aBase) {
                    // base training document
                    trainOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
                } else {
                    // training document with other features
                    trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
                }
            }
            if (!aBase) {
                status.setTrainDocs(status.getTrainDocs() - 1);
            }
        }
        trainingDocsCount++;
        LOG.info("Processed source document " + trainingDocsCount + " of " + trainingDocuments.size());
    }
    // Tab-sep documents to be used as a target layer train document
    int goldStandardDocsCounter = 0;
    List<TrainingDocument> goldStandardDocs = aAutomationService.listTabSepDocuments(feature.getProject());
    for (TrainingDocument document : goldStandardDocs) {
        if (document.getFormat().equals(WebAnnoConst.TAB_SEP) && document.getFeature() != null && document.getFeature().equals(feature)) {
            File tabSepFile = new File(aAutomationService.getDocumentFolder(document), document.getName());
            LineIterator it = IOUtils.lineIterator(new FileReader(tabSepFile));
            while (it.hasNext()) {
                String line = it.next();
                if (line.trim().equals("")) {
                    trainOut.append("\n");
                } else {
                    StringTokenizer st = new StringTokenizer(line, "\t");
                    if (st.countTokens() != 2) {
                        trainOut.close();
                        throw new AutomationException("This is not a valid TAB-SEP document");
                    }
                    if (aBase) {
                        trainOut.append(getMiraLineForTabSep(st.nextToken(), ""));
                    } else {
                        trainOut.append(getMiraLineForTabSep(st.nextToken(), st.nextToken()));
                    }
                }
            }
        }
        goldStandardDocsCounter++;
        LOG.info("Processed gold standard document " + goldStandardDocsCounter + " of " + goldStandardDocs.size());
    }
    trainOut.close();
    LOG.info("Completed generating training document");
}
Also used : FileWriter(java.io.FileWriter) SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) JCas(org.apache.uima.jcas.JCas) LineIterator(org.apache.commons.io.LineIterator) AutomationStatus(de.tudarmstadt.ukp.clarin.webanno.automation.model.AutomationStatus) BufferedWriter(java.io.BufferedWriter) AutomationTypeAdapter(de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter) StringTokenizer(java.util.StringTokenizer) FileReader(java.io.FileReader) File(java.io.File) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature) TrainingDocument(de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument)

Example 33 with AnnotationFeature

use of de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature in project webanno by webanno.

the class SuggestionViewPanel method addSuggestionColor.

/**
 * For each {@link ConfigurationSet}, where there are some differences in users annotation and
 * the curation annotation.
 */
private void addSuggestionColor(Project aProject, Mode aMode, Map<String, JCas> aCasMap, Map<String, Map<VID, AnnotationState>> aSuggestionColors, Collection<ConfigurationSet> aCfgSet, boolean aI, boolean aAgree) {
    for (ConfigurationSet cs : aCfgSet) {
        boolean use = false;
        for (String u : cs.getCasGroupIds()) {
            Map<VID, AnnotationState> colors = aSuggestionColors.get(u);
            if (colors == null) {
                colors = new HashMap<>();
                aSuggestionColors.put(u, colors);
            }
            for (Configuration c : cs.getConfigurations(u)) {
                FeatureStructure fs = c.getFs(u, aCasMap);
                AnnotationLayer layer = annotationService.getLayer(fs.getType().getName(), aProject);
                TypeAdapter typeAdapter = annotationService.getAdapter(layer);
                VID vid;
                // link FS
                if (c.getPosition().getFeature() != null) {
                    int fi = 0;
                    for (AnnotationFeature f : typeAdapter.listFeatures()) {
                        if (f.getName().equals(c.getPosition().getFeature())) {
                            break;
                        }
                        fi++;
                    }
                    vid = new VID(WebAnnoCasUtil.getAddr(fs), fi, c.getAID(u).index);
                } else {
                    vid = new VID(WebAnnoCasUtil.getAddr(fs));
                }
                if (aAgree) {
                    colors.put(vid, AnnotationState.AGREE);
                    continue;
                }
                // automation and correction projects
                if (!aMode.equals(Mode.CURATION) && !aAgree) {
                    if (cs.getCasGroupIds().size() == 2) {
                        colors.put(vid, AnnotationState.DO_NOT_USE);
                    } else {
                        colors.put(vid, AnnotationState.DISAGREE);
                    }
                    continue;
                }
                // this set agree with the curation annotation
                if (c.getCasGroupIds().contains(CURATION_USER)) {
                    use = true;
                } else {
                    use = false;
                }
                // this curation view
                if (u.equals(CURATION_USER)) {
                    continue;
                }
                if (aAgree) {
                    colors.put(vid, AnnotationState.AGREE);
                } else if (use) {
                    colors.put(vid, AnnotationState.USE);
                } else if (aI) {
                    colors.put(vid, AnnotationState.DISAGREE);
                } else if (!cs.getCasGroupIds().contains(CURATION_USER)) {
                    colors.put(vid, AnnotationState.DISAGREE);
                } else {
                    colors.put(vid, AnnotationState.DO_NOT_USE);
                }
            }
        }
    }
}
Also used : Configuration(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.Configuration) AnnotationLayer(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationLayer) VID(de.tudarmstadt.ukp.clarin.webanno.api.annotation.model.VID) FeatureStructure(org.apache.uima.cas.FeatureStructure) ConfigurationSet(de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.ConfigurationSet) TypeAdapter(de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.TypeAdapter) AnnotationState(de.tudarmstadt.ukp.clarin.webanno.ui.curation.component.model.AnnotationState) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)

Example 34 with AnnotationFeature

use of de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature in project webanno by webanno.

the class AutomationUtil method predict.

public static void predict(MiraTemplate aTemplate, DocumentService aRepository, CorrectionDocumentService aCorrectionDocumentService, AutomationService aAutomationService, UserDao aUserDao) throws UIMAException, ClassNotFoundException, IOException, AnnotationException {
    AnnotationFeature layerFeature = aTemplate.getTrainFeature();
    File miraDir = aAutomationService.getMiraDir(layerFeature);
    AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
    for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) {
        File predFile = new File(miraDir, document.getId() + ".pred");
        Mira mira = new Mira();
        int shiftColumns = 0;
        int nbest = 1;
        int beamSize = 0;
        boolean maxPosteriors = false;
        String modelName = aAutomationService.getMiraModel(layerFeature, false, null).getAbsolutePath();
        String testName = predFile.getAbsolutePath();
        File predcitedFile = new File(predFile.getAbsolutePath() + "-pred");
        PrintStream stream = new PrintStream(predcitedFile);
        BufferedReader input = new BufferedReader(new InputStreamReader(System.in));
        if (testName != null) {
            input = new BufferedReader(new FileReader(testName));
        }
        mira.loadModel(modelName);
        mira.setShiftColumns(shiftColumns);
        mira.nbest = nbest;
        mira.beamSize = beamSize;
        mira.maxPosteriors = maxPosteriors;
        mira.test(input, stream);
        LOG.info("Prediction is wrtten to a MIRA File. To be done is writing back to the CAS");
        LineIterator it = IOUtils.lineIterator(new FileReader(predcitedFile));
        List<String> annotations = new ArrayList<>();
        while (it.hasNext()) {
            String line = it.next();
            if (line.trim().equals("")) {
                continue;
            }
            StringTokenizer st = new StringTokenizer(line, " ");
            String tag = "";
            while (st.hasMoreTokens()) {
                tag = st.nextToken();
            }
            annotations.add(tag);
        }
        LOG.info(annotations.size() + " Predictions found to be written to the CAS");
        JCas jCas = null;
        User user = aUserDao.getCurrentUser();
        try {
            AnnotationDocument annoDocument = aRepository.getAnnotationDocument(document, user);
            jCas = aRepository.readAnnotationCas(annoDocument);
            automate(jCas, layerFeature, annotations);
        } catch (DataRetrievalFailureException e) {
            automate(jCas, layerFeature, annotations);
            LOG.info("Predictions found are written to the CAS");
            aCorrectionDocumentService.writeCorrectionCas(jCas, document);
            status.setAnnoDocs(status.getAnnoDocs() - 1);
        }
        automate(jCas, layerFeature, annotations);
        LOG.info("Predictions found are written to the CAS");
        aCorrectionDocumentService.writeCorrectionCas(jCas, document);
        status.setAnnoDocs(status.getAnnoDocs() - 1);
    }
}
Also used : PrintStream(java.io.PrintStream) User(de.tudarmstadt.ukp.clarin.webanno.security.model.User) InputStreamReader(java.io.InputStreamReader) SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) ArrayList(java.util.ArrayList) JCas(org.apache.uima.jcas.JCas) AnnotationDocument(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument) LineIterator(org.apache.commons.io.LineIterator) AutomationStatus(de.tudarmstadt.ukp.clarin.webanno.automation.model.AutomationStatus) Mira(edu.lium.mira.Mira) StringTokenizer(java.util.StringTokenizer) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) DataRetrievalFailureException(org.springframework.dao.DataRetrievalFailureException) File(java.io.File) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)

Example 35 with AnnotationFeature

use of de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature in project webanno by webanno.

the class AutomationUtil method automate.

/**
 * Add new annotation to the CAS using the MIRA prediction. This is different from the add
 * methods in the {@link TypeAdapter}s in such a way that the begin and end offsets are always
 * exact so that no need to re-compute
 *
 * @param aJcas
 *            the JCas.
 * @param aFeature
 *            the feature.
 * @param aLabelValues
 *            the values.
 * @throws AnnotationException
 *             if the annotations could not be created/updated.
 * @throws IOException
 *             if an I/O error occurs.
 */
public static void automate(JCas aJcas, AnnotationFeature aFeature, List<String> aLabelValues) throws AnnotationException, IOException {
    String typeName = aFeature.getLayer().getName();
    String attachTypeName = aFeature.getLayer().getAttachType() == null ? null : aFeature.getLayer().getAttachType().getName();
    Type type = CasUtil.getType(aJcas.getCas(), typeName);
    Feature feature = type.getFeatureByBaseName(aFeature.getName());
    int i = 0;
    String prevNe = "O";
    int begin = 0;
    int end = 0;
    // remove existing annotations of this type, after all it is an
    // automation, no care
    clearAnnotations(aJcas, type);
    if (!aFeature.getLayer().isLockToTokenOffset() || aFeature.getLayer().isMultipleTokens()) {
        for (Token token : select(aJcas, Token.class)) {
            String value = aLabelValues.get(i);
            AnnotationFS newAnnotation;
            if (value.equals("O") && prevNe.equals("O")) {
                i++;
                continue;
            } else if (value.equals("O") && !prevNe.equals("O")) {
                newAnnotation = aJcas.getCas().createAnnotation(type, begin, end);
                newAnnotation.setFeatureValueFromString(feature, prevNe.replace("B-", ""));
                prevNe = "O";
                aJcas.getCas().addFsToIndexes(newAnnotation);
            } else if (!value.equals("O") && prevNe.equals("O")) {
                begin = token.getBegin();
                end = token.getEnd();
                prevNe = value;
            } else if (!value.equals("O") && !prevNe.equals("O")) {
                if (value.replace("B-", "").replace("I-", "").equals(prevNe.replace("B-", "").replace("I-", "")) && value.startsWith("B-")) {
                    newAnnotation = aJcas.getCas().createAnnotation(type, begin, end);
                    newAnnotation.setFeatureValueFromString(feature, prevNe.replace("B-", "").replace("I-", ""));
                    prevNe = value;
                    begin = token.getBegin();
                    end = token.getEnd();
                    aJcas.getCas().addFsToIndexes(newAnnotation);
                } else if (value.replace("B-", "").replace("I-", "").equals(prevNe.replace("B-", "").replace("I-", ""))) {
                    i++;
                    end = token.getEnd();
                    continue;
                } else {
                    newAnnotation = aJcas.getCas().createAnnotation(type, begin, end);
                    newAnnotation.setFeatureValueFromString(feature, prevNe.replace("B-", "").replace("I-", ""));
                    prevNe = value;
                    begin = token.getBegin();
                    end = token.getEnd();
                    aJcas.getCas().addFsToIndexes(newAnnotation);
                }
            }
            i++;
        }
    } else {
        // check if annotation is on an AttachType
        Feature attachFeature = null;
        Type attachType;
        if (attachTypeName != null) {
            attachType = CasUtil.getType(aJcas.getCas(), attachTypeName);
            attachFeature = attachType.getFeatureByBaseName(attachTypeName);
        }
        for (Token token : select(aJcas, Token.class)) {
            AnnotationFS newAnnotation = aJcas.getCas().createAnnotation(type, token.getBegin(), token.getEnd());
            newAnnotation.setFeatureValueFromString(feature, aLabelValues.get(i));
            i++;
            if (attachFeature != null) {
                token.setFeatureValue(attachFeature, newAnnotation);
            }
            aJcas.getCas().addFsToIndexes(newAnnotation);
        }
    }
}
Also used : AnnotationFS(org.apache.uima.cas.text.AnnotationFS) Type(org.apache.uima.cas.Type) CasUtil.getType(org.apache.uima.fit.util.CasUtil.getType) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Feature(org.apache.uima.cas.Feature) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)

Aggregations

AnnotationFeature (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)73 AnnotationLayer (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationLayer)34 Feature (org.apache.uima.cas.Feature)20 ArrayList (java.util.ArrayList)16 Type (org.apache.uima.cas.Type)16 AnnotationFS (org.apache.uima.cas.text.AnnotationFS)15 TagSet (de.tudarmstadt.ukp.clarin.webanno.model.TagSet)12 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)11 JCas (org.apache.uima.jcas.JCas)10 AnnotatorState (de.tudarmstadt.ukp.clarin.webanno.api.annotation.model.AnnotatorState)9 File (java.io.File)9 CasUtil.getType (org.apache.uima.fit.util.CasUtil.getType)8 LinkWithRoleModel (de.tudarmstadt.ukp.clarin.webanno.api.annotation.model.LinkWithRoleModel)6 SourceDocument (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument)6 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)6 List (java.util.List)6 FeatureStructure (org.apache.uima.cas.FeatureStructure)6 ArcAdapter (de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.ArcAdapter)5 TypeAdapter (de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.TypeAdapter)5 FeatureSupportRegistry (de.tudarmstadt.ukp.clarin.webanno.api.annotation.feature.FeatureSupportRegistry)5