Search in sources :

Example 1 with AutomationTypeAdapter

use of de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter in project webanno by webanno.

the class AutomationUtil method generatePredictDocument.

// TODO: rename to predictDocument
public static void generatePredictDocument(MiraTemplate aTemplate, DocumentService aRepository, CorrectionDocumentService aCorrectionDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws IOException, UIMAException, ClassNotFoundException {
    File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
    if (!miraDir.exists()) {
        FileUtils.forceMkdir(miraDir);
    }
    User user = aUserDao.getCurrentUser();
    AnnotationFeature feature = aTemplate.getTrainFeature();
    AutomationTypeAdapter adapter = (AutomationTypeAdapter) aAnnotationService.getAdapter(feature.getLayer());
    for (SourceDocument document : aRepository.listSourceDocuments(feature.getProject())) {
        File predFile = new File(miraDir, document.getId() + ".pred.ft");
        BufferedWriter predOut = new BufferedWriter(new FileWriter(predFile));
        JCas jCas;
        try {
            jCas = aCorrectionDocumentService.readCorrectionCas(document);
        } catch (Exception e) {
            AnnotationDocument annoDoc = aRepository.createOrGetAnnotationDocument(document, user);
            jCas = aRepository.readAnnotationCas(annoDoc);
        }
        for (Sentence sentence : select(jCas, Sentence.class)) {
            predOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
        }
        predOut.close();
    }
}
Also used : User(de.tudarmstadt.ukp.clarin.webanno.security.model.User) FileWriter(java.io.FileWriter) SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) JCas(org.apache.uima.jcas.JCas) AnnotationDocument(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument) File(java.io.File) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) NoResultException(javax.persistence.NoResultException) AnnotationException(de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException) CASException(org.apache.uima.cas.CASException) UIMAException(org.apache.uima.UIMAException) DataRetrievalFailureException(org.springframework.dao.DataRetrievalFailureException) IOException(java.io.IOException) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature) AutomationTypeAdapter(de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter) BufferedWriter(java.io.BufferedWriter)

Example 2 with AutomationTypeAdapter

use of de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter in project webanno by webanno.

the class AutomationUtil method generateTrainDocument.

public static void generateTrainDocument(MiraTemplate aTemplate, DocumentService aRepository, CurationDocumentService aCurationDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao, boolean aBase) throws IOException, UIMAException, ClassNotFoundException, AutomationException {
    LOG.info("Starting to generate training document");
    File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
    if (!miraDir.exists()) {
        FileUtils.forceMkdir(miraDir);
    }
    AnnotationFeature feature = aTemplate.getTrainFeature();
    boolean documentChanged = false;
    // A. training document for other train layers were changed
    for (AnnotationFeature otherrFeature : aTemplate.getOtherFeatures()) {
        for (TrainingDocument document : aAutomationService.listTrainingDocuments(aTemplate.getTrainFeature().getProject())) {
            if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(otherrFeature)) {
                documentChanged = true;
                break;
            }
        }
    }
    // B. Training document for the main training layer were changed
    for (TrainingDocument document : aAutomationService.listTrainingDocuments(feature.getProject())) {
        if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(feature))) {
            documentChanged = true;
            break;
        }
    }
    // C. New Curation document arrives
    if (aRepository.listSourceDocuments(feature.getProject()).size() > 0) {
        documentChanged = true;
    }
    // D. tab-sep training documents
    for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
        if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) {
            documentChanged = true;
            break;
        }
    }
    if (!documentChanged) {
        return;
    }
    File trainFile;
    if (aBase) {
        trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.ft");
    } else {
        trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.base");
    }
    AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
    BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile));
    AutomationTypeAdapter adapter = (AutomationTypeAdapter) aAnnotationService.getAdapter(feature.getLayer());
    // Training documents (Curated or webanno-compatible imported ones - read using UIMA)
    List<TrainingDocument> trainingDocuments = aAutomationService.listTrainingDocuments(feature.getProject());
    int trainingDocsCount = 0;
    for (TrainingDocument trainingDocument : trainingDocuments) {
        if ((trainingDocument.getFeature() != null && trainingDocument.getFeature().equals(feature)) && !trainingDocument.getFormat().equals(WebAnnoConst.TAB_SEP)) {
            JCas jCas = aAutomationService.readTrainingAnnotationCas(trainingDocument);
            for (Sentence sentence : select(jCas, Sentence.class)) {
                if (aBase) {
                    // base training document
                    trainOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
                } else {
                    // training document with other features
                    trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
                }
            }
            trainingDocument.setProcessed(!aBase);
            if (!aBase) {
                status.setTrainDocs(status.getTrainDocs() - 1);
            }
        }
    }
    // for curated docuemnts
    List<SourceDocument> sourceDocuments = aRepository.listSourceDocuments(feature.getProject());
    for (SourceDocument sourceDocument : sourceDocuments) {
        if (sourceDocument.getState().equals(SourceDocumentState.CURATION_FINISHED)) {
            JCas jCas = aCurationDocumentService.readCurationCas(sourceDocument);
            for (Sentence sentence : select(jCas, Sentence.class)) {
                if (aBase) {
                    // base training document
                    trainOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
                } else {
                    // training document with other features
                    trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
                }
            }
            if (!aBase) {
                status.setTrainDocs(status.getTrainDocs() - 1);
            }
        }
        trainingDocsCount++;
        LOG.info("Processed source document " + trainingDocsCount + " of " + trainingDocuments.size());
    }
    // Tab-sep documents to be used as a target layer train document
    int goldStandardDocsCounter = 0;
    List<TrainingDocument> goldStandardDocs = aAutomationService.listTabSepDocuments(feature.getProject());
    for (TrainingDocument document : goldStandardDocs) {
        if (document.getFormat().equals(WebAnnoConst.TAB_SEP) && document.getFeature() != null && document.getFeature().equals(feature)) {
            File tabSepFile = new File(aAutomationService.getDocumentFolder(document), document.getName());
            LineIterator it = IOUtils.lineIterator(new FileReader(tabSepFile));
            while (it.hasNext()) {
                String line = it.next();
                if (line.trim().equals("")) {
                    trainOut.append("\n");
                } else {
                    StringTokenizer st = new StringTokenizer(line, "\t");
                    if (st.countTokens() != 2) {
                        trainOut.close();
                        throw new AutomationException("This is not a valid TAB-SEP document");
                    }
                    if (aBase) {
                        trainOut.append(getMiraLineForTabSep(st.nextToken(), ""));
                    } else {
                        trainOut.append(getMiraLineForTabSep(st.nextToken(), st.nextToken()));
                    }
                }
            }
        }
        goldStandardDocsCounter++;
        LOG.info("Processed gold standard document " + goldStandardDocsCounter + " of " + goldStandardDocs.size());
    }
    trainOut.close();
    LOG.info("Completed generating training document");
}
Also used : FileWriter(java.io.FileWriter) SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) JCas(org.apache.uima.jcas.JCas) LineIterator(org.apache.commons.io.LineIterator) AutomationStatus(de.tudarmstadt.ukp.clarin.webanno.automation.model.AutomationStatus) BufferedWriter(java.io.BufferedWriter) AutomationTypeAdapter(de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter) StringTokenizer(java.util.StringTokenizer) FileReader(java.io.FileReader) File(java.io.File) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature) TrainingDocument(de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument)

Example 3 with AutomationTypeAdapter

use of de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter in project webanno by webanno.

the class AutomationUtil method addOtherFeatureFromAnnotation.

/**
 * If the training file or the test file already contain the "Other layer" annotations, get the
 * UIMA annotation and add it as a feature - no need to train and predict for this "other layer"
 */
private static void addOtherFeatureFromAnnotation(AnnotationFeature aFeature, DocumentService aRepository, AutomationService aAutomationServic, AnnotationSchemaService aAnnotationService, UserDao aUserDao, List<List<String>> aPredictions, SourceDocument aSourceDocument) throws UIMAException, ClassNotFoundException, IOException {
    AutomationTypeAdapter adapter = (AutomationTypeAdapter) aAnnotationService.getAdapter(aFeature.getLayer());
    List<String> annotations = new ArrayList<>();
    // this is training - all training documents will be converted to a single training file
    if (aSourceDocument == null) {
        for (TrainingDocument trainingDocument : aAutomationServic.listTrainingDocuments(aFeature.getProject())) {
            JCas jCas = aAutomationServic.readTrainingAnnotationCas(trainingDocument);
            for (Sentence sentence : select(jCas, Sentence.class)) {
                if (aFeature.getLayer().isMultipleTokens()) {
                    annotations.addAll(((SpanAdapter) adapter).getMultipleAnnotation(sentence, aFeature).values());
                } else {
                    annotations.addAll(adapter.getAnnotation(sentence, aFeature));
                }
            }
        }
        aPredictions.add(annotations);
    } else // This is SourceDocument to predict (in the suggestion pane)
    {
        User user = aUserDao.getCurrentUser();
        AnnotationDocument annodoc = aRepository.createOrGetAnnotationDocument(aSourceDocument, user);
        JCas jCas = aRepository.readAnnotationCas(annodoc);
        for (Sentence sentence : select(jCas, Sentence.class)) {
            if (aFeature.getLayer().isMultipleTokens()) {
                annotations.addAll(((SpanAdapter) adapter).getMultipleAnnotation(sentence, aFeature).values());
            } else {
                annotations.addAll(adapter.getAnnotation(sentence, aFeature));
            }
        }
        aPredictions.add(annotations);
    }
}
Also used : User(de.tudarmstadt.ukp.clarin.webanno.security.model.User) ArrayList(java.util.ArrayList) JCas(org.apache.uima.jcas.JCas) SpanAdapter(de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.SpanAdapter) AnnotationDocument(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) AutomationTypeAdapter(de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter) TrainingDocument(de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument)

Example 4 with AutomationTypeAdapter

use of de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter in project webanno by webanno.

the class AutomationUtil method deleteSpanAnnotation.

public static void deleteSpanAnnotation(AnnotatorState aBModel, DocumentService aDocumentService, CorrectionDocumentService aCorrectionDocumentService, AnnotationSchemaService aAnnotationService, int aStart, int aEnd, AnnotationFeature aFeature, String aValue) throws UIMAException, ClassNotFoundException, IOException, AnnotationException {
    AnnotationDocument annoDoc = aDocumentService.getAnnotationDocument(aBModel.getDocument(), aBModel.getUser());
    JCas annoCas = aDocumentService.readAnnotationCas(annoDoc);
    // get selected text, concatenations of tokens
    String selectedText = WebAnnoCasUtil.getSelectedText(annoCas, aStart, aEnd);
    for (SourceDocument d : aDocumentService.listSourceDocuments(aBModel.getProject())) {
        loadDocument(d, aAnnotationService, aDocumentService, aCorrectionDocumentService, aBModel.getUser());
        JCas jCas = aCorrectionDocumentService.readCorrectionCas(d);
        AutomationTypeAdapter adapter = (AutomationTypeAdapter) aAnnotationService.getAdapter(aFeature.getLayer());
        for (Sentence sentence : select(jCas, Sentence.class)) {
            String sentenceText = sentence.getCoveredText().toLowerCase();
            for (int i = -1; (i = sentenceText.indexOf(selectedText.toLowerCase(), i)) != -1; i = i + selectedText.length()) {
                if (selectCovered(jCas, Token.class, sentence.getBegin() + i, sentence.getBegin() + i + selectedText.length()).size() > 0) {
                    adapter.delete(aBModel, jCas, aFeature, sentence.getBegin() + i, sentence.getBegin() + i + selectedText.length() - 1, aValue);
                }
            }
        }
        aCorrectionDocumentService.writeCorrectionCas(jCas, d);
    }
}
Also used : SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) JCas(org.apache.uima.jcas.JCas) AnnotationDocument(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) AutomationTypeAdapter(de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter)

Example 5 with AutomationTypeAdapter

use of de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter in project webanno by webanno.

the class AutomationUtil method addOtherFeatureTrainDocument.

// generates training document that will be used to predict the training document
// to add extra features, for example add POS tag as a feature for NE classifier
public static void addOtherFeatureTrainDocument(MiraTemplate aTemplate, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws IOException, UIMAException, ClassNotFoundException {
    File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
    if (!miraDir.exists()) {
        FileUtils.forceMkdir(miraDir);
    }
    AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
    for (AnnotationFeature feature : aTemplate.getOtherFeatures()) {
        File trainFile = new File(miraDir, feature.getId() + ".train");
        boolean documentChanged = false;
        for (TrainingDocument document : aAutomationService.listTrainingDocuments(feature.getProject())) {
            if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(feature))) {
                documentChanged = true;
                break;
            }
        }
        if (!documentChanged && trainFile.exists()) {
            continue;
        }
        BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile));
        AutomationTypeAdapter adapter = (AutomationTypeAdapter) aAnnotationService.getAdapter(feature.getLayer());
        for (TrainingDocument trainingDocument : aAutomationService.listTrainingDocuments(feature.getProject())) {
            if ((trainingDocument.getFeature() != null && trainingDocument.getFeature().equals(feature))) {
                JCas jCas = aAutomationService.readTrainingAnnotationCas(trainingDocument);
                for (Sentence sentence : select(jCas, Sentence.class)) {
                    trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
                }
                trainingDocument.setProcessed(false);
                status.setTrainDocs(status.getTrainDocs() - 1);
            }
        }
        trainOut.close();
    }
}
Also used : FileWriter(java.io.FileWriter) JCas(org.apache.uima.jcas.JCas) File(java.io.File) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) AutomationStatus(de.tudarmstadt.ukp.clarin.webanno.automation.model.AutomationStatus) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature) TrainingDocument(de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument) BufferedWriter(java.io.BufferedWriter) AutomationTypeAdapter(de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter)

Aggregations

AutomationTypeAdapter (de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter)5 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)5 JCas (org.apache.uima.jcas.JCas)5 AnnotationDocument (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument)3 AnnotationFeature (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)3 SourceDocument (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument)3 TrainingDocument (de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument)3 BufferedWriter (java.io.BufferedWriter)3 File (java.io.File)3 FileWriter (java.io.FileWriter)3 AutomationStatus (de.tudarmstadt.ukp.clarin.webanno.automation.model.AutomationStatus)2 User (de.tudarmstadt.ukp.clarin.webanno.security.model.User)2 SpanAdapter (de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.SpanAdapter)1 AnnotationException (de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException)1 FileReader (java.io.FileReader)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 StringTokenizer (java.util.StringTokenizer)1 NoResultException (javax.persistence.NoResultException)1 LineIterator (org.apache.commons.io.LineIterator)1