Examples with SourceDocument - de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument

Example 16 with SourceDocument

use of de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument in project webanno by webanno.

the class RemoteApiController2 method createCompatibleCas.

private JCas createCompatibleCas(long aProjectId, long aDocumentId, MultipartFile aFile, Optional<String> aFormat) throws RemoteApiException, ClassNotFoundException, IOException, UIMAException {
    Project project = getProject(aProjectId);
    SourceDocument document = getDocument(project, aDocumentId);
    // Check if the format is supported
    String format = aFormat.orElse(FORMAT_DEFAULT);
    Map<String, Class<CollectionReader>> readableFormats = importExportService.getReadableFormats();
    if (readableFormats.get(format) == null) {
        throw new UnsupportedFormatException("Format [%s] not supported. Acceptable formats are %s.", format, readableFormats.keySet());
    }
    // Convert the uploaded annotation document into a CAS
    File tmpFile = null;
    JCas annotationCas;
    try {
        tmpFile = File.createTempFile("upload", ".bin");
        aFile.transferTo(tmpFile);
        annotationCas = importExportService.importCasFromFile(tmpFile, project, format);
    } finally {
        if (tmpFile != null) {
            FileUtils.forceDelete(tmpFile);
        }
    }
    // Check if the uploaded file is compatible with the source document. They are compatible
    // if the text is the same and if all the token and sentence annotations have the same
    // offsets.
    JCas initialCas = documentService.createOrReadInitialCas(document);
    String initialText = initialCas.getDocumentText();
    String annotationText = annotationCas.getDocumentText();
    // If any of the texts contains tailing line breaks, we ignore that. We assume at the moment
    // that nobody will have created annotations over that trailing line breaks.
    initialText = StringUtils.chomp(initialText);
    annotationText = StringUtils.chomp(annotationText);
    if (ObjectUtils.notEqual(initialText, annotationText)) {
        int diffIndex = StringUtils.indexOfDifference(initialText, annotationText);
        String expected = initialText.substring(diffIndex, Math.min(initialText.length(), diffIndex + 20));
        String actual = annotationText.substring(diffIndex, Math.min(annotationText.length(), diffIndex + 20));
        throw new IncompatibleDocumentException("Text of annotation document does not match text of source document at offset " + "[%d]. Expected [%s] but found [%s].", diffIndex, expected, actual);
    }
    // Just in case we really had to chomp off a trailing line break from the annotation CAS,
    // make sure we copy over the proper text from the initial CAS
    // NOT AT HOME THIS YOU SHOULD TRY
    // SETTING THE SOFA STRING FORCEFULLY FOLLOWING THE DARK SIDE IS!
    forceSetFeatureValue(annotationCas.getSofa(), CAS.FEATURE_BASE_NAME_SOFASTRING, initialCas.getDocumentText());
    FSUtil.setFeature(annotationCas.getDocumentAnnotationFs(), CAS.FEATURE_BASE_NAME_END, initialCas.getDocumentText().length());
    Collection<Sentence> annotationSentences = select(annotationCas, Sentence.class);
    Collection<Sentence> initialSentences = select(initialCas, Sentence.class);
    if (annotationSentences.size() != initialSentences.size()) {
        throw new IncompatibleDocumentException("Expected [%d] sentences, but annotation document contains [%d] sentences.", initialSentences.size(), annotationSentences.size());
    }
    assertCompatibleOffsets(initialSentences, annotationSentences);
    Collection<Token> annotationTokens = select(annotationCas, Token.class);
    Collection<Token> initialTokens = select(initialCas, Token.class);
    if (annotationTokens.size() != initialTokens.size()) {
        throw new IncompatibleDocumentException("Expected [%d] sentences, but annotation document contains [%d] sentences.", initialSentences.size(), annotationSentences.size());
    }
    assertCompatibleOffsets(initialTokens, annotationTokens);
    return annotationCas;
}

Also used : SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) JCas(org.apache.uima.jcas.JCas) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) IncompatibleDocumentException(de.tudarmstadt.ukp.clarin.webanno.webapp.remoteapi.v2.exception.IncompatibleDocumentException) RProject(de.tudarmstadt.ukp.clarin.webanno.webapp.remoteapi.v2.model.RProject) Project(de.tudarmstadt.ukp.clarin.webanno.model.Project) UnsupportedFormatException(de.tudarmstadt.ukp.clarin.webanno.webapp.remoteapi.v2.exception.UnsupportedFormatException) File(java.io.File) MultipartFile(org.springframework.web.multipart.MultipartFile) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 17 with SourceDocument

use of de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument in project webanno by webanno.

the class AutomationUtil method generateFinalClassifier.

/**
 * Based on the other layer, predict features for the training document
 *
 * @param aTemplate
 *            the template.
 * @param aRepository
 *            the repository.
 * @return the prediction.
 * @throws UIMAException
 *             hum?
 * @throws ClassNotFoundException
 *             hum?
 * @throws IOException
 *             hum?
 * @throws AnnotationException
 *             hum?
 *
 * @throws AutomationException
 *             if an error occurs.
 */
public static String generateFinalClassifier(MiraTemplate aTemplate, DocumentService aRepository, CurationDocumentService aCurationDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws UIMAException, ClassNotFoundException, IOException, AnnotationException, AutomationException {
    int frequency = 2;
    double sigma = 1;
    int iterations = 10;
    int beamSize = 0;
    boolean maxPosteriors = false;
    AnnotationFeature layerFeature = aTemplate.getTrainFeature();
    List<List<String>> predictions = new ArrayList<>();
    File miraDir = aAutomationService.getMiraDir(layerFeature);
    Mira mira = new Mira();
    File predFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train.ft");
    File predcitedFile = new File(predFile.getAbsolutePath() + "-pred");
    boolean trainingDocumentUpdated = false;
    // A. training document for other train layers were changed
    for (AnnotationFeature feature : aTemplate.getOtherFeatures()) {
        for (TrainingDocument document : aAutomationService.listTrainingDocuments(aTemplate.getTrainFeature().getProject())) {
            if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) {
                trainingDocumentUpdated = true;
                break;
            }
        }
    }
    // B. Training document for the main training layer were changed
    for (TrainingDocument document : aAutomationService.listTrainingDocuments(layerFeature.getProject())) {
        if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(layerFeature))) {
            trainingDocumentUpdated = true;
            break;
        }
    }
    // C. New Curation document arrives
    for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) {
        if (document.getState().equals(SourceDocumentState.CURATION_FINISHED)) {
            trainingDocumentUpdated = true;
            break;
        }
    }
    // D. tab-sep training documents
    for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
        if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(layerFeature)) {
            trainingDocumentUpdated = true;
            break;
        }
    }
    if (!trainingDocumentUpdated) {
        return aTemplate.getResult();
    }
    // if no other layer is used, use this as main train document,
    // otherwise, add all the
    // predictions and modify template
    File baseTrainFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train.base");
    File trainFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train");
    // generate final classifier, using all features generated
    String trainName = trainFile.getAbsolutePath();
    String finalClassifierModelName = aAutomationService.getMiraModel(layerFeature, false, null).getAbsolutePath();
    getFeatureOtherLayer(aTemplate, aRepository, aAutomationService, aAnnotationService, aUserDao, beamSize, maxPosteriors, predictions, mira, predFile, predcitedFile, null);
    getFeaturesTabSep(aTemplate, aAutomationService, beamSize, maxPosteriors, layerFeature, predictions, mira, predFile, predcitedFile);
    generateTrainDocument(aTemplate, aRepository, aCurationDocumentService, aAnnotationService, aAutomationService, aUserDao, false);
    String trainTemplate;
    if (predictions.size() == 0) {
        trainTemplate = createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), 0);
        FileUtils.copyFile(baseTrainFile, trainFile);
    } else {
        trainTemplate = createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), predictions.size());
        buildTrainFile(baseTrainFile, trainFile, predictions);
    }
    boolean randomInit = false;
    if (!layerFeature.getLayer().isLockToTokenOffset()) {
        mira.setIobScorer();
    }
    mira.loadTemplates(trainTemplate);
    mira.setClip(sigma);
    mira.maxPosteriors = maxPosteriors;
    mira.beamSize = beamSize;
    int numExamples = mira.count(trainName, frequency);
    mira.initModel(randomInit);
    String trainResult = "";
    for (int i = 0; i < iterations; i++) {
        trainResult = mira.train(trainName, iterations, numExamples, i);
        mira.averageWeights(iterations * numExamples);
    }
    mira.saveModel(finalClassifierModelName);
    // all training documents are processed by now
    for (TrainingDocument document : aAutomationService.listTrainingDocuments(layerFeature.getProject())) {
        document.setProcessed(true);
    }
    for (TrainingDocument document : aAutomationService.listTabSepDocuments(layerFeature.getProject())) {
        document.setProcessed(true);
    }
    return trainResult;
}

Also used : ArrayList(java.util.ArrayList) SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) List(java.util.List) ArrayList(java.util.ArrayList) File(java.io.File) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature) Mira(edu.lium.mira.Mira) TrainingDocument(de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument)

Example 18 with SourceDocument

use of de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument in project webanno by webanno.

the class AutomationUtil method addOtherFeatureToPredictDocument.

/**
 * Based on the other layer, add features for the prediction document
 *
 * @param aTemplate
 *            the template.
 * @param aRepository
 *            the repository.
 * @throws UIMAException
 *             hum?
 * @throws ClassNotFoundException
 *             hum?
 * @throws IOException
 *             hum?
 * @throws AnnotationException
 *             hum?
 * @throws AutomationException
 *             hum?
 */
public static void addOtherFeatureToPredictDocument(MiraTemplate aTemplate, DocumentService aRepository, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws UIMAException, ClassNotFoundException, IOException, AnnotationException, AutomationException {
    AnnotationFeature layerFeature = aTemplate.getTrainFeature();
    File miraDir = aAutomationService.getMiraDir(layerFeature);
    for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) {
        List<List<String>> predictions = new ArrayList<>();
        File predFtFile = new File(miraDir, document.getId() + ".pred.ft");
        Mira mira = new Mira();
        int beamSize = 0;
        boolean maxPosteriors = false;
        File predcitedFile = new File(predFtFile.getAbsolutePath() + "-pred");
        getFeatureOtherLayer(aTemplate, aRepository, aAutomationService, aAnnotationService, aUserDao, beamSize, maxPosteriors, predictions, mira, predFtFile, predcitedFile, document);
        getFeaturesTabSep(aTemplate, aAutomationService, beamSize, maxPosteriors, layerFeature, predictions, mira, predFtFile, predcitedFile);
        File basePredFile = new File(miraDir, document.getId() + ".pred");
        if (predictions.size() == 0) {
            createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), 0);
            FileUtils.copyFile(predFtFile, basePredFile);
        } else {
            createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), predictions.size());
            buildPredictFile(predFtFile, basePredFile, predictions, aTemplate.getTrainFeature());
        }
    }
}

Also used : SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) File(java.io.File) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature) Mira(edu.lium.mira.Mira)

Example 19 with SourceDocument

use of de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument in project webanno by webanno.

the class AutomationUtil method generatePredictDocument.

// TODO: rename to predictDocument
public static void generatePredictDocument(MiraTemplate aTemplate, DocumentService aRepository, CorrectionDocumentService aCorrectionDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws IOException, UIMAException, ClassNotFoundException {
    File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
    if (!miraDir.exists()) {
        FileUtils.forceMkdir(miraDir);
    }
    User user = aUserDao.getCurrentUser();
    AnnotationFeature feature = aTemplate.getTrainFeature();
    AutomationTypeAdapter adapter = (AutomationTypeAdapter) aAnnotationService.getAdapter(feature.getLayer());
    for (SourceDocument document : aRepository.listSourceDocuments(feature.getProject())) {
        File predFile = new File(miraDir, document.getId() + ".pred.ft");
        BufferedWriter predOut = new BufferedWriter(new FileWriter(predFile));
        JCas jCas;
        try {
            jCas = aCorrectionDocumentService.readCorrectionCas(document);
        } catch (Exception e) {
            AnnotationDocument annoDoc = aRepository.createOrGetAnnotationDocument(document, user);
            jCas = aRepository.readAnnotationCas(annoDoc);
        }
        for (Sentence sentence : select(jCas, Sentence.class)) {
            predOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
        }
        predOut.close();
    }
}

Also used : User(de.tudarmstadt.ukp.clarin.webanno.security.model.User) FileWriter(java.io.FileWriter) SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) JCas(org.apache.uima.jcas.JCas) AnnotationDocument(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument) File(java.io.File) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) NoResultException(javax.persistence.NoResultException) AnnotationException(de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException) CASException(org.apache.uima.cas.CASException) UIMAException(org.apache.uima.UIMAException) DataRetrievalFailureException(org.springframework.dao.DataRetrievalFailureException) IOException(java.io.IOException) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature) AutomationTypeAdapter(de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter) BufferedWriter(java.io.BufferedWriter)

Example 20 with SourceDocument

use of de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument in project webanno by webanno.

the class AutomationUtil method generateTrainDocument.

public static void generateTrainDocument(MiraTemplate aTemplate, DocumentService aRepository, CurationDocumentService aCurationDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao, boolean aBase) throws IOException, UIMAException, ClassNotFoundException, AutomationException {
    LOG.info("Starting to generate training document");
    File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
    if (!miraDir.exists()) {
        FileUtils.forceMkdir(miraDir);
    }
    AnnotationFeature feature = aTemplate.getTrainFeature();
    boolean documentChanged = false;
    // A. training document for other train layers were changed
    for (AnnotationFeature otherrFeature : aTemplate.getOtherFeatures()) {
        for (TrainingDocument document : aAutomationService.listTrainingDocuments(aTemplate.getTrainFeature().getProject())) {
            if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(otherrFeature)) {
                documentChanged = true;
                break;
            }
        }
    }
    // B. Training document for the main training layer were changed
    for (TrainingDocument document : aAutomationService.listTrainingDocuments(feature.getProject())) {
        if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(feature))) {
            documentChanged = true;
            break;
        }
    }
    // C. New Curation document arrives
    if (aRepository.listSourceDocuments(feature.getProject()).size() > 0) {
        documentChanged = true;
    }
    // D. tab-sep training documents
    for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
        if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) {
            documentChanged = true;
            break;
        }
    }
    if (!documentChanged) {
        return;
    }
    File trainFile;
    if (aBase) {
        trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.ft");
    } else {
        trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.base");
    }
    AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
    BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile));
    AutomationTypeAdapter adapter = (AutomationTypeAdapter) aAnnotationService.getAdapter(feature.getLayer());
    // Training documents (Curated or webanno-compatible imported ones - read using UIMA)
    List<TrainingDocument> trainingDocuments = aAutomationService.listTrainingDocuments(feature.getProject());
    int trainingDocsCount = 0;
    for (TrainingDocument trainingDocument : trainingDocuments) {
        if ((trainingDocument.getFeature() != null && trainingDocument.getFeature().equals(feature)) && !trainingDocument.getFormat().equals(WebAnnoConst.TAB_SEP)) {
            JCas jCas = aAutomationService.readTrainingAnnotationCas(trainingDocument);
            for (Sentence sentence : select(jCas, Sentence.class)) {
                if (aBase) {
                    // base training document
                    trainOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
                } else {
                    // training document with other features
                    trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
                }
            }
            trainingDocument.setProcessed(!aBase);
            if (!aBase) {
                status.setTrainDocs(status.getTrainDocs() - 1);
            }
        }
    }
    // for curated docuemnts
    List<SourceDocument> sourceDocuments = aRepository.listSourceDocuments(feature.getProject());
    for (SourceDocument sourceDocument : sourceDocuments) {
        if (sourceDocument.getState().equals(SourceDocumentState.CURATION_FINISHED)) {
            JCas jCas = aCurationDocumentService.readCurationCas(sourceDocument);
            for (Sentence sentence : select(jCas, Sentence.class)) {
                if (aBase) {
                    // base training document
                    trainOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
                } else {
                    // training document with other features
                    trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
                }
            }
            if (!aBase) {
                status.setTrainDocs(status.getTrainDocs() - 1);
            }
        }
        trainingDocsCount++;
        LOG.info("Processed source document " + trainingDocsCount + " of " + trainingDocuments.size());
    }
    // Tab-sep documents to be used as a target layer train document
    int goldStandardDocsCounter = 0;
    List<TrainingDocument> goldStandardDocs = aAutomationService.listTabSepDocuments(feature.getProject());
    for (TrainingDocument document : goldStandardDocs) {
        if (document.getFormat().equals(WebAnnoConst.TAB_SEP) && document.getFeature() != null && document.getFeature().equals(feature)) {
            File tabSepFile = new File(aAutomationService.getDocumentFolder(document), document.getName());
            LineIterator it = IOUtils.lineIterator(new FileReader(tabSepFile));
            while (it.hasNext()) {
                String line = it.next();
                if (line.trim().equals("")) {
                    trainOut.append("\n");
                } else {
                    StringTokenizer st = new StringTokenizer(line, "\t");
                    if (st.countTokens() != 2) {
                        trainOut.close();
                        throw new AutomationException("This is not a valid TAB-SEP document");
                    }
                    if (aBase) {
                        trainOut.append(getMiraLineForTabSep(st.nextToken(), ""));
                    } else {
                        trainOut.append(getMiraLineForTabSep(st.nextToken(), st.nextToken()));
                    }
                }
            }
        }
        goldStandardDocsCounter++;
        LOG.info("Processed gold standard document " + goldStandardDocsCounter + " of " + goldStandardDocs.size());
    }
    trainOut.close();
    LOG.info("Completed generating training document");
}

Also used : FileWriter(java.io.FileWriter) SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) JCas(org.apache.uima.jcas.JCas) LineIterator(org.apache.commons.io.LineIterator) AutomationStatus(de.tudarmstadt.ukp.clarin.webanno.automation.model.AutomationStatus) BufferedWriter(java.io.BufferedWriter) AutomationTypeAdapter(de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter) StringTokenizer(java.util.StringTokenizer) FileReader(java.io.FileReader) File(java.io.File) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature) TrainingDocument(de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument)

Aggregations

SourceDocument (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument)59 JCas (org.apache.uima.jcas.JCas)24 AnnotationDocument (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument)22 Project (de.tudarmstadt.ukp.clarin.webanno.model.Project)22 User (de.tudarmstadt.ukp.clarin.webanno.security.model.User)19 RequestMapping (org.springframework.web.bind.annotation.RequestMapping)14 File (java.io.File)13 RProject (de.tudarmstadt.ukp.clarin.webanno.webapp.remoteapi.v2.model.RProject)11 ArrayList (java.util.ArrayList)10 HashMap (java.util.HashMap)10 ApiOperation (io.swagger.annotations.ApiOperation)9 IOException (java.io.IOException)9 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)8 Map (java.util.Map)8 AnnotatorState (de.tudarmstadt.ukp.clarin.webanno.api.annotation.model.AnnotatorState)7 LinkedHashMap (java.util.LinkedHashMap)7 List (java.util.List)7 DiffResult (de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.DiffResult)6 AnnotationFeature (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)6 NoResultException (javax.persistence.NoResultException)6