Search in sources :

Example 1 with TrainingDocument

use of de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument in project webanno by webanno.

the class ExportUtil method exportProjectSettings.

public static de.tudarmstadt.ukp.clarin.webanno.export.model.Project exportProjectSettings(AnnotationSchemaService annotationService, Optional<AutomationService> automationService, DocumentService documentService, ProjectService projectService, Project aProject, File aProjectSettings, File aExportTempDir) {
    de.tudarmstadt.ukp.clarin.webanno.export.model.Project exProjekt = new de.tudarmstadt.ukp.clarin.webanno.export.model.Project();
    exProjekt.setDescription(aProject.getDescription());
    exProjekt.setName(aProject.getName());
    // In older versions of WebAnno, the mode was an enum which was serialized as upper-case
    // during export but as lower-case in the database. This is compensating for this case.
    exProjekt.setMode(StringUtils.upperCase(aProject.getMode(), Locale.US));
    exProjekt.setScriptDirection(aProject.getScriptDirection());
    exProjekt.setVersion(aProject.getVersion());
    exProjekt.setDisableExport(aProject.isDisableExport());
    exProjekt.setCreated(aProject.getCreated());
    exProjekt.setUpdated(aProject.getUpdated());
    List<de.tudarmstadt.ukp.clarin.webanno.export.model.AnnotationLayer> exLayers = new ArrayList<>();
    // Store map of layer and its equivalent exLayer so that the attach type is attached later
    Map<AnnotationLayer, de.tudarmstadt.ukp.clarin.webanno.export.model.AnnotationLayer> layerToExLayers = new HashMap<>();
    // Store map of feature and its equivalent exFeature so that the attach feature is attached
    // later
    Map<AnnotationFeature, de.tudarmstadt.ukp.clarin.webanno.export.model.AnnotationFeature> featureToExFeatures = new HashMap<>();
    for (AnnotationLayer layer : annotationService.listAnnotationLayer(aProject)) {
        exLayers.add(ImportUtil.exportLayerDetails(layerToExLayers, featureToExFeatures, layer, annotationService));
    }
    // exported feature
    for (AnnotationLayer layer : layerToExLayers.keySet()) {
        if (layer.getAttachType() != null) {
            layerToExLayers.get(layer).setAttachType(layerToExLayers.get(layer.getAttachType()));
        }
        if (layer.getAttachFeature() != null) {
            layerToExLayers.get(layer).setAttachFeature(featureToExFeatures.get(layer.getAttachFeature()));
        }
    }
    exProjekt.setLayers(exLayers);
    List<ExportedTagSet> extTagSets = new ArrayList<>();
    for (TagSet tagSet : annotationService.listTagSets(aProject)) {
        ExportedTagSet exTagSet = new ExportedTagSet();
        exTagSet.setCreateTag(tagSet.isCreateTag());
        exTagSet.setDescription(tagSet.getDescription());
        exTagSet.setLanguage(tagSet.getLanguage());
        exTagSet.setName(tagSet.getName());
        List<ExportedTag> exTags = new ArrayList<>();
        for (Tag tag : annotationService.listTags(tagSet)) {
            ExportedTag exTag = new ExportedTag();
            exTag.setDescription(tag.getDescription());
            exTag.setName(tag.getName());
            exTags.add(exTag);
        }
        exTagSet.setTags(exTags);
        extTagSets.add(exTagSet);
    }
    exProjekt.setTagSets(extTagSets);
    List<SourceDocument> sourceDocuments = new ArrayList<>();
    List<AnnotationDocument> annotationDocuments = new ArrayList<>();
    // add source documents to a project
    List<de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument> documents = documentService.listSourceDocuments(aProject);
    for (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument sourceDocument : documents) {
        SourceDocument exDocument = new SourceDocument();
        exDocument.setFormat(sourceDocument.getFormat());
        exDocument.setName(sourceDocument.getName());
        exDocument.setState(sourceDocument.getState());
        exDocument.setTimestamp(sourceDocument.getTimestamp());
        exDocument.setSentenceAccessed(sourceDocument.getSentenceAccessed());
        exDocument.setCreated(sourceDocument.getCreated());
        exDocument.setUpdated(sourceDocument.getUpdated());
        // add annotation document to Project
        for (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument annotationDocument : documentService.listAnnotationDocuments(sourceDocument)) {
            AnnotationDocument annotationDocumentToExport = new AnnotationDocument();
            annotationDocumentToExport.setName(annotationDocument.getName());
            annotationDocumentToExport.setState(annotationDocument.getState());
            annotationDocumentToExport.setUser(annotationDocument.getUser());
            annotationDocumentToExport.setTimestamp(annotationDocument.getTimestamp());
            annotationDocumentToExport.setSentenceAccessed(annotationDocument.getSentenceAccessed());
            annotationDocumentToExport.setCreated(annotationDocument.getCreated());
            annotationDocumentToExport.setUpdated(annotationDocument.getUpdated());
            annotationDocuments.add(annotationDocumentToExport);
        }
        sourceDocuments.add(exDocument);
    }
    exProjekt.setSourceDocuments(sourceDocuments);
    exProjekt.setAnnotationDocuments(annotationDocuments);
    if (automationService.isPresent()) {
        List<de.tudarmstadt.ukp.clarin.webanno.export.model.TrainingDocument> trainDocuments = new ArrayList<>();
        List<TrainingDocument> trainingDocuments = automationService.get().listTrainingDocuments(aProject);
        Map<String, de.tudarmstadt.ukp.clarin.webanno.export.model.AnnotationFeature> fm = new HashMap<>();
        for (de.tudarmstadt.ukp.clarin.webanno.export.model.AnnotationFeature f : featureToExFeatures.values()) {
            fm.put(f.getName(), f);
        }
        for (TrainingDocument trainingDocument : trainingDocuments) {
            de.tudarmstadt.ukp.clarin.webanno.export.model.TrainingDocument exDocument = new de.tudarmstadt.ukp.clarin.webanno.export.model.TrainingDocument();
            exDocument.setFormat(trainingDocument.getFormat());
            exDocument.setName(trainingDocument.getName());
            exDocument.setState(trainingDocument.getState());
            exDocument.setTimestamp(trainingDocument.getTimestamp());
            exDocument.setSentenceAccessed(trainingDocument.getSentenceAccessed());
            if (trainingDocument.getFeature() != null) {
                exDocument.setFeature(fm.get(trainingDocument.getFeature().getName()));
            }
            trainDocuments.add(exDocument);
        }
        exProjekt.setTrainingDocuments(trainDocuments);
    } else {
        exProjekt.setTrainingDocuments(new ArrayList<>());
    }
    List<ProjectPermission> projectPermissions = new ArrayList<>();
    // add project permissions to the project
    for (User user : projectService.listProjectUsersWithPermissions(aProject)) {
        for (de.tudarmstadt.ukp.clarin.webanno.model.ProjectPermission permission : projectService.listProjectPermissionLevel(user, aProject)) {
            ProjectPermission permissionToExport = new ProjectPermission();
            permissionToExport.setLevel(permission.getLevel());
            permissionToExport.setUser(user.getUsername());
            projectPermissions.add(permissionToExport);
        }
    }
    exProjekt.setProjectPermissions(projectPermissions);
    // export automation Mira template
    if (automationService.isPresent()) {
        List<de.tudarmstadt.ukp.clarin.webanno.export.model.MiraTemplate> exTemplates = new ArrayList<>();
        for (MiraTemplate template : automationService.get().listMiraTemplates(aProject)) {
            de.tudarmstadt.ukp.clarin.webanno.export.model.MiraTemplate exTemplate = new de.tudarmstadt.ukp.clarin.webanno.export.model.MiraTemplate();
            exTemplate.setAnnotateAndPredict(template.isAnnotateAndRepeat());
            exTemplate.setAutomationStarted(template.isAutomationStarted());
            exTemplate.setCurrentLayer(template.isCurrentLayer());
            exTemplate.setResult(template.getResult());
            exTemplate.setTrainFeature(featureToExFeatures.get(template.getTrainFeature()));
            if (template.getOtherFeatures().size() > 0) {
                Set<de.tudarmstadt.ukp.clarin.webanno.export.model.AnnotationFeature> exOtherFeatures = new HashSet<>();
                for (AnnotationFeature feature : template.getOtherFeatures()) {
                    exOtherFeatures.add(featureToExFeatures.get(feature));
                }
                exTemplate.setOtherFeatures(exOtherFeatures);
            }
            exTemplates.add(exTemplate);
        }
        exProjekt.setMiraTemplates(exTemplates);
    } else {
        exProjekt.setMiraTemplates(new ArrayList<>());
    }
    return exProjekt;
}
Also used : Mode(de.tudarmstadt.ukp.clarin.webanno.model.Mode) User(de.tudarmstadt.ukp.clarin.webanno.security.model.User) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) AnnotationDocument(de.tudarmstadt.ukp.clarin.webanno.export.model.AnnotationDocument) AnnotationLayer(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationLayer) ExportedTagSet(de.tudarmstadt.ukp.clarin.webanno.export.model.ExportedTagSet) TagSet(de.tudarmstadt.ukp.clarin.webanno.model.TagSet) ExportedTagSet(de.tudarmstadt.ukp.clarin.webanno.export.model.ExportedTagSet) ProjectPermission(de.tudarmstadt.ukp.clarin.webanno.export.model.ProjectPermission) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature) HashSet(java.util.HashSet) SourceDocument(de.tudarmstadt.ukp.clarin.webanno.export.model.SourceDocument) Project(de.tudarmstadt.ukp.clarin.webanno.model.Project) MiraTemplate(de.tudarmstadt.ukp.clarin.webanno.automation.model.MiraTemplate) ExportedTag(de.tudarmstadt.ukp.clarin.webanno.export.model.ExportedTag) Tag(de.tudarmstadt.ukp.clarin.webanno.model.Tag) ExportedTag(de.tudarmstadt.ukp.clarin.webanno.export.model.ExportedTag) TrainingDocument(de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument)

Example 2 with TrainingDocument

use of de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument in project webanno by webanno.

the class AutomationUtil method generateFinalClassifier.

/**
 * Based on the other layer, predict features for the training document
 *
 * @param aTemplate
 *            the template.
 * @param aRepository
 *            the repository.
 * @return the prediction.
 * @throws UIMAException
 *             hum?
 * @throws ClassNotFoundException
 *             hum?
 * @throws IOException
 *             hum?
 * @throws AnnotationException
 *             hum?
 *
 * @throws AutomationException
 *             if an error occurs.
 */
public static String generateFinalClassifier(MiraTemplate aTemplate, DocumentService aRepository, CurationDocumentService aCurationDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws UIMAException, ClassNotFoundException, IOException, AnnotationException, AutomationException {
    int frequency = 2;
    double sigma = 1;
    int iterations = 10;
    int beamSize = 0;
    boolean maxPosteriors = false;
    AnnotationFeature layerFeature = aTemplate.getTrainFeature();
    List<List<String>> predictions = new ArrayList<>();
    File miraDir = aAutomationService.getMiraDir(layerFeature);
    Mira mira = new Mira();
    File predFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train.ft");
    File predcitedFile = new File(predFile.getAbsolutePath() + "-pred");
    boolean trainingDocumentUpdated = false;
    // A. training document for other train layers were changed
    for (AnnotationFeature feature : aTemplate.getOtherFeatures()) {
        for (TrainingDocument document : aAutomationService.listTrainingDocuments(aTemplate.getTrainFeature().getProject())) {
            if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) {
                trainingDocumentUpdated = true;
                break;
            }
        }
    }
    // B. Training document for the main training layer were changed
    for (TrainingDocument document : aAutomationService.listTrainingDocuments(layerFeature.getProject())) {
        if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(layerFeature))) {
            trainingDocumentUpdated = true;
            break;
        }
    }
    // C. New Curation document arrives
    for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) {
        if (document.getState().equals(SourceDocumentState.CURATION_FINISHED)) {
            trainingDocumentUpdated = true;
            break;
        }
    }
    // D. tab-sep training documents
    for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
        if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(layerFeature)) {
            trainingDocumentUpdated = true;
            break;
        }
    }
    if (!trainingDocumentUpdated) {
        return aTemplate.getResult();
    }
    // if no other layer is used, use this as main train document,
    // otherwise, add all the
    // predictions and modify template
    File baseTrainFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train.base");
    File trainFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train");
    // generate final classifier, using all features generated
    String trainName = trainFile.getAbsolutePath();
    String finalClassifierModelName = aAutomationService.getMiraModel(layerFeature, false, null).getAbsolutePath();
    getFeatureOtherLayer(aTemplate, aRepository, aAutomationService, aAnnotationService, aUserDao, beamSize, maxPosteriors, predictions, mira, predFile, predcitedFile, null);
    getFeaturesTabSep(aTemplate, aAutomationService, beamSize, maxPosteriors, layerFeature, predictions, mira, predFile, predcitedFile);
    generateTrainDocument(aTemplate, aRepository, aCurationDocumentService, aAnnotationService, aAutomationService, aUserDao, false);
    String trainTemplate;
    if (predictions.size() == 0) {
        trainTemplate = createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), 0);
        FileUtils.copyFile(baseTrainFile, trainFile);
    } else {
        trainTemplate = createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), predictions.size());
        buildTrainFile(baseTrainFile, trainFile, predictions);
    }
    boolean randomInit = false;
    if (!layerFeature.getLayer().isLockToTokenOffset()) {
        mira.setIobScorer();
    }
    mira.loadTemplates(trainTemplate);
    mira.setClip(sigma);
    mira.maxPosteriors = maxPosteriors;
    mira.beamSize = beamSize;
    int numExamples = mira.count(trainName, frequency);
    mira.initModel(randomInit);
    String trainResult = "";
    for (int i = 0; i < iterations; i++) {
        trainResult = mira.train(trainName, iterations, numExamples, i);
        mira.averageWeights(iterations * numExamples);
    }
    mira.saveModel(finalClassifierModelName);
    // all training documents are processed by now
    for (TrainingDocument document : aAutomationService.listTrainingDocuments(layerFeature.getProject())) {
        document.setProcessed(true);
    }
    for (TrainingDocument document : aAutomationService.listTabSepDocuments(layerFeature.getProject())) {
        document.setProcessed(true);
    }
    return trainResult;
}
Also used : ArrayList(java.util.ArrayList) SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) List(java.util.List) ArrayList(java.util.ArrayList) File(java.io.File) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature) Mira(edu.lium.mira.Mira) TrainingDocument(de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument)

Example 3 with TrainingDocument

use of de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument in project webanno by webanno.

the class AutomationUtil method generateTrainDocument.

public static void generateTrainDocument(MiraTemplate aTemplate, DocumentService aRepository, CurationDocumentService aCurationDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao, boolean aBase) throws IOException, UIMAException, ClassNotFoundException, AutomationException {
    LOG.info("Starting to generate training document");
    File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
    if (!miraDir.exists()) {
        FileUtils.forceMkdir(miraDir);
    }
    AnnotationFeature feature = aTemplate.getTrainFeature();
    boolean documentChanged = false;
    // A. training document for other train layers were changed
    for (AnnotationFeature otherrFeature : aTemplate.getOtherFeatures()) {
        for (TrainingDocument document : aAutomationService.listTrainingDocuments(aTemplate.getTrainFeature().getProject())) {
            if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(otherrFeature)) {
                documentChanged = true;
                break;
            }
        }
    }
    // B. Training document for the main training layer were changed
    for (TrainingDocument document : aAutomationService.listTrainingDocuments(feature.getProject())) {
        if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(feature))) {
            documentChanged = true;
            break;
        }
    }
    // C. New Curation document arrives
    if (aRepository.listSourceDocuments(feature.getProject()).size() > 0) {
        documentChanged = true;
    }
    // D. tab-sep training documents
    for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
        if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) {
            documentChanged = true;
            break;
        }
    }
    if (!documentChanged) {
        return;
    }
    File trainFile;
    if (aBase) {
        trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.ft");
    } else {
        trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.base");
    }
    AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
    BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile));
    AutomationTypeAdapter adapter = (AutomationTypeAdapter) aAnnotationService.getAdapter(feature.getLayer());
    // Training documents (Curated or webanno-compatible imported ones - read using UIMA)
    List<TrainingDocument> trainingDocuments = aAutomationService.listTrainingDocuments(feature.getProject());
    int trainingDocsCount = 0;
    for (TrainingDocument trainingDocument : trainingDocuments) {
        if ((trainingDocument.getFeature() != null && trainingDocument.getFeature().equals(feature)) && !trainingDocument.getFormat().equals(WebAnnoConst.TAB_SEP)) {
            JCas jCas = aAutomationService.readTrainingAnnotationCas(trainingDocument);
            for (Sentence sentence : select(jCas, Sentence.class)) {
                if (aBase) {
                    // base training document
                    trainOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
                } else {
                    // training document with other features
                    trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
                }
            }
            trainingDocument.setProcessed(!aBase);
            if (!aBase) {
                status.setTrainDocs(status.getTrainDocs() - 1);
            }
        }
    }
    // for curated docuemnts
    List<SourceDocument> sourceDocuments = aRepository.listSourceDocuments(feature.getProject());
    for (SourceDocument sourceDocument : sourceDocuments) {
        if (sourceDocument.getState().equals(SourceDocumentState.CURATION_FINISHED)) {
            JCas jCas = aCurationDocumentService.readCurationCas(sourceDocument);
            for (Sentence sentence : select(jCas, Sentence.class)) {
                if (aBase) {
                    // base training document
                    trainOut.append(getMiraLine(sentence, null, adapter).toString()).append("\n");
                } else {
                    // training document with other features
                    trainOut.append(getMiraLine(sentence, feature, adapter).toString()).append("\n");
                }
            }
            if (!aBase) {
                status.setTrainDocs(status.getTrainDocs() - 1);
            }
        }
        trainingDocsCount++;
        LOG.info("Processed source document " + trainingDocsCount + " of " + trainingDocuments.size());
    }
    // Tab-sep documents to be used as a target layer train document
    int goldStandardDocsCounter = 0;
    List<TrainingDocument> goldStandardDocs = aAutomationService.listTabSepDocuments(feature.getProject());
    for (TrainingDocument document : goldStandardDocs) {
        if (document.getFormat().equals(WebAnnoConst.TAB_SEP) && document.getFeature() != null && document.getFeature().equals(feature)) {
            File tabSepFile = new File(aAutomationService.getDocumentFolder(document), document.getName());
            LineIterator it = IOUtils.lineIterator(new FileReader(tabSepFile));
            while (it.hasNext()) {
                String line = it.next();
                if (line.trim().equals("")) {
                    trainOut.append("\n");
                } else {
                    StringTokenizer st = new StringTokenizer(line, "\t");
                    if (st.countTokens() != 2) {
                        trainOut.close();
                        throw new AutomationException("This is not a valid TAB-SEP document");
                    }
                    if (aBase) {
                        trainOut.append(getMiraLineForTabSep(st.nextToken(), ""));
                    } else {
                        trainOut.append(getMiraLineForTabSep(st.nextToken(), st.nextToken()));
                    }
                }
            }
        }
        goldStandardDocsCounter++;
        LOG.info("Processed gold standard document " + goldStandardDocsCounter + " of " + goldStandardDocs.size());
    }
    trainOut.close();
    LOG.info("Completed generating training document");
}
Also used : FileWriter(java.io.FileWriter) SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) JCas(org.apache.uima.jcas.JCas) LineIterator(org.apache.commons.io.LineIterator) AutomationStatus(de.tudarmstadt.ukp.clarin.webanno.automation.model.AutomationStatus) BufferedWriter(java.io.BufferedWriter) AutomationTypeAdapter(de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter) StringTokenizer(java.util.StringTokenizer) FileReader(java.io.FileReader) File(java.io.File) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature) TrainingDocument(de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument)

Example 4 with TrainingDocument

use of de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument in project webanno by webanno.

the class AutomationUtil method getFeaturesTabSep.

private static void getFeaturesTabSep(MiraTemplate aTemplate, AutomationService aAutomationService, int beamSize, boolean maxPosteriors, AnnotationFeature layerFeature, List<List<String>> predictions, Mira mira, File predFile, File predcitedFile) throws IOException, ClassNotFoundException, AutomationException {
    for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
        int shiftColumns = 0;
        int nbest = 1;
        String modelName = aAutomationService.getMiraModel(layerFeature, true, document).getAbsolutePath();
        if (!new File(modelName).exists()) {
            continue;
        }
        String testName = predFile.getAbsolutePath();
        PrintStream stream = new PrintStream(predcitedFile);
        BufferedReader input = new BufferedReader(new InputStreamReader(System.in));
        if (testName != null) {
            input = new BufferedReader(new FileReader(testName));
        }
        mira.loadModel(modelName);
        mira.setShiftColumns(shiftColumns);
        mira.nbest = nbest;
        mira.beamSize = beamSize;
        mira.maxPosteriors = maxPosteriors;
        try {
            mira.test(input, stream);
        } catch (Exception e) {
            throw new AutomationException(document.getName() + " is Invalid TAB-SEP file!");
        }
        LineIterator it = IOUtils.lineIterator(new FileReader(predcitedFile));
        List<String> annotations = new ArrayList<>();
        while (it.hasNext()) {
            String line = it.next();
            if (line.trim().equals("")) {
                continue;
            }
            StringTokenizer st = new StringTokenizer(line, " ");
            String tag = "";
            while (st.hasMoreTokens()) {
                tag = st.nextToken();
            }
            annotations.add(tag);
        }
        predictions.add(annotations);
    }
}
Also used : PrintStream(java.io.PrintStream) InputStreamReader(java.io.InputStreamReader) ArrayList(java.util.ArrayList) LineIterator(org.apache.commons.io.LineIterator) NoResultException(javax.persistence.NoResultException) AnnotationException(de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException) CASException(org.apache.uima.cas.CASException) UIMAException(org.apache.uima.UIMAException) DataRetrievalFailureException(org.springframework.dao.DataRetrievalFailureException) IOException(java.io.IOException) StringTokenizer(java.util.StringTokenizer) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) File(java.io.File) TrainingDocument(de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument)

Example 5 with TrainingDocument

use of de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument in project webanno by webanno.

the class AutomationUtil method addTabSepTrainDocument.

public static void addTabSepTrainDocument(MiraTemplate aTemplate, AutomationService aAutomationService) throws IOException, UIMAException, ClassNotFoundException, AutomationException {
    File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
    if (!miraDir.exists()) {
        FileUtils.forceMkdir(miraDir);
    }
    AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
    boolean documentChanged = false;
    for (TrainingDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
        if (!document.isProcessed()) {
            documentChanged = true;
            break;
        }
    }
    if (!documentChanged) {
        return;
    }
    for (TrainingDocument trainingDocument : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
        if (trainingDocument.getFeature() != null) {
            // This is a target layer train document
            continue;
        }
        File trainFile = new File(miraDir, trainingDocument.getId() + trainingDocument.getProject().getId() + ".train");
        BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile));
        File tabSepFile = new File(aAutomationService.getDocumentFolder(trainingDocument), trainingDocument.getName());
        LineIterator it = IOUtils.lineIterator(new FileReader(tabSepFile));
        while (it.hasNext()) {
            String line = it.next();
            if (line.trim().equals("")) {
                trainOut.append("\n");
            } else {
                StringTokenizer st = new StringTokenizer(line, "\t");
                if (st.countTokens() != 2) {
                    trainOut.close();
                    throw new AutomationException("This is not a valid TAB-SEP document");
                }
                trainOut.append(getMiraLineForTabSep(st.nextToken(), st.nextToken()));
            }
        }
        trainingDocument.setProcessed(false);
        status.setTrainDocs(status.getTrainDocs() - 1);
        trainOut.close();
    }
}
Also used : StringTokenizer(java.util.StringTokenizer) FileWriter(java.io.FileWriter) FileReader(java.io.FileReader) File(java.io.File) LineIterator(org.apache.commons.io.LineIterator) AutomationStatus(de.tudarmstadt.ukp.clarin.webanno.automation.model.AutomationStatus) TrainingDocument(de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument) BufferedWriter(java.io.BufferedWriter)

Aggregations

TrainingDocument (de.tudarmstadt.ukp.clarin.webanno.model.TrainingDocument)11 File (java.io.File)7 ArrayList (java.util.ArrayList)5 AnnotationFeature (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)4 AutomationTypeAdapter (de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter)3 AutomationStatus (de.tudarmstadt.ukp.clarin.webanno.automation.model.AutomationStatus)3 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)3 BufferedWriter (java.io.BufferedWriter)3 FileReader (java.io.FileReader)3 FileWriter (java.io.FileWriter)3 StringTokenizer (java.util.StringTokenizer)3 LineIterator (org.apache.commons.io.LineIterator)3 JCas (org.apache.uima.jcas.JCas)3 MiraTemplate (de.tudarmstadt.ukp.clarin.webanno.automation.model.MiraTemplate)2 Project (de.tudarmstadt.ukp.clarin.webanno.model.Project)2 SourceDocument (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument)2 User (de.tudarmstadt.ukp.clarin.webanno.security.model.User)2 Mira (edu.lium.mira.Mira)2 SpanAdapter (de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.SpanAdapter)1 AnnotationException (de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException)1