Search in sources :

Example 11 with DocumentMetaData

use of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData in project dkpro-tc by dkpro.

the class TestTaskUtils method initJCas.

private JCas initJCas(boolean setUnitIdAsPartOfTheInstanceId) throws Exception {
    AnalysisEngine engine = AnalysisEngineFactory.createEngine(NoOpAnnotator.class);
    JCas jCas = engine.newJCas();
    JCasId id = new JCasId(jCas);
    id.setId(4711);
    id.addToIndexes();
    DocumentMetaData meta = new DocumentMetaData(jCas);
    meta.setDocumentTitle("title");
    meta.setDocumentId("4711");
    meta.addToIndexes();
    String[][] tokens = { // sequence 1
    { "a", "DT" }, // sequence 1
    { "car", "NN" }, // sequence 1
    { "drives", "VBZ" }, // sequence 2
    { "the", "DT" }, // sequence 2
    { "hedgehogs", "NN" }, // sequence 2
    { "dies", "VBZ" } };
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < tokens.length; i++) {
        int start = sb.length();
        int end = start + tokens[i][0].length();
        TextClassificationTarget unit = new TextClassificationTarget(jCas, start, end);
        if (setUnitIdAsPartOfTheInstanceId) {
            unit.setSuffix(tokens[i][0]);
        }
        unit.setId(i);
        unit.addToIndexes();
        TextClassificationOutcome outcome = new TextClassificationOutcome(jCas, start, end);
        outcome.setOutcome(tokens[i][1]);
        outcome.addToIndexes();
        sb.append(tokens[i][0]);
        if (i + 1 < tokens.length) {
            sb.append(" ");
        }
    }
    String text = sb.toString();
    jCas.setDocumentText(text);
    int lenSeq1 = tokens[0][0].length() + 1 + tokens[1][0].length() + 1 + tokens[2][0].length();
    TextClassificationSequence seq1 = new TextClassificationSequence(jCas, 0, lenSeq1);
    seq1.addToIndexes();
    TextClassificationSequence seq2 = new TextClassificationSequence(jCas, lenSeq1 + 1, text.length());
    seq2.addToIndexes();
    return jCas;
}
Also used : JCasId(org.dkpro.tc.api.type.JCasId) TextClassificationOutcome(org.dkpro.tc.api.type.TextClassificationOutcome) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) JCas(org.apache.uima.jcas.JCas) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData) TextClassificationSequence(org.dkpro.tc.api.type.TextClassificationSequence) AnalysisEngine(org.apache.uima.analysis_engine.AnalysisEngine)

Example 12 with DocumentMetaData

use of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData in project dkpro-tc by dkpro.

the class TestFoldUtil method createNoSequenceCas.

private void createNoSequenceCas() throws Exception {
    tmpFoldNoSeq = new TemporaryFolder();
    tmpFoldNoSeq.create();
    jcasNoSequence = JCasFactory.createJCas();
    jcasNoSequence.setDocumentText("Mr. Hawksley said yesterday he would be willing to go before the city .");
    setUnit(jcasNoSequence, 0, 2);
    setUnit(jcasNoSequence, 4, 12);
    setUnit(jcasNoSequence, 13, 18);
    setUnit(jcasNoSequence, 18, 28);
    setUnit(jcasNoSequence, 31, 36);
    setUnit(jcasNoSequence, 37, 39);
    setUnit(jcasNoSequence, 40, 47);
    setUnit(jcasNoSequence, 48, 50);
    setUnit(jcasNoSequence, 51, 53);
    setUnit(jcasNoSequence, 54, 60);
    setUnit(jcasNoSequence, 61, 64);
    setUnit(jcasNoSequence, 65, 69);
    setUnit(jcasNoSequence, 70, 71);
    DocumentMetaData dmd = new DocumentMetaData(jcasNoSequence);
    dmd.setDocumentId("id");
    dmd.addToIndexes();
    createJCasIdAnnotation(jcasNoSequence);
    AnalysisEngine xmiWriter = AnalysisEngineFactory.createEngine(BinaryCasWriter.class, BinaryCasWriter.PARAM_TARGET_LOCATION, tmpFoldNoSeq.getRoot(), BinaryCasWriter.PARAM_FORMAT, "6+");
    xmiWriter.process(jcasNoSequence);
}
Also used : TemporaryFolder(org.junit.rules.TemporaryFolder) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData) AnalysisEngine(org.apache.uima.analysis_engine.AnalysisEngine)

Example 13 with DocumentMetaData

use of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData in project dkpro-tc by dkpro.

the class TestPairReader method getNext.

@Override
public void getNext(JCas jcas) throws IOException, CollectionException {
    super.getNext(jcas);
    JCasId id = new JCasId(jcas);
    id.setId(jcasid++);
    id.addToIndexes();
    for (String outcomeValue : getTextClassificationOutcomes(jcas)) {
        TextClassificationOutcome outcome = new TextClassificationOutcome(jcas);
        outcome.setOutcome(outcomeValue);
        outcome.addToIndexes();
    }
    // as we are creating more than one CAS out of a single file, we need to have different
    // document titles and URIs for each CAS
    // otherwise, serialized CASes will be overwritten
    DocumentMetaData dmd = DocumentMetaData.get(jcas);
    dmd.setDocumentTitle(dmd.getDocumentTitle() + "-" + fileOffset);
    dmd.setDocumentUri(dmd.getDocumentUri() + "-" + fileOffset);
    fileOffset++;
}
Also used : JCasId(org.dkpro.tc.api.type.JCasId) TextClassificationOutcome(org.dkpro.tc.api.type.TextClassificationOutcome) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData)

Example 14 with DocumentMetaData

use of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData in project webanno by webanno.

the class CasStorageServiceImpl method realWriteCas.

private void realWriteCas(Project aProject, String aDocumentName, long aDocumentId, JCas aJcas, String aUserName, File aAnnotationFolder, File aTargetPath) throws IOException {
    log.debug("Writing annotation document [{}]({}) for user [{}] in project [{}]({})", aDocumentName, aDocumentId, aUserName, aProject.getName(), aProject.getId());
    try {
        if (casDoctor != null) {
            casDoctor.analyze(aProject, aJcas.getCas());
        }
    } catch (CasDoctorException e) {
        StringBuilder detailMsg = new StringBuilder();
        detailMsg.append("CAS Doctor found problems for user [").append(aUserName).append("] in source document [").append(aDocumentName).append("] (").append(aDocumentId).append(") in project[").append(aProject.getName()).append("] (").append(aProject.getId()).append(")\n");
        e.getDetails().forEach(m -> detailMsg.append(String.format("- [%s] %s%n", m.level, m.message)));
        throw new DataRetrievalFailureException(detailMsg.toString());
    } catch (Exception e) {
        throw new DataRetrievalFailureException("Error analyzing CAS of user [" + aUserName + "] in source document [" + aDocumentName + "] (" + aDocumentId + ") in project [" + aProject.getName() + "] (" + aProject.getId() + ")", e);
    }
    synchronized (lock) {
        // File annotationFolder = getAnnotationFolder(aDocument);
        FileUtils.forceMkdir(aAnnotationFolder);
        final String username = aUserName;
        File currentVersion = new File(aAnnotationFolder, username + ".ser");
        File oldVersion = new File(aAnnotationFolder, username + ".ser.old");
        // Save current version
        try {
            // Make a backup of the current version of the file before overwriting
            if (currentVersion.exists()) {
                renameFile(currentVersion, oldVersion);
            }
            // Now write the new version to "<username>.ser" or CURATION_USER.ser
            DocumentMetaData md;
            try {
                md = DocumentMetaData.get(aJcas);
            } catch (IllegalArgumentException e) {
                md = DocumentMetaData.create(aJcas);
            }
            md.setDocumentId(aUserName);
            // File targetPath = getAnnotationFolder(aDocument);
            CasPersistenceUtils.writeSerializedCas(aJcas, new File(aTargetPath, aUserName + ".ser"));
            try (MDC.MDCCloseable closable = MDC.putCloseable(Logging.KEY_PROJECT_ID, String.valueOf(aProject.getId()))) {
                log.info("Updated annotations for user [{}] on document [{}]({}) in project [{}]({})", aUserName, aDocumentName, aDocumentId, aProject.getName(), aProject.getId());
            }
            // If the saving was successful, we delete the old version
            if (oldVersion.exists()) {
                FileUtils.forceDelete(oldVersion);
            }
        } catch (IOException e) {
            // If we could not save the new version, restore the old one.
            FileUtils.forceDelete(currentVersion);
            // If this is the first version, there is no old version, so do not restore anything
            if (oldVersion.exists()) {
                renameFile(oldVersion, currentVersion);
            }
            // Now abort anyway
            throw e;
        }
        // Manage history
        if (backupInterval > 0) {
            // Determine the reference point in time based on the current version
            long now = currentVersion.lastModified();
            // Get all history files for the current user
            File[] history = aAnnotationFolder.listFiles(new FileFilter() {

                private final Matcher matcher = Pattern.compile(Pattern.quote(username) + "\\.ser\\.[0-9]+\\.bak").matcher("");

                @Override
                public boolean accept(File aFile) {
                    // Check if the filename matches the pattern given above.
                    return matcher.reset(aFile.getName()).matches();
                }
            });
            // Sort the files (oldest one first)
            Arrays.sort(history, LastModifiedFileComparator.LASTMODIFIED_COMPARATOR);
            // Check if we need to make a new history file
            boolean historyFileCreated = false;
            File historyFile = new File(aAnnotationFolder, username + ".ser." + now + ".bak");
            if (history.length == 0) {
                // If there is no history yet but we should keep history, then we create a
                // history file in any case.
                FileUtils.copyFile(currentVersion, historyFile);
                historyFileCreated = true;
            } else {
                // Check if the newest history file is significantly older than the current one
                File latestHistory = history[history.length - 1];
                if (latestHistory.lastModified() + backupInterval < now) {
                    FileUtils.copyFile(currentVersion, historyFile);
                    historyFileCreated = true;
                }
            }
            // Prune history based on number of backup
            if (historyFileCreated) {
                // The new version is not in the history, so we keep that in any case. That
                // means we need to keep one less.
                int toKeep = Math.max(backupKeepNumber - 1, 0);
                if ((backupKeepNumber > 0) && (toKeep < history.length)) {
                    // Copy the oldest files to a new array
                    File[] toRemove = new File[history.length - toKeep];
                    System.arraycopy(history, 0, toRemove, 0, toRemove.length);
                    // Restrict the history to what is left
                    File[] newHistory = new File[toKeep];
                    if (toKeep > 0) {
                        System.arraycopy(history, toRemove.length, newHistory, 0, newHistory.length);
                    }
                    history = newHistory;
                    // Remove these old files
                    for (File file : toRemove) {
                        FileUtils.forceDelete(file);
                        try (MDC.MDCCloseable closable = MDC.putCloseable(Logging.KEY_PROJECT_ID, String.valueOf(aProject.getId()))) {
                            log.info("Removed surplus history file [{}] of user [{}] for " + "document [{}]({}) in project [{}]({})", file.getName(), aUserName, aDocumentName, aDocumentId, aProject.getName(), aProject.getId());
                        }
                    }
                }
                // Prune history based on time
                if (backupKeepTime > 0) {
                    for (File file : history) {
                        if ((file.lastModified() + backupKeepTime) < now) {
                            FileUtils.forceDelete(file);
                            try (MDC.MDCCloseable closable = MDC.putCloseable(Logging.KEY_PROJECT_ID, String.valueOf(aProject.getId()))) {
                                log.info("Removed outdated history file [{}] of user [{}] for " + "document [{}]({}) in project [{}]({})", file.getName(), aUserName, aDocumentName, aDocumentId, aProject.getName(), aProject.getId());
                            }
                        }
                    }
                }
            }
        }
    }
}
Also used : Arrays(java.util.Arrays) CasDoctorException(de.tudarmstadt.ukp.clarin.webanno.diag.CasDoctorException) LoggerFactory(org.slf4j.LoggerFactory) CasStorageService(de.tudarmstadt.ukp.clarin.webanno.api.CasStorageService) CAS(org.apache.uima.cas.CAS) Autowired(org.springframework.beans.factory.annotation.Autowired) HashMap(java.util.HashMap) DataRetrievalFailureException(org.springframework.dao.DataRetrievalFailureException) InitializingBean(org.springframework.beans.factory.InitializingBean) RequestCycle(org.apache.wicket.request.cycle.RequestCycle) Value(org.springframework.beans.factory.annotation.Value) TypeSystemDescription(org.apache.uima.resource.metadata.TypeSystemDescription) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData) Matcher(java.util.regex.Matcher) User(de.tudarmstadt.ukp.clarin.webanno.security.model.User) Map(java.util.Map) Project(de.tudarmstadt.ukp.clarin.webanno.model.Project) UIMAException(org.apache.uima.UIMAException) JCas(org.apache.uima.jcas.JCas) Logger(org.slf4j.Logger) PROJECT_FOLDER(de.tudarmstadt.ukp.clarin.webanno.api.ProjectService.PROJECT_FOLDER) IOException(java.io.IOException) FileUtils(org.apache.commons.io.FileUtils) AnnotationDocument(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument) ANNOTATION_FOLDER(de.tudarmstadt.ukp.clarin.webanno.api.ProjectService.ANNOTATION_FOLDER) File(java.io.File) FileNotFoundException(java.io.FileNotFoundException) CasCreationUtils(org.apache.uima.util.CasCreationUtils) Component(org.springframework.stereotype.Component) FileFilter(java.io.FileFilter) CasDoctor(de.tudarmstadt.ukp.clarin.webanno.diag.CasDoctor) AbstractRequestCycleListener(org.apache.wicket.request.cycle.AbstractRequestCycleListener) DOCUMENT_FOLDER(de.tudarmstadt.ukp.clarin.webanno.api.ProjectService.DOCUMENT_FOLDER) MetaDataKey(org.apache.wicket.MetaDataKey) MDC(org.slf4j.MDC) LastModifiedFileComparator(org.apache.commons.io.comparator.LastModifiedFileComparator) SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) Entry(java.util.Map.Entry) Pattern(java.util.regex.Pattern) Logging(de.tudarmstadt.ukp.clarin.webanno.support.logging.Logging) Matcher(java.util.regex.Matcher) IOException(java.io.IOException) MDC(org.slf4j.MDC) CasDoctorException(de.tudarmstadt.ukp.clarin.webanno.diag.CasDoctorException) DataRetrievalFailureException(org.springframework.dao.DataRetrievalFailureException) UIMAException(org.apache.uima.UIMAException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) CasDoctorException(de.tudarmstadt.ukp.clarin.webanno.diag.CasDoctorException) DataRetrievalFailureException(org.springframework.dao.DataRetrievalFailureException) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData) FileFilter(java.io.FileFilter) File(java.io.File)

Example 15 with DocumentMetaData

use of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData in project webanno by webanno.

the class ImportExportServiceImpl method exportCasToFile.

/**
 * A new directory is created using UUID so that every exported file will reside in its own
 * directory. This is useful as the written file can have multiple extensions based on the
 * Writer class used.
 */
@Override
public File exportCasToFile(CAS cas, SourceDocument aDocument, String aFileName, @SuppressWarnings("rawtypes") Class aWriter, boolean aStripExtension) throws IOException, UIMAException {
    // Update the source file name in case it is changed for some reason. This is necessary
    // for the writers to create the files under the correct names.
    Project project = aDocument.getProject();
    File currentDocumentUri = new File(dir.getAbsolutePath() + "/" + PROJECT_FOLDER + "/" + project.getId() + "/" + DOCUMENT_FOLDER + "/" + aDocument.getId() + "/" + SOURCE_FOLDER);
    DocumentMetaData documentMetadata = DocumentMetaData.get(cas.getJCas());
    documentMetadata.setDocumentUri(new File(currentDocumentUri, aFileName).toURI().toURL().toExternalForm());
    documentMetadata.setDocumentBaseUri(currentDocumentUri.toURI().toURL().toExternalForm());
    documentMetadata.setCollectionId(currentDocumentUri.toURI().toURL().toExternalForm());
    documentMetadata.setDocumentUri(new File(dir.getAbsolutePath() + "/" + PROJECT_FOLDER + "/" + project.getId() + "/" + DOCUMENT_FOLDER + "/" + aDocument.getId() + "/" + SOURCE_FOLDER + "/" + aFileName).toURI().toURL().toExternalForm());
    // update with the correct tagset name
    List<AnnotationFeature> features = annotationService.listAnnotationFeature(project);
    for (AnnotationFeature feature : features) {
        TagSet tagSet = feature.getTagset();
        if (tagSet == null) {
            continue;
        } else if (!feature.getLayer().getType().equals(WebAnnoConst.CHAIN_TYPE)) {
            updateCasWithTagSet(cas, feature.getLayer().getName(), tagSet.getName());
        }
    }
    File exportTempDir = File.createTempFile("webanno", "export");
    try {
        exportTempDir.delete();
        exportTempDir.mkdirs();
        AnalysisEngineDescription writer;
        if (aWriter.getName().equals("de.tudarmstadt.ukp.clarin.webanno.tsv.WebannoTsv3Writer")) {
            List<AnnotationLayer> layers = annotationService.listAnnotationLayer(aDocument.getProject());
            List<String> slotFeatures = new ArrayList<>();
            List<String> slotTargets = new ArrayList<>();
            List<String> linkTypes = new ArrayList<>();
            Set<String> spanLayers = new HashSet<>();
            Set<String> slotLayers = new HashSet<>();
            for (AnnotationLayer layer : layers) {
                if (layer.getType().contentEquals(WebAnnoConst.SPAN_TYPE)) {
                    // TSV will not use this
                    if (!annotationExists(cas, layer.getName())) {
                        continue;
                    }
                    boolean isslotLayer = false;
                    for (AnnotationFeature f : annotationService.listAnnotationFeature(layer)) {
                        if (MultiValueMode.ARRAY.equals(f.getMultiValueMode()) && LinkMode.WITH_ROLE.equals(f.getLinkMode())) {
                            isslotLayer = true;
                            slotFeatures.add(layer.getName() + ":" + f.getName());
                            slotTargets.add(f.getType());
                            linkTypes.add(f.getLinkTypeName());
                        }
                    }
                    if (isslotLayer) {
                        slotLayers.add(layer.getName());
                    } else {
                        spanLayers.add(layer.getName());
                    }
                }
            }
            spanLayers.addAll(slotLayers);
            List<String> chainLayers = new ArrayList<>();
            for (AnnotationLayer layer : layers) {
                if (layer.getType().contentEquals(WebAnnoConst.CHAIN_TYPE)) {
                    if (!chainAnnotationExists(cas, layer.getName() + "Chain")) {
                        continue;
                    }
                    chainLayers.add(layer.getName());
                }
            }
            List<String> relationLayers = new ArrayList<>();
            for (AnnotationLayer layer : layers) {
                if (layer.getType().contentEquals(WebAnnoConst.RELATION_TYPE)) {
                    // TSV will not use this
                    if (!annotationExists(cas, layer.getName())) {
                        continue;
                    }
                    relationLayers.add(layer.getName());
                }
            }
            writer = createEngineDescription(aWriter, JCasFileWriter_ImplBase.PARAM_TARGET_LOCATION, exportTempDir, JCasFileWriter_ImplBase.PARAM_STRIP_EXTENSION, aStripExtension, "spanLayers", spanLayers, "slotFeatures", slotFeatures, "slotTargets", slotTargets, "linkTypes", linkTypes, "chainLayers", chainLayers, "relationLayers", relationLayers);
        } else {
            writer = createEngineDescription(aWriter, JCasFileWriter_ImplBase.PARAM_TARGET_LOCATION, exportTempDir, JCasFileWriter_ImplBase.PARAM_STRIP_EXTENSION, aStripExtension);
        }
        runPipeline(cas, writer);
        // If the writer produced more than one file, we package it up as a ZIP file
        File exportFile;
        if (exportTempDir.listFiles().length > 1) {
            exportFile = new File(exportTempDir.getAbsolutePath() + ".zip");
            try {
                ZipUtils.zipFolder(exportTempDir, exportFile);
            } catch (Exception e) {
                try (MDC.MDCCloseable closable = MDC.putCloseable(Logging.KEY_PROJECT_ID, String.valueOf(project.getId()))) {
                    log.info("Unable to create zip File");
                }
            }
        } else {
            exportFile = new File(exportTempDir.getParent(), exportTempDir.listFiles()[0].getName());
            FileUtils.copyFile(exportTempDir.listFiles()[0], exportFile);
        }
        return exportFile;
    } finally {
        if (exportTempDir != null) {
            FileUtils.forceDelete(exportTempDir);
        }
    }
}
Also used : ArrayList(java.util.ArrayList) AnnotationLayer(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationLayer) UIMAException(org.apache.uima.UIMAException) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException) Project(de.tudarmstadt.ukp.clarin.webanno.model.Project) TagSet(de.tudarmstadt.ukp.clarin.webanno.model.TagSet) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData) File(java.io.File) AnnotationFeature(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature) HashSet(java.util.HashSet)

Aggregations

DocumentMetaData (de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData)27 JCas (org.apache.uima.jcas.JCas)7 ArrayList (java.util.ArrayList)6 IOException (java.io.IOException)5 AnalysisEngine (org.apache.uima.analysis_engine.AnalysisEngine)4 TextClassificationOutcome (org.dkpro.tc.api.type.TextClassificationOutcome)4 Project (de.tudarmstadt.ukp.clarin.webanno.model.Project)3 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)3 HashMap (java.util.HashMap)3 HashSet (java.util.HashSet)3 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)3 CAS (org.apache.uima.cas.CAS)3 JCasId (org.dkpro.tc.api.type.JCasId)3 TextClassificationTarget (org.dkpro.tc.api.type.TextClassificationTarget)3 AnnotationDocument (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument)2 SourceDocument (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument)2 User (de.tudarmstadt.ukp.clarin.webanno.security.model.User)2 File (java.io.File)2 FileNotFoundException (java.io.FileNotFoundException)2 LinkedHashMap (java.util.LinkedHashMap)2