Search in sources :

Example 1 with DocumentMetaData

use of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData in project webanno by webanno.

the class WebannoTsv2Reader method convertToCas.

public void convertToCas(JCas aJCas, InputStream aIs, String aEncoding) throws IOException {
    StringBuilder text = new StringBuilder();
    DocumentMetaData documentMetadata = DocumentMetaData.get(aJCas);
    fileName = documentMetadata.getDocumentTitle();
    setAnnotations(aJCas, aIs, aEncoding, text);
    aJCas.setDocumentText(text.toString());
}
Also used : DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData)

Example 2 with DocumentMetaData

use of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData in project webanno by webanno.

the class WebannoTsv2Writer method convertToTsv.

private void convertToTsv(JCas aJCas, OutputStream aOs, String aEncoding) throws IOException, ResourceInitializationException, CASRuntimeException, CASException {
    LowLevelCAS llCas = aJCas.getLowLevelCas();
    tokenIds = new HashMap<>();
    setTokenId(aJCas, tokenIds);
    tokenPositions = new TreeMap<>();
    setTokenPosition(aJCas, tokenPositions);
    Map<Integer, Integer> getTokensPerSentence = new TreeMap<>();
    setTokenSentenceAddress(aJCas, getTokensPerSentence);
    // list of annotation types
    Set<Type> allTypes = new LinkedHashSet<>();
    for (Annotation a : select(aJCas, Annotation.class)) {
        if (!(a instanceof Token || a instanceof Sentence || a instanceof DocumentMetaData || a instanceof TagsetDescription || a instanceof CoreferenceLink)) {
            allTypes.add(a.getType());
        }
    }
    Set<Type> relationTypes = new LinkedHashSet<>();
    // get all arc types
    for (Type type : allTypes) {
        if (type.getFeatures().size() == 0) {
            continue;
        }
        for (Feature feature : type.getFeatures()) {
            if (feature.getShortName().equals(GOVERNOR)) {
                relationTypes.add(type);
                break;
            }
        }
    }
    allTypes.removeAll(relationTypes);
    // relation annotations
    Map<Type, String> relationTypesMap = new HashMap<>();
    for (Type type : relationTypes) {
        if (type.getName().equals(Dependency.class.getName())) {
            relationTypesMap.put(type, POS.class.getName());
            continue;
        }
        for (AnnotationFS anno : CasUtil.select(aJCas.getCas(), type)) {
            for (Feature feature : type.getFeatures()) {
                if (feature.getShortName().equals(GOVERNOR)) {
                    relationTypesMap.put(type, anno.getFeatureValue(feature).getType().getName());
                }
            }
        }
    }
    // all span annotation first
    Map<Feature, Type> spanFeatures = new LinkedHashMap<>();
    allTypes: for (Type type : allTypes) {
        if (type.getFeatures().size() == 0) {
            continue;
        }
        for (Feature feature : type.getFeatures()) {
            // coreference annotation not supported
            if (feature.getShortName().equals(FIRST) || feature.getShortName().equals(NEXT)) {
                continue allTypes;
            }
        }
        IOUtils.write(" # " + type.getName(), aOs, aEncoding);
        for (Feature feature : type.getFeatures()) {
            if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end")) {
                continue;
            }
            spanFeatures.put(feature, type);
            IOUtils.write(" | " + feature.getShortName(), aOs, aEncoding);
        }
    }
    // write all relation annotation first
    Set<Feature> relationFeatures = new LinkedHashSet<>();
    for (Type type : relationTypes) {
        IOUtils.write(" # " + type.getName(), aOs, aEncoding);
        for (Feature feature : type.getFeatures()) {
            if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT)) {
                continue;
            }
            relationFeatures.add(feature);
            IOUtils.write(" | " + feature.getShortName(), aOs, aEncoding);
        }
        // Add the attach type for the realtion anotation
        IOUtils.write(" | AttachTo=" + relationTypesMap.get(type), aOs, aEncoding);
    }
    IOUtils.write("\n", aOs, aEncoding);
    Map<Feature, Map<Integer, String>> allAnnos = new HashMap<>();
    allTypes: for (Type type : allTypes) {
        for (Feature feature : type.getFeatures()) {
            // coreference annotation not supported
            if (feature.getShortName().equals(FIRST) || feature.getShortName().equals(NEXT)) {
                continue allTypes;
            }
        }
        for (Feature feature : type.getFeatures()) {
            if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end")) {
                continue;
            }
            Map<Integer, String> tokenAnnoMap = new TreeMap<>();
            setTokenAnnos(aJCas.getCas(), tokenAnnoMap, type, feature);
            allAnnos.put(feature, tokenAnnoMap);
        }
    }
    // get tokens where dependents are drown to
    Map<Feature, Map<Integer, String>> relAnnos = new HashMap<>();
    for (Type type : relationTypes) {
        for (Feature feature : type.getFeatures()) {
            if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT)) {
                continue;
            }
            Map<Integer, String> tokenAnnoMap = new HashMap<>();
            setRelationFeatureAnnos(aJCas.getCas(), tokenAnnoMap, type, feature);
            relAnnos.put(feature, tokenAnnoMap);
        }
    }
    // get tokens where dependents are drown from - the governor
    Map<Type, Map<Integer, String>> governorAnnos = new HashMap<>();
    for (Type type : relationTypes) {
        Map<Integer, String> govAnnoMap = new HashMap<>();
        setRelationGovernorPos(aJCas.getCas(), govAnnoMap, type);
        governorAnnos.put(type, govAnnoMap);
    }
    int sentId = 1;
    for (Sentence sentence : select(aJCas, Sentence.class)) {
        IOUtils.write("#id=" + sentId++ + "\n", aOs, aEncoding);
        IOUtils.write("#text=" + sentence.getCoveredText().replace("\n", "") + "\n", aOs, aEncoding);
        for (Token token : selectCovered(Token.class, sentence)) {
            IOUtils.write(tokenIds.get(llCas.ll_getFSRef(token)) + "\t" + token.getCoveredText() + "\t", aOs, aEncoding);
            // all span annotations on this token
            for (Feature feature : spanFeatures.keySet()) {
                String annos = allAnnos.get(feature).get(llCas.ll_getFSRef(token));
                if (annos == null) {
                    if (multipleSpans.contains(spanFeatures.get(feature).getName())) {
                        IOUtils.write("O\t", aOs, aEncoding);
                    } else {
                        IOUtils.write("_\t", aOs, aEncoding);
                    }
                } else {
                    IOUtils.write(annos + "\t", aOs, aEncoding);
                }
            }
            for (Type type : relationTypes) {
                for (Feature feature : type.getFeatures()) {
                    if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT)) {
                        continue;
                    }
                    String annos = relAnnos.get(feature).get(llCas.ll_getFSRef(token));
                    if (annos == null) {
                        IOUtils.write("_\t", aOs, aEncoding);
                    } else {
                        IOUtils.write(annos + "\t", aOs, aEncoding);
                    }
                }
                // the governor positions
                String govPos = governorAnnos.get(type).get(llCas.ll_getFSRef(token));
                if (govPos == null) {
                    IOUtils.write("_\t", aOs, aEncoding);
                } else {
                    IOUtils.write(governorAnnos.get(type).get(llCas.ll_getFSRef(token)) + "\t", aOs, aEncoding);
                }
            }
            IOUtils.write("\n", aOs, aEncoding);
        }
        IOUtils.write("\n", aOs, aEncoding);
    }
}
Also used : LinkedHashSet(java.util.LinkedHashSet) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Feature(org.apache.uima.cas.Feature) TagsetDescription(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription) LinkedHashMap(java.util.LinkedHashMap) AnnotationFS(org.apache.uima.cas.text.AnnotationFS) CoreferenceLink(de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData) LowLevelCAS(org.apache.uima.cas.impl.LowLevelCAS) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) Dependency(de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency) TreeMap(java.util.TreeMap) Annotation(org.apache.uima.jcas.tcas.Annotation) Type(org.apache.uima.cas.Type) POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) NavigableMap(java.util.NavigableMap) TreeMap(java.util.TreeMap)

Example 3 with DocumentMetaData

use of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData in project webanno by webanno.

the class CasDiff2 method addCas.

/**
 * CASes are added to the diff one after another, building the diff iteratively. A CAS can be
 * added multiple times for different types. Make sure a CAS is not added twice with the same
 * type!
 *
 * @param aCasGroupId
 *            the ID of the CAS group to add.
 * @param aCas
 *            the CAS itself.
 * @param aType
 *            the type on which to calculate the diff.
 */
private void addCas(String aCasGroupId, int aCasId, CAS aCas, String aType) {
    // Remember that we have already seen this CAS.
    List<CAS> casList = cases.get(aCasGroupId);
    if (casList == null) {
        casList = new ArrayList<>();
        cases.put(aCasGroupId, casList);
    }
    // that failed when we had multiple "null" CASes.
    if ((casList.size() - 1) < aCasId) {
        casList.add(aCas);
    }
    assert (casList.size() - 1) == aCasId : "Expected CAS ID [" + (casList.size() - 1) + "] but was [" + aCasId + "]";
    // We add these to the internal list above, but then we bail out here.
    if (aCas == null) {
        log.debug("CAS group [" + aCasGroupId + "] does not contain a CAS at index [" + aCasId + "].");
        return;
    }
    if (log.isDebugEnabled()) {
        log.debug("Processing CAS group [" + aCasGroupId + "] CAS [" + aCasId + "].");
        String collectionId = null;
        String documentId = null;
        try {
            DocumentMetaData dmd = DocumentMetaData.get(aCas);
            collectionId = dmd.getCollectionId();
            documentId = dmd.getDocumentId();
            log.debug("User [" + collectionId + "] - Document [" + documentId + "]");
        } catch (IllegalArgumentException e) {
        // We use this information only for debugging - so we can ignore if the information
        // is missing.
        }
    }
    Collection<AnnotationFS> annotations;
    if (begin == -1 && end == -1) {
        annotations = select(aCas, getType(aCas, aType));
    } else {
        annotations = selectCovered(aCas, getType(aCas, aType), begin, end);
    }
    if (annotations.isEmpty()) {
        log.debug("CAS group [" + aCasGroupId + "] CAS [" + aCasId + "] contains no annotations of type [" + aType + "]");
        return;
    } else {
        log.debug("CAS group [" + aCasGroupId + "] CAS [" + aCasId + "] contains [" + annotations.size() + "] annotations of type [" + aType + "]");
    }
    int posBefore = configSets.keySet().size();
    log.debug("Positions before: [" + posBefore + "]");
    for (AnnotationFS fs : annotations) {
        List<Position> positions = new ArrayList<>();
        // Get/create configuration set at the current position
        positions.add(getAdapter(aType).getPosition(aCasId, fs));
        // Generate secondary positions for multi-link features
        positions.addAll(getAdapter(aType).generateSubPositions(aCasId, fs, linkCompareBehavior));
        for (Position pos : positions) {
            ConfigurationSet configSet = configSets.get(pos);
            if (configSet == null) {
                configSet = new ConfigurationSet(pos);
                configSets.put(pos, configSet);
            }
            assert pos.getClass() == configSet.position.getClass() : "Position type mismatch [" + pos.getClass() + "] vs [" + configSet.position.getClass() + "]";
            // Merge FS into current set
            configSet.addConfiguration(aCasGroupId, fs);
        }
    }
    log.debug("Positions after: [" + configSets.keySet().size() + "] (delta: " + (configSets.keySet().size() - posBefore) + ")");
// 
// // Remember that we have processed the type
// entryTypes.add(aType);
}
Also used : ArrayList(java.util.ArrayList) AnnotationFS(org.apache.uima.cas.text.AnnotationFS) CAS(org.apache.uima.cas.CAS) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData)

Example 4 with DocumentMetaData

use of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData in project webanno by webanno.

the class AgreementPage method getJCases.

/**
 * Get the finished CASes used to compute agreement.
 */
private Map<String, List<JCas>> getJCases() {
    // Avoid reloading the CASes when switching features.
    if (cachedCASes != null) {
        return cachedCASes;
    }
    Project project = projectSelectionForm.getModelObject().project;
    List<User> users = projectService.listProjectUsersWithPermissions(project, PermissionLevel.USER);
    List<SourceDocument> sourceDocuments = documentService.listSourceDocuments(project);
    cachedCASes = new LinkedHashMap<>();
    for (User user : users) {
        List<JCas> cases = new ArrayList<>();
        for (SourceDocument document : sourceDocuments) {
            JCas jCas = null;
            // Load the CAS if there is a finished one.
            if (documentService.existsAnnotationDocument(document, user)) {
                AnnotationDocument annotationDocument = documentService.getAnnotationDocument(document, user);
                if (annotationDocument.getState().equals(AnnotationDocumentState.FINISHED)) {
                    try {
                        jCas = documentService.readAnnotationCas(annotationDocument);
                        annotationService.upgradeCas(jCas.getCas(), annotationDocument);
                        // REC: I think there is no need to write the CASes here. We would not
                        // want to interfere with currently active annotator users
                        // Set the CAS name in the DocumentMetaData so that we can pick it
                        // up in the Diff position for the purpose of debugging / transparency.
                        DocumentMetaData documentMetadata = DocumentMetaData.get(jCas);
                        documentMetadata.setDocumentId(annotationDocument.getDocument().getName());
                        documentMetadata.setCollectionId(annotationDocument.getProject().getName());
                    } catch (Exception e) {
                        LOG.error("Unable to load data", e);
                        error("Unable to load data: " + ExceptionUtils.getRootCauseMessage(e));
                    }
                }
            }
            // The next line can enter null values into the list if a user didn't work on this
            // source document yet.
            cases.add(jCas);
        }
        cachedCASes.put(user.getUsername(), cases);
    }
    return cachedCASes;
}
Also used : Project(de.tudarmstadt.ukp.clarin.webanno.model.Project) User(de.tudarmstadt.ukp.clarin.webanno.security.model.User) SourceDocument(de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument) ArrayList(java.util.ArrayList) JCas(org.apache.uima.jcas.JCas) AnnotationDocument(de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData) ResourceStreamNotFoundException(org.apache.wicket.util.resource.ResourceStreamNotFoundException) IOException(java.io.IOException)

Example 5 with DocumentMetaData

use of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData in project dkpro-tc by dkpro.

the class LinewiseTextReader method getNext.

public void getNext(JCas aJCas) throws IOException, CollectionException {
    DocumentMetaData md = new DocumentMetaData(aJCas);
    md.setDocumentTitle("");
    md.setDocumentId("" + (instanceId++));
    md.setLanguage(language);
    md.addToIndexes();
    String[] split = nextLine.split("\t");
    String documentText = split[1];
    String label = split[0];
    documentText = checkUnescapeHtml(documentText);
    documentText = checkUnescapeJava(documentText);
    aJCas.setDocumentText(documentText);
    TextClassificationOutcome outcome = new TextClassificationOutcome(aJCas);
    outcome.setOutcome(label);
    outcome.addToIndexes();
    checkSetSentence(aJCas);
}
Also used : TextClassificationOutcome(org.dkpro.tc.api.type.TextClassificationOutcome) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData)

Aggregations

DocumentMetaData (de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData)27 JCas (org.apache.uima.jcas.JCas)7 ArrayList (java.util.ArrayList)6 IOException (java.io.IOException)5 AnalysisEngine (org.apache.uima.analysis_engine.AnalysisEngine)4 TextClassificationOutcome (org.dkpro.tc.api.type.TextClassificationOutcome)4 Project (de.tudarmstadt.ukp.clarin.webanno.model.Project)3 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)3 HashMap (java.util.HashMap)3 HashSet (java.util.HashSet)3 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)3 CAS (org.apache.uima.cas.CAS)3 JCasId (org.dkpro.tc.api.type.JCasId)3 TextClassificationTarget (org.dkpro.tc.api.type.TextClassificationTarget)3 AnnotationDocument (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument)2 SourceDocument (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument)2 User (de.tudarmstadt.ukp.clarin.webanno.security.model.User)2 File (java.io.File)2 FileNotFoundException (java.io.FileNotFoundException)2 LinkedHashMap (java.util.LinkedHashMap)2