Search in sources :

Example 16 with Feature

use of org.apache.uima.cas.Feature in project webanno by webanno.

the class ConllUReader method convert.

public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
    if (readPos) {
        try {
            posMappingProvider.configure(aJCas.getCas());
        } catch (AnalysisEngineProcessException e) {
            throw new IOException(e);
        }
    }
    JCasBuilder doc = new JCasBuilder(aJCas);
    List<String[]> words;
    while ((words = readSentence(aReader)) != null) {
        if (words.isEmpty()) {
            // markers following each other.
            continue;
        }
        int sentenceBegin = doc.getPosition();
        int sentenceEnd = sentenceBegin;
        int surfaceBegin = -1;
        int surfaceEnd = -1;
        String surfaceString = null;
        // Tokens, Lemma, POS
        Int2ObjectMap<Token> tokens = new Int2ObjectOpenHashMap<>();
        for (String[] word : words) {
            if (word[ID].contains("-")) {
                String[] fragments = word[ID].split("-");
                surfaceBegin = Integer.valueOf(fragments[0]);
                surfaceEnd = Integer.valueOf(fragments[1]);
                surfaceString = word[FORM];
                continue;
            }
            // Read token
            int tokenIdx = Integer.valueOf(word[ID]);
            Token token = doc.add(word[FORM], Token.class);
            tokens.put(tokenIdx, token);
            if (!StringUtils.contains(word[MISC], "SpaceAfter=No")) {
                doc.add(" ");
            }
            // Read lemma
            if (!UNUSED.equals(word[LEMMA]) && readLemma) {
                Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
                lemma.setValue(word[LEMMA]);
                lemma.addToIndexes();
                token.setLemma(lemma);
            }
            // Read part-of-speech tag
            if (!UNUSED.equals(word[POSTAG]) && readPos) {
                Type posTag = posMappingProvider.getTagType(word[POSTAG]);
                POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
                pos.setPosValue(word[POSTAG]);
                pos.addToIndexes();
                token.setPos(pos);
            }
            // Read morphological features
            if (!UNUSED.equals(word[FEATS]) && readMorph) {
                MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd());
                morphtag.setValue(word[FEATS]);
                morphtag.addToIndexes();
                token.setMorph(morphtag);
                // Try parsing out individual feature values. Since the DKPro Core
                // MorphologicalFeatures type is based on the definition from the UD project,
                // we can do this rather straightforwardly.
                Type morphType = morphtag.getType();
                String[] items = word[FEATS].split("\\|");
                for (String item : items) {
                    String[] keyValue = item.split("=");
                    StringBuilder key = new StringBuilder(keyValue[0]);
                    key.setCharAt(0, Character.toLowerCase(key.charAt(0)));
                    String value = keyValue[1];
                    Feature feat = morphType.getFeatureByBaseName(key.toString());
                    if (feat != null) {
                        morphtag.setStringValue(feat, value);
                    }
                }
            }
            // Read surface form
            if (tokenIdx == surfaceEnd) {
                int begin = tokens.get(surfaceBegin).getBegin();
                int end = tokens.get(surfaceEnd).getEnd();
                SurfaceForm surfaceForm = new SurfaceForm(aJCas, begin, end);
                surfaceForm.setValue(surfaceString);
                surfaceForm.addToIndexes();
                surfaceBegin = -1;
                surfaceEnd = -1;
                surfaceString = null;
            }
            sentenceEnd = token.getEnd();
        }
        // Dependencies
        if (readDependency) {
            for (String[] word : words) {
                if (!UNUSED.equals(word[DEPREL])) {
                    int depId = Integer.valueOf(word[ID]);
                    int govId = Integer.valueOf(word[HEAD]);
                    // Model the root as a loop onto itself
                    makeDependency(aJCas, govId, depId, word[DEPREL], DependencyFlavor.BASIC, tokens, word);
                }
                if (!UNUSED.equals(word[DEPS])) {
                    // list items separated by vertical bar
                    String[] items = word[DEPS].split("\\|");
                    for (String item : items) {
                        String[] sItem = item.split(":");
                        int depId = Integer.valueOf(word[ID]);
                        int govId = Integer.valueOf(sItem[0]);
                        makeDependency(aJCas, govId, depId, sItem[1], DependencyFlavor.ENHANCED, tokens, word);
                    }
                }
            }
        }
        // Sentence
        Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
        sentence.addToIndexes();
        // Once sentence per line.
        doc.add("\n");
    }
    doc.close();
}
Also used : Int2ObjectOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap) MorphologicalFeatures(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) IOException(java.io.IOException) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) Feature(org.apache.uima.cas.Feature) Type(org.apache.uima.cas.Type) POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) SurfaceForm(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm) JCasBuilder(org.apache.uima.fit.factory.JCasBuilder) Lemma(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 17 with Feature

use of org.apache.uima.cas.Feature in project webanno by webanno.

the class WebannoTsv2Writer method setRelationFeatureAnnos.

private void setRelationFeatureAnnos(CAS aCas, Map<Integer, String> aRelAnnoMap, Type aType, Feature aFeature) throws CASRuntimeException, CASException {
    LowLevelCAS llCas = aCas.getLowLevelCAS();
    Feature dependent = null;
    AnnotationFS temp = null;
    for (Feature feature : aType.getFeatures()) {
        if (feature.getShortName().equals(DEPENDENT)) {
            dependent = feature;
        }
    }
    for (AnnotationFS annoFs : CasUtil.select(aCas, aType)) {
        // relation annotation will be from Governor to Dependent
        // Entry done on Dependent side
        temp = annoFs;
        annoFs = (AnnotationFS) annoFs.getFeatureValue(dependent);
        boolean first = true;
        for (Token token : selectCovered(aCas.getJCas(), Token.class, annoFs.getBegin(), annoFs.getEnd())) {
            if (annoFs.getBegin() <= token.getBegin() && annoFs.getEnd() >= token.getEnd()) {
                annoFs = temp;
                String annotation = annoFs.getFeatureValueAsString(aFeature);
                if (annotation == null) {
                    annotation = aType.getName() + "_";
                }
                if (aRelAnnoMap.get(llCas.ll_getFSRef(token)) == null) {
                    if (!multipleSpans.contains(aType.getName())) {
                        aRelAnnoMap.put(llCas.ll_getFSRef(token), annotation);
                    } else {
                        aRelAnnoMap.put(llCas.ll_getFSRef(token), (first ? "B-" : "I-") + annotation);
                        first = false;
                    }
                } else {
                    if (!multipleSpans.contains(aType.getName())) {
                        aRelAnnoMap.put(llCas.ll_getFSRef(token), aRelAnnoMap.get(llCas.ll_getFSRef(token)) + "|" + annotation);
                    } else {
                        aRelAnnoMap.put(llCas.ll_getFSRef(token), aRelAnnoMap.get(llCas.ll_getFSRef(token)) + "|" + (first ? "B-" : "I-") + annotation);
                        first = false;
                    }
                }
            }
            // we just need an arc to the first token.
            break;
        }
    }
}
Also used : AnnotationFS(org.apache.uima.cas.text.AnnotationFS) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) LowLevelCAS(org.apache.uima.cas.impl.LowLevelCAS) Feature(org.apache.uima.cas.Feature)

Example 18 with Feature

use of org.apache.uima.cas.Feature in project webanno by webanno.

the class WebannoTsv2Writer method convertToTsv.

private void convertToTsv(JCas aJCas, OutputStream aOs, String aEncoding) throws IOException, ResourceInitializationException, CASRuntimeException, CASException {
    LowLevelCAS llCas = aJCas.getLowLevelCas();
    tokenIds = new HashMap<>();
    setTokenId(aJCas, tokenIds);
    tokenPositions = new TreeMap<>();
    setTokenPosition(aJCas, tokenPositions);
    Map<Integer, Integer> getTokensPerSentence = new TreeMap<>();
    setTokenSentenceAddress(aJCas, getTokensPerSentence);
    // list of annotation types
    Set<Type> allTypes = new LinkedHashSet<>();
    for (Annotation a : select(aJCas, Annotation.class)) {
        if (!(a instanceof Token || a instanceof Sentence || a instanceof DocumentMetaData || a instanceof TagsetDescription || a instanceof CoreferenceLink)) {
            allTypes.add(a.getType());
        }
    }
    Set<Type> relationTypes = new LinkedHashSet<>();
    // get all arc types
    for (Type type : allTypes) {
        if (type.getFeatures().size() == 0) {
            continue;
        }
        for (Feature feature : type.getFeatures()) {
            if (feature.getShortName().equals(GOVERNOR)) {
                relationTypes.add(type);
                break;
            }
        }
    }
    allTypes.removeAll(relationTypes);
    // relation annotations
    Map<Type, String> relationTypesMap = new HashMap<>();
    for (Type type : relationTypes) {
        if (type.getName().equals(Dependency.class.getName())) {
            relationTypesMap.put(type, POS.class.getName());
            continue;
        }
        for (AnnotationFS anno : CasUtil.select(aJCas.getCas(), type)) {
            for (Feature feature : type.getFeatures()) {
                if (feature.getShortName().equals(GOVERNOR)) {
                    relationTypesMap.put(type, anno.getFeatureValue(feature).getType().getName());
                }
            }
        }
    }
    // all span annotation first
    Map<Feature, Type> spanFeatures = new LinkedHashMap<>();
    allTypes: for (Type type : allTypes) {
        if (type.getFeatures().size() == 0) {
            continue;
        }
        for (Feature feature : type.getFeatures()) {
            // coreference annotation not supported
            if (feature.getShortName().equals(FIRST) || feature.getShortName().equals(NEXT)) {
                continue allTypes;
            }
        }
        IOUtils.write(" # " + type.getName(), aOs, aEncoding);
        for (Feature feature : type.getFeatures()) {
            if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end")) {
                continue;
            }
            spanFeatures.put(feature, type);
            IOUtils.write(" | " + feature.getShortName(), aOs, aEncoding);
        }
    }
    // write all relation annotation first
    Set<Feature> relationFeatures = new LinkedHashSet<>();
    for (Type type : relationTypes) {
        IOUtils.write(" # " + type.getName(), aOs, aEncoding);
        for (Feature feature : type.getFeatures()) {
            if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT)) {
                continue;
            }
            relationFeatures.add(feature);
            IOUtils.write(" | " + feature.getShortName(), aOs, aEncoding);
        }
        // Add the attach type for the realtion anotation
        IOUtils.write(" | AttachTo=" + relationTypesMap.get(type), aOs, aEncoding);
    }
    IOUtils.write("\n", aOs, aEncoding);
    Map<Feature, Map<Integer, String>> allAnnos = new HashMap<>();
    allTypes: for (Type type : allTypes) {
        for (Feature feature : type.getFeatures()) {
            // coreference annotation not supported
            if (feature.getShortName().equals(FIRST) || feature.getShortName().equals(NEXT)) {
                continue allTypes;
            }
        }
        for (Feature feature : type.getFeatures()) {
            if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end")) {
                continue;
            }
            Map<Integer, String> tokenAnnoMap = new TreeMap<>();
            setTokenAnnos(aJCas.getCas(), tokenAnnoMap, type, feature);
            allAnnos.put(feature, tokenAnnoMap);
        }
    }
    // get tokens where dependents are drown to
    Map<Feature, Map<Integer, String>> relAnnos = new HashMap<>();
    for (Type type : relationTypes) {
        for (Feature feature : type.getFeatures()) {
            if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT)) {
                continue;
            }
            Map<Integer, String> tokenAnnoMap = new HashMap<>();
            setRelationFeatureAnnos(aJCas.getCas(), tokenAnnoMap, type, feature);
            relAnnos.put(feature, tokenAnnoMap);
        }
    }
    // get tokens where dependents are drown from - the governor
    Map<Type, Map<Integer, String>> governorAnnos = new HashMap<>();
    for (Type type : relationTypes) {
        Map<Integer, String> govAnnoMap = new HashMap<>();
        setRelationGovernorPos(aJCas.getCas(), govAnnoMap, type);
        governorAnnos.put(type, govAnnoMap);
    }
    int sentId = 1;
    for (Sentence sentence : select(aJCas, Sentence.class)) {
        IOUtils.write("#id=" + sentId++ + "\n", aOs, aEncoding);
        IOUtils.write("#text=" + sentence.getCoveredText().replace("\n", "") + "\n", aOs, aEncoding);
        for (Token token : selectCovered(Token.class, sentence)) {
            IOUtils.write(tokenIds.get(llCas.ll_getFSRef(token)) + "\t" + token.getCoveredText() + "\t", aOs, aEncoding);
            // all span annotations on this token
            for (Feature feature : spanFeatures.keySet()) {
                String annos = allAnnos.get(feature).get(llCas.ll_getFSRef(token));
                if (annos == null) {
                    if (multipleSpans.contains(spanFeatures.get(feature).getName())) {
                        IOUtils.write("O\t", aOs, aEncoding);
                    } else {
                        IOUtils.write("_\t", aOs, aEncoding);
                    }
                } else {
                    IOUtils.write(annos + "\t", aOs, aEncoding);
                }
            }
            for (Type type : relationTypes) {
                for (Feature feature : type.getFeatures()) {
                    if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT)) {
                        continue;
                    }
                    String annos = relAnnos.get(feature).get(llCas.ll_getFSRef(token));
                    if (annos == null) {
                        IOUtils.write("_\t", aOs, aEncoding);
                    } else {
                        IOUtils.write(annos + "\t", aOs, aEncoding);
                    }
                }
                // the governor positions
                String govPos = governorAnnos.get(type).get(llCas.ll_getFSRef(token));
                if (govPos == null) {
                    IOUtils.write("_\t", aOs, aEncoding);
                } else {
                    IOUtils.write(governorAnnos.get(type).get(llCas.ll_getFSRef(token)) + "\t", aOs, aEncoding);
                }
            }
            IOUtils.write("\n", aOs, aEncoding);
        }
        IOUtils.write("\n", aOs, aEncoding);
    }
}
Also used : LinkedHashSet(java.util.LinkedHashSet) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Feature(org.apache.uima.cas.Feature) TagsetDescription(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription) LinkedHashMap(java.util.LinkedHashMap) AnnotationFS(org.apache.uima.cas.text.AnnotationFS) CoreferenceLink(de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData) LowLevelCAS(org.apache.uima.cas.impl.LowLevelCAS) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) Dependency(de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency) TreeMap(java.util.TreeMap) Annotation(org.apache.uima.jcas.tcas.Annotation) Type(org.apache.uima.cas.Type) POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) NavigableMap(java.util.NavigableMap) TreeMap(java.util.TreeMap)

Example 19 with Feature

use of org.apache.uima.cas.Feature in project webanno by webanno.

the class WebannoTsv2Writer method setRelationGovernorPos.

private void setRelationGovernorPos(CAS aCas, Map<Integer, String> aRelationGovernorMap, Type aType) throws CASRuntimeException, CASException {
    Feature governor = null, dependent = null;
    AnnotationFS temp = null;
    for (Feature feature : aType.getFeatures()) {
        if (feature.getShortName().equals(GOVERNOR)) {
            governor = feature;
        }
        if (feature.getShortName().equals(DEPENDENT)) {
            dependent = feature;
        }
    }
    LowLevelCAS llCas = aCas.getLowLevelCAS();
    for (AnnotationFS anno : CasUtil.select(aCas, aType)) {
        // relation annotation will be from Governor to Dependent
        // Entry done on Dependent side
        temp = anno;
        anno = (AnnotationFS) anno.getFeatureValue(dependent);
        for (Token token : selectCovered(aCas.getJCas(), Token.class, anno.getBegin(), anno.getEnd())) {
            if (anno.getBegin() <= token.getBegin() && anno.getEnd() >= token.getEnd()) {
                if (aRelationGovernorMap.get(llCas.ll_getFSRef(token)) == null) {
                    AnnotationFS govAnno = (AnnotationFS) temp.getFeatureValue(governor);
                    aRelationGovernorMap.put(llCas.ll_getFSRef(token), tokenIds.get(tokenPositions.floorEntry(govAnno.getBegin()).getValue()));
                } else {
                    AnnotationFS govAnno = (AnnotationFS) temp.getFeatureValue(governor);
                    aRelationGovernorMap.put(llCas.ll_getFSRef(token), aRelationGovernorMap.get(llCas.ll_getFSRef(token)) + "|" + tokenIds.get(tokenPositions.floorEntry(govAnno.getBegin()).getValue()));
                }
            }
            // we just need an arc to the first token.
            break;
        }
    }
}
Also used : AnnotationFS(org.apache.uima.cas.text.AnnotationFS) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Feature(org.apache.uima.cas.Feature) LowLevelCAS(org.apache.uima.cas.impl.LowLevelCAS)

Example 20 with Feature

use of org.apache.uima.cas.Feature in project webanno by webanno.

the class ComplexTypeTest method testCountryType.

@Test
public void testCountryType() throws Exception {
    TypeSystemDescription tsd = TypeSystemDescriptionFactory.createTypeSystemDescription("desc.types.TestTypeSystemDescriptor");
    CAS cas = CasCreationUtils.createCas(tsd, null, null);
    cas.setDocumentText("Asia is the largest continent on Earth. Asia is subdivided into 48 countries, two of them (Russia and Turkey) having part of their land in Europe. The most active place on Earth for tropical cyclone activity lies northeast of the Philippines and south of Japan. The Gobi Desert is in Mongolia and the Arabian Desert stretches across much of the Middle East. The Yangtze River in China is the longest river in the continent. The Himalayas between Nepal and China is the tallest mountain range in the world. Tropical rainforests stretch across much of southern Asia and coniferous and deciduous forests lie farther north.");
    TypeSystem ts = cas.getTypeSystem();
    Type continentType = ts.getType("de.Continent");
    Feature continentName = continentType.getFeatureByBaseName("name");
    AnnotationFS asiaContinent = cas.createAnnotation(continentType, 0, 4);
    asiaContinent.setStringValue(continentName, "Asia");
    cas.addFsToIndexes(asiaContinent);
    Type countryType = ts.getType("de.Country");
    Feature countryName = countryType.getFeatureByBaseName("name");
    AnnotationFS russia = cas.createAnnotation(countryType, 56, 62);
    russia.setStringValue(countryName, "Russian Federation");
    Feature continentFeature = countryType.getFeatureByBaseName("continent");
    russia.setFeatureValue(continentFeature, asiaContinent);
    cas.addFsToIndexes(russia);
    ConstraintsGrammar parser = new ConstraintsGrammar(new FileInputStream("src/test/resources/rules/region.rules"));
    Parse p = parser.Parse();
    ParsedConstraints constraints = p.accept(new ParserVisitor());
    Evaluator constraintsEvaluator = new ValuesGenerator();
    List<PossibleValue> possibleValues = constraintsEvaluator.generatePossibleValues(russia, "regionType", constraints);
    List<PossibleValue> exValues = new LinkedList<>();
    exValues.add(new PossibleValue("cold", true));
    assertEquals(possibleValues, exValues);
}
Also used : TypeSystem(org.apache.uima.cas.TypeSystem) TypeSystemDescription(org.apache.uima.resource.metadata.TypeSystemDescription) Parse(de.tudarmstadt.ukp.clarin.webanno.constraints.grammar.syntaxtree.Parse) ParserVisitor(de.tudarmstadt.ukp.clarin.webanno.constraints.visitor.ParserVisitor) ParsedConstraints(de.tudarmstadt.ukp.clarin.webanno.constraints.model.ParsedConstraints) ValuesGenerator(de.tudarmstadt.ukp.clarin.webanno.constraints.evaluator.ValuesGenerator) Evaluator(de.tudarmstadt.ukp.clarin.webanno.constraints.evaluator.Evaluator) Feature(org.apache.uima.cas.Feature) FileInputStream(java.io.FileInputStream) LinkedList(java.util.LinkedList) AnnotationFS(org.apache.uima.cas.text.AnnotationFS) Type(org.apache.uima.cas.Type) CAS(org.apache.uima.cas.CAS) PossibleValue(de.tudarmstadt.ukp.clarin.webanno.constraints.evaluator.PossibleValue) ConstraintsGrammar(de.tudarmstadt.ukp.clarin.webanno.constraints.grammar.ConstraintsGrammar) Test(org.junit.Test)

Aggregations

Feature (org.apache.uima.cas.Feature)84 Type (org.apache.uima.cas.Type)62 AnnotationFeature (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)50 AnnotationFS (org.apache.uima.cas.text.AnnotationFS)48 ArrayList (java.util.ArrayList)23 FeatureStructure (org.apache.uima.cas.FeatureStructure)18 CasUtil.getType (org.apache.uima.fit.util.CasUtil.getType)18 JCas (org.apache.uima.jcas.JCas)18 List (java.util.List)15 Test (org.junit.Test)14 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)13 WebAnnoCasUtil.setFeature (de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.setFeature)12 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)12 CAS (org.apache.uima.cas.CAS)10 HashSet (java.util.HashSet)8 LinkedHashMap (java.util.LinkedHashMap)8 Map (java.util.Map)8 HashMap (java.util.HashMap)7 TypeSystem (org.apache.uima.cas.TypeSystem)7 AnnotationException (de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException)6