Search in sources :

Example 1 with SurfaceForm

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm in project webanno by webanno.

the class ConllUReader method convert.

public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
    if (readPos) {
        try {
            posMappingProvider.configure(aJCas.getCas());
        } catch (AnalysisEngineProcessException e) {
            throw new IOException(e);
        }
    }
    JCasBuilder doc = new JCasBuilder(aJCas);
    List<String[]> words;
    while ((words = readSentence(aReader)) != null) {
        if (words.isEmpty()) {
            // markers following each other.
            continue;
        }
        int sentenceBegin = doc.getPosition();
        int sentenceEnd = sentenceBegin;
        int surfaceBegin = -1;
        int surfaceEnd = -1;
        String surfaceString = null;
        // Tokens, Lemma, POS
        Int2ObjectMap<Token> tokens = new Int2ObjectOpenHashMap<>();
        for (String[] word : words) {
            if (word[ID].contains("-")) {
                String[] fragments = word[ID].split("-");
                surfaceBegin = Integer.valueOf(fragments[0]);
                surfaceEnd = Integer.valueOf(fragments[1]);
                surfaceString = word[FORM];
                continue;
            }
            // Read token
            int tokenIdx = Integer.valueOf(word[ID]);
            Token token = doc.add(word[FORM], Token.class);
            tokens.put(tokenIdx, token);
            if (!StringUtils.contains(word[MISC], "SpaceAfter=No")) {
                doc.add(" ");
            }
            // Read lemma
            if (!UNUSED.equals(word[LEMMA]) && readLemma) {
                Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
                lemma.setValue(word[LEMMA]);
                lemma.addToIndexes();
                token.setLemma(lemma);
            }
            // Read part-of-speech tag
            if (!UNUSED.equals(word[POSTAG]) && readPos) {
                Type posTag = posMappingProvider.getTagType(word[POSTAG]);
                POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
                pos.setPosValue(word[POSTAG]);
                pos.addToIndexes();
                token.setPos(pos);
            }
            // Read morphological features
            if (!UNUSED.equals(word[FEATS]) && readMorph) {
                MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd());
                morphtag.setValue(word[FEATS]);
                morphtag.addToIndexes();
                token.setMorph(morphtag);
                // Try parsing out individual feature values. Since the DKPro Core
                // MorphologicalFeatures type is based on the definition from the UD project,
                // we can do this rather straightforwardly.
                Type morphType = morphtag.getType();
                String[] items = word[FEATS].split("\\|");
                for (String item : items) {
                    String[] keyValue = item.split("=");
                    StringBuilder key = new StringBuilder(keyValue[0]);
                    key.setCharAt(0, Character.toLowerCase(key.charAt(0)));
                    String value = keyValue[1];
                    Feature feat = morphType.getFeatureByBaseName(key.toString());
                    if (feat != null) {
                        morphtag.setStringValue(feat, value);
                    }
                }
            }
            // Read surface form
            if (tokenIdx == surfaceEnd) {
                int begin = tokens.get(surfaceBegin).getBegin();
                int end = tokens.get(surfaceEnd).getEnd();
                SurfaceForm surfaceForm = new SurfaceForm(aJCas, begin, end);
                surfaceForm.setValue(surfaceString);
                surfaceForm.addToIndexes();
                surfaceBegin = -1;
                surfaceEnd = -1;
                surfaceString = null;
            }
            sentenceEnd = token.getEnd();
        }
        // Dependencies
        if (readDependency) {
            for (String[] word : words) {
                if (!UNUSED.equals(word[DEPREL])) {
                    int depId = Integer.valueOf(word[ID]);
                    int govId = Integer.valueOf(word[HEAD]);
                    // Model the root as a loop onto itself
                    makeDependency(aJCas, govId, depId, word[DEPREL], DependencyFlavor.BASIC, tokens, word);
                }
                if (!UNUSED.equals(word[DEPS])) {
                    // list items separated by vertical bar
                    String[] items = word[DEPS].split("\\|");
                    for (String item : items) {
                        String[] sItem = item.split(":");
                        int depId = Integer.valueOf(word[ID]);
                        int govId = Integer.valueOf(sItem[0]);
                        makeDependency(aJCas, govId, depId, sItem[1], DependencyFlavor.ENHANCED, tokens, word);
                    }
                }
            }
        }
        // Sentence
        Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
        sentence.addToIndexes();
        // Once sentence per line.
        doc.add("\n");
    }
    doc.close();
}
Also used : Int2ObjectOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap) MorphologicalFeatures(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) IOException(java.io.IOException) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) Feature(org.apache.uima.cas.Feature) Type(org.apache.uima.cas.Type) POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) SurfaceForm(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm) JCasBuilder(org.apache.uima.fit.factory.JCasBuilder) Lemma(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 2 with SurfaceForm

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm in project webanno by webanno.

the class ConllUWriter method convert.

private void convert(JCas aJCas, PrintWriter aOut) {
    Map<SurfaceForm, Collection<Token>> surfaceIdx = indexCovered(aJCas, SurfaceForm.class, Token.class);
    Int2ObjectMap<SurfaceForm> surfaceBeginIdx = new Int2ObjectOpenHashMap<>();
    for (SurfaceForm sf : select(aJCas, SurfaceForm.class)) {
        surfaceBeginIdx.put(sf.getBegin(), sf);
    }
    for (Sentence sentence : select(aJCas, Sentence.class)) {
        HashMap<Token, Row> ctokens = new LinkedHashMap<>();
        // Tokens
        List<Token> tokens = selectCovered(Token.class, sentence);
        for (int i = 0; i < tokens.size(); i++) {
            Row row = new Row();
            row.id = i + 1;
            row.token = tokens.get(i);
            row.noSpaceAfter = (i + 1 < tokens.size()) && row.token.getEnd() == tokens.get(i + 1).getBegin();
            ctokens.put(row.token, row);
        }
        // Dependencies
        for (Dependency rel : selectCovered(Dependency.class, sentence)) {
            String flavor = FSUtil.getFeature(rel, "flavor", String.class);
            if (StringUtils.isBlank(flavor) || DependencyFlavor.BASIC.equals(flavor)) {
                ctokens.get(rel.getDependent()).deprel = rel;
            } else {
                ctokens.get(rel.getDependent()).deps.add(rel);
            }
        }
        // Write sentence in CONLL-U format
        for (Row row : ctokens.values()) {
            String lemma = UNUSED;
            if (writeLemma && (row.token.getLemma() != null)) {
                lemma = row.token.getLemma().getValue();
            }
            String pos = UNUSED;
            String cpos = UNUSED;
            if (writePos && (row.token.getPos() != null)) {
                POS posAnno = row.token.getPos();
                pos = posAnno.getPosValue();
                cpos = dkpro2ud.get(posAnno.getClass());
                if (StringUtils.isBlank(cpos)) {
                    cpos = pos;
                }
            }
            int headId = UNUSED_INT;
            String deprel = UNUSED;
            String deps = UNUSED;
            if (writeDependency) {
                if ((row.deprel != null)) {
                    deprel = row.deprel.getDependencyType();
                    headId = ctokens.get(row.deprel.getGovernor()).id;
                    if (headId == row.id) {
                        // ROOT dependencies may be modeled as a loop, ignore these.
                        headId = 0;
                    }
                }
                StringBuilder depsBuf = new StringBuilder();
                for (Dependency d : row.deps) {
                    if (depsBuf.length() > 0) {
                        depsBuf.append('|');
                    }
                    // Resolve self-looping root to 0-indexed root
                    int govId = ctokens.get(d.getGovernor()).id;
                    if (govId == row.id) {
                        govId = 0;
                    }
                    depsBuf.append(govId);
                    depsBuf.append(':');
                    depsBuf.append(d.getDependencyType());
                }
                if (depsBuf.length() > 0) {
                    deps = depsBuf.toString();
                }
            }
            String head = UNUSED;
            if (headId != UNUSED_INT) {
                head = Integer.toString(headId);
            }
            String feats = UNUSED;
            if (writeMorph && (row.token.getMorph() != null)) {
                feats = row.token.getMorph().getValue();
            }
            String misc = UNUSED;
            if (row.noSpaceAfter) {
                misc = "SpaceAfter=No";
            }
            SurfaceForm sf = surfaceBeginIdx.get(row.token.getBegin());
            if (sf != null) {
                @SuppressWarnings({ "unchecked", "rawtypes" }) List<Token> covered = (List) surfaceIdx.get(sf);
                int id1 = ctokens.get(covered.get(0)).id;
                int id2 = ctokens.get(covered.get(covered.size() - 1)).id;
                aOut.printf("%d-%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", id1, id2, sf.getValue(), UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED);
            }
            aOut.printf("%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", row.id, row.token.getCoveredText(), lemma, cpos, pos, feats, head, deprel, deps, misc);
        }
        aOut.println();
    }
}
Also used : Int2ObjectOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Dependency(de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency) LinkedHashMap(java.util.LinkedHashMap) SurfaceForm(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm) POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) Collection(java.util.Collection) ArrayList(java.util.ArrayList) List(java.util.List) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Aggregations

POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)2 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)2 SurfaceForm (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm)2 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)2 Int2ObjectOpenHashMap (it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap)2 MorphologicalFeatures (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures)1 Lemma (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma)1 Dependency (de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 LinkedHashMap (java.util.LinkedHashMap)1 List (java.util.List)1 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)1 Feature (org.apache.uima.cas.Feature)1 Type (org.apache.uima.cas.Type)1 JCasBuilder (org.apache.uima.fit.factory.JCasBuilder)1