Search in sources :

Example 76 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class WebannoTsv3Writer method setTokenSentenceAddress.

private void setTokenSentenceAddress(JCas aJCas) {
    int sentNMumber = 1;
    for (Sentence sentence : select(aJCas, Sentence.class)) {
        int lineNumber = 1;
        for (Token token : selectCovered(Token.class, sentence)) {
            AnnotationUnit unit = new AnnotationUnit(token.getBegin(), token.getEnd(), false, token.getCoveredText());
            units.add(unit);
            if (lineNumber == 1) {
                sentenceUnits.put(unit, sentence.getCoveredText());
            }
            unitsLineNumber.put(unit, sentNMumber + "-" + lineNumber);
            lineNumber++;
        }
        sentNMumber++;
    }
}
Also used : AnnotationUnit(de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 77 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class ConllUWriter method convert.

private void convert(JCas aJCas, PrintWriter aOut) {
    Map<SurfaceForm, Collection<Token>> surfaceIdx = indexCovered(aJCas, SurfaceForm.class, Token.class);
    Int2ObjectMap<SurfaceForm> surfaceBeginIdx = new Int2ObjectOpenHashMap<>();
    for (SurfaceForm sf : select(aJCas, SurfaceForm.class)) {
        surfaceBeginIdx.put(sf.getBegin(), sf);
    }
    for (Sentence sentence : select(aJCas, Sentence.class)) {
        HashMap<Token, Row> ctokens = new LinkedHashMap<>();
        // Tokens
        List<Token> tokens = selectCovered(Token.class, sentence);
        for (int i = 0; i < tokens.size(); i++) {
            Row row = new Row();
            row.id = i + 1;
            row.token = tokens.get(i);
            row.noSpaceAfter = (i + 1 < tokens.size()) && row.token.getEnd() == tokens.get(i + 1).getBegin();
            ctokens.put(row.token, row);
        }
        // Dependencies
        for (Dependency rel : selectCovered(Dependency.class, sentence)) {
            String flavor = FSUtil.getFeature(rel, "flavor", String.class);
            if (StringUtils.isBlank(flavor) || DependencyFlavor.BASIC.equals(flavor)) {
                ctokens.get(rel.getDependent()).deprel = rel;
            } else {
                ctokens.get(rel.getDependent()).deps.add(rel);
            }
        }
        // Write sentence in CONLL-U format
        for (Row row : ctokens.values()) {
            String lemma = UNUSED;
            if (writeLemma && (row.token.getLemma() != null)) {
                lemma = row.token.getLemma().getValue();
            }
            String pos = UNUSED;
            String cpos = UNUSED;
            if (writePos && (row.token.getPos() != null)) {
                POS posAnno = row.token.getPos();
                pos = posAnno.getPosValue();
                cpos = dkpro2ud.get(posAnno.getClass());
                if (StringUtils.isBlank(cpos)) {
                    cpos = pos;
                }
            }
            int headId = UNUSED_INT;
            String deprel = UNUSED;
            String deps = UNUSED;
            if (writeDependency) {
                if ((row.deprel != null)) {
                    deprel = row.deprel.getDependencyType();
                    headId = ctokens.get(row.deprel.getGovernor()).id;
                    if (headId == row.id) {
                        // ROOT dependencies may be modeled as a loop, ignore these.
                        headId = 0;
                    }
                }
                StringBuilder depsBuf = new StringBuilder();
                for (Dependency d : row.deps) {
                    if (depsBuf.length() > 0) {
                        depsBuf.append('|');
                    }
                    // Resolve self-looping root to 0-indexed root
                    int govId = ctokens.get(d.getGovernor()).id;
                    if (govId == row.id) {
                        govId = 0;
                    }
                    depsBuf.append(govId);
                    depsBuf.append(':');
                    depsBuf.append(d.getDependencyType());
                }
                if (depsBuf.length() > 0) {
                    deps = depsBuf.toString();
                }
            }
            String head = UNUSED;
            if (headId != UNUSED_INT) {
                head = Integer.toString(headId);
            }
            String feats = UNUSED;
            if (writeMorph && (row.token.getMorph() != null)) {
                feats = row.token.getMorph().getValue();
            }
            String misc = UNUSED;
            if (row.noSpaceAfter) {
                misc = "SpaceAfter=No";
            }
            SurfaceForm sf = surfaceBeginIdx.get(row.token.getBegin());
            if (sf != null) {
                @SuppressWarnings({ "unchecked", "rawtypes" }) List<Token> covered = (List) surfaceIdx.get(sf);
                int id1 = ctokens.get(covered.get(0)).id;
                int id2 = ctokens.get(covered.get(covered.size() - 1)).id;
                aOut.printf("%d-%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", id1, id2, sf.getValue(), UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED);
            }
            aOut.printf("%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", row.id, row.token.getCoveredText(), lemma, cpos, pos, feats, head, deprel, deps, misc);
        }
        aOut.println();
    }
}
Also used : Int2ObjectOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Dependency(de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency) LinkedHashMap(java.util.LinkedHashMap) SurfaceForm(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm) POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) Collection(java.util.Collection) ArrayList(java.util.ArrayList) List(java.util.List) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 78 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class WebAnnoSemanticGraphReader method convertToCas.

public void convertToCas(JCas aJCas, InputStream aIs, String aEncoding) throws IOException {
    StringBuilder text = new StringBuilder();
    LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding);
    int tokenBeginPosition = 0;
    while (lineIterator.hasNext()) {
        String line = lineIterator.next();
        String[] contents = line.split("\t>\t|\tX\t");
        int sentenceBegin = tokenBeginPosition;
        int chainBegin = tokenBeginPosition;
        int chainEnd = 0;
        StringTokenizer st = new StringTokenizer(contents[0]);
        while (st.hasMoreTokens()) {
            String content = st.nextToken();
            Token outToken = new Token(aJCas, tokenBeginPosition, tokenBeginPosition + content.length());
            outToken.addToIndexes();
            tokenBeginPosition = outToken.getEnd() + 1;
            chainEnd = tokenBeginPosition;
            text.append(content).append(" ");
        }
        CoreferenceChain chain = new CoreferenceChain(aJCas);
        CoreferenceLink link = new CoreferenceLink(aJCas, chainBegin, chainEnd - 1);
        link.setReferenceType("text");
        link.addToIndexes();
        chain.setFirst(link);
        if (line.contains("\t>\t")) {
            link.setReferenceRelation("entails");
            Token outToken = new Token(aJCas, tokenBeginPosition, tokenBeginPosition + 1);
            outToken.addToIndexes();
            tokenBeginPosition = outToken.getEnd() + 1;
            text.append("> ");
        } else {
            link.setReferenceRelation("do not entails");
            Token outToken = new Token(aJCas, tokenBeginPosition, tokenBeginPosition + 1);
            outToken.addToIndexes();
            tokenBeginPosition = outToken.getEnd() + 1;
            text.append("X ");
        }
        chainBegin = tokenBeginPosition;
        st = new StringTokenizer(contents[0]);
        while (st.hasMoreTokens()) {
            String content = st.nextToken();
            Token outToken = new Token(aJCas, tokenBeginPosition, tokenBeginPosition + content.length());
            outToken.addToIndexes();
            tokenBeginPosition = outToken.getEnd() + 1;
            chainEnd = tokenBeginPosition;
            text.append(content).append(" ");
        }
        CoreferenceLink nextLink = new CoreferenceLink(aJCas, chainBegin, chainEnd - 1);
        nextLink.setReferenceType("hypothesis");
        nextLink.addToIndexes();
        link.setNext(nextLink);
        chain.addToIndexes();
        text.append("\n");
        Sentence outSentence = new Sentence(aJCas);
        outSentence.setBegin(sentenceBegin);
        outSentence.setEnd(tokenBeginPosition);
        outSentence.addToIndexes();
        tokenBeginPosition = tokenBeginPosition + 1;
        sentenceBegin = tokenBeginPosition;
    }
    aJCas.setDocumentText(text.toString());
}
Also used : StringTokenizer(java.util.StringTokenizer) CoreferenceChain(de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain) CoreferenceLink(de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) LineIterator(org.apache.commons.io.LineIterator) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 79 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class LineOrientedTextReader method createSentence.

protected Sentence createSentence(final JCas aJCas, final int aBegin, final int aEnd) {
    int[] span = new int[] { aBegin, aEnd };
    trim(aJCas.getDocumentText(), span);
    if (!isEmpty(span[0], span[1])) {
        Sentence seg = new Sentence(aJCas, span[0], span[1]);
        seg.addToIndexes(aJCas);
        return seg;
    } else {
        return null;
    }
}
Also used : Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 80 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class WebannoTsv1Reader method createSentence.

/**
 * Add sentence layer to CAS
 */
private void createSentence(JCas aJCas, List<Integer> firstTokenInSentence, Map<String, Token> tokensStored) {
    for (int i = 0; i < firstTokenInSentence.size(); i++) {
        Sentence outSentence = new Sentence(aJCas);
        // Only last sentence, and no the only sentence in the document (i!=0)
        if (i == firstTokenInSentence.size() - 1 && i != 0) {
            outSentence.setBegin(tokensStored.get("t_" + firstTokenInSentence.get(i)).getEnd());
            outSentence.setEnd(tokensStored.get("t_" + (tokensStored.size())).getEnd());
            outSentence.addToIndexes();
            break;
        }
        if (i == firstTokenInSentence.size() - 1 && i == 0) {
            outSentence.setBegin(tokensStored.get("t_" + firstTokenInSentence.get(i)).getBegin());
            outSentence.setEnd(tokensStored.get("t_" + (tokensStored.size())).getEnd());
            outSentence.addToIndexes();
        } else if (i == 0) {
            outSentence.setBegin(tokensStored.get("t_" + firstTokenInSentence.get(i)).getBegin());
            outSentence.setEnd(tokensStored.get("t_" + firstTokenInSentence.get(i + 1)).getEnd());
            outSentence.addToIndexes();
        } else {
            outSentence.setBegin(tokensStored.get("t_" + firstTokenInSentence.get(i)).getEnd() + 1);
            outSentence.setEnd(tokensStored.get("t_" + firstTokenInSentence.get(i + 1)).getEnd());
            outSentence.addToIndexes();
        }
    }
}
Also used : Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Aggregations

Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)90 JCas (org.apache.uima.jcas.JCas)41 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)34 ArrayList (java.util.ArrayList)22 AnnotatorState (de.tudarmstadt.ukp.clarin.webanno.api.annotation.model.AnnotatorState)14 Type (org.apache.uima.cas.Type)12 AnnotationFS (org.apache.uima.cas.text.AnnotationFS)12 IOException (java.io.IOException)9 SourceDocument (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument)8 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)8 Test (org.junit.Test)8 HashMap (java.util.HashMap)7 TokenBuilder (org.apache.uima.fit.testing.factory.TokenBuilder)7 AnnotationException (de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException)6 WebAnnoCasUtil.getFirstSentence (de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.getFirstSentence)6 AnnotationDocument (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument)6 AnnotationFeature (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)6 FrequencyDistribution (de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)6 CASException (org.apache.uima.cas.CASException)6 AutomationTypeAdapter (de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter)5