Search in sources :

Example 1 with JCasBuilder

use of org.apache.uima.fit.factory.JCasBuilder in project webanno by webanno.

the class Conll2009Reader method convert.

public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
    if (readPos) {
        try {
            posMappingProvider.configure(aJCas.getCas());
        } catch (AnalysisEngineProcessException e) {
            throw new IOException(e);
        }
    }
    JCasBuilder doc = new JCasBuilder(aJCas);
    List<String[]> words;
    while ((words = readSentence(aReader)) != null) {
        if (words.isEmpty()) {
            // markers following each other.
            continue;
        }
        int sentenceBegin = doc.getPosition();
        int sentenceEnd = sentenceBegin;
        // Tokens, Lemma, POS
        Map<Integer, Token> tokens = new HashMap<Integer, Token>();
        List<SemPred> preds = new ArrayList<>();
        Iterator<String[]> wordIterator = words.iterator();
        while (wordIterator.hasNext()) {
            String[] word = wordIterator.next();
            // Read token
            Token token = doc.add(word[FORM], Token.class);
            tokens.put(Integer.valueOf(word[ID]), token);
            if (wordIterator.hasNext()) {
                doc.add(" ");
            }
            // Read lemma
            if (!UNUSED.equals(word[LEMMA]) && readLemma) {
                Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
                lemma.setValue(word[LEMMA]);
                lemma.addToIndexes();
                token.setLemma(lemma);
            }
            // Read part-of-speech tag
            if (!UNUSED.equals(word[POS]) && readPos) {
                Type posTag = posMappingProvider.getTagType(word[POS]);
                POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
                pos.setPosValue(word[POS].intern());
                // WebAnno did not yet backport the coarse grained POS feature from
                // DKPro Core 1.9.0
                // POSUtils.assignCoarseValue(pos);
                pos.addToIndexes();
                token.setPos(pos);
            }
            // Read morphological features
            if (!UNUSED.equals(word[FEAT]) && readMorph) {
                MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd());
                morphtag.setValue(word[FEAT]);
                morphtag.addToIndexes();
            }
            if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) {
                SemPred pred = new SemPred(aJCas, token.getBegin(), token.getEnd());
                pred.setCategory(word[PRED]);
                pred.addToIndexes();
                preds.add(pred);
            }
            sentenceEnd = token.getEnd();
        }
        // Dependencies
        if (readDependency) {
            for (String[] word : words) {
                if (!UNUSED.equals(word[DEPREL])) {
                    int depId = Integer.valueOf(word[ID]);
                    int govId = Integer.valueOf(word[HEAD]);
                    // Model the root as a loop onto itself
                    if (govId == 0) {
                        // Not using ROOT here because WebAnno cannot deal with elevated
                        // types
                        Dependency rel = new Dependency(aJCas);
                        rel.setGovernor(tokens.get(depId));
                        rel.setDependent(tokens.get(depId));
                        rel.setDependencyType(word[DEPREL]);
                        rel.setBegin(rel.getDependent().getBegin());
                        rel.setEnd(rel.getDependent().getEnd());
                        // This is set via FSUtil because we still use the DKPro Core 1.7.0 JCas
                        // classes
                        FSUtil.setFeature(rel, "flavor", DependencyFlavor.BASIC);
                        rel.addToIndexes();
                    } else {
                        Dependency rel = new Dependency(aJCas);
                        rel.setGovernor(tokens.get(govId));
                        rel.setDependent(tokens.get(depId));
                        rel.setDependencyType(word[DEPREL]);
                        rel.setBegin(rel.getDependent().getBegin());
                        rel.setEnd(rel.getDependent().getEnd());
                        // This is set via FSUtil because we still use the DKPro Core 1.7.0 JCas
                        // classes
                        FSUtil.setFeature(rel, "flavor", DependencyFlavor.BASIC);
                        rel.addToIndexes();
                    }
                }
            }
        }
        // Semantic arguments
        if (readSemanticPredicate) {
            // Get arguments for one predicate at a time
            for (int p = 0; p < preds.size(); p++) {
                List<SemArgLink> args = new ArrayList<>();
                for (String[] word : words) {
                    if (!UNUSED.equals(word[APRED + p])) {
                        Token token = tokens.get(Integer.valueOf(word[ID]));
                        SemArg arg = new SemArg(aJCas, token.getBegin(), token.getEnd());
                        arg.addToIndexes();
                        SemArgLink link = new SemArgLink(aJCas);
                        link.setRole(word[APRED + p]);
                        link.setTarget(arg);
                        args.add(link);
                    }
                }
                SemPred pred = preds.get(p);
                pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args));
            }
        }
        // Sentence
        Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
        sentence.addToIndexes();
        // Once sentence per line.
        doc.add("\n");
    }
    doc.close();
}
Also used : MorphologicalFeatures(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SemArgLink(de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) IOException(java.io.IOException) Dependency(de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) Type(org.apache.uima.cas.Type) POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) JCasBuilder(org.apache.uima.fit.factory.JCasBuilder) Lemma(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma) SemPred(de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) SemArg(de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg)

Example 2 with JCasBuilder

use of org.apache.uima.fit.factory.JCasBuilder in project webanno by webanno.

the class ConllUReader method convert.

public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
    if (readPos) {
        try {
            posMappingProvider.configure(aJCas.getCas());
        } catch (AnalysisEngineProcessException e) {
            throw new IOException(e);
        }
    }
    JCasBuilder doc = new JCasBuilder(aJCas);
    List<String[]> words;
    while ((words = readSentence(aReader)) != null) {
        if (words.isEmpty()) {
            // markers following each other.
            continue;
        }
        int sentenceBegin = doc.getPosition();
        int sentenceEnd = sentenceBegin;
        int surfaceBegin = -1;
        int surfaceEnd = -1;
        String surfaceString = null;
        // Tokens, Lemma, POS
        Int2ObjectMap<Token> tokens = new Int2ObjectOpenHashMap<>();
        for (String[] word : words) {
            if (word[ID].contains("-")) {
                String[] fragments = word[ID].split("-");
                surfaceBegin = Integer.valueOf(fragments[0]);
                surfaceEnd = Integer.valueOf(fragments[1]);
                surfaceString = word[FORM];
                continue;
            }
            // Read token
            int tokenIdx = Integer.valueOf(word[ID]);
            Token token = doc.add(word[FORM], Token.class);
            tokens.put(tokenIdx, token);
            if (!StringUtils.contains(word[MISC], "SpaceAfter=No")) {
                doc.add(" ");
            }
            // Read lemma
            if (!UNUSED.equals(word[LEMMA]) && readLemma) {
                Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
                lemma.setValue(word[LEMMA]);
                lemma.addToIndexes();
                token.setLemma(lemma);
            }
            // Read part-of-speech tag
            if (!UNUSED.equals(word[POSTAG]) && readPos) {
                Type posTag = posMappingProvider.getTagType(word[POSTAG]);
                POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
                pos.setPosValue(word[POSTAG]);
                pos.addToIndexes();
                token.setPos(pos);
            }
            // Read morphological features
            if (!UNUSED.equals(word[FEATS]) && readMorph) {
                MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd());
                morphtag.setValue(word[FEATS]);
                morphtag.addToIndexes();
                token.setMorph(morphtag);
                // Try parsing out individual feature values. Since the DKPro Core
                // MorphologicalFeatures type is based on the definition from the UD project,
                // we can do this rather straightforwardly.
                Type morphType = morphtag.getType();
                String[] items = word[FEATS].split("\\|");
                for (String item : items) {
                    String[] keyValue = item.split("=");
                    StringBuilder key = new StringBuilder(keyValue[0]);
                    key.setCharAt(0, Character.toLowerCase(key.charAt(0)));
                    String value = keyValue[1];
                    Feature feat = morphType.getFeatureByBaseName(key.toString());
                    if (feat != null) {
                        morphtag.setStringValue(feat, value);
                    }
                }
            }
            // Read surface form
            if (tokenIdx == surfaceEnd) {
                int begin = tokens.get(surfaceBegin).getBegin();
                int end = tokens.get(surfaceEnd).getEnd();
                SurfaceForm surfaceForm = new SurfaceForm(aJCas, begin, end);
                surfaceForm.setValue(surfaceString);
                surfaceForm.addToIndexes();
                surfaceBegin = -1;
                surfaceEnd = -1;
                surfaceString = null;
            }
            sentenceEnd = token.getEnd();
        }
        // Dependencies
        if (readDependency) {
            for (String[] word : words) {
                if (!UNUSED.equals(word[DEPREL])) {
                    int depId = Integer.valueOf(word[ID]);
                    int govId = Integer.valueOf(word[HEAD]);
                    // Model the root as a loop onto itself
                    makeDependency(aJCas, govId, depId, word[DEPREL], DependencyFlavor.BASIC, tokens, word);
                }
                if (!UNUSED.equals(word[DEPS])) {
                    // list items separated by vertical bar
                    String[] items = word[DEPS].split("\\|");
                    for (String item : items) {
                        String[] sItem = item.split(":");
                        int depId = Integer.valueOf(word[ID]);
                        int govId = Integer.valueOf(sItem[0]);
                        makeDependency(aJCas, govId, depId, sItem[1], DependencyFlavor.ENHANCED, tokens, word);
                    }
                }
            }
        }
        // Sentence
        Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
        sentence.addToIndexes();
        // Once sentence per line.
        doc.add("\n");
    }
    doc.close();
}
Also used : Int2ObjectOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap) MorphologicalFeatures(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) IOException(java.io.IOException) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) Feature(org.apache.uima.cas.Feature) Type(org.apache.uima.cas.Type) POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) SurfaceForm(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm) JCasBuilder(org.apache.uima.fit.factory.JCasBuilder) Lemma(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 3 with JCasBuilder

use of org.apache.uima.fit.factory.JCasBuilder in project dkpro-tc by dkpro.

the class NGramUtilsTest method phoneticNgramsTest.

@Test
public void phoneticNgramsTest() throws Exception {
    String text = "This is a big house";
    JCas jcas = JCasFactory.createJCas();
    jcas.setDocumentLanguage("en");
    jcas.setDocumentText(text);
    TextClassificationTarget aTarget = new TextClassificationTarget(jcas, 0, text.length());
    aTarget.addToIndexes();
    JCasBuilder cb = new JCasBuilder(jcas);
    for (String token : text.split(" ")) {
        cb.add(token, Token.class);
    }
    cb.add(0, Sentence.class);
    FrequencyDistribution<String> ngrams = PhoneticNGramMC.getDocumentPhoneticNgrams(jcas, aTarget, 1, 3);
    assertEquals(12, ngrams.getN());
    assertTrue(ngrams.contains("I000"));
    assertTrue(ngrams.contains("T200"));
}
Also used : JCasBuilder(org.apache.uima.fit.factory.JCasBuilder) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) JCas(org.apache.uima.jcas.JCas) Test(org.junit.Test)

Example 4 with JCasBuilder

use of org.apache.uima.fit.factory.JCasBuilder in project dkpro-tc by dkpro.

the class NGramUtilsTest method characterBiGrams.

@Test
public void characterBiGrams() throws Exception {
    String text = "A house";
    JCas jcas = JCasFactory.createJCas();
    jcas.setDocumentLanguage("en");
    jcas.setDocumentText(text);
    JCasBuilder cb = new JCasBuilder(jcas);
    for (String token : text.split(" ")) {
        cb.add(token, Token.class);
    }
    TextClassificationTarget tu = new TextClassificationTarget(jcas, 2, 7);
    tu.addToIndexes();
    FrequencyDistribution<String> ngrams = CharacterNGramMC.getAnnotationCharacterNgrams(tu, false, 2, 3, '^', '$');
    for (String s : ngrams.getKeys()) {
        System.out.println(s);
    }
    assertEquals(11, ngrams.getN());
    assertTrue(ngrams.contains("^h"));
    assertTrue(ngrams.contains("ho"));
    assertTrue(ngrams.contains("ou"));
    assertTrue(ngrams.contains("us"));
    assertTrue(ngrams.contains("se"));
    assertTrue(ngrams.contains("se$"));
    assertTrue(ngrams.contains("^ho"));
    assertTrue(ngrams.contains("hou"));
    assertTrue(ngrams.contains("ous"));
    assertTrue(ngrams.contains("use"));
    assertTrue(ngrams.contains("se$"));
}
Also used : JCasBuilder(org.apache.uima.fit.factory.JCasBuilder) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) JCas(org.apache.uima.jcas.JCas) Test(org.junit.Test)

Example 5 with JCasBuilder

use of org.apache.uima.fit.factory.JCasBuilder in project webanno by webanno.

the class BratAjaxCasUtilTest method testIsSameSentence.

@Test
public void testIsSameSentence() throws Exception {
    JCas jcas = JCasFactory.createJCas();
    JCasBuilder jb = new JCasBuilder(jcas);
    Sentence s1 = jb.add("Sentence 1.", Sentence.class);
    jb.add(" ");
    Sentence s2 = jb.add("Sentence 2.", Sentence.class);
    jb.close();
    assertTrue(isSameSentence(jcas, s2.getBegin(), s2.getEnd()));
    assertTrue(isSameSentence(jcas, s2.getEnd(), s2.getBegin()));
    assertTrue(isSameSentence(jcas, s1.getBegin() + 1, s1.getEnd() - 1));
    assertTrue(isSameSentence(jcas, s1.getEnd() - 1, s1.getBegin() + 1));
    assertTrue(isSameSentence(jcas, s1.getBegin(), s1.getEnd()));
    assertTrue(isSameSentence(jcas, s1.getEnd(), s1.getBegin()));
    assertFalse(isSameSentence(jcas, s2.getBegin(), s1.getBegin()));
    assertFalse(isSameSentence(jcas, s1.getBegin(), s2.getBegin()));
    assertTrue(isSameSentence(jcas, 0, 0));
}
Also used : JCasBuilder(org.apache.uima.fit.factory.JCasBuilder) JCas(org.apache.uima.jcas.JCas) WebAnnoCasUtil.isSameSentence(de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.isSameSentence) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) Test(org.junit.Test)

Aggregations

JCasBuilder (org.apache.uima.fit.factory.JCasBuilder)5 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)3 JCas (org.apache.uima.jcas.JCas)3 Test (org.junit.Test)3 MorphologicalFeatures (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures)2 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)2 Lemma (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma)2 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)2 IOException (java.io.IOException)2 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)2 Type (org.apache.uima.cas.Type)2 TextClassificationTarget (org.dkpro.tc.api.type.TextClassificationTarget)2 WebAnnoCasUtil.isSameSentence (de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.isSameSentence)1 SurfaceForm (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm)1 SemArg (de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg)1 SemArgLink (de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink)1 SemPred (de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred)1 Dependency (de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency)1 Int2ObjectOpenHashMap (it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap)1 ArrayList (java.util.ArrayList)1