Search in sources :

Example 46 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.

the class NGramUtils method getDocumentNgrams.

/**
 * Returns document ngrams over any annotation type that extends Annotation. Intended use is
 * Lemma, Stem, etc.
 *
 * @param jcas
 *            a jcas
 * @param aTarget
 *            target annotation span
 * @param lowerCaseNGrams
 *            lower caseing
 * @param filterPartialMatches
 *            filter partial matches
 * @param minN
 *            minimal n
 * @param maxN
 *            maximal n
 * @param stopwords
 *            set of stopwords
 * @param annotationClass
 *            annotation type of the ngram
 * @return a frequency distribution
 *
 * @throws TextClassificationException
 *             when an exception occurs
 */
public static FrequencyDistribution<String> getDocumentNgrams(JCas jcas, Annotation aTarget, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords, Class<? extends Annotation> annotationClass) throws TextClassificationException {
    FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
    for (Sentence s : selectCovered(jcas, Sentence.class, aTarget)) {
        List<String> strings = valuesToText(jcas, s, annotationClass.getName());
        for (List<String> ngram : new NGramStringListIterable(strings, minN, maxN)) {
            if (lowerCaseNGrams) {
                ngram = lower(ngram);
            }
            if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
                String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
                documentNgrams.inc(ngramString);
            }
        }
    }
    return documentNgrams;
}
Also used : Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 47 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.

the class SequenceOutcomeReader method setSentence.

protected void setSentence(JCas aJCas, int begin, int end) {
    Sentence sentence = new Sentence(aJCas, begin, end);
    sentence.addToIndexes();
}
Also used : Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 48 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.

the class NETest method nEFeatureExtractorTest.

@Test
public void nEFeatureExtractorTest() throws Exception {
    AnalysisEngine engine = createEngine(NoOpAnnotator.class);
    JCas jcas = engine.newJCas();
    engine.process(jcas);
    TextClassificationTarget aTarget = new TextClassificationTarget(jcas, 0, 22);
    aTarget.addToIndexes();
    Location l1 = new Location(jcas, 0, 5);
    Person p1 = new Person(jcas, 0, 5);
    Organization o1 = new Organization(jcas, 0, 5);
    Sentence s1 = new Sentence(jcas, 0, 15);
    Sentence s2 = new Sentence(jcas, 15, 22);
    l1.addToIndexes();
    p1.addToIndexes();
    o1.addToIndexes();
    s1.addToIndexes();
    s2.addToIndexes();
    NamedEntityPerSentenceRatio extractor = new NamedEntityPerSentenceRatio();
    Set<Feature> features1 = extractor.extract(jcas, aTarget);
    assertEquals(6, features1.size());
    testFeatures(features1, 1, 1, 1, 0.5f, 0.5f, 0.5f);
}
Also used : Organization(de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) JCas(org.apache.uima.jcas.JCas) Person(de.tudarmstadt.ukp.dkpro.core.api.ner.type.Person) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) Feature(org.dkpro.tc.api.features.Feature) AnalysisEngine(org.apache.uima.analysis_engine.AnalysisEngine) Location(de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location) Test(org.junit.Test)

Example 49 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class ImportExportServiceImpl method splitSentences.

public static void splitSentences(JCas aJCas) {
    BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
    bi.setText(aJCas.getDocumentText());
    int last = bi.first();
    int cur = bi.next();
    while (cur != BreakIterator.DONE) {
        int[] span = new int[] { last, cur };
        trim(aJCas.getDocumentText(), span);
        if (!isEmpty(span[0], span[1])) {
            Sentence seg = new Sentence(aJCas, span[0], span[1]);
            seg.addToIndexes(aJCas);
        }
        last = cur;
        cur = bi.next();
    }
}
Also used : Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) BreakIterator(java.text.BreakIterator)

Example 50 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.

the class ImportExportServiceImpl method tokenize.

public static void tokenize(JCas aJCas) {
    BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
    for (Sentence s : select(aJCas, Sentence.class)) {
        bi.setText(s.getCoveredText());
        int last = bi.first();
        int cur = bi.next();
        while (cur != BreakIterator.DONE) {
            int[] span = new int[] { last, cur };
            trim(s.getCoveredText(), span);
            if (!isEmpty(span[0], span[1])) {
                Token seg = new Token(aJCas, span[0] + s.getBegin(), span[1] + s.getBegin());
                seg.addToIndexes(aJCas);
            }
            last = cur;
            cur = bi.next();
        }
    }
}
Also used : Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) BreakIterator(java.text.BreakIterator)

Aggregations

Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)90 JCas (org.apache.uima.jcas.JCas)41 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)34 ArrayList (java.util.ArrayList)22 AnnotatorState (de.tudarmstadt.ukp.clarin.webanno.api.annotation.model.AnnotatorState)14 Type (org.apache.uima.cas.Type)12 AnnotationFS (org.apache.uima.cas.text.AnnotationFS)12 IOException (java.io.IOException)9 SourceDocument (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument)8 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)8 Test (org.junit.Test)8 HashMap (java.util.HashMap)7 TokenBuilder (org.apache.uima.fit.testing.factory.TokenBuilder)7 AnnotationException (de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException)6 WebAnnoCasUtil.getFirstSentence (de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.getFirstSentence)6 AnnotationDocument (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument)6 AnnotationFeature (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)6 FrequencyDistribution (de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)6 CASException (org.apache.uima.cas.CASException)6 AutomationTypeAdapter (de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter)5