Search in sources :

Example 86 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.

the class SequenceOutcomeAnnotator method process.

@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
    for (Sentence sent : JCasUtil.select(aJCas, Sentence.class)) {
        TextClassificationSequence sequence = new TextClassificationSequence(aJCas, sent.getBegin(), sent.getEnd());
        sequence.addToIndexes();
        List<Token> tokens = JCasUtil.selectCovered(aJCas, Token.class, sent);
        for (Token token : tokens) {
            TextClassificationTarget unit = new TextClassificationTarget(aJCas, token.getBegin(), token.getEnd());
            unit.setId(tcId++);
            unit.setSuffix(token.getCoveredText());
            unit.addToIndexes();
            TextClassificationOutcome outcome = new TextClassificationOutcome(aJCas, token.getBegin(), token.getEnd());
            outcome.setOutcome(getTextClassificationOutcome(aJCas, unit));
            outcome.addToIndexes();
        }
    }
}
Also used : TextClassificationOutcome(org.dkpro.tc.api.type.TextClassificationOutcome) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) TextClassificationSequence(org.dkpro.tc.api.type.TextClassificationSequence) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 87 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.

the class PosNGramMC method sentenceBasedDistribution.

private static FrequencyDistribution<String> sentenceBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) {
    FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>();
    for (Sentence s : selectCovered(jcas, Sentence.class, focus)) {
        List<String> postagstrings = new ArrayList<String>();
        for (POS p : selectCovered(jcas, POS.class, s)) {
            if (useCanonical) {
                postagstrings.add(p.getClass().getSimpleName());
            } else {
                postagstrings.add(p.getPosValue());
            }
        }
        String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
        for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) {
            posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
        }
    }
    return posNgrams;
}
Also used : POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) ArrayList(java.util.ArrayList) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 88 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.

the class SkipWordNGramMC method getDocumentSkipNgrams.

public static FrequencyDistribution<String> getDocumentSkipNgrams(JCas jcas, Annotation anno, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, int skipN, Set<String> stopwords) {
    FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
    for (Sentence s : selectCovered(jcas, Sentence.class, anno)) {
        for (List<String> ngram : new SkipNgramStringListIterable(toText(selectCovered(Token.class, s)), minN, maxN, skipN)) {
            if (lowerCaseNGrams) {
                ngram = lower(ngram);
            }
            if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
                String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
                documentNgrams.inc(ngramString);
            }
        }
    }
    return documentNgrams;
}
Also used : SkipNgramStringListIterable(org.dkpro.tc.features.ngram.util.SkipNgramStringListIterable) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 89 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.

the class NGramUtils method getAnnotationNgrams.

public static FrequencyDistribution<String> getAnnotationNgrams(JCas jcas, Annotation focusAnnotation, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords) {
    FrequencyDistribution<String> annoNgrams = new FrequencyDistribution<String>();
    // if not, extract them from all tokens in the focusAnnotation
    if (selectCovered(jcas, Sentence.class, focusAnnotation).size() > 0) {
        for (Sentence s : selectCovered(jcas, Sentence.class, focusAnnotation)) {
            for (List<String> ngram : new NGramStringListIterable(toText(selectCovered(Token.class, s)), minN, maxN)) {
                if (lowerCaseNGrams) {
                    ngram = lower(ngram);
                }
                if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
                    String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
                    annoNgrams.inc(ngramString);
                }
            }
        }
    } else {
        for (List<String> ngram : new NGramStringListIterable(toText(selectCovered(Token.class, focusAnnotation)), minN, maxN)) {
            if (lowerCaseNGrams) {
                ngram = lower(ngram);
            }
            if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
                String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
                annoNgrams.inc(ngramString);
            }
        }
    }
    return annoNgrams;
}
Also used : Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 90 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.

the class EmoticonRatioTest method emoticonRatioFeatureExtractorTest.

@Test
public void emoticonRatioFeatureExtractorTest() throws Exception {
    AnalysisEngineDescription desc = createEngineDescription(NoOpAnnotator.class);
    AnalysisEngine engine = createEngine(desc);
    TokenBuilder<Token, Sentence> builder = TokenBuilder.create(Token.class, Sentence.class);
    JCas jcas = engine.newJCas();
    jcas.setDocumentLanguage("en");
    builder.buildTokens(jcas, "This is a very emotional tweet ;-)");
    POS_EMO emo = new POS_EMO(jcas);
    emo.setBegin(31);
    emo.setEnd(34);
    emo.addToIndexes();
    engine.process(jcas);
    TextClassificationTarget aTarget = new TextClassificationTarget(jcas, 0, jcas.getDocumentText().length());
    aTarget.addToIndexes();
    EmoticonRatio extractor = new EmoticonRatio();
    List<Feature> features = new ArrayList<Feature>(extractor.extract(jcas, aTarget));
    Assert.assertEquals(1, features.size());
    for (Feature feature : features) {
        assertFeature(EmoticonRatio.class.getSimpleName(), 0.14, feature, 0.01);
    }
}
Also used : EmoticonRatio(org.dkpro.tc.features.twitter.EmoticonRatio) POS_EMO(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_EMO) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) ArrayList(java.util.ArrayList) JCas(org.apache.uima.jcas.JCas) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) FeatureTestUtil.assertFeature(org.dkpro.tc.testing.FeatureTestUtil.assertFeature) Feature(org.dkpro.tc.api.features.Feature) AnalysisEngine(org.apache.uima.analysis_engine.AnalysisEngine) Test(org.junit.Test)

Aggregations

Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)90 JCas (org.apache.uima.jcas.JCas)41 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)34 ArrayList (java.util.ArrayList)22 AnnotatorState (de.tudarmstadt.ukp.clarin.webanno.api.annotation.model.AnnotatorState)14 Type (org.apache.uima.cas.Type)12 AnnotationFS (org.apache.uima.cas.text.AnnotationFS)12 IOException (java.io.IOException)9 SourceDocument (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument)8 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)8 Test (org.junit.Test)8 HashMap (java.util.HashMap)7 TokenBuilder (org.apache.uima.fit.testing.factory.TokenBuilder)7 AnnotationException (de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException)6 WebAnnoCasUtil.getFirstSentence (de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.getFirstSentence)6 AnnotationDocument (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument)6 AnnotationFeature (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)6 FrequencyDistribution (de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)6 CASException (org.apache.uima.cas.CASException)6 AutomationTypeAdapter (de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter)5