use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.
the class SequenceOutcomeAnnotator method process.
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
for (Sentence sent : JCasUtil.select(aJCas, Sentence.class)) {
TextClassificationSequence sequence = new TextClassificationSequence(aJCas, sent.getBegin(), sent.getEnd());
sequence.addToIndexes();
List<Token> tokens = JCasUtil.selectCovered(aJCas, Token.class, sent);
for (Token token : tokens) {
TextClassificationTarget unit = new TextClassificationTarget(aJCas, token.getBegin(), token.getEnd());
unit.setId(tcId++);
unit.setSuffix(token.getCoveredText());
unit.addToIndexes();
TextClassificationOutcome outcome = new TextClassificationOutcome(aJCas, token.getBegin(), token.getEnd());
outcome.setOutcome(getTextClassificationOutcome(aJCas, unit));
outcome.addToIndexes();
}
}
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.
the class PosNGramMC method sentenceBasedDistribution.
private static FrequencyDistribution<String> sentenceBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) {
FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>();
for (Sentence s : selectCovered(jcas, Sentence.class, focus)) {
List<String> postagstrings = new ArrayList<String>();
for (POS p : selectCovered(jcas, POS.class, s)) {
if (useCanonical) {
postagstrings.add(p.getClass().getSimpleName());
} else {
postagstrings.add(p.getPosValue());
}
}
String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) {
posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
}
}
return posNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.
the class SkipWordNGramMC method getDocumentSkipNgrams.
public static FrequencyDistribution<String> getDocumentSkipNgrams(JCas jcas, Annotation anno, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, int skipN, Set<String> stopwords) {
FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
for (Sentence s : selectCovered(jcas, Sentence.class, anno)) {
for (List<String> ngram : new SkipNgramStringListIterable(toText(selectCovered(Token.class, s)), minN, maxN, skipN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
documentNgrams.inc(ngramString);
}
}
}
return documentNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.
the class NGramUtils method getAnnotationNgrams.
public static FrequencyDistribution<String> getAnnotationNgrams(JCas jcas, Annotation focusAnnotation, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords) {
FrequencyDistribution<String> annoNgrams = new FrequencyDistribution<String>();
// if not, extract them from all tokens in the focusAnnotation
if (selectCovered(jcas, Sentence.class, focusAnnotation).size() > 0) {
for (Sentence s : selectCovered(jcas, Sentence.class, focusAnnotation)) {
for (List<String> ngram : new NGramStringListIterable(toText(selectCovered(Token.class, s)), minN, maxN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
annoNgrams.inc(ngramString);
}
}
}
} else {
for (List<String> ngram : new NGramStringListIterable(toText(selectCovered(Token.class, focusAnnotation)), minN, maxN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
annoNgrams.inc(ngramString);
}
}
}
return annoNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.
the class EmoticonRatioTest method emoticonRatioFeatureExtractorTest.
@Test
public void emoticonRatioFeatureExtractorTest() throws Exception {
AnalysisEngineDescription desc = createEngineDescription(NoOpAnnotator.class);
AnalysisEngine engine = createEngine(desc);
TokenBuilder<Token, Sentence> builder = TokenBuilder.create(Token.class, Sentence.class);
JCas jcas = engine.newJCas();
jcas.setDocumentLanguage("en");
builder.buildTokens(jcas, "This is a very emotional tweet ;-)");
POS_EMO emo = new POS_EMO(jcas);
emo.setBegin(31);
emo.setEnd(34);
emo.addToIndexes();
engine.process(jcas);
TextClassificationTarget aTarget = new TextClassificationTarget(jcas, 0, jcas.getDocumentText().length());
aTarget.addToIndexes();
EmoticonRatio extractor = new EmoticonRatio();
List<Feature> features = new ArrayList<Feature>(extractor.extract(jcas, aTarget));
Assert.assertEquals(1, features.size());
for (Feature feature : features) {
assertFeature(EmoticonRatio.class.getSimpleName(), 0.14, feature, 0.01);
}
}
Aggregations