use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.
the class NGramUtils method getDocumentNgrams.
/**
* Returns document ngrams over any annotation type that extends Annotation. Intended use is
* Lemma, Stem, etc.
*
* @param jcas
* a jcas
* @param aTarget
* target annotation span
* @param lowerCaseNGrams
* lower caseing
* @param filterPartialMatches
* filter partial matches
* @param minN
* minimal n
* @param maxN
* maximal n
* @param stopwords
* set of stopwords
* @param annotationClass
* annotation type of the ngram
* @return a frequency distribution
*
* @throws TextClassificationException
* when an exception occurs
*/
public static FrequencyDistribution<String> getDocumentNgrams(JCas jcas, Annotation aTarget, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords, Class<? extends Annotation> annotationClass) throws TextClassificationException {
FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
for (Sentence s : selectCovered(jcas, Sentence.class, aTarget)) {
List<String> strings = valuesToText(jcas, s, annotationClass.getName());
for (List<String> ngram : new NGramStringListIterable(strings, minN, maxN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
documentNgrams.inc(ngramString);
}
}
}
return documentNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.
the class SequenceOutcomeReader method setSentence.
protected void setSentence(JCas aJCas, int begin, int end) {
Sentence sentence = new Sentence(aJCas, begin, end);
sentence.addToIndexes();
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.
the class NETest method nEFeatureExtractorTest.
@Test
public void nEFeatureExtractorTest() throws Exception {
AnalysisEngine engine = createEngine(NoOpAnnotator.class);
JCas jcas = engine.newJCas();
engine.process(jcas);
TextClassificationTarget aTarget = new TextClassificationTarget(jcas, 0, 22);
aTarget.addToIndexes();
Location l1 = new Location(jcas, 0, 5);
Person p1 = new Person(jcas, 0, 5);
Organization o1 = new Organization(jcas, 0, 5);
Sentence s1 = new Sentence(jcas, 0, 15);
Sentence s2 = new Sentence(jcas, 15, 22);
l1.addToIndexes();
p1.addToIndexes();
o1.addToIndexes();
s1.addToIndexes();
s2.addToIndexes();
NamedEntityPerSentenceRatio extractor = new NamedEntityPerSentenceRatio();
Set<Feature> features1 = extractor.extract(jcas, aTarget);
assertEquals(6, features1.size());
testFeatures(features1, 1, 1, 1, 0.5f, 0.5f, 0.5f);
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.
the class ImportExportServiceImpl method splitSentences.
public static void splitSentences(JCas aJCas) {
BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
bi.setText(aJCas.getDocumentText());
int last = bi.first();
int cur = bi.next();
while (cur != BreakIterator.DONE) {
int[] span = new int[] { last, cur };
trim(aJCas.getDocumentText(), span);
if (!isEmpty(span[0], span[1])) {
Sentence seg = new Sentence(aJCas, span[0], span[1]);
seg.addToIndexes(aJCas);
}
last = cur;
cur = bi.next();
}
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.
the class ImportExportServiceImpl method tokenize.
public static void tokenize(JCas aJCas) {
BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
for (Sentence s : select(aJCas, Sentence.class)) {
bi.setText(s.getCoveredText());
int last = bi.first();
int cur = bi.next();
while (cur != BreakIterator.DONE) {
int[] span = new int[] { last, cur };
trim(s.getCoveredText(), span);
if (!isEmpty(span[0], span[1])) {
Token seg = new Token(aJCas, span[0] + s.getBegin(), span[1] + s.getBegin());
seg.addToIndexes(aJCas);
}
last = cur;
cur = bi.next();
}
}
}
Aggregations