use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.
the class BrownCorpusReader method getNext.
@Override
public void getNext(CAS cas) throws IOException, CollectionException {
super.getNext(cas);
JCas jcas;
try {
jcas = cas.getJCas();
} catch (CASException e) {
throw new CollectionException(e);
}
for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) {
TextClassificationSequence sequence = new TextClassificationSequence(jcas, sentence.getBegin(), sentence.getEnd());
sequence.addToIndexes();
for (Token token : JCasUtil.selectCovered(jcas, Token.class, sentence)) {
TextClassificationTarget unit = new TextClassificationTarget(jcas, token.getBegin(), token.getEnd());
// will add the token content as a suffix to the ID of this unit
unit.setSuffix(token.getCoveredText());
unit.addToIndexes();
TextClassificationOutcome outcome = new TextClassificationOutcome(jcas, token.getBegin(), token.getEnd());
outcome.setOutcome(getTextClassificationOutcome(jcas, unit));
outcome.addToIndexes();
}
}
}
use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.
the class CosineFeatureExtractor method extract.
@Override
public Set<Feature> extract(JCas view1, JCas view2) throws TextClassificationException {
try {
TextClassificationTarget aTarget1 = JCasUtil.selectSingle(view1, TextClassificationTarget.class);
TextClassificationTarget aTarget2 = JCasUtil.selectSingle(view2, TextClassificationTarget.class);
// Note: getSimilarity(String, String) is *not* a convenience
// method for getSimilarity(Collection<String>, Collection<String>).
Set<String> text1 = NGramUtils.getDocumentNgrams(view1, aTarget1, true, false, 1, 1, stopwords, ngramAnnotationType).getKeys();
Set<String> text2 = NGramUtils.getDocumentNgrams(view2, aTarget2, true, false, 1, 1, stopwords, ngramAnnotationType).getKeys();
double similarity = measure.getSimilarity(text1, text2);
// Temporary fix for DKPro Similarity Issue 30
if (Double.isNaN(similarity)) {
similarity = 0.0;
}
return new Feature("Similarity" + measure.getName(), similarity, FeatureType.NUMERIC).asSet();
} catch (SimilarityException e) {
throw new TextClassificationException(e);
}
}
use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.
the class CharacterNGramMC method getNgramsFD.
@Override
protected FrequencyDistribution<String> getNgramsFD(JCas jcas) {
TextClassificationTarget fullDoc = new TextClassificationTarget(jcas, 0, jcas.getDocumentText().length());
FrequencyDistribution<String> fd = getAnnotationCharacterNgrams(fullDoc, lowerCase, ngramMinN, ngramMaxN, '^', '$');
return fd;
}
use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.
the class WordNGramMC method getNgramsFD.
@Override
protected FrequencyDistribution<String> getNgramsFD(JCas jcas) throws TextClassificationException {
TextClassificationTarget fullDoc = new TextClassificationTarget(jcas, 0, jcas.getDocumentText().length());
FrequencyDistribution<String> fd = null;
fd = NGramUtils.getDocumentNgrams(jcas, fullDoc, ngramLowerCase, filterPartialStopwordMatches, ngramMinN, ngramMaxN, stopwords, Token.class);
return fd;
}
use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.
the class KeywordNGramUtils method getMultipleViewKeywordNgrams.
public static FrequencyDistribution<String> getMultipleViewKeywordNgrams(List<JCas> jcases, int minN, int maxN, boolean markSentenceBoundary, boolean markSentenceLocation, boolean includeCommas, Set<String> keywords) {
FrequencyDistribution<String> viewNgramsTotal = new FrequencyDistribution<String>();
for (JCas view : jcases) {
TextClassificationTarget aTarget = JCasUtil.selectSingle(view, TextClassificationTarget.class);
FrequencyDistribution<String> oneViewsNgrams = getDocumentKeywordNgrams(view, aTarget, minN, maxN, markSentenceBoundary, markSentenceLocation, includeCommas, keywords);
// This is a hack because there's no method to combine 2 FD's
for (String key : oneViewsNgrams.getKeys()) {
viewNgramsTotal.addSample(key, oneViewsNgrams.getCount(key));
}
}
return viewNgramsTotal;
}
Aggregations