use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class ContextualityMeasureFeatureExtractor method extract.
@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
Set<Feature> featSet = new HashSet<Feature>();
double total = selectCovered(jcas, POS.class, aTarget).size();
double noun = selectCovered(jcas, POS_NOUN.class, aTarget).size() / total;
double adj = selectCovered(jcas, POS_ADJ.class, aTarget).size() / total;
double prep = selectCovered(jcas, POS_ADP.class, aTarget).size() / total;
// !includes
double art = selectCovered(jcas, POS_DET.class, aTarget).size() / total;
// determiners
double pro = selectCovered(jcas, POS_PRON.class, aTarget).size() / total;
double verb = selectCovered(jcas, POS_VERB.class, aTarget).size() / total;
double adv = selectCovered(jcas, POS_ADV.class, aTarget).size() / total;
// noun freq + adj.freq. + prepositions freq. + article freq. - pronoun freq. - verb f. -
// adverb - interjection + 100
double contextualityMeasure = 0.5 * (noun + adj + prep + art - pro - verb - adv + 100);
featSet.add(new Feature("NounRate", noun, FeatureType.NUMERIC));
featSet.add(new Feature("AdjectiveRate", adj, FeatureType.NUMERIC));
featSet.add(new Feature("PrepositionRate", prep, FeatureType.NUMERIC));
featSet.add(new Feature("ArticleRate", art, FeatureType.NUMERIC));
featSet.add(new Feature("PronounRate", pro, FeatureType.NUMERIC));
featSet.add(new Feature("VerbRate", verb, FeatureType.NUMERIC));
featSet.add(new Feature("AdverbRate", adv, FeatureType.NUMERIC));
featSet.add(new Feature(CONTEXTUALITY_MEASURE_FN, contextualityMeasure, FeatureType.NUMERIC));
return featSet;
}
use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class TopicWordsFeatureExtractor method countWordHits.
private List<Feature> countWordHits(String wordListName, List<String> tokens) throws TextClassificationException {
// word lists are stored in resources folder relative to feature extractor
String wordListPath = TopicWordsFeatureExtractor.class.getClassLoader().getResource("./" + wordListName).getPath();
List<String> topicwords = null;
try {
topicwords = FileUtils.readLines(new File(wordListPath), "utf-8");
} catch (IOException e) {
throw new TextClassificationException(e);
}
int wordcount = 0;
for (String token : tokens) {
if (topicwords.contains(token)) {
wordcount++;
}
}
double numTokens = tokens.size();
// name the feature same as wordlist
return Arrays.asList(new Feature(prefix + wordListName, numTokens > 0 ? wordcount / numTokens : 0, FeatureType.NUMERIC));
}
use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class TopicWordsFeatureExtractor method extract.
@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
if (topicFilePath == null || topicFilePath.isEmpty()) {
throw new TextClassificationException("Path to word list must be set!");
}
List<String> topics = null;
Set<Feature> features = new HashSet<Feature>();
List<String> tokens = JCasUtil.toText(JCasUtil.selectCovered(jcas, Token.class, aTarget));
try {
topics = FileUtils.readLines(new File(topicFilePath), "utf-8");
for (String t : topics) {
features.addAll(countWordHits(t, tokens));
}
} catch (IOException e) {
throw new TextClassificationException(e);
}
return features;
}
use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class InitialCharacterUpperCase method extract.
@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
String token = aTarget.getCoveredText();
boolean bool = Character.isUpperCase(token.charAt(0));
return new Feature(FEATURE_NAME, bool ? 1.0 : 0.0, bool == false, FeatureType.BOOLEAN).asSet();
}
use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class QuestionsRatioFeatureExtractor method extract.
@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
int nrOfSentences = JCasUtil.selectCovered(jcas, Sentence.class, aTarget).size();
String text = aTarget.getCoveredText();
// don't count multiple question marks as multiple
Pattern p = Pattern.compile("\\?[^\\?]");
// questions
int matches = 0;
Matcher m = p.matcher(text);
while (m.find()) {
matches++;
}
double questionRatio = 0.0;
if (nrOfSentences > 0) {
questionRatio = (double) matches / nrOfSentences;
}
return new Feature(FN_QUESTION_RATIO, questionRatio, FeatureType.NUMERIC).asSet();
}
Aggregations