use of org.dkpro.tc.api.exception.TextClassificationException in project dkpro-tc by dkpro.
the class TopicWordsFeatureExtractor method countWordHits.
private List<Feature> countWordHits(String wordListName, List<String> tokens) throws TextClassificationException {
// word lists are stored in resources folder relative to feature extractor
String wordListPath = TopicWordsFeatureExtractor.class.getClassLoader().getResource("./" + wordListName).getPath();
List<String> topicwords = null;
try {
topicwords = FileUtils.readLines(new File(wordListPath), "utf-8");
} catch (IOException e) {
throw new TextClassificationException(e);
}
int wordcount = 0;
for (String token : tokens) {
if (topicwords.contains(token)) {
wordcount++;
}
}
double numTokens = tokens.size();
// name the feature same as wordlist
return Arrays.asList(new Feature(prefix + wordListName, numTokens > 0 ? wordcount / numTokens : 0, FeatureType.NUMERIC));
}
use of org.dkpro.tc.api.exception.TextClassificationException in project dkpro-tc by dkpro.
the class TopicWordsFeatureExtractor method extract.
@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
if (topicFilePath == null || topicFilePath.isEmpty()) {
throw new TextClassificationException("Path to word list must be set!");
}
List<String> topics = null;
Set<Feature> features = new HashSet<Feature>();
List<String> tokens = JCasUtil.toText(JCasUtil.selectCovered(jcas, Token.class, aTarget));
try {
topics = FileUtils.readLines(new File(topicFilePath), "utf-8");
for (String t : topics) {
features.addAll(countWordHits(t, tokens));
}
} catch (IOException e) {
throw new TextClassificationException(e);
}
return features;
}
use of org.dkpro.tc.api.exception.TextClassificationException in project dkpro-tc by dkpro.
the class InitTask method getPreValidityCheckEngine.
private AnalysisEngineDescription getPreValidityCheckEngine() throws ResourceInitializationException {
// check mandatory dimensions
if (featureExtractors == null) {
throw new ResourceInitializationException(new TextClassificationException("No feature extractors have been added to the experiment."));
}
List<Object> parameters = new ArrayList<Object>();
parameters.add(ValidityCheckConnector.PARAM_LEARNING_MODE);
parameters.add(learningMode);
parameters.add(ValidityCheckConnector.PARAM_FEATURE_MODE);
parameters.add(featureMode);
parameters.add(ValidityCheckConnector.PARAM_BIPARTITION_THRESHOLD);
parameters.add(threshold);
parameters.add(ValidityCheckConnector.PARAM_FEATURE_EXTRACTORS);
parameters.add(getFeatureExtractorNames(featureExtractors));
return createEngineDescription(ValidityCheckConnector.class, parameters.toArray());
}
use of org.dkpro.tc.api.exception.TextClassificationException in project dkpro-tc by dkpro.
the class InstanceExtractor method getUnitInstances.
public List<Instance> getUnitInstances(JCas jcas, boolean supportSparseFeatures) throws TextClassificationException {
List<Instance> instances = new ArrayList<Instance>();
int jcasId = JCasUtil.selectSingle(jcas, JCasId.class).getId();
Collection<TextClassificationTarget> targets = JCasUtil.select(jcas, TextClassificationTarget.class);
for (TextClassificationTarget aTarget : targets) {
Instance instance = new Instance();
if (addInstanceId) {
Feature feat = InstanceIdFeature.retrieve(jcas, aTarget);
instance.addFeature(feat);
}
for (FeatureExtractorResource_ImplBase featExt : featureExtractors) {
if (!(featExt instanceof FeatureExtractor)) {
throw new TextClassificationException("Feature extractor does not implement interface [" + FeatureExtractor.class.getName() + "]: " + featExt.getResourceName());
}
if (supportSparseFeatures) {
instance.addFeatures(getSparse(jcas, aTarget, featExt));
} else {
instance.addFeatures(getDense(jcas, aTarget, featExt));
}
}
// set and write outcome label(s)
instance.setOutcomes(getOutcomes(jcas, aTarget));
instance.setWeight(getWeight(jcas, aTarget));
instance.setJcasId(jcasId);
// instance.setSequenceId(sequenceId);
instance.setSequencePosition(aTarget.getId());
instances.add(instance);
}
return instances;
}
use of org.dkpro.tc.api.exception.TextClassificationException in project dkpro-tc by dkpro.
the class InstanceExtractor method getWeight.
private double getWeight(JCas jcas, AnnotationFS unit) throws TextClassificationException {
Collection<TextClassificationOutcome> outcomes;
if (unit == null) {
outcomes = JCasUtil.select(jcas, TextClassificationOutcome.class);
} else {
outcomes = JCasUtil.selectCovered(jcas, TextClassificationOutcome.class, unit);
}
if (outcomes.size() == 0) {
throw new TextClassificationException("No instance weight annotation present in current CAS.");
}
double weight = -1.0;
for (TextClassificationOutcome outcome : outcomes) {
weight = outcome.getWeight();
}
return weight;
}
Aggregations