Search in sources :

Example 6 with TextClassificationException

use of org.dkpro.tc.api.exception.TextClassificationException in project dkpro-tc by dkpro.

the class TopicWordsFeatureExtractor method countWordHits.

private List<Feature> countWordHits(String wordListName, List<String> tokens) throws TextClassificationException {
    // word lists are stored in resources folder relative to feature extractor
    String wordListPath = TopicWordsFeatureExtractor.class.getClassLoader().getResource("./" + wordListName).getPath();
    List<String> topicwords = null;
    try {
        topicwords = FileUtils.readLines(new File(wordListPath), "utf-8");
    } catch (IOException e) {
        throw new TextClassificationException(e);
    }
    int wordcount = 0;
    for (String token : tokens) {
        if (topicwords.contains(token)) {
            wordcount++;
        }
    }
    double numTokens = tokens.size();
    // name the feature same as wordlist
    return Arrays.asList(new Feature(prefix + wordListName, numTokens > 0 ? wordcount / numTokens : 0, FeatureType.NUMERIC));
}
Also used : TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) IOException(java.io.IOException) File(java.io.File) Feature(org.dkpro.tc.api.features.Feature)

Example 7 with TextClassificationException

use of org.dkpro.tc.api.exception.TextClassificationException in project dkpro-tc by dkpro.

the class TopicWordsFeatureExtractor method extract.

@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
    if (topicFilePath == null || topicFilePath.isEmpty()) {
        throw new TextClassificationException("Path to word list must be set!");
    }
    List<String> topics = null;
    Set<Feature> features = new HashSet<Feature>();
    List<String> tokens = JCasUtil.toText(JCasUtil.selectCovered(jcas, Token.class, aTarget));
    try {
        topics = FileUtils.readLines(new File(topicFilePath), "utf-8");
        for (String t : topics) {
            features.addAll(countWordHits(t, tokens));
        }
    } catch (IOException e) {
        throw new TextClassificationException(e);
    }
    return features;
}
Also used : TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) IOException(java.io.IOException) Feature(org.dkpro.tc.api.features.Feature) File(java.io.File) HashSet(java.util.HashSet)

Example 8 with TextClassificationException

use of org.dkpro.tc.api.exception.TextClassificationException in project dkpro-tc by dkpro.

the class InitTask method getPreValidityCheckEngine.

private AnalysisEngineDescription getPreValidityCheckEngine() throws ResourceInitializationException {
    // check mandatory dimensions
    if (featureExtractors == null) {
        throw new ResourceInitializationException(new TextClassificationException("No feature extractors have been added to the experiment."));
    }
    List<Object> parameters = new ArrayList<Object>();
    parameters.add(ValidityCheckConnector.PARAM_LEARNING_MODE);
    parameters.add(learningMode);
    parameters.add(ValidityCheckConnector.PARAM_FEATURE_MODE);
    parameters.add(featureMode);
    parameters.add(ValidityCheckConnector.PARAM_BIPARTITION_THRESHOLD);
    parameters.add(threshold);
    parameters.add(ValidityCheckConnector.PARAM_FEATURE_EXTRACTORS);
    parameters.add(getFeatureExtractorNames(featureExtractors));
    return createEngineDescription(ValidityCheckConnector.class, parameters.toArray());
}
Also used : TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) ArrayList(java.util.ArrayList)

Example 9 with TextClassificationException

use of org.dkpro.tc.api.exception.TextClassificationException in project dkpro-tc by dkpro.

the class InstanceExtractor method getUnitInstances.

public List<Instance> getUnitInstances(JCas jcas, boolean supportSparseFeatures) throws TextClassificationException {
    List<Instance> instances = new ArrayList<Instance>();
    int jcasId = JCasUtil.selectSingle(jcas, JCasId.class).getId();
    Collection<TextClassificationTarget> targets = JCasUtil.select(jcas, TextClassificationTarget.class);
    for (TextClassificationTarget aTarget : targets) {
        Instance instance = new Instance();
        if (addInstanceId) {
            Feature feat = InstanceIdFeature.retrieve(jcas, aTarget);
            instance.addFeature(feat);
        }
        for (FeatureExtractorResource_ImplBase featExt : featureExtractors) {
            if (!(featExt instanceof FeatureExtractor)) {
                throw new TextClassificationException("Feature extractor does not implement interface [" + FeatureExtractor.class.getName() + "]: " + featExt.getResourceName());
            }
            if (supportSparseFeatures) {
                instance.addFeatures(getSparse(jcas, aTarget, featExt));
            } else {
                instance.addFeatures(getDense(jcas, aTarget, featExt));
            }
        }
        // set and write outcome label(s)
        instance.setOutcomes(getOutcomes(jcas, aTarget));
        instance.setWeight(getWeight(jcas, aTarget));
        instance.setJcasId(jcasId);
        // instance.setSequenceId(sequenceId);
        instance.setSequencePosition(aTarget.getId());
        instances.add(instance);
    }
    return instances;
}
Also used : JCasId(org.dkpro.tc.api.type.JCasId) FeatureExtractor(org.dkpro.tc.api.features.FeatureExtractor) PairFeatureExtractor(org.dkpro.tc.api.features.PairFeatureExtractor) Instance(org.dkpro.tc.api.features.Instance) TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) ArrayList(java.util.ArrayList) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) Feature(org.dkpro.tc.api.features.Feature) InstanceIdFeature(org.dkpro.tc.core.feature.InstanceIdFeature) FeatureExtractorResource_ImplBase(org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase)

Example 10 with TextClassificationException

use of org.dkpro.tc.api.exception.TextClassificationException in project dkpro-tc by dkpro.

the class InstanceExtractor method getWeight.

private double getWeight(JCas jcas, AnnotationFS unit) throws TextClassificationException {
    Collection<TextClassificationOutcome> outcomes;
    if (unit == null) {
        outcomes = JCasUtil.select(jcas, TextClassificationOutcome.class);
    } else {
        outcomes = JCasUtil.selectCovered(jcas, TextClassificationOutcome.class, unit);
    }
    if (outcomes.size() == 0) {
        throw new TextClassificationException("No instance weight annotation present in current CAS.");
    }
    double weight = -1.0;
    for (TextClassificationOutcome outcome : outcomes) {
        weight = outcome.getWeight();
    }
    return weight;
}
Also used : TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) TextClassificationOutcome(org.dkpro.tc.api.type.TextClassificationOutcome)

Aggregations

TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)25 ArrayList (java.util.ArrayList)10 TextClassificationTarget (org.dkpro.tc.api.type.TextClassificationTarget)7 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)6 IOException (java.io.IOException)5 Feature (org.dkpro.tc.api.features.Feature)5 File (java.io.File)4 JCas (org.apache.uima.jcas.JCas)4 ResourceInitializationException (org.apache.uima.resource.ResourceInitializationException)4 FeatureExtractorResource_ImplBase (org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase)4 JCasId (org.dkpro.tc.api.type.JCasId)4 TextClassificationOutcome (org.dkpro.tc.api.type.TextClassificationOutcome)4 CASException (org.apache.uima.cas.CASException)3 PairFeatureExtractor (org.dkpro.tc.api.features.PairFeatureExtractor)3 FrequencyDistribution (de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)2 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)2 SimilarityException (dkpro.similarity.algorithms.api.SimilarityException)2 HashSet (java.util.HashSet)2 FeatureExtractor (org.dkpro.tc.api.features.FeatureExtractor)2 Instance (org.dkpro.tc.api.features.Instance)2