use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class TokenRatioPerDocument method extract.
@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
long maxLen = getMax();
Collection<Token> tokens = JCasUtil.selectCovered(jcas, Token.class, aTarget);
double ratio = getRatio(tokens.size(), maxLen);
return new Feature(FEATURE_NAME, ratio, FeatureType.NUMERIC).asSet();
}
use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class KeywordNGram method extract.
@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
Set<Feature> features = new HashSet<Feature>();
FrequencyDistribution<String> documentNgrams = KeywordNGramUtils.getDocumentKeywordNgrams(jcas, aTarget, ngramMaxN, ngramMaxN, markSentenceBoundary, markSentenceLocation, includeCommas, keywords);
for (String topNgram : topKSet.getKeys()) {
if (documentNgrams.getKeys().contains(topNgram)) {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 1, FeatureType.BOOLEAN));
} else {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true, FeatureType.BOOLEAN));
}
}
return features;
}
use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class PosNGram method extract.
@Override
public Set<Feature> extract(JCas view, TextClassificationTarget classificationUnit) throws TextClassificationException {
Set<Feature> features = new HashSet<Feature>();
FrequencyDistribution<String> documentPOSNgrams = null;
documentPOSNgrams = PosNGramMC.getDocumentPosNgrams(view, classificationUnit, ngramMinN, ngramMaxN, useCanonicalTags);
for (String topNgram : topKSet.getKeys()) {
if (documentPOSNgrams.getKeys().contains(topNgram)) {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 1, FeatureType.BOOLEAN));
} else {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true, FeatureType.BOOLEAN));
}
}
return features;
}
use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class NumberOfHashTagsTest method numberOfHashTagsFeatureExtractorTest.
@Test
public void numberOfHashTagsFeatureExtractorTest() throws Exception {
AnalysisEngineDescription desc = createEngineDescription(NoOpAnnotator.class);
AnalysisEngine engine = createEngine(desc);
JCas jcas = engine.newJCas();
jcas.setDocumentLanguage("en");
jcas.setDocumentText("This is a very #emotional tweet ;-) #icouldcry #ILoveHashTags");
engine.process(jcas);
TextClassificationTarget aTarget = new TextClassificationTarget(jcas, 0, jcas.getDocumentText().length());
aTarget.addToIndexes();
NumberOfHashTags extractor = new NumberOfHashTags();
List<Feature> features = new ArrayList<Feature>(extractor.extract(jcas, aTarget));
Assert.assertEquals(1, features.size());
for (Feature feature : features) {
assertFeature(NumberOfHashTags.class.getSimpleName(), 3, feature);
}
}
use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class LibsvmDataFormatLoadModelConnector method createInputFile.
private File createInputFile(JCas jcas) throws Exception {
File tempFile = FileUtil.createTempFile("libsvm", ".txt");
tempFile.deleteOnExit();
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tempFile), "utf-8"));
InstanceExtractor extractor = new InstanceExtractor(featureMode, featureExtractors, true);
List<Instance> instances = extractor.getInstances(jcas, true);
for (Instance instance : instances) {
bw.write(OUTCOME_PLACEHOLDER);
bw.write(injectSequenceId(instance));
for (Feature f : instance.getFeatures()) {
if (!sanityCheckValue(f)) {
continue;
}
bw.write("\t");
bw.write(featureMapping.get(f.getName()) + ":" + f.getValue());
}
bw.write("\n");
}
bw.close();
return tempFile;
}
Aggregations