use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class WordNGramTest method evaluateExtractedFeatures.
@Override
protected void evaluateExtractedFeatures(File output) throws Exception {
List<Instance> instances = readInstances(output);
assertEquals(4, instances.size());
assertEquals(1, getUniqueOutcomes(instances));
Set<String> featureNames = new HashSet<String>();
for (Instance i : instances) {
for (Feature f : i.getFeatures()) {
featureNames.add(f.getName());
}
}
assertEquals(3, featureNames.size());
assertTrue(featureNames.contains("ngram_4"));
assertTrue(featureNames.contains("ngram_5"));
assertTrue(featureNames.contains("ngram_5_5"));
}
use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class CosineSimilarityTest method testCosSimDefaultTfIdf.
/**
* Tests TFIDF Cosine Similarity with TF weight FREQUENCY_LOGPLUSONE, IDF weight PASSTHROUGH,
* and normalization mode L2. <br />
*
* Answer 0.2 for Tokens confirmed by following equation 15.2, pg 541, in Manning and Schuetze.
* <br />
* Vector1 = 1,.5,0,1,.5,0 <br />
* Vector2 = 0,.5,1,0,.5,1 <br />
* Sum of vector products (svp) = (1x0)+(.5x.5)+(0x1)+(1x0)+(.5x.5)+(0x1) =.5 <br />
* normVector1 = sqrt(sum(i in finalVector1, ^2)) = sqrt(1+.25+0+1+.25+0) = 1.58 <br />
* normVector2 = sqrt(sum(i in finalVector2, ^2)) = sqrt(0+.25+1+0+.25+1) = 1.58 <br />
* CosSim = svp/(normVector1*normVector2) = 0.5 / (1.58*1.58) = 0.2 <br />
*
* @throws Exception
*/
@Test
public void testCosSimDefaultTfIdf() throws Exception {
CosineSimilarityTest test = new CosineSimilarityTest();
test.initialize();
test.parameters = new Object[] { CosineFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123", CosineFeatureExtractor.PARAM_SOURCE_LOCATION, test.lucenePath.toString(), IdfPairMetaCollector.PARAM_TARGET_LOCATION, test.lucenePath.toString() };
test.runPipeline();
assertTrue(test.featureNames.first().equals("SimilarityCosineSimilarity"));
assertEquals(test.featureNames.size(), 1);
for (Feature feat : test.instanceList.get(0).getFeatures()) {
assertEquals(0.2, (double) feat.getValue(), epsilon);
// System.out.println("CosSim score: " + (double)feat.getValue());
}
}
use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class CosineSimilarityTest method testCosSimWithStems.
@Test
public void testCosSimWithStems() throws Exception {
CosineSimilarityTest test = new CosineSimilarityTest();
test.initialize();
test.parameters = new Object[] { CosineFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123", CosineFeatureExtractor.PARAM_SOURCE_LOCATION, test.lucenePath.toString(), IdfPairMetaCollector.PARAM_TARGET_LOCATION, test.lucenePath.toString(), CosineFeatureExtractor.PARAM_NGRAM_ANNO_TYPE, "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem" };
test.runPipeline();
assertTrue(test.featureNames.first().equals("SimilarityCosineSimilarity"));
assertEquals(test.featureNames.size(), 1);
for (Feature feat : test.instanceList.get(0).getFeatures()) {
assertEquals(0.2, (double) feat.getValue(), epsilon);
}
}
use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class SkipCharacterNGram method extract.
@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
Set<Feature> features = new HashSet<Feature>();
FrequencyDistribution<String> charNgrams = SkipCharacterNGramMC.getCharacterSkipNgrams(jcas, aTarget, ngramLowerCase, ngramMinN, ngramMaxN, charSkipSize);
for (String topNgram : topKSet.getKeys()) {
if (charNgrams.getKeys().contains(topNgram)) {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 1, FeatureType.BOOLEAN));
} else {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true, FeatureType.BOOLEAN));
}
}
return features;
}
use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class WordNGram method extract.
@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
Set<Feature> features = new HashSet<Feature>();
FrequencyDistribution<String> documentNgrams = null;
documentNgrams = NGramUtils.getAnnotationNgrams(jcas, aTarget, ngramLowerCase, filterPartialStopwordMatches, ngramMinN, ngramMaxN, stopwords);
for (String topNgram : topKSet.getKeys()) {
if (documentNgrams.getKeys().contains(topNgram)) {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 1, FeatureType.BOOLEAN));
} else {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true, FeatureType.BOOLEAN));
}
}
return features;
}
Aggregations