Search in sources :

Example 56 with Feature

use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.

the class WordNGramTest method evaluateExtractedFeatures.

@Override
protected void evaluateExtractedFeatures(File output) throws Exception {
    List<Instance> instances = readInstances(output);
    assertEquals(4, instances.size());
    assertEquals(1, getUniqueOutcomes(instances));
    Set<String> featureNames = new HashSet<String>();
    for (Instance i : instances) {
        for (Feature f : i.getFeatures()) {
            featureNames.add(f.getName());
        }
    }
    assertEquals(3, featureNames.size());
    assertTrue(featureNames.contains("ngram_4"));
    assertTrue(featureNames.contains("ngram_5"));
    assertTrue(featureNames.contains("ngram_5_5"));
}
Also used : Instance(org.dkpro.tc.api.features.Instance) Feature(org.dkpro.tc.api.features.Feature) HashSet(java.util.HashSet)

Example 57 with Feature

use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.

the class CosineSimilarityTest method testCosSimDefaultTfIdf.

/**
 * Tests TFIDF Cosine Similarity with TF weight FREQUENCY_LOGPLUSONE, IDF weight PASSTHROUGH,
 * and normalization mode L2. <br />
 *
 * Answer 0.2 for Tokens confirmed by following equation 15.2, pg 541, in Manning and Schuetze.
 * <br />
 * Vector1 = 1,.5,0,1,.5,0 <br />
 * Vector2 = 0,.5,1,0,.5,1 <br />
 * Sum of vector products (svp) = (1x0)+(.5x.5)+(0x1)+(1x0)+(.5x.5)+(0x1) =.5 <br />
 * normVector1 = sqrt(sum(i in finalVector1, ^2)) = sqrt(1+.25+0+1+.25+0) = 1.58 <br />
 * normVector2 = sqrt(sum(i in finalVector2, ^2)) = sqrt(0+.25+1+0+.25+1) = 1.58 <br />
 * CosSim = svp/(normVector1*normVector2) = 0.5 / (1.58*1.58) = 0.2 <br />
 *
 * @throws Exception
 */
@Test
public void testCosSimDefaultTfIdf() throws Exception {
    CosineSimilarityTest test = new CosineSimilarityTest();
    test.initialize();
    test.parameters = new Object[] { CosineFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123", CosineFeatureExtractor.PARAM_SOURCE_LOCATION, test.lucenePath.toString(), IdfPairMetaCollector.PARAM_TARGET_LOCATION, test.lucenePath.toString() };
    test.runPipeline();
    assertTrue(test.featureNames.first().equals("SimilarityCosineSimilarity"));
    assertEquals(test.featureNames.size(), 1);
    for (Feature feat : test.instanceList.get(0).getFeatures()) {
        assertEquals(0.2, (double) feat.getValue(), epsilon);
    // System.out.println("CosSim score: " + (double)feat.getValue());
    }
}
Also used : Feature(org.dkpro.tc.api.features.Feature) Test(org.junit.Test)

Example 58 with Feature

use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.

the class CosineSimilarityTest method testCosSimWithStems.

@Test
public void testCosSimWithStems() throws Exception {
    CosineSimilarityTest test = new CosineSimilarityTest();
    test.initialize();
    test.parameters = new Object[] { CosineFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123", CosineFeatureExtractor.PARAM_SOURCE_LOCATION, test.lucenePath.toString(), IdfPairMetaCollector.PARAM_TARGET_LOCATION, test.lucenePath.toString(), CosineFeatureExtractor.PARAM_NGRAM_ANNO_TYPE, "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem" };
    test.runPipeline();
    assertTrue(test.featureNames.first().equals("SimilarityCosineSimilarity"));
    assertEquals(test.featureNames.size(), 1);
    for (Feature feat : test.instanceList.get(0).getFeatures()) {
        assertEquals(0.2, (double) feat.getValue(), epsilon);
    }
}
Also used : Feature(org.dkpro.tc.api.features.Feature) Test(org.junit.Test)

Example 59 with Feature

use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.

the class SkipCharacterNGram method extract.

@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
    Set<Feature> features = new HashSet<Feature>();
    FrequencyDistribution<String> charNgrams = SkipCharacterNGramMC.getCharacterSkipNgrams(jcas, aTarget, ngramLowerCase, ngramMinN, ngramMaxN, charSkipSize);
    for (String topNgram : topKSet.getKeys()) {
        if (charNgrams.getKeys().contains(topNgram)) {
            features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 1, FeatureType.BOOLEAN));
        } else {
            features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true, FeatureType.BOOLEAN));
        }
    }
    return features;
}
Also used : Feature(org.dkpro.tc.api.features.Feature) HashSet(java.util.HashSet)

Example 60 with Feature

use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.

the class WordNGram method extract.

@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
    Set<Feature> features = new HashSet<Feature>();
    FrequencyDistribution<String> documentNgrams = null;
    documentNgrams = NGramUtils.getAnnotationNgrams(jcas, aTarget, ngramLowerCase, filterPartialStopwordMatches, ngramMinN, ngramMaxN, stopwords);
    for (String topNgram : topKSet.getKeys()) {
        if (documentNgrams.getKeys().contains(topNgram)) {
            features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 1, FeatureType.BOOLEAN));
        } else {
            features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true, FeatureType.BOOLEAN));
        }
    }
    return features;
}
Also used : Feature(org.dkpro.tc.api.features.Feature) HashSet(java.util.HashSet)

Aggregations

Feature (org.dkpro.tc.api.features.Feature)94 Test (org.junit.Test)48 Instance (org.dkpro.tc.api.features.Instance)30 ArrayList (java.util.ArrayList)29 HashSet (java.util.HashSet)21 FeatureTestUtil.assertFeature (org.dkpro.tc.testing.FeatureTestUtil.assertFeature)17 AnalysisEngine (org.apache.uima.analysis_engine.AnalysisEngine)16 TextClassificationTarget (org.dkpro.tc.api.type.TextClassificationTarget)16 JCas (org.apache.uima.jcas.JCas)15 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)13 File (java.io.File)8 Attribute (weka.core.Attribute)8 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)7 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)6 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)5 Chunk (de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk)4 OpenNlpPosTagger (de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger)4 BreakIteratorSegmenter (de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter)4 Instances (weka.core.Instances)4 IOException (java.io.IOException)3