Search in sources :

Example 16 with Feature

use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.

the class CosineSimilarityTest method testCosSimWithPosTags.

@Test
public void testCosSimWithPosTags() throws Exception {
    CosineSimilarityTest test = new CosineSimilarityTest();
    test.initialize();
    test.parameters = new Object[] { CosineFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123", CosineFeatureExtractor.PARAM_SOURCE_LOCATION, test.lucenePath.toString(), IdfPairMetaCollector.PARAM_TARGET_LOCATION, test.lucenePath.toString(), CosineFeatureExtractor.PARAM_NGRAM_ANNO_TYPE, "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" };
    test.runPipeline();
    assertTrue(test.featureNames.first().equals("SimilarityCosineSimilarity"));
    assertEquals(test.featureNames.size(), 1);
    for (Feature feat : test.instanceList.get(0).getFeatures()) {
        assertEquals(0.2, (double) feat.getValue(), epsilon);
    }
}
Also used : Feature(org.dkpro.tc.api.features.Feature) Test(org.junit.Test)

Example 17 with Feature

use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.

the class LuceneKeywordPFE method extract.

@Override
public Set<Feature> extract(JCas view1, JCas view2) throws TextClassificationException {
    TextClassificationTarget aTarget1 = JCasUtil.selectSingle(view1, TextClassificationTarget.class);
    TextClassificationTarget aTarget2 = JCasUtil.selectSingle(view2, TextClassificationTarget.class);
    FrequencyDistribution<String> view1Ngrams = KeywordNGramUtils.getDocumentKeywordNgrams(view1, aTarget1, ngramMinN1, ngramMaxN1, markSentenceBoundary, markSentenceLocation, includeCommas, keywords);
    FrequencyDistribution<String> view2Ngrams = KeywordNGramUtils.getDocumentKeywordNgrams(view2, aTarget2, ngramMinN2, ngramMaxN2, markSentenceBoundary, markSentenceLocation, includeCommas, keywords);
    FrequencyDistribution<String> allNgrams = getViewNgrams(view1, view2);
    Set<Feature> features = new HashSet<Feature>();
    if (useView1NgramsAsFeatures) {
        prefix = "keyNG1";
        features = addToFeatureArray(view1Ngrams, topKSetView1, features);
    }
    if (useView2NgramsAsFeatures) {
        prefix = "keyNG2";
        features = addToFeatureArray(view2Ngrams, topKSetView2, features);
    }
    if (useViewBlindNgramsAsFeatures && !markViewBlindNgramsWithLocalView) {
        prefix = "keyNG";
        features = addToFeatureArray(allNgrams, topKSet, features);
    }
    if (useViewBlindNgramsAsFeatures && markViewBlindNgramsWithLocalView) {
        prefix = "keyNGall1";
        features = addToFeatureArray(view1Ngrams, topKSet, features);
        prefix = "keyNGall2";
        features = addToFeatureArray(view2Ngrams, topKSet, features);
    }
    return features;
}
Also used : TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) Feature(org.dkpro.tc.api.features.Feature) HashSet(java.util.HashSet)

Example 18 with Feature

use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.

the class LuceneNGramPFE method extract.

@Override
public Set<Feature> extract(JCas view1, JCas view2) throws TextClassificationException {
    FrequencyDistribution<String> view1Ngrams = null;
    FrequencyDistribution<String> view2Ngrams = null;
    FrequencyDistribution<String> allNgrams = null;
    TextClassificationTarget aTarget1 = JCasUtil.selectSingle(view1, TextClassificationTarget.class);
    TextClassificationTarget aTarget2 = JCasUtil.selectSingle(view2, TextClassificationTarget.class);
    view1Ngrams = NGramUtils.getDocumentNgrams(view1, aTarget1, ngramLowerCase, filterPartialStopwordMatches, ngramMinN1, ngramMaxN1, stopwords, Token.class);
    view2Ngrams = NGramUtils.getDocumentNgrams(view2, aTarget2, ngramLowerCase, filterPartialStopwordMatches, ngramMinN2, ngramMaxN2, stopwords, Token.class);
    allNgrams = getViewNgrams(view1, view2);
    Set<Feature> features = new HashSet<Feature>();
    if (useView1NgramsAsFeatures) {
        prefix = "view1NG";
        features = addToFeatureArray(view1Ngrams, topKSetView1, features);
    }
    if (useView2NgramsAsFeatures) {
        prefix = "view2NG";
        features = addToFeatureArray(view2Ngrams, topKSetView2, features);
    }
    if (useViewBlindNgramsAsFeatures && !markViewBlindNgramsWithLocalView) {
        prefix = "allNG";
        features = addToFeatureArray(allNgrams, topKSet, features);
    }
    if (useViewBlindNgramsAsFeatures && markViewBlindNgramsWithLocalView) {
        prefix = "view1allNG";
        features = addToFeatureArray(view1Ngrams, topKSet, features);
        prefix = "view2allNG";
        features = addToFeatureArray(view2Ngrams, topKSet, features);
    }
    return features;
}
Also used : TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Feature(org.dkpro.tc.api.features.Feature) HashSet(java.util.HashSet)

Example 19 with Feature

use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.

the class SentenceRatioPerDocument method extract.

@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
    long maxLen = getMax();
    List<Sentence> sentences = JCasUtil.selectCovered(jcas, Sentence.class, aTarget);
    double ratio = getRatio(sentences.size(), maxLen);
    return new Feature(FEATURE_NAME, ratio, FeatureType.NUMERIC).asSet();
}
Also used : Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) Feature(org.dkpro.tc.api.features.Feature)

Example 20 with Feature

use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.

the class SkipWordNGram method extract.

@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
    Set<Feature> features = new HashSet<Feature>();
    FrequencyDistribution<String> documentNgrams = SkipWordNGramMC.getDocumentSkipNgrams(jcas, aTarget, ngramLowerCase, filterPartialStopwordMatches, ngramMinN, ngramMaxN, skipSize, stopwords);
    for (String topNgram : topKSet.getKeys()) {
        if (documentNgrams.getKeys().contains(topNgram)) {
            features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 1, FeatureType.BOOLEAN));
        } else {
            features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true, FeatureType.BOOLEAN));
        }
    }
    return features;
}
Also used : Feature(org.dkpro.tc.api.features.Feature) HashSet(java.util.HashSet)

Aggregations

Feature (org.dkpro.tc.api.features.Feature)94 Test (org.junit.Test)48 Instance (org.dkpro.tc.api.features.Instance)30 ArrayList (java.util.ArrayList)29 HashSet (java.util.HashSet)21 FeatureTestUtil.assertFeature (org.dkpro.tc.testing.FeatureTestUtil.assertFeature)17 AnalysisEngine (org.apache.uima.analysis_engine.AnalysisEngine)16 TextClassificationTarget (org.dkpro.tc.api.type.TextClassificationTarget)16 JCas (org.apache.uima.jcas.JCas)15 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)13 File (java.io.File)8 Attribute (weka.core.Attribute)8 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)7 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)6 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)5 Chunk (de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk)4 OpenNlpPosTagger (de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger)4 BreakIteratorSegmenter (de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter)4 Instances (weka.core.Instances)4 IOException (java.io.IOException)3