Search in sources :

Example 66 with Feature

use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.

the class CrfSuiteFeatureFormatExtractionIteratorTest method buildFeatures.

private void buildFeatures() throws Exception {
    fs = new ArrayList<>();
    List<Feature> features1 = new ArrayList<Feature>();
    features1.add(new Feature("feature1", 1.0, FeatureType.NUMERIC));
    features1.add(new Feature("feature2", 0.0, FeatureType.NUMERIC));
    features1.add(new Feature("feature3", "Water", FeatureType.STRING));
    List<Feature> features2 = new ArrayList<Feature>();
    features2.add(new Feature("feature2", 0.5, FeatureType.NUMERIC));
    features2.add(new Feature("feature1", 0.5, FeatureType.NUMERIC));
    features2.add(new Feature("feature3", "Fanta", FeatureType.STRING));
    Instance instance1 = new Instance(features1, "1");
    instance1.setJcasId(0);
    instance1.setSequenceId(0);
    instance1.setSequencePosition(0);
    Instance instance2 = new Instance(features2, "2");
    instance1.setJcasId(0);
    instance2.setSequenceId(0);
    instance2.setSequencePosition(1);
    Instance instance3 = new Instance(features1, "3");
    instance1.setJcasId(0);
    instance3.setSequenceId(0);
    instance3.setSequencePosition(2);
    Instance instance4 = new Instance(features1, "4");
    instance1.setJcasId(0);
    instance4.setSequenceId(1);
    instance4.setSequencePosition(0);
    Instance instance5 = new Instance(features2, "4");
    instance1.setJcasId(0);
    instance5.setSequenceId(1);
    instance5.setSequencePosition(1);
    fs.add(instance1);
    fs.add(instance2);
    fs.add(instance3);
    fs.add(instance4);
    fs.add(instance5);
}
Also used : Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) Feature(org.dkpro.tc.api.features.Feature)

Example 67 with Feature

use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.

the class TypeTokenPairFeatureExtractorTest method testExtract.

@Test
public void testExtract() throws TextClassificationException {
    TypeTokenPairFeatureExtractor extractor = new TypeTokenPairFeatureExtractor();
    Set<Feature> features = extractor.extract(jcas1, jcas2);
    assertEquals(1, features.size());
    for (Feature feature : features) {
        assertFeature("DiffTypeTokenRatio", 1.33, feature, 0.1);
    }
}
Also used : TypeTokenPairFeatureExtractor(org.dkpro.tc.features.pair.core.style.TypeTokenPairFeatureExtractor) FeatureTestUtil.assertFeature(org.dkpro.tc.testing.FeatureTestUtil.assertFeature) Feature(org.dkpro.tc.api.features.Feature) Test(org.junit.Test)

Example 68 with Feature

use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.

the class AdjectiveEndingFeatureExtractor method extract.

@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) {
    double able = 0;
    double al = 0;
    double ful = 0;
    double ible = 0;
    double ic = 0;
    double ive = 0;
    double less = 0;
    double ous = 0;
    double ly = 0;
    int n = 0;
    for (POS_ADJ adj : JCasUtil.selectCovered(jcas, POS_ADJ.class, aTarget)) {
        n++;
        String text = adj.getCoveredText().toLowerCase();
        if (text.endsWith("able")) {
            able++;
        } else if (text.endsWith("al")) {
            al++;
        } else if (text.endsWith("ful")) {
            ful++;
        } else if (text.endsWith("ible")) {
            ible++;
        } else if (text.endsWith("ic")) {
            ic++;
        } else if (text.endsWith("ive")) {
            ive++;
        } else if (text.endsWith("less")) {
            less++;
        } else if (text.endsWith("ous")) {
            ous++;
        }
    }
    int m = 0;
    for (POS_ADV adv : JCasUtil.select(jcas, POS_ADV.class)) {
        m++;
        String text = adv.getCoveredText().toLowerCase();
        if (text.endsWith("ly")) {
            ly++;
        }
    }
    Set<Feature> featSet = new HashSet<Feature>();
    featSet.add(new Feature(ADJ_ENDING1, n > 0 ? able * 100 / n : 0, n == 0, FeatureType.NUMERIC));
    featSet.add(new Feature(ADJ_ENDING2, n > 0 ? al * 100 / n : 0, n == 0, FeatureType.NUMERIC));
    featSet.add(new Feature(ADJ_ENDING3, n > 0 ? ful * 100 / n : 0, n == 0, FeatureType.NUMERIC));
    featSet.add(new Feature(ADJ_ENDING4, n > 0 ? ible * 100 / n : 0, n == 0, FeatureType.NUMERIC));
    featSet.add(new Feature(ADJ_ENDING5, n > 0 ? less * 100 / n : 0, n == 0, FeatureType.NUMERIC));
    featSet.add(new Feature(ADJ_ENDING6, n > 0 ? ous * 100 / n : 0, n == 0, FeatureType.NUMERIC));
    featSet.add(new Feature(ADJ_ENDING7, n > 0 ? ive * 100 / n : 0, n == 0, FeatureType.NUMERIC));
    featSet.add(new Feature(ADJ_ENDING8, n > 0 ? ic * 100 / n : 0, n == 0, FeatureType.NUMERIC));
    featSet.add(new Feature(ADV_ENDING9, m > 0 ? ly * 100 / m : 0, n == 0, FeatureType.NUMERIC));
    return featSet;
}
Also used : POS_ADJ(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_ADJ) POS_ADV(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_ADV) Feature(org.dkpro.tc.api.features.Feature) HashSet(java.util.HashSet)

Example 69 with Feature

use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.

the class POSRatioFeatureExtractor method extract.

@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
    Set<Feature> features = new HashSet<Feature>();
    double total = selectCovered(jcas, POS.class, aTarget).size();
    double adj = selectCovered(jcas, POS_ADJ.class, aTarget).size() / total;
    double adv = selectCovered(jcas, POS_ADV.class, aTarget).size() / total;
    double art = selectCovered(jcas, POS_DET.class, aTarget).size() / total;
    double card = selectCovered(jcas, POS_NUM.class, aTarget).size() / total;
    double conj = selectCovered(jcas, POS_CONJ.class, aTarget).size() / total;
    double noun = selectCovered(jcas, POS_NOUN.class, aTarget).size() / total;
    double propNoun = selectCovered(jcas, POS_PROPN.class, aTarget).size() / total;
    double other = selectCovered(jcas, POS_X.class, aTarget).size() / total;
    double prep = selectCovered(jcas, POS_ADP.class, aTarget).size() / total;
    double pron = selectCovered(jcas, POS_PRON.class, aTarget).size() / total;
    double punc = selectCovered(jcas, POS_PUNCT.class, aTarget).size() / total;
    double verb = selectCovered(jcas, POS_VERB.class, aTarget).size() / total;
    features.add(new Feature(FN_ADJ_RATIO, adj, FeatureType.NUMERIC));
    features.add(new Feature(FN_ADV_RATIO, adv, FeatureType.NUMERIC));
    features.add(new Feature(FN_ART_RATIO, art, FeatureType.NUMERIC));
    features.add(new Feature(FN_CARD_RATIO, card, FeatureType.NUMERIC));
    features.add(new Feature(FN_CONJ_RATIO, conj, FeatureType.NUMERIC));
    features.add(new Feature(FN_N_RATIO, noun + propNoun, FeatureType.NUMERIC));
    features.add(new Feature(FN_O_RATIO, other, FeatureType.NUMERIC));
    features.add(new Feature(FN_PR_RATIO, pron, FeatureType.NUMERIC));
    features.add(new Feature(FN_PP_RATIO, prep, FeatureType.NUMERIC));
    features.add(new Feature(FN_PUNC_RATIO, punc, FeatureType.NUMERIC));
    features.add(new Feature(FN_V_RATIO, verb, FeatureType.NUMERIC));
    return features;
}
Also used : POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) Feature(org.dkpro.tc.api.features.Feature) HashSet(java.util.HashSet)

Example 70 with Feature

use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.

the class NumberOfHashTags method extract.

@Override
public Set<Feature> extract(JCas jCas, TextClassificationTarget aTarget) throws TextClassificationException {
    Matcher hashTagMatcher = HASHTAG_PATTERN.matcher(jCas.getDocumentText().substring(aTarget.getBegin(), aTarget.getEnd()));
    int numberOfHashTags = 0;
    while (hashTagMatcher.find()) {
        numberOfHashTags++;
    }
    return new Feature(NumberOfHashTags.class.getSimpleName(), numberOfHashTags, FeatureType.NUMERIC).asSet();
}
Also used : Matcher(java.util.regex.Matcher) Feature(org.dkpro.tc.api.features.Feature)

Aggregations

Feature (org.dkpro.tc.api.features.Feature)94 Test (org.junit.Test)48 Instance (org.dkpro.tc.api.features.Instance)30 ArrayList (java.util.ArrayList)29 HashSet (java.util.HashSet)21 FeatureTestUtil.assertFeature (org.dkpro.tc.testing.FeatureTestUtil.assertFeature)17 AnalysisEngine (org.apache.uima.analysis_engine.AnalysisEngine)16 TextClassificationTarget (org.dkpro.tc.api.type.TextClassificationTarget)16 JCas (org.apache.uima.jcas.JCas)15 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)13 File (java.io.File)8 Attribute (weka.core.Attribute)8 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)7 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)6 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)5 Chunk (de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk)4 OpenNlpPosTagger (de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger)4 BreakIteratorSegmenter (de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter)4 Instances (weka.core.Instances)4 IOException (java.io.IOException)3