Search in sources :

Example 21 with TcFeatureSet

use of org.dkpro.tc.api.features.TcFeatureSet in project dkpro-tc by dkpro.

the class ExperimentUtil method getFeatureNamesMinusOne.

private static TcFeatureSet getFeatureNamesMinusOne(TcFeature[] names, int i) {
    TcFeatureSet nameList = new TcFeatureSet(names);
    nameList.setFeatureSetName(LEFTOUT_FE + names[i].getDiscriminatorValue());
    nameList.remove(i);
    return nameList;
}
Also used : TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet)

Example 22 with TcFeatureSet

use of org.dkpro.tc.api.features.TcFeatureSet in project dkpro-tc by dkpro.

the class ExperimentUtil method getAblationTestFeatures.

/**
 * Returns a pre-defined dimension with feature extractor sets configured for an ablation test.
 * For example, if you specify four feature extractors A, B, C, and D, you will get [A,B,C,D],
 * [A,B,C], [A,B,D], [A,C,D], [B,C,D],
 *
 * @param features
 *            All the feature extractors that should be tested.
 * @return a dimension with a list of feature extractor sets; named after the feature that is
 *         left out
 */
public static Dimension<TcFeatureSet> getAblationTestFeatures(TcFeature... features) {
    TcFeatureSet[] featureSets = new TcFeatureSet[features.length + 1];
    for (int i = 0; i < features.length; i++) {
        TcFeatureSet featureNamesMinusOne = getFeatureNamesMinusOne(features, i);
        featureSets[i] = featureNamesMinusOne;
    }
    // also add all features extractors
    featureSets[features.length] = new TcFeatureSet(features);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(Constants.DIM_FEATURE_SET, featureSets);
    return dimFeatureSets;
}
Also used : TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet)

Example 23 with TcFeatureSet

use of org.dkpro.tc.api.features.TcFeatureSet in project dkpro-tc by dkpro.

the class WekaDocumentPlain method getParameterSpace.

public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
    // configure training and test data reader dimension
    // train/test will use both, while cross-validation will only use the
    // train part
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, corpusFilePathTrain, FolderwiseDataReader.PARAM_LANGUAGE, LANGUAGE_CODE, FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, corpusFilePathTest, FolderwiseDataReader.PARAM_LANGUAGE, LANGUAGE_CODE, FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
    dimReaders.put(DIM_READER_TEST, readerTest);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet("DummyFeatureSet", TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 20, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3)));
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), NaiveBayes.class.getName() });
    config.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, mlas);
    return pSpace;
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) NaiveBayes(weka.classifiers.bayes.NaiveBayes) HashMap(java.util.HashMap) ParameterSpace(org.dkpro.lab.task.ParameterSpace) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) HashMap(java.util.HashMap) Map(java.util.Map) WekaAdapter(org.dkpro.tc.ml.weka.WekaAdapter)

Example 24 with TcFeatureSet

use of org.dkpro.tc.api.features.TcFeatureSet in project dkpro-tc by dkpro.

the class WekaManualFoldCrossValidation method getParameterSpace.

public static ParameterSpace getParameterSpace(boolean manualFolds) throws ResourceInitializationException {
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    dimReaders.put(DIM_READER_TRAIN, BrownCorpusReader.class);
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(BrownCorpusReader.class, BrownCorpusReader.PARAM_LANGUAGE, "de", BrownCorpusReader.PARAM_SOURCE_LOCATION, corpusFilePathTrain, BrownCorpusReader.PARAM_PATTERNS, INCLUDE_PREFIX + "*.xml");
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(CharacterNGram.class, CharacterNGram.PARAM_NGRAM_MIN_N, 2, CharacterNGram.PARAM_NGRAM_MAX_N, 3, CharacterNGram.PARAM_NGRAM_USE_TOP_K, 750)));
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), NaiveBayes.class.getName() });
    config.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_UNIT), dimFeatureSets, mlas, /*
                 * MANUAL CROSS VALIDATION FOLDS - i.e. the cas created by your reader will be used
                 * as is to make folds
                 */
    Dimension.create(DIM_CROSS_VALIDATION_MANUAL_FOLDS, manualFolds));
    return pSpace;
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) NaiveBayes(weka.classifiers.bayes.NaiveBayes) HashMap(java.util.HashMap) ParameterSpace(org.dkpro.lab.task.ParameterSpace) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) HashMap(java.util.HashMap) Map(java.util.Map) WekaAdapter(org.dkpro.tc.ml.weka.WekaAdapter)

Example 25 with TcFeatureSet

use of org.dkpro.tc.api.features.TcFeatureSet in project dkpro-tc by dkpro.

the class WekaTwentyNewsgroupsInstanceWeightingDemo method getParameterSpace.

public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
    // configure training and test data reader dimension
    // train/test will use both, while cross-validation will only use the train part
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(WeightedTwentyNewsgroupsCorpusReader.class, WeightedTwentyNewsgroupsCorpusReader.PARAM_SOURCE_LOCATION, corpusFilePathTrain, WeightedTwentyNewsgroupsCorpusReader.PARAM_WEIGHT_FILE_LOCATION, corpusFilePathTrain + weightsFile, WeightedTwentyNewsgroupsCorpusReader.PARAM_LANGUAGE, LANGUAGE_CODE, WeightedTwentyNewsgroupsCorpusReader.PARAM_PATTERNS, Arrays.asList(WeightedTwentyNewsgroupsCorpusReader.INCLUDE_PREFIX + "*/*.txt"));
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(WeightedTwentyNewsgroupsCorpusReader.class, WeightedTwentyNewsgroupsCorpusReader.PARAM_SOURCE_LOCATION, corpusFilePathTest, WeightedTwentyNewsgroupsCorpusReader.PARAM_WEIGHT_FILE_LOCATION, corpusFilePathTest + weightsFile, WeightedTwentyNewsgroupsCorpusReader.PARAM_LANGUAGE, LANGUAGE_CODE, WeightedTwentyNewsgroupsCorpusReader.PARAM_PATTERNS, WeightedTwentyNewsgroupsCorpusReader.INCLUDE_PREFIX + "*/*.txt");
    dimReaders.put(DIM_READER_TEST, readerTest);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 50, WordNGram.PARAM_NGRAM_MIN_N, 2, WordNGram.PARAM_NGRAM_MAX_N, 3)));
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), SMO.class.getName() });
    config.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, mlas, Dimension.create(DIM_APPLY_INSTANCE_WEIGHTING, true));
    return pSpace;
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) SMO(weka.classifiers.functions.SMO) HashMap(java.util.HashMap) ParameterSpace(org.dkpro.lab.task.ParameterSpace) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) HashMap(java.util.HashMap) Map(java.util.Map) WekaAdapter(org.dkpro.tc.ml.weka.WekaAdapter)

Aggregations

TcFeatureSet (org.dkpro.tc.api.features.TcFeatureSet)44 HashMap (java.util.HashMap)42 ParameterSpace (org.dkpro.lab.task.ParameterSpace)42 CollectionReaderDescription (org.apache.uima.collection.CollectionReaderDescription)40 Map (java.util.Map)36 WekaAdapter (org.dkpro.tc.ml.weka.WekaAdapter)18 LiblinearAdapter (org.dkpro.tc.ml.liblinear.LiblinearAdapter)9 NaiveBayes (weka.classifiers.bayes.NaiveBayes)9 LibsvmAdapter (org.dkpro.tc.ml.libsvm.LibsvmAdapter)7 XgboostAdapter (org.dkpro.tc.ml.xgboost.XgboostAdapter)6 List (java.util.List)5 SMO (weka.classifiers.functions.SMO)5 ArrayList (java.util.ArrayList)4 MekaAdapter (org.dkpro.tc.ml.weka.MekaAdapter)3 RandomForest (weka.classifiers.trees.RandomForest)3 MULAN (meka.classifiers.multilabel.MULAN)2 SvmHmmAdapter (org.dkpro.tc.ml.svmhmm.SvmHmmAdapter)2 SMOreg (weka.classifiers.functions.SMOreg)2 PolyKernel (weka.classifiers.functions.supportVector.PolyKernel)2 BR (meka.classifiers.multilabel.BR)1