use of org.dkpro.tc.api.features.TcFeatureSet in project dkpro-tc by dkpro.
the class ExperimentUtil method getFeatureNamesMinusOne.
private static TcFeatureSet getFeatureNamesMinusOne(TcFeature[] names, int i) {
TcFeatureSet nameList = new TcFeatureSet(names);
nameList.setFeatureSetName(LEFTOUT_FE + names[i].getDiscriminatorValue());
nameList.remove(i);
return nameList;
}
use of org.dkpro.tc.api.features.TcFeatureSet in project dkpro-tc by dkpro.
the class ExperimentUtil method getAblationTestFeatures.
/**
* Returns a pre-defined dimension with feature extractor sets configured for an ablation test.
* For example, if you specify four feature extractors A, B, C, and D, you will get [A,B,C,D],
* [A,B,C], [A,B,D], [A,C,D], [B,C,D],
*
* @param features
* All the feature extractors that should be tested.
* @return a dimension with a list of feature extractor sets; named after the feature that is
* left out
*/
public static Dimension<TcFeatureSet> getAblationTestFeatures(TcFeature... features) {
TcFeatureSet[] featureSets = new TcFeatureSet[features.length + 1];
for (int i = 0; i < features.length; i++) {
TcFeatureSet featureNamesMinusOne = getFeatureNamesMinusOne(features, i);
featureSets[i] = featureNamesMinusOne;
}
// also add all features extractors
featureSets[features.length] = new TcFeatureSet(features);
Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(Constants.DIM_FEATURE_SET, featureSets);
return dimFeatureSets;
}
use of org.dkpro.tc.api.features.TcFeatureSet in project dkpro-tc by dkpro.
the class WekaDocumentPlain method getParameterSpace.
public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
// configure training and test data reader dimension
// train/test will use both, while cross-validation will only use the
// train part
Map<String, Object> dimReaders = new HashMap<String, Object>();
CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, corpusFilePathTrain, FolderwiseDataReader.PARAM_LANGUAGE, LANGUAGE_CODE, FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
dimReaders.put(DIM_READER_TRAIN, readerTrain);
CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, corpusFilePathTest, FolderwiseDataReader.PARAM_LANGUAGE, LANGUAGE_CODE, FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
dimReaders.put(DIM_READER_TEST, readerTest);
Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet("DummyFeatureSet", TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 20, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3)));
Map<String, Object> config = new HashMap<>();
config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), NaiveBayes.class.getName() });
config.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
config.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, mlas);
return pSpace;
}
use of org.dkpro.tc.api.features.TcFeatureSet in project dkpro-tc by dkpro.
the class WekaManualFoldCrossValidation method getParameterSpace.
public static ParameterSpace getParameterSpace(boolean manualFolds) throws ResourceInitializationException {
Map<String, Object> dimReaders = new HashMap<String, Object>();
dimReaders.put(DIM_READER_TRAIN, BrownCorpusReader.class);
CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(BrownCorpusReader.class, BrownCorpusReader.PARAM_LANGUAGE, "de", BrownCorpusReader.PARAM_SOURCE_LOCATION, corpusFilePathTrain, BrownCorpusReader.PARAM_PATTERNS, INCLUDE_PREFIX + "*.xml");
dimReaders.put(DIM_READER_TRAIN, readerTrain);
Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(CharacterNGram.class, CharacterNGram.PARAM_NGRAM_MIN_N, 2, CharacterNGram.PARAM_NGRAM_MAX_N, 3, CharacterNGram.PARAM_NGRAM_USE_TOP_K, 750)));
Map<String, Object> config = new HashMap<>();
config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), NaiveBayes.class.getName() });
config.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
config.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_UNIT), dimFeatureSets, mlas, /*
* MANUAL CROSS VALIDATION FOLDS - i.e. the cas created by your reader will be used
* as is to make folds
*/
Dimension.create(DIM_CROSS_VALIDATION_MANUAL_FOLDS, manualFolds));
return pSpace;
}
use of org.dkpro.tc.api.features.TcFeatureSet in project dkpro-tc by dkpro.
the class WekaTwentyNewsgroupsInstanceWeightingDemo method getParameterSpace.
public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
// configure training and test data reader dimension
// train/test will use both, while cross-validation will only use the train part
Map<String, Object> dimReaders = new HashMap<String, Object>();
CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(WeightedTwentyNewsgroupsCorpusReader.class, WeightedTwentyNewsgroupsCorpusReader.PARAM_SOURCE_LOCATION, corpusFilePathTrain, WeightedTwentyNewsgroupsCorpusReader.PARAM_WEIGHT_FILE_LOCATION, corpusFilePathTrain + weightsFile, WeightedTwentyNewsgroupsCorpusReader.PARAM_LANGUAGE, LANGUAGE_CODE, WeightedTwentyNewsgroupsCorpusReader.PARAM_PATTERNS, Arrays.asList(WeightedTwentyNewsgroupsCorpusReader.INCLUDE_PREFIX + "*/*.txt"));
dimReaders.put(DIM_READER_TRAIN, readerTrain);
CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(WeightedTwentyNewsgroupsCorpusReader.class, WeightedTwentyNewsgroupsCorpusReader.PARAM_SOURCE_LOCATION, corpusFilePathTest, WeightedTwentyNewsgroupsCorpusReader.PARAM_WEIGHT_FILE_LOCATION, corpusFilePathTest + weightsFile, WeightedTwentyNewsgroupsCorpusReader.PARAM_LANGUAGE, LANGUAGE_CODE, WeightedTwentyNewsgroupsCorpusReader.PARAM_PATTERNS, WeightedTwentyNewsgroupsCorpusReader.INCLUDE_PREFIX + "*/*.txt");
dimReaders.put(DIM_READER_TEST, readerTest);
Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 50, WordNGram.PARAM_NGRAM_MIN_N, 2, WordNGram.PARAM_NGRAM_MAX_N, 3)));
Map<String, Object> config = new HashMap<>();
config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), SMO.class.getName() });
config.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
config.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, mlas, Dimension.create(DIM_APPLY_INSTANCE_WEIGHTING, true));
return pSpace;
}
Aggregations