Search in sources :

Example 1 with MekaAdapter

use of org.dkpro.tc.ml.weka.MekaAdapter in project dkpro-tc by dkpro.

the class MekaComplexConfigurationMultiDemo method getParameterSpace.

public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
    // configure training and test data reader dimension
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(ReutersCorpusReader.class, ReutersCorpusReader.PARAM_SOURCE_LOCATION, FILEPATH_TRAIN, ReutersCorpusReader.PARAM_GOLD_LABEL_FILE, FILEPATH_GOLD_LABELS, ReutersCorpusReader.PARAM_LANGUAGE, LANGUAGE_CODE, ReutersCorpusReader.PARAM_PATTERNS, ReutersCorpusReader.INCLUDE_PREFIX + "*.txt");
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(ReutersCorpusReader.class, ReutersCorpusReader.PARAM_SOURCE_LOCATION, FILEPATH_TEST, ReutersCorpusReader.PARAM_GOLD_LABEL_FILE, FILEPATH_GOLD_LABELS, ReutersCorpusReader.PARAM_LANGUAGE, LANGUAGE_CODE, ReutersCorpusReader.PARAM_PATTERNS, ReutersCorpusReader.INCLUDE_PREFIX + "*.txt");
    dimReaders.put(DIM_READER_TEST, readerTest);
    // Config 1
    Map<String, Object> config1 = new HashMap<>();
    config1.put(DIM_CLASSIFICATION_ARGS, new Object[] { new MekaAdapter(), BR.class.getName(), "-W", NaiveBayes.class.getName() });
    config1.put(DIM_DATA_WRITER, new MekaAdapter().getDataWriterClass().getName());
    config1.put(DIM_FEATURE_USE_SPARSE, new MekaAdapter().useSparseFeatures());
    Map<String, Object> config2 = new HashMap<>();
    config2.put(DIM_CLASSIFICATION_ARGS, new Object[] { new MekaAdapter(), CCq.class.getName(), "-P", "0.9" });
    config2.put(DIM_DATA_WRITER, new MekaAdapter().getDataWriterClass().getName());
    config2.put(DIM_FEATURE_USE_SPARSE, new MekaAdapter().useSparseFeatures());
    Map<String, Object> config3 = new HashMap<>();
    config3.put(DIM_CLASSIFICATION_ARGS, new Object[] { new MekaAdapter(), PSUpdateable.class.getName(), "-B", "900", "-S", "9" });
    config3.put(DIM_DATA_WRITER, new MekaAdapter().getDataWriterClass().getName());
    config3.put(DIM_FEATURE_USE_SPARSE, new MekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config1, config2, config3);
    // We configure 2 sets of feature extractors, one consisting of 2 extractors, and one with
    // only one
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 600, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3)));
    // multi-label feature selection (Mulan specific options), reduces the feature set to 10
    Map<String, Object> dimFeatureSelection = new HashMap<String, Object>();
    dimFeatureSelection.put(DIM_LABEL_TRANSFORMATION_METHOD, "BinaryRelevanceAttributeEvaluator");
    dimFeatureSelection.put(DIM_ATTRIBUTE_EVALUATOR_ARGS, asList(new String[] { InfoGainAttributeEval.class.getName() }));
    dimFeatureSelection.put(DIM_NUM_LABELS_TO_KEEP, 10);
    dimFeatureSelection.put(DIM_APPLY_FEATURE_SELECTION, true);
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_MULTI_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), Dimension.create(DIM_BIPARTITION_THRESHOLD, BIPARTITION_THRESHOLD), dimFeatureSets, mlas, Dimension.createBundle("featureSelection", dimFeatureSelection));
    return pSpace;
}
Also used : HashMap(java.util.HashMap) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) BR(meka.classifiers.multilabel.BR) CCq(meka.classifiers.multilabel.CCq) NaiveBayes(weka.classifiers.bayes.NaiveBayes) MekaAdapter(org.dkpro.tc.ml.weka.MekaAdapter) ParameterSpace(org.dkpro.lab.task.ParameterSpace) PSUpdateable(meka.classifiers.multilabel.incremental.PSUpdateable) HashMap(java.util.HashMap) Map(java.util.Map)

Example 2 with MekaAdapter

use of org.dkpro.tc.ml.weka.MekaAdapter in project dkpro-tc by dkpro.

the class WekaSaveAndLoadModelDocumentMultiLabelTest method documentGetParameterSpaceMultiLabel.

private ParameterSpace documentGetParameterSpaceMultiLabel() throws ResourceInitializationException {
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(ReutersCorpusReader.class, ReutersCorpusReader.PARAM_SOURCE_LOCATION, documentTrainFolderReuters, ReutersCorpusReader.PARAM_GOLD_LABEL_FILE, documentGoldLabelsReuters, ReutersCorpusReader.PARAM_LANGUAGE, "en", ReutersCorpusReader.PARAM_PATTERNS, ReutersCorpusReader.INCLUDE_PREFIX + "*.txt");
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    Map<String, Object> wekaConfig = new HashMap<>();
    wekaConfig.put(DIM_CLASSIFICATION_ARGS, new Object[] { new MekaAdapter(), MULAN.class.getName(), "-S", "RAkEL2", "-W", RandomForest.class.getName() });
    wekaConfig.put(DIM_DATA_WRITER, new MekaAdapter().getDataWriterClass().getName());
    wekaConfig.put(DIM_FEATURE_USE_SPARSE, new MekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", wekaConfig);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 50, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3), TcFeatureFactory.create(TokenRatioPerDocument.class)));
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_MULTI_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, Dimension.create(DIM_BIPARTITION_THRESHOLD, "0.5"), mlas);
    return pSpace;
}
Also used : HashMap(java.util.HashMap) RandomForest(weka.classifiers.trees.RandomForest) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) MekaAdapter(org.dkpro.tc.ml.weka.MekaAdapter) MULAN(meka.classifiers.multilabel.MULAN) ParameterSpace(org.dkpro.lab.task.ParameterSpace) HashMap(java.util.HashMap) Map(java.util.Map)

Example 3 with MekaAdapter

use of org.dkpro.tc.ml.weka.MekaAdapter in project dkpro-tc by dkpro.

the class MekaSaveAndApplyModelMultilabelDemo method getParameterSpace.

public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
    // configure training and test data reader dimension
    // train/test will use both, while cross-validation will only use the
    // train part
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(ReutersCorpusReader.class, ReutersCorpusReader.PARAM_SOURCE_LOCATION, FILEPATH_TRAIN, ReutersCorpusReader.PARAM_GOLD_LABEL_FILE, FILEPATH_GOLD_LABELS, ReutersCorpusReader.PARAM_LANGUAGE, LANGUAGE_CODE, ReutersCorpusReader.PARAM_PATTERNS, ReutersCorpusReader.INCLUDE_PREFIX + "*.txt");
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(ReutersCorpusReader.class, ReutersCorpusReader.PARAM_SOURCE_LOCATION, FILEPATH_TEST, ReutersCorpusReader.PARAM_GOLD_LABEL_FILE, FILEPATH_GOLD_LABELS, ReutersCorpusReader.PARAM_LANGUAGE, LANGUAGE_CODE, ReutersCorpusReader.PARAM_PATTERNS, ReutersCorpusReader.INCLUDE_PREFIX + "*.txt");
    dimReaders.put(DIM_READER_TEST, readerTest);
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new MekaAdapter(), MULAN.class.getName(), "-S", "RAkEL2", "-W", RandomForest.class.getName() });
    config.put(DIM_DATA_WRITER, new MekaAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new MekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 100, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3)));
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_MULTI_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), Dimension.create(DIM_BIPARTITION_THRESHOLD, BIPARTITION_THRESHOLD), dimFeatureSets, mlas);
    return pSpace;
}
Also used : HashMap(java.util.HashMap) RandomForest(weka.classifiers.trees.RandomForest) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) MekaAdapter(org.dkpro.tc.ml.weka.MekaAdapter) MULAN(meka.classifiers.multilabel.MULAN) ParameterSpace(org.dkpro.lab.task.ParameterSpace) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

HashMap (java.util.HashMap)3 Map (java.util.Map)3 CollectionReaderDescription (org.apache.uima.collection.CollectionReaderDescription)3 ParameterSpace (org.dkpro.lab.task.ParameterSpace)3 TcFeatureSet (org.dkpro.tc.api.features.TcFeatureSet)3 MekaAdapter (org.dkpro.tc.ml.weka.MekaAdapter)3 MULAN (meka.classifiers.multilabel.MULAN)2 RandomForest (weka.classifiers.trees.RandomForest)2 BR (meka.classifiers.multilabel.BR)1 CCq (meka.classifiers.multilabel.CCq)1 PSUpdateable (meka.classifiers.multilabel.incremental.PSUpdateable)1 NaiveBayes (weka.classifiers.bayes.NaiveBayes)1