Search in sources :

Example 1 with WekaAdapter

use of org.dkpro.tc.ml.weka.WekaAdapter in project dkpro-tc by dkpro.

the class CRFSuiteSaveAndLoadModelTest method loadModelArowParameters.

@Test
public void loadModelArowParameters() throws Exception {
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new CrfSuiteAdapter(), CrfSuiteAdapter.ALGORITHM_ADAPTIVE_REGULARIZATION_OF_WEIGHT_VECTOR, "-p", "max_iterations=2" });
    config.put(DIM_DATA_WRITER, new CrfSuiteAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    // create a model
    File modelFolder = folder.newFolder();
    ParameterSpace pSpace = getParameterSpace(mlas);
    executeSaveModelIntoTemporyFolder(pSpace, modelFolder);
    JCas jcas = JCasFactory.createJCas();
    jcas.setDocumentText("This is an example text. It has 2 sentences.");
    jcas.setDocumentLanguage("en");
    AnalysisEngine tokenizer = AnalysisEngineFactory.createEngine(BreakIteratorSegmenter.class);
    AnalysisEngine tcAnno = AnalysisEngineFactory.createEngine(TcAnnotator.class, TcAnnotator.PARAM_TC_MODEL_LOCATION, modelFolder.getAbsolutePath(), TcAnnotator.PARAM_NAME_SEQUENCE_ANNOTATION, Sentence.class.getName(), TcAnnotator.PARAM_NAME_UNIT_ANNOTATION, Token.class.getName());
    tokenizer.process(jcas);
    tcAnno.process(jcas);
    List<TextClassificationOutcome> outcomes = new ArrayList<>(JCasUtil.select(jcas, TextClassificationOutcome.class));
    // 9 token + 2 punctuation marks
    assertEquals(11, outcomes.size());
    for (TextClassificationOutcome o : outcomes) {
        assertTrue(postags.contains(o.getOutcome()));
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) JCas(org.apache.uima.jcas.JCas) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) CrfSuiteAdapter(org.dkpro.tc.ml.crfsuite.CrfSuiteAdapter) WekaAdapter(org.dkpro.tc.ml.weka.WekaAdapter) ParameterSpace(org.dkpro.lab.task.ParameterSpace) TextClassificationOutcome(org.dkpro.tc.api.type.TextClassificationOutcome) HashMap(java.util.HashMap) Map(java.util.Map) File(java.io.File) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) AnalysisEngine(org.apache.uima.analysis_engine.AnalysisEngine) Test(org.junit.Test)

Example 2 with WekaAdapter

use of org.dkpro.tc.ml.weka.WekaAdapter in project dkpro-tc by dkpro.

the class WekaComplexConfigurationSingleDemo method getParameterSpace.

public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
    // configure training and test data reader dimension
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, CORPUS_FILEPATH_TRAIN, FolderwiseDataReader.PARAM_LANGUAGE, LANGUAGE_CODE, FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, COPRUS_FILEPATH_TEST, FolderwiseDataReader.PARAM_LANGUAGE, LANGUAGE_CODE, FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
    dimReaders.put(DIM_READER_TEST, readerTest);
    Map<String, Object> config1 = new HashMap<>();
    config1.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), SMO.class.getName(), "-C", "1.0", "-K", PolyKernel.class.getName() + " " + "-C -1 -E 2" });
    config1.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
    config1.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Map<String, Object> config2 = new HashMap<>();
    config2.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), RandomForest.class.getName(), "-I", "5" });
    config2.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
    config2.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Map<String, Object> config3 = new HashMap<>();
    config3.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), Bagging.class.getName(), "-I", "2", "-W", J48.class.getName(), "--", "-C", "0.5", "-M", "2" });
    config3.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
    config3.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config1, config2, config3);
    // We configure 2 sets of feature extractors, one consisting of 3 extractors, and one with
    // only 1
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 50, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3)), new TcFeatureSet(TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 50, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3)));
    // single-label feature selection (Weka specific options), reduces the feature set to 10
    Map<String, Object> dimFeatureSelection = new HashMap<String, Object>();
    dimFeatureSelection.put(DIM_FEATURE_SEARCHER_ARGS, asList(new String[] { Ranker.class.getName(), "-N", "10" }));
    dimFeatureSelection.put(DIM_ATTRIBUTE_EVALUATOR_ARGS, asList(new String[] { InfoGainAttributeEval.class.getName() }));
    dimFeatureSelection.put(DIM_APPLY_FEATURE_SELECTION, true);
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, mlas, Dimension.createBundle("featureSelection", dimFeatureSelection));
    return pSpace;
}
Also used : HashMap(java.util.HashMap) RandomForest(weka.classifiers.trees.RandomForest) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) WekaAdapter(org.dkpro.tc.ml.weka.WekaAdapter) J48(weka.classifiers.trees.J48) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) SMO(weka.classifiers.functions.SMO) ParameterSpace(org.dkpro.lab.task.ParameterSpace) PolyKernel(weka.classifiers.functions.supportVector.PolyKernel) HashMap(java.util.HashMap) Map(java.util.Map) Bagging(weka.classifiers.meta.Bagging)

Example 3 with WekaAdapter

use of org.dkpro.tc.ml.weka.WekaAdapter in project dkpro-tc by dkpro.

the class WekaUniformClassDistributionDemo method getParameterSpace.

@SuppressWarnings("unchecked")
public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
    // configure training and test data reader dimension
    // train/test will use both, while cross-validation will only use the train part
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, corpusFilePathTrain, FolderwiseDataReader.PARAM_LANGUAGE, LANGUAGE_CODE, FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, corpusFilePathTest, FolderwiseDataReader.PARAM_LANGUAGE, LANGUAGE_CODE, FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
    dimReaders.put(DIM_READER_TEST, readerTest);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 50, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3)));
    Dimension<List<String>> dimFeatureFilters = Dimension.create(DIM_FEATURE_FILTERS, Arrays.asList(new String[] { UniformClassDistributionFilter.class.getName() }));
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), NaiveBayes.class.getName() });
    config.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, dimFeatureFilters, mlas);
    return pSpace;
}
Also used : HashMap(java.util.HashMap) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) WekaAdapter(org.dkpro.tc.ml.weka.WekaAdapter) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) NaiveBayes(weka.classifiers.bayes.NaiveBayes) ParameterSpace(org.dkpro.lab.task.ParameterSpace) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map)

Example 4 with WekaAdapter

use of org.dkpro.tc.ml.weka.WekaAdapter in project dkpro-tc by dkpro.

the class MultiSvmUsingWekaLibsvmLiblinear method getParameterSpace.

public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
    // configure training and test data reader dimension
    // train/test will use both, while cross-validation will only use the
    // train part
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, corpusFilePathTrain, FolderwiseDataReader.PARAM_LANGUAGE, LANGUAGE_CODE, FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    // 
    CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, corpusFilePathTest, FolderwiseDataReader.PARAM_LANGUAGE, LANGUAGE_CODE, FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
    dimReaders.put(DIM_READER_TEST, readerTest);
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), SMO.class.getName(), "-C", "1.0", "-K", PolyKernel.class.getName() + " " + "-C -1 -E 2" });
    config.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Map<String, Object> config2 = new HashMap<>();
    config2.put(DIM_CLASSIFICATION_ARGS, new Object[] { new LiblinearAdapter(), "-s", "4", "-c", "100" });
    config2.put(DIM_DATA_WRITER, new LiblinearAdapter().getDataWriterClass().getName());
    config2.put(DIM_FEATURE_USE_SPARSE, new LiblinearAdapter().useSparseFeatures());
    Map<String, Object> config3 = new HashMap<>();
    config3.put(DIM_CLASSIFICATION_ARGS, new Object[] { new LibsvmAdapter(), "-s", "1", "-c", "1000", "-t", "3" });
    config3.put(DIM_DATA_WRITER, new LibsvmAdapter().getDataWriterClass().getName());
    config3.put(DIM_FEATURE_USE_SPARSE, new LibsvmAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config, config2, config3);
    Dimension<String> dimLearningMode = Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL);
    Dimension<String> dimFeatureMode = Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT);
    Dimension<TcFeatureSet> dimFeatureSet = Dimension.create(DIM_FEATURE_SET, getFeatureSet());
    ParameterSpace ps = new ParameterSpace(dimLearningMode, dimFeatureMode, dimFeatureMode, dimFeatureSet, mlas, Dimension.createBundle(DIM_READERS, dimReaders));
    return ps;
}
Also used : HashMap(java.util.HashMap) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) LiblinearAdapter(org.dkpro.tc.ml.liblinear.LiblinearAdapter) WekaAdapter(org.dkpro.tc.ml.weka.WekaAdapter) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) SMO(weka.classifiers.functions.SMO) ParameterSpace(org.dkpro.lab.task.ParameterSpace) LibsvmAdapter(org.dkpro.tc.ml.libsvm.LibsvmAdapter) PolyKernel(weka.classifiers.functions.supportVector.PolyKernel) HashMap(java.util.HashMap) Map(java.util.Map)

Example 5 with WekaAdapter

use of org.dkpro.tc.ml.weka.WekaAdapter in project dkpro-tc by dkpro.

the class WekaAblationDemo method getParameterSpace.

public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
    // configure training and test data reader dimension
    // train/test will use both, while cross-validation will only use the
    // train part
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, corpusFilePathTrain, FolderwiseDataReader.PARAM_LANGUAGE, LANGUAGE_CODE, FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, corpusFilePathTest, FolderwiseDataReader.PARAM_LANGUAGE, LANGUAGE_CODE, FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
    dimReaders.put(DIM_READER_TEST, readerTest);
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), NaiveBayes.class.getName() });
    config.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    Dimension<TcFeatureSet> dimFeatureSets = ExperimentUtil.getAblationTestFeatures(TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(EmoticonRatio.class), TcFeatureFactory.create(NumberOfHashTags.class));
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, mlas);
    return pSpace;
}
Also used : EmoticonRatio(org.dkpro.tc.features.twitter.EmoticonRatio) HashMap(java.util.HashMap) TokenRatioPerDocument(org.dkpro.tc.features.maxnormalization.TokenRatioPerDocument) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) WekaAdapter(org.dkpro.tc.ml.weka.WekaAdapter) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) NaiveBayes(weka.classifiers.bayes.NaiveBayes) ParameterSpace(org.dkpro.lab.task.ParameterSpace) NumberOfHashTags(org.dkpro.tc.features.twitter.NumberOfHashTags) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

HashMap (java.util.HashMap)21 ParameterSpace (org.dkpro.lab.task.ParameterSpace)21 WekaAdapter (org.dkpro.tc.ml.weka.WekaAdapter)21 Map (java.util.Map)19 TcFeatureSet (org.dkpro.tc.api.features.TcFeatureSet)18 CollectionReaderDescription (org.apache.uima.collection.CollectionReaderDescription)17 NaiveBayes (weka.classifiers.bayes.NaiveBayes)8 SMO (weka.classifiers.functions.SMO)5 ArrayList (java.util.ArrayList)4 File (java.io.File)3 List (java.util.List)3 CrfSuiteAdapter (org.dkpro.tc.ml.crfsuite.CrfSuiteAdapter)3 Test (org.junit.Test)3 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)2 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)2 AnalysisEngine (org.apache.uima.analysis_engine.AnalysisEngine)2 JCas (org.apache.uima.jcas.JCas)2 TextClassificationOutcome (org.dkpro.tc.api.type.TextClassificationOutcome)2 LiblinearAdapter (org.dkpro.tc.ml.liblinear.LiblinearAdapter)2 LibsvmAdapter (org.dkpro.tc.ml.libsvm.LibsvmAdapter)2