Search in sources :

Example 26 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class WekaSaveAndLoadModelDocumentMultiLabelTest method documentGetParameterSpaceMultiLabel.

private ParameterSpace documentGetParameterSpaceMultiLabel() throws ResourceInitializationException {
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(ReutersCorpusReader.class, ReutersCorpusReader.PARAM_SOURCE_LOCATION, documentTrainFolderReuters, ReutersCorpusReader.PARAM_GOLD_LABEL_FILE, documentGoldLabelsReuters, ReutersCorpusReader.PARAM_LANGUAGE, "en", ReutersCorpusReader.PARAM_PATTERNS, ReutersCorpusReader.INCLUDE_PREFIX + "*.txt");
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    Map<String, Object> wekaConfig = new HashMap<>();
    wekaConfig.put(DIM_CLASSIFICATION_ARGS, new Object[] { new MekaAdapter(), MULAN.class.getName(), "-S", "RAkEL2", "-W", RandomForest.class.getName() });
    wekaConfig.put(DIM_DATA_WRITER, new MekaAdapter().getDataWriterClass().getName());
    wekaConfig.put(DIM_FEATURE_USE_SPARSE, new MekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", wekaConfig);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 50, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3), TcFeatureFactory.create(TokenRatioPerDocument.class)));
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_MULTI_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, Dimension.create(DIM_BIPARTITION_THRESHOLD, "0.5"), mlas);
    return pSpace;
}
Also used : HashMap(java.util.HashMap) RandomForest(weka.classifiers.trees.RandomForest) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) MekaAdapter(org.dkpro.tc.ml.weka.MekaAdapter) MULAN(meka.classifiers.multilabel.MULAN) ParameterSpace(org.dkpro.lab.task.ParameterSpace) HashMap(java.util.HashMap) Map(java.util.Map)

Example 27 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class WekaSaveAndLoadModelDocumentRegression method regressionGetParameterSpace.

private ParameterSpace regressionGetParameterSpace() throws Exception {
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(LinewiseTextOutcomeReader.class, LinewiseTextOutcomeReader.PARAM_OUTCOME_INDEX, 0, LinewiseTextOutcomeReader.PARAM_TEXT_INDEX, 1, LinewiseTextOutcomeReader.PARAM_SOURCE_LOCATION, "src/main/resources/data/essays/train/essay_train.txt", LinewiseTextOutcomeReader.PARAM_LANGUAGE, "en");
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    @SuppressWarnings("unchecked") Dimension<List<Object>> dimClassificationArgs = Dimension.create(DIM_CLASSIFICATION_ARGS, Arrays.asList(new Object[] { new WekaAdapter(), LinearRegression.class.getName() }));
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(SentenceRatioPerDocument.class)));
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_REGRESSION), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, dimClassificationArgs);
    return pSpace;
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) HashMap(java.util.HashMap) ParameterSpace(org.dkpro.lab.task.ParameterSpace) ArrayList(java.util.ArrayList) List(java.util.List) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) WekaAdapter(org.dkpro.tc.ml.weka.WekaAdapter)

Example 28 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class XgboostSaveAndLoadModelDocumentSingleLabelTest method documentGetParameterSpaceSingleLabel.

private ParameterSpace documentGetParameterSpaceSingleLabel(boolean useParametrizedArgs) throws ResourceInitializationException {
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, documentTrainFolder, FolderwiseDataReader.PARAM_LANGUAGE, "en", FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 50, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3)));
    ParameterSpace pSpace;
    if (useParametrizedArgs) {
        Map<String, Object> config = new HashMap<>();
        config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new XgboostAdapter(), "objective=multi:softmax" });
        config.put(DIM_DATA_WRITER, new XgboostAdapter().getDataWriterClass().getName());
        config.put(DIM_FEATURE_USE_SPARSE, new XgboostAdapter().useSparseFeatures());
        Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
        pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), mlas, dimFeatureSets);
    } else {
        Map<String, Object> config = new HashMap<>();
        config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new LiblinearAdapter() });
        config.put(DIM_DATA_WRITER, new LiblinearAdapter().getDataWriterClass().getName());
        config.put(DIM_FEATURE_USE_SPARSE, new LiblinearAdapter().useSparseFeatures());
        Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
        pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, mlas);
    }
    return pSpace;
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) HashMap(java.util.HashMap) ParameterSpace(org.dkpro.lab.task.ParameterSpace) XgboostAdapter(org.dkpro.tc.ml.xgboost.XgboostAdapter) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) LiblinearAdapter(org.dkpro.tc.ml.liblinear.LiblinearAdapter) Map(java.util.Map) HashMap(java.util.HashMap)

Example 29 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class XgboostSaveAndLoadModelDocumentSingleLabelTest method unitGetParameterSpaceSingleLabel.

public static ParameterSpace unitGetParameterSpaceSingleLabel() throws ResourceInitializationException {
    // configure training and test data reader dimension
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(BrownCorpusReader.class, BrownCorpusReader.PARAM_LANGUAGE, "en", BrownCorpusReader.PARAM_SOURCE_LOCATION, unitTrainFolder, BrownCorpusReader.PARAM_PATTERNS, new String[] { INCLUDE_PREFIX + "a01.xml" });
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    Map<String, Object> wekaConfig = new HashMap<>();
    wekaConfig.put(DIM_CLASSIFICATION_ARGS, new Object[] { new LiblinearAdapter() });
    wekaConfig.put(DIM_DATA_WRITER, new LiblinearAdapter().getDataWriterClass().getName());
    wekaConfig.put(DIM_FEATURE_USE_SPARSE, new LiblinearAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", wekaConfig);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(CharacterNGram.class, CharacterNGram.PARAM_NGRAM_LOWER_CASE, false)));
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_UNIT), dimFeatureSets, mlas);
    return pSpace;
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) HashMap(java.util.HashMap) ParameterSpace(org.dkpro.lab.task.ParameterSpace) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) LiblinearAdapter(org.dkpro.tc.ml.liblinear.LiblinearAdapter) Map(java.util.Map) HashMap(java.util.HashMap)

Example 30 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class LiblinearSaveAndLoadModelDocumentSingleLabelTest method documentGetParameterSpaceSingleLabel.

private ParameterSpace documentGetParameterSpaceSingleLabel(boolean useParametrizedArgs) throws ResourceInitializationException {
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, documentTrainFolder, FolderwiseDataReader.PARAM_LANGUAGE, "en", FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 50, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3)));
    ParameterSpace pSpace;
    if (useParametrizedArgs) {
        Map<String, Object> config = new HashMap<>();
        config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new LiblinearAdapter(), "-s", "6" });
        config.put(DIM_DATA_WRITER, new LiblinearAdapter().getDataWriterClass().getName());
        config.put(DIM_FEATURE_USE_SPARSE, new LiblinearAdapter().useSparseFeatures());
        Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
        pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), mlas, dimFeatureSets);
    } else {
        Map<String, Object> config = new HashMap<>();
        config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new LiblinearAdapter(), "-s", "6" });
        config.put(DIM_DATA_WRITER, new LiblinearAdapter().getDataWriterClass().getName());
        config.put(DIM_FEATURE_USE_SPARSE, new LiblinearAdapter().useSparseFeatures());
        Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
        pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, mlas);
    }
    return pSpace;
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) HashMap(java.util.HashMap) ParameterSpace(org.dkpro.lab.task.ParameterSpace) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) LiblinearAdapter(org.dkpro.tc.ml.liblinear.LiblinearAdapter) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

CollectionReaderDescription (org.apache.uima.collection.CollectionReaderDescription)78 HashMap (java.util.HashMap)53 ParameterSpace (org.dkpro.lab.task.ParameterSpace)51 TcFeatureSet (org.dkpro.tc.api.features.TcFeatureSet)40 Map (java.util.Map)35 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)25 File (java.io.File)19 WekaAdapter (org.dkpro.tc.ml.weka.WekaAdapter)17 Test (org.junit.Test)14 ArrayList (java.util.ArrayList)13 LiblinearAdapter (org.dkpro.tc.ml.liblinear.LiblinearAdapter)9 NaiveBayes (weka.classifiers.bayes.NaiveBayes)9 ExternalResourceDescription (org.apache.uima.resource.ExternalResourceDescription)7 LibsvmAdapter (org.dkpro.tc.ml.libsvm.LibsvmAdapter)7 Gson (com.google.gson.Gson)6 Instance (org.dkpro.tc.api.features.Instance)6 JsonDataWriter (org.dkpro.tc.core.io.JsonDataWriter)6 XgboostAdapter (org.dkpro.tc.ml.xgboost.XgboostAdapter)6 JCasIterable (org.apache.uima.fit.pipeline.JCasIterable)5 JCas (org.apache.uima.jcas.JCas)5