Examples with CollectionReaderDescription - org.apache.uima.collection.CollectionReaderDescription

Example 51 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class WekaExternalResourceDemo method getParameterSpace.

public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
    // configure training and test data reader dimension
    // train/test will use both, while cross-validation will only use the
    // train part
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(PairTwentyNewsgroupsReader.class, PairTwentyNewsgroupsReader.PARAM_LISTFILE, listFilePathTrain, PairTwentyNewsgroupsReader.PARAM_LANGUAGE_CODE, languageCode);
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(PairTwentyNewsgroupsReader.class, PairTwentyNewsgroupsReader.PARAM_LISTFILE, listFilePathTest, PairTwentyNewsgroupsReader.PARAM_LANGUAGE_CODE, languageCode);
    dimReaders.put(DIM_READER_TEST, readerTest);
    // Create the External Resource here:
    ExternalResourceDescription gstResource = ExternalResourceFactory.createExternalResourceDescription(CosineSimilarityResource.class, CosineSimilarityResource.PARAM_NORMALIZATION, NormalizationMode.L2.toString());
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(SimilarityPairFeatureExtractor.class, SimilarityPairFeatureExtractor.PARAM_TEXT_SIMILARITY_RESOURCE, gstResource)));
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), SMO.class.getName() });
    config.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle(DIM_READERS, dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_PAIR), dimFeatureSets, mlas);
    return pSpace;
}

Also used : HashMap(java.util.HashMap) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) WekaAdapter(org.dkpro.tc.ml.weka.WekaAdapter) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) SMO(weka.classifiers.functions.SMO) ParameterSpace(org.dkpro.lab.task.ParameterSpace) HashMap(java.util.HashMap) Map(java.util.Map) ExternalResourceDescription(org.apache.uima.resource.ExternalResourceDescription)

Example 52 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class SemanticTextSimilarityDemo method getParameterSpace.

public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
    // configure training data reader dimension
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(STSReader.class, STSReader.PARAM_INPUT_FILE, inputFileTrain, STSReader.PARAM_GOLD_FILE, goldFileTrain);
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(STSReader.class, STSReader.PARAM_INPUT_FILE, inputFileTest, STSReader.PARAM_GOLD_FILE, goldFileTest);
    dimReaders.put(DIM_READER_TEST, readerTest);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(DiffNrOfTokensPairFeatureExtractor.class)));
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), SMOreg.class.getName() });
    config.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle(Constants.DIM_READER_TRAIN, dimReaders), Dimension.create(Constants.DIM_FEATURE_MODE, Constants.FM_PAIR), Dimension.create(Constants.DIM_LEARNING_MODE, Constants.LM_REGRESSION), dimFeatureSets, mlas);
    return pSpace;
}

Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) HashMap(java.util.HashMap) ParameterSpace(org.dkpro.lab.task.ParameterSpace) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) SMOreg(weka.classifiers.functions.SMOreg) HashMap(java.util.HashMap) Map(java.util.Map) WekaAdapter(org.dkpro.tc.ml.weka.WekaAdapter)

Example 53 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class ExtractFeaturesConnectorTest method extractFeaturesConnectorRegressionTest.

@Test
public void extractFeaturesConnectorRegressionTest() throws Exception {
    File outputPath = folder.newFolder();
    // we do not need parameters here, but in case we do :)
    Object[] parameters = new Object[] { NoopFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123", UnitContextMetaCollector.PARAM_CONTEXT_FOLDER, Constants.ID_CONTEXT_KEY };
    ExternalResourceDescription featureExtractor = ExternalResourceFactory.createExternalResourceDescription(NoopFeatureExtractor.class, parameters);
    List<ExternalResourceDescription> fes = new ArrayList<>();
    fes.add(featureExtractor);
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestReaderRegression.class, TestReaderRegression.PARAM_SOURCE_LOCATION, "src/test/resources/data/*.txt");
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
    AnalysisEngineDescription featExtractorConnector = TaskUtils.getFeatureExtractorConnector(outputPath.getAbsolutePath(), JsonDataWriter.class.getName(), Constants.LM_REGRESSION, Constants.FM_DOCUMENT, false, false, false, false, Collections.emptyList(), fes, new String[] {});
    SimplePipeline.runPipeline(reader, segmenter, doc, featExtractorConnector);
    Gson gson = new Gson();
    List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
    List<Instance> instances = new ArrayList<>();
    for (String l : lines) {
        instances.add(gson.fromJson(l, Instance.class));
    }
    assertEquals(2, instances.size());
    assertEquals(1, getUniqueOutcomes(instances));
    assertEquals("0.45", instances.get(0).getOutcome());
    System.out.println(FileUtils.readFileToString(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8"));
}

Also used : JsonDataWriter(org.dkpro.tc.core.io.JsonDataWriter) Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) Gson(com.google.gson.Gson) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) File(java.io.File) ExternalResourceDescription(org.apache.uima.resource.ExternalResourceDescription) Test(org.junit.Test)

Example 54 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class MinimalWorkingExample method getParameterSpace.

public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
    // configure training and test data reader dimension
    // train/test will use both, while cross-validation will only use the
    // train part
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, corpusFilePathTrain, FolderwiseDataReader.PARAM_LANGUAGE, LANGUAGE_CODE, FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, corpusFilePathTest, FolderwiseDataReader.PARAM_LANGUAGE, LANGUAGE_CODE, FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
    dimReaders.put(DIM_READER_TEST, readerTest);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet("DummyFeatureSet", TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 20, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3)));
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), NaiveBayes.class.getName() });
    config.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, mlas);
    return pSpace;
}

Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) NaiveBayes(weka.classifiers.bayes.NaiveBayes) HashMap(java.util.HashMap) ParameterSpace(org.dkpro.lab.task.ParameterSpace) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) HashMap(java.util.HashMap) Map(java.util.Map) WekaAdapter(org.dkpro.tc.ml.weka.WekaAdapter)

Example 55 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class WekaPairRegressionDemo method getParameterSpace.

public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
    // configure training data reader dimension
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(STSReader.class, STSReader.PARAM_INPUT_FILE, inputFileTrain, STSReader.PARAM_GOLD_FILE, goldFileTrain);
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(STSReader.class, STSReader.PARAM_INPUT_FILE, inputFileTest, STSReader.PARAM_GOLD_FILE, goldFileTest);
    dimReaders.put(DIM_READER_TEST, readerTest);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(DiffNrOfTokensPairFeatureExtractor.class)));
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), SMOreg.class.getName() });
    config.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle(Constants.DIM_READER_TRAIN, dimReaders), Dimension.create(Constants.DIM_FEATURE_MODE, Constants.FM_PAIR), Dimension.create(Constants.DIM_LEARNING_MODE, Constants.LM_REGRESSION), dimFeatureSets, mlas);
    return pSpace;
}

Aggregations

CollectionReaderDescription (org.apache.uima.collection.CollectionReaderDescription)78 HashMap (java.util.HashMap)53 ParameterSpace (org.dkpro.lab.task.ParameterSpace)51 TcFeatureSet (org.dkpro.tc.api.features.TcFeatureSet)40 Map (java.util.Map)35 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)25 File (java.io.File)19 WekaAdapter (org.dkpro.tc.ml.weka.WekaAdapter)17 Test (org.junit.Test)14 ArrayList (java.util.ArrayList)13 LiblinearAdapter (org.dkpro.tc.ml.liblinear.LiblinearAdapter)9 NaiveBayes (weka.classifiers.bayes.NaiveBayes)9 ExternalResourceDescription (org.apache.uima.resource.ExternalResourceDescription)7 LibsvmAdapter (org.dkpro.tc.ml.libsvm.LibsvmAdapter)7 Gson (com.google.gson.Gson)6 Instance (org.dkpro.tc.api.features.Instance)6 JsonDataWriter (org.dkpro.tc.core.io.JsonDataWriter)6 XgboostAdapter (org.dkpro.tc.ml.xgboost.XgboostAdapter)6 JCasIterable (org.apache.uima.fit.pipeline.JCasIterable)5 JCas (org.apache.uima.jcas.JCas)5