Search in sources :

Example 31 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class LibsvmSaveAndLoadModelDocumentSingleLabelTest method documentGetParameterSpaceSingleLabel.

private ParameterSpace documentGetParameterSpaceSingleLabel(boolean useClassificationArguments) throws ResourceInitializationException {
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, documentTrainFolder, FolderwiseDataReader.PARAM_LANGUAGE, "en", FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    Map<String, Object> config = new HashMap<>();
    config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new LibsvmAdapter(), "-c", "100" });
    config.put(DIM_DATA_WRITER, new LibsvmAdapter().getDataWriterClass().getName());
    config.put(DIM_FEATURE_USE_SPARSE, new LibsvmAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 50, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3)));
    ParameterSpace pSpace;
    if (useClassificationArguments) {
        pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), mlas, dimFeatureSets);
    } else {
        config = new HashMap<>();
        config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new LibsvmAdapter() });
        config.put(DIM_DATA_WRITER, new LibsvmAdapter().getDataWriterClass().getName());
        config.put(DIM_FEATURE_USE_SPARSE, new LibsvmAdapter().useSparseFeatures());
        mlas = Dimension.createBundle("config", config);
        pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, mlas);
    }
    return pSpace;
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) HashMap(java.util.HashMap) ParameterSpace(org.dkpro.lab.task.ParameterSpace) LibsvmAdapter(org.dkpro.tc.ml.libsvm.LibsvmAdapter) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) HashMap(java.util.HashMap) Map(java.util.Map)

Example 32 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class WordNGramTest method luceneNGramFeatureExtractorNonDefaultFrequencyThresholdTest.

@Test
public void luceneNGramFeatureExtractorNonDefaultFrequencyThresholdTest() throws Exception {
    File luceneFolder = folder.newFolder();
    File outputPath = folder.newFolder();
    Object[] parameters = new Object[] { WordNGram.PARAM_NGRAM_USE_TOP_K, "3", WordNGram.PARAM_UNIQUE_EXTRACTOR_NAME, "123", WordNGram.PARAM_SOURCE_LOCATION, luceneFolder.toString(), WordNGram.PARAM_NGRAM_FREQ_THRESHOLD, "0.1f", WordNGramMC.PARAM_TARGET_LOCATION, luceneFolder.toString() };
    List<Object> parameterList = new ArrayList<Object>(Arrays.asList(parameters));
    CollectionReaderDescription reader = getMetaReader();
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(WordNGramMC.class, parameterList.toArray());
    ExternalResourceDescription featureExtractor = ExternalResourceFactory.createExternalResourceDescription(WordNGram.class, parameters);
    List<ExternalResourceDescription> fes = new ArrayList<>();
    fes.add(featureExtractor);
    AnalysisEngineDescription featExtractorConnector = TaskUtils.getFeatureExtractorConnector(outputPath.getAbsolutePath(), JsonDataWriter.class.getName(), Constants.LM_SINGLE_LABEL, Constants.FM_DOCUMENT, false, false, false, false, Collections.emptyList(), fes, new String[] {});
    // run meta collector
    SimplePipeline.runPipeline(reader, segmenter, metaCollector);
    // run FE(s)
    SimplePipeline.runPipeline(reader, segmenter, featExtractorConnector);
    List<Instance> instances = readInstances(outputPath);
    assertEquals(4, instances.size());
    assertEquals(1, getUniqueOutcomes(instances));
    for (Instance i : instances) {
        assertTrue(i.getFeatures().isEmpty());
    }
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) JsonDataWriter(org.dkpro.tc.core.io.JsonDataWriter) Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) File(java.io.File) ExternalResourceDescription(org.apache.uima.resource.ExternalResourceDescription) Test(org.junit.Test)

Example 33 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class WordNGramMetaCollectorTest method luceneNgramMetaCollectorTest.

@Test
public void luceneNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_PATTERNS, "text*.txt");
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(WordNGramMC.class, WordNGramMC.PARAM_TARGET_LOCATION, tmpDir, WordNGramMC.PARAM_UNIQUE_EXTRACTOR_NAME, UNIQUE_FEATURE_NAME);
    for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) {
        System.out.println(jcas.getDocumentText().length());
    }
    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(WordNGram.LUCENE_NGRAM_FIELD + UNIQUE_FEATURE_NAME);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    if (text.utf8ToString().equals("this")) {
                        assertEquals(1, termsEnum.docFreq());
                        assertEquals(3, termsEnum.totalTermFreq());
                    }
                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }
    assertEquals(35, i);
}
Also used : JCasIterable(org.apache.uima.fit.pipeline.JCasIterable) Terms(org.apache.lucene.index.Terms) JCas(org.apache.uima.jcas.JCas) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) TermsEnum(org.apache.lucene.index.TermsEnum) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) Fields(org.apache.lucene.index.Fields) MultiFields(org.apache.lucene.index.MultiFields) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) IndexReader(org.apache.lucene.index.IndexReader) File(java.io.File) BytesRef(org.apache.lucene.util.BytesRef) Test(org.junit.Test)

Example 34 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class PPipelineTestBase method runPipeline.

protected void runPipeline() throws Exception {
    List<Object> parameterList = new ArrayList<Object>(Arrays.asList(parameters));
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, setTestPairsLocation());
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription stemmer = AnalysisEngineFactory.createEngineDescription(SnowballStemmer.class);
    AnalysisEngineDescription lemmatizer = AnalysisEngineFactory.createEngineDescription(MorphaLemmatizer.class);
    AnalysisEngineDescription posTagger = AnalysisEngineFactory.createEngineDescription(OpenNlpPosTagger.class);
    AnalysisEngineDescription pairAnno = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_PAIR);
    AggregateBuilder builder = new AggregateBuilder();
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);
    builder.add(pairAnno, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(pairAnno, Constants.INITIAL_VIEW, Constants.PART_TWO);
    builder.add(stemmer, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(stemmer, Constants.INITIAL_VIEW, Constants.PART_TWO);
    builder.add(lemmatizer, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(lemmatizer, Constants.INITIAL_VIEW, Constants.PART_TWO);
    builder.add(posTagger, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(posTagger, Constants.INITIAL_VIEW, Constants.PART_TWO);
    getMetaCollector(parameterList);
    getFeatureExtractorCollector(parameterList);
    // run meta collector
    SimplePipeline.runPipeline(reader, builder.createAggregateDescription(), metaCollector);
    // run FE(s)
    SimplePipeline.runPipeline(reader, builder.createAggregateDescription(), featExtractorConnector);
    Gson gson = new Gson();
    List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
    for (String l : lines) {
        instanceList.add(gson.fromJson(l, Instance.class));
    }
    assertEquals(1, lines.size());
    assertEquals(1, getUniqueOutcomes(instanceList).size());
    featureNames = getFeatureNames(instanceList);
    for (int i = 0; i < instanceList.size(); i++) {
        outcomeList.add(instanceList.get(i).getOutcomes());
    }
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) AggregateBuilder(org.apache.uima.fit.factory.AggregateBuilder) Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) Gson(com.google.gson.Gson) File(java.io.File)

Example 35 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class LuceneMetaCollectionBasedFeatureTestBase method runMetaCollection.

protected void runMetaCollection(File luceneFolder, AnalysisEngineDescription metaCollector) throws Exception {
    CollectionReaderDescription reader = getMetaReader();
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    SimplePipeline.runPipeline(reader, segmenter, metaCollector);
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription)

Aggregations

CollectionReaderDescription (org.apache.uima.collection.CollectionReaderDescription)78 HashMap (java.util.HashMap)53 ParameterSpace (org.dkpro.lab.task.ParameterSpace)51 TcFeatureSet (org.dkpro.tc.api.features.TcFeatureSet)40 Map (java.util.Map)35 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)25 File (java.io.File)19 WekaAdapter (org.dkpro.tc.ml.weka.WekaAdapter)17 Test (org.junit.Test)14 ArrayList (java.util.ArrayList)13 LiblinearAdapter (org.dkpro.tc.ml.liblinear.LiblinearAdapter)9 NaiveBayes (weka.classifiers.bayes.NaiveBayes)9 ExternalResourceDescription (org.apache.uima.resource.ExternalResourceDescription)7 LibsvmAdapter (org.dkpro.tc.ml.libsvm.LibsvmAdapter)7 Gson (com.google.gson.Gson)6 Instance (org.dkpro.tc.api.features.Instance)6 JsonDataWriter (org.dkpro.tc.core.io.JsonDataWriter)6 XgboostAdapter (org.dkpro.tc.ml.xgboost.XgboostAdapter)6 JCasIterable (org.apache.uima.fit.pipeline.JCasIterable)5 JCas (org.apache.uima.jcas.JCas)5