Search in sources :

Example 76 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class WekaSaveAndLoadModelUnitTest method unitGetParameterSpace.

private static ParameterSpace unitGetParameterSpace() throws ResourceInitializationException {
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(BrownCorpusReader.class, BrownCorpusReader.PARAM_SOURCE_LOCATION, unitTrainFolder, BrownCorpusReader.PARAM_LANGUAGE, "en", BrownCorpusReader.PARAM_PATTERNS, Arrays.asList("*.xml"));
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    Map<String, Object> wekaConfig = new HashMap<>();
    wekaConfig.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), SMO.class.getName() });
    wekaConfig.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
    wekaConfig.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
    Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", wekaConfig);
    Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(CharacterNGram.class, CharacterNGram.PARAM_NGRAM_USE_TOP_K, 20)));
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_UNIT), dimFeatureSets, mlas);
    return pSpace;
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) SMO(weka.classifiers.functions.SMO) HashMap(java.util.HashMap) ParameterSpace(org.dkpro.lab.task.ParameterSpace) TcFeatureSet(org.dkpro.tc.api.features.TcFeatureSet) HashMap(java.util.HashMap) Map(java.util.Map) WekaAdapter(org.dkpro.tc.ml.weka.WekaAdapter)

Example 77 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class WordNGramMetaCollectorTest method emptyDocumentTest.

@SuppressWarnings("unused")
@Test
public void emptyDocumentTest() throws Exception {
    File tmpDir = folder.newFolder();
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/empty/", TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_PATTERNS, "empty*.txt");
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(WordNGramMC.class, WordNGramMC.PARAM_UNIQUE_EXTRACTOR_NAME, "123", WordNGramMC.PARAM_TARGET_LOCATION, tmpDir);
    for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) {
    // System.out.println(jcas.getDocumentText().length());
    }
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) JCasIterable(org.apache.uima.fit.pipeline.JCasIterable) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) JCas(org.apache.uima.jcas.JCas) File(java.io.File) Test(org.junit.Test)

Example 78 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class FoldUtil method createMinimalSplit.

/**
 * Takes the available CAS and creates more cases from them to conform to the minimal requested
 * amount of CAS objects to have sufficient for running a cross-validation. Computes a
 * rule-of-thumb value to split each of the found cas into N sub-cases and the end the total
 * created number is compared to the requested number of CAS and an exception thrown if too few
 * CAS were created.
 *
 * @param inputFolder
 *            the input folder
 * @param numFolds
 *            number of folds to create
 * @param numAvailableJCas
 *            number available cas'
 * @param isSequence
 *            is sequence model
 * @return returns folder with sufficient folds
 * @throws Exception
 *             if not enough data is available for creating the required number of folds
 */
public static File createMinimalSplit(String inputFolder, int numFolds, int numAvailableJCas, boolean isSequence) throws Exception {
    File outputFolder = new File(inputFolder, "output");
    int splitNum = (int) Math.ceil(numFolds / (double) numAvailableJCas);
    CollectionReaderDescription createReader = CollectionReaderFactory.createReaderDescription(BinaryCasReader.class, BinaryCasReader.PARAM_SOURCE_LOCATION, inputFolder, BinaryCasReader.PARAM_PATTERNS, "*.bin", BinaryCasReader.PARAM_ADD_DOCUMENT_METADATA, false);
    AnalysisEngineDescription multiplier = AnalysisEngineFactory.createEngineDescription(FoldClassificationUnitCasMultiplier.class, FoldClassificationUnitCasMultiplier.PARAM_REQUESTED_SPLITS, splitNum, FoldClassificationUnitCasMultiplier.PARAM_USE_SEQUENCES, isSequence);
    AnalysisEngineDescription xmiWriter = AnalysisEngineFactory.createEngineDescription(BinaryCasWriter.class, BinaryCasWriter.PARAM_TARGET_LOCATION, outputFolder.getAbsolutePath(), BinaryCasWriter.PARAM_FORMAT, "6+");
    AnalysisEngineDescription both = AnalysisEngineFactory.createEngineDescription(multiplier, xmiWriter);
    SimplePipeline.runPipeline(createReader, both);
    // final check - do we have at least as many folds as requested by "numFolds"?
    isNumberOfCasCreatedLargerEqualNumFolds(outputFolder, numFolds);
    return outputFolder;
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) File(java.io.File)

Aggregations

CollectionReaderDescription (org.apache.uima.collection.CollectionReaderDescription)78 HashMap (java.util.HashMap)53 ParameterSpace (org.dkpro.lab.task.ParameterSpace)51 TcFeatureSet (org.dkpro.tc.api.features.TcFeatureSet)40 Map (java.util.Map)35 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)25 File (java.io.File)19 WekaAdapter (org.dkpro.tc.ml.weka.WekaAdapter)17 Test (org.junit.Test)14 ArrayList (java.util.ArrayList)13 LiblinearAdapter (org.dkpro.tc.ml.liblinear.LiblinearAdapter)9 NaiveBayes (weka.classifiers.bayes.NaiveBayes)9 ExternalResourceDescription (org.apache.uima.resource.ExternalResourceDescription)7 LibsvmAdapter (org.dkpro.tc.ml.libsvm.LibsvmAdapter)7 Gson (com.google.gson.Gson)6 Instance (org.dkpro.tc.api.features.Instance)6 JsonDataWriter (org.dkpro.tc.core.io.JsonDataWriter)6 XgboostAdapter (org.dkpro.tc.ml.xgboost.XgboostAdapter)6 JCasIterable (org.apache.uima.fit.pipeline.JCasIterable)5 JCas (org.apache.uima.jcas.JCas)5