use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.
the class WekaSaveAndLoadModelUnitTest method unitGetParameterSpace.
private static ParameterSpace unitGetParameterSpace() throws ResourceInitializationException {
Map<String, Object> dimReaders = new HashMap<String, Object>();
CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(BrownCorpusReader.class, BrownCorpusReader.PARAM_SOURCE_LOCATION, unitTrainFolder, BrownCorpusReader.PARAM_LANGUAGE, "en", BrownCorpusReader.PARAM_PATTERNS, Arrays.asList("*.xml"));
dimReaders.put(DIM_READER_TRAIN, readerTrain);
Map<String, Object> wekaConfig = new HashMap<>();
wekaConfig.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), SMO.class.getName() });
wekaConfig.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
wekaConfig.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", wekaConfig);
Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(CharacterNGram.class, CharacterNGram.PARAM_NGRAM_USE_TOP_K, 20)));
ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_UNIT), dimFeatureSets, mlas);
return pSpace;
}
use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.
the class WordNGramMetaCollectorTest method emptyDocumentTest.
@SuppressWarnings("unused")
@Test
public void emptyDocumentTest() throws Exception {
File tmpDir = folder.newFolder();
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/empty/", TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_PATTERNS, "empty*.txt");
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(WordNGramMC.class, WordNGramMC.PARAM_UNIQUE_EXTRACTOR_NAME, "123", WordNGramMC.PARAM_TARGET_LOCATION, tmpDir);
for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) {
// System.out.println(jcas.getDocumentText().length());
}
}
use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.
the class FoldUtil method createMinimalSplit.
/**
* Takes the available CAS and creates more cases from them to conform to the minimal requested
* amount of CAS objects to have sufficient for running a cross-validation. Computes a
* rule-of-thumb value to split each of the found cas into N sub-cases and the end the total
* created number is compared to the requested number of CAS and an exception thrown if too few
* CAS were created.
*
* @param inputFolder
* the input folder
* @param numFolds
* number of folds to create
* @param numAvailableJCas
* number available cas'
* @param isSequence
* is sequence model
* @return returns folder with sufficient folds
* @throws Exception
* if not enough data is available for creating the required number of folds
*/
public static File createMinimalSplit(String inputFolder, int numFolds, int numAvailableJCas, boolean isSequence) throws Exception {
File outputFolder = new File(inputFolder, "output");
int splitNum = (int) Math.ceil(numFolds / (double) numAvailableJCas);
CollectionReaderDescription createReader = CollectionReaderFactory.createReaderDescription(BinaryCasReader.class, BinaryCasReader.PARAM_SOURCE_LOCATION, inputFolder, BinaryCasReader.PARAM_PATTERNS, "*.bin", BinaryCasReader.PARAM_ADD_DOCUMENT_METADATA, false);
AnalysisEngineDescription multiplier = AnalysisEngineFactory.createEngineDescription(FoldClassificationUnitCasMultiplier.class, FoldClassificationUnitCasMultiplier.PARAM_REQUESTED_SPLITS, splitNum, FoldClassificationUnitCasMultiplier.PARAM_USE_SEQUENCES, isSequence);
AnalysisEngineDescription xmiWriter = AnalysisEngineFactory.createEngineDescription(BinaryCasWriter.class, BinaryCasWriter.PARAM_TARGET_LOCATION, outputFolder.getAbsolutePath(), BinaryCasWriter.PARAM_FORMAT, "6+");
AnalysisEngineDescription both = AnalysisEngineFactory.createEngineDescription(multiplier, xmiWriter);
SimplePipeline.runPipeline(createReader, both);
// final check - do we have at least as many folds as requested by "numFolds"?
isNumberOfCasCreatedLargerEqualNumFolds(outputFolder, numFolds);
return outputFolder;
}
Aggregations