use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.
the class LibsvmSaveAndLoadModelDocumentSingleLabelTest method documentGetParameterSpaceSingleLabel.
private ParameterSpace documentGetParameterSpaceSingleLabel(boolean useClassificationArguments) throws ResourceInitializationException {
Map<String, Object> dimReaders = new HashMap<String, Object>();
CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, documentTrainFolder, FolderwiseDataReader.PARAM_LANGUAGE, "en", FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
dimReaders.put(DIM_READER_TRAIN, readerTrain);
Map<String, Object> config = new HashMap<>();
config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new LibsvmAdapter(), "-c", "100" });
config.put(DIM_DATA_WRITER, new LibsvmAdapter().getDataWriterClass().getName());
config.put(DIM_FEATURE_USE_SPARSE, new LibsvmAdapter().useSparseFeatures());
Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config);
Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 50, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3)));
ParameterSpace pSpace;
if (useClassificationArguments) {
pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), mlas, dimFeatureSets);
} else {
config = new HashMap<>();
config.put(DIM_CLASSIFICATION_ARGS, new Object[] { new LibsvmAdapter() });
config.put(DIM_DATA_WRITER, new LibsvmAdapter().getDataWriterClass().getName());
config.put(DIM_FEATURE_USE_SPARSE, new LibsvmAdapter().useSparseFeatures());
mlas = Dimension.createBundle("config", config);
pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, mlas);
}
return pSpace;
}
use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.
the class WordNGramTest method luceneNGramFeatureExtractorNonDefaultFrequencyThresholdTest.
@Test
public void luceneNGramFeatureExtractorNonDefaultFrequencyThresholdTest() throws Exception {
File luceneFolder = folder.newFolder();
File outputPath = folder.newFolder();
Object[] parameters = new Object[] { WordNGram.PARAM_NGRAM_USE_TOP_K, "3", WordNGram.PARAM_UNIQUE_EXTRACTOR_NAME, "123", WordNGram.PARAM_SOURCE_LOCATION, luceneFolder.toString(), WordNGram.PARAM_NGRAM_FREQ_THRESHOLD, "0.1f", WordNGramMC.PARAM_TARGET_LOCATION, luceneFolder.toString() };
List<Object> parameterList = new ArrayList<Object>(Arrays.asList(parameters));
CollectionReaderDescription reader = getMetaReader();
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(WordNGramMC.class, parameterList.toArray());
ExternalResourceDescription featureExtractor = ExternalResourceFactory.createExternalResourceDescription(WordNGram.class, parameters);
List<ExternalResourceDescription> fes = new ArrayList<>();
fes.add(featureExtractor);
AnalysisEngineDescription featExtractorConnector = TaskUtils.getFeatureExtractorConnector(outputPath.getAbsolutePath(), JsonDataWriter.class.getName(), Constants.LM_SINGLE_LABEL, Constants.FM_DOCUMENT, false, false, false, false, Collections.emptyList(), fes, new String[] {});
// run meta collector
SimplePipeline.runPipeline(reader, segmenter, metaCollector);
// run FE(s)
SimplePipeline.runPipeline(reader, segmenter, featExtractorConnector);
List<Instance> instances = readInstances(outputPath);
assertEquals(4, instances.size());
assertEquals(1, getUniqueOutcomes(instances));
for (Instance i : instances) {
assertTrue(i.getFeatures().isEmpty());
}
}
use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.
the class WordNGramMetaCollectorTest method luceneNgramMetaCollectorTest.
@Test
public void luceneNgramMetaCollectorTest() throws Exception {
File tmpDir = folder.newFolder();
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_PATTERNS, "text*.txt");
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(WordNGramMC.class, WordNGramMC.PARAM_TARGET_LOCATION, tmpDir, WordNGramMC.PARAM_UNIQUE_EXTRACTOR_NAME, UNIQUE_FEATURE_NAME);
for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) {
System.out.println(jcas.getDocumentText().length());
}
int i = 0;
IndexReader index;
try {
index = DirectoryReader.open(FSDirectory.open(tmpDir));
Fields fields = MultiFields.getFields(index);
if (fields != null) {
Terms terms = fields.terms(WordNGram.LUCENE_NGRAM_FIELD + UNIQUE_FEATURE_NAME);
if (terms != null) {
TermsEnum termsEnum = terms.iterator(null);
BytesRef text = null;
while ((text = termsEnum.next()) != null) {
if (text.utf8ToString().equals("this")) {
assertEquals(1, termsEnum.docFreq());
assertEquals(3, termsEnum.totalTermFreq());
}
i++;
}
}
}
} catch (Exception e) {
throw new ResourceInitializationException(e);
}
assertEquals(35, i);
}
use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.
the class PPipelineTestBase method runPipeline.
protected void runPipeline() throws Exception {
List<Object> parameterList = new ArrayList<Object>(Arrays.asList(parameters));
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, setTestPairsLocation());
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription stemmer = AnalysisEngineFactory.createEngineDescription(SnowballStemmer.class);
AnalysisEngineDescription lemmatizer = AnalysisEngineFactory.createEngineDescription(MorphaLemmatizer.class);
AnalysisEngineDescription posTagger = AnalysisEngineFactory.createEngineDescription(OpenNlpPosTagger.class);
AnalysisEngineDescription pairAnno = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_PAIR);
AggregateBuilder builder = new AggregateBuilder();
builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);
builder.add(pairAnno, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(pairAnno, Constants.INITIAL_VIEW, Constants.PART_TWO);
builder.add(stemmer, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(stemmer, Constants.INITIAL_VIEW, Constants.PART_TWO);
builder.add(lemmatizer, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(lemmatizer, Constants.INITIAL_VIEW, Constants.PART_TWO);
builder.add(posTagger, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(posTagger, Constants.INITIAL_VIEW, Constants.PART_TWO);
getMetaCollector(parameterList);
getFeatureExtractorCollector(parameterList);
// run meta collector
SimplePipeline.runPipeline(reader, builder.createAggregateDescription(), metaCollector);
// run FE(s)
SimplePipeline.runPipeline(reader, builder.createAggregateDescription(), featExtractorConnector);
Gson gson = new Gson();
List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
for (String l : lines) {
instanceList.add(gson.fromJson(l, Instance.class));
}
assertEquals(1, lines.size());
assertEquals(1, getUniqueOutcomes(instanceList).size());
featureNames = getFeatureNames(instanceList);
for (int i = 0; i < instanceList.size(); i++) {
outcomeList.add(instanceList.get(i).getOutcomes());
}
}
use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.
the class LuceneMetaCollectionBasedFeatureTestBase method runMetaCollection.
protected void runMetaCollection(File luceneFolder, AnalysisEngineDescription metaCollector) throws Exception {
CollectionReaderDescription reader = getMetaReader();
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
SimplePipeline.runPipeline(reader, segmenter, metaCollector);
}
Aggregations