use of org.apache.uima.collection.CollectionReaderDescription in project webanno by webanno.
the class WebAnnoTsv3ReaderWriterRoundTripTest method runTest.
@Test
public void runTest() throws Exception {
TypeSystemDescription global = TypeSystemDescriptionFactory.createTypeSystemDescription();
TypeSystemDescription local;
if (new File(referenceFolder, "typesystem.xml").exists()) {
local = TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath(new File(referenceFolder, "typesystem.xml").toString());
} else {
local = TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath("src/test/resources/desc/type/webannoTestTypes.xml");
}
TypeSystemDescription merged = CasCreationUtils.mergeTypeSystems(asList(global, local));
String targetFolder = "target/test-output/WebAnnoTsv3ReaderWriterRoundTripTest/" + referenceFolder.getName();
CollectionReaderDescription reader = createReaderDescription(WebannoTsv3Reader.class, merged, WebannoTsv3Reader.PARAM_SOURCE_LOCATION, referenceFolder, WebannoTsv3Reader.PARAM_PATTERNS, "reference.tsv");
AnalysisEngineDescription checker = createEngineDescription(DKProCoreConventionsChecker.class);
// WebannoTsv3Writer doesn't seem to like it if both "SimpleLinkHost" and
// "ComplexLinkHost" are declared, so I comment out "ComplexLinkHost" which has
// less tests.
AnalysisEngineDescription tsvWriter = createEngineDescription(WebannoTsv3Writer.class, merged, WebannoTsv3Writer.PARAM_TARGET_LOCATION, targetFolder, WebannoTsv3Writer.PARAM_STRIP_EXTENSION, true, WebannoTsv3Writer.PARAM_CHAIN_LAYERS, asList("webanno.custom.Simple"), WebannoTsv3Writer.PARAM_SLOT_FEATS, asList("webanno.custom.SimpleLinkHost:links"), WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList(NamedEntity.class.getName(), MorphologicalFeatures.class.getName(), POS.class.getName(), Lemma.class.getName(), Stem.class.getName(), "webanno.custom.SimpleSpan", "webanno.custom.SimpleLinkHost"), WebannoTsv3Writer.PARAM_LINK_TYPES, asList("webanno.custom.LinkType"), WebannoTsv3Writer.PARAM_SLOT_TARGETS, asList("webanno.custom.SimpleSpan"), WebannoTsv3Writer.PARAM_RELATION_LAYERS, asList("webanno.custom.SimpleRelation", "webanno.custom.Relation", "webanno.custom.ComplexRelation", Dependency.class.getName()));
AnalysisEngineDescription xmiWriter = createEngineDescription(XmiWriter.class, merged, XmiWriter.PARAM_TARGET_LOCATION, targetFolder, XmiWriter.PARAM_STRIP_EXTENSION, true);
try {
SimplePipeline.runPipeline(reader, checker, tsvWriter, xmiWriter);
} catch (Throwable e) {
assumeFalse("This test is known to fail.", isKnownToFail(referenceFolder.getName()));
throw e;
}
String reference = FileUtils.readFileToString(new File(referenceFolder, "reference.tsv"), "UTF-8");
String actual = FileUtils.readFileToString(new File(targetFolder, "reference.tsv"), "UTF-8");
//
// The XMI files here are not compared semantically but using their serialization which
// is subject to minor variations depending e.g. on the order in which annotation are
// created in the CAS. Thus, this code is commented out and should only be used on a
// case-by-case base to compare XMIs during development.
//
// String referenceXmi = FileUtils.readFileToString(new File(referenceFolder,
// "reference.xmi"),
// "UTF-8");
//
// String actualXmi = FileUtils.readFileToString(new File(targetFolder, "reference.xmi"),
// "UTF-8");
assumeFalse("This test is known to fail.", isKnownToFail(referenceFolder.getName()));
assertEquals(reference, actual);
// assertEquals(referenceXmi, actualXmi);
}
use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.
the class DeepLearningDl4jSeq2SeqTrainTest method getParameterSpace.
public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
// configure training and test data reader dimension
Map<String, Object> dimReaders = new HashMap<String, Object>();
CollectionReaderDescription train = CollectionReaderFactory.createReaderDescription(TeiReader.class, TeiReader.PARAM_LANGUAGE, "en", TeiReader.PARAM_SOURCE_LOCATION, corpusFilePathTrain, TeiReader.PARAM_PATTERNS, "*.xml");
dimReaders.put(DIM_READER_TRAIN, train);
CollectionReaderDescription test = CollectionReaderFactory.createReaderDescription(TeiReader.class, TeiReader.PARAM_LANGUAGE, "en", TeiReader.PARAM_SOURCE_LOCATION, corpusFilePathTest, TeiReader.PARAM_PATTERNS, "*.xml");
dimReaders.put(DIM_READER_TEST, test);
ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_FEATURE_MODE, Constants.FM_SEQUENCE), Dimension.create(DIM_LEARNING_MODE, Constants.LM_SINGLE_LABEL), Dimension.create(DeepLearningConstants.DIM_PRETRAINED_EMBEDDINGS, "src/test/resources/wordvector/glove.6B.50d_250.txt"), Dimension.create(DeepLearningConstants.DIM_VECTORIZE_TO_INTEGER, false), Dimension.create(DeepLearningConstants.DIM_USE_ONLY_VOCABULARY_COVERED_BY_EMBEDDING, true), Dimension.create(DeepLearningConstants.DIM_USER_CODE, new Dl4jSeq2SeqUserCode()));
return pSpace;
}
use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.
the class LiblinearSaveAndLoadModelDocumentRegression method regressionGetParameterSpace.
private ParameterSpace regressionGetParameterSpace() throws Exception {
Map<String, Object> dimReaders = new HashMap<String, Object>();
CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(LinewiseTextOutcomeReader.class, LinewiseTextOutcomeReader.PARAM_OUTCOME_INDEX, 0, LinewiseTextOutcomeReader.PARAM_TEXT_INDEX, 1, LinewiseTextOutcomeReader.PARAM_LANGUAGE, "en", LinewiseTextOutcomeReader.PARAM_SOURCE_LOCATION, "src/main/resources/data/essays/train/essay_train.txt", LinewiseTextOutcomeReader.PARAM_LANGUAGE, "en");
dimReaders.put(DIM_READER_TRAIN, readerTrain);
@SuppressWarnings("unchecked") Dimension<List<Object>> dimClassificationArgs = Dimension.create(DIM_CLASSIFICATION_ARGS, Arrays.asList(new Object[] { new LiblinearAdapter(), "-s", "6" }));
Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(SentenceRatioPerDocument.class), TcFeatureFactory.create(WordNGram.class), TcFeatureFactory.create(TokenRatioPerDocument.class)));
ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_REGRESSION), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, dimClassificationArgs);
return pSpace;
}
use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.
the class SVMHMMSaveAndLoadModelTest method getParameterSpace.
private ParameterSpace getParameterSpace() throws ResourceInitializationException {
DemoUtils.setDkproHome(this.getClass().getName());
String trainFolder = "src/main/resources/data/brown_tei/";
// configure training and test data reader dimension
// train/test will use both, while cross-validation will only use the
// train part
Map<String, Object> dimReaders = new HashMap<String, Object>();
CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(BrownCorpusReader.class, BrownCorpusReader.PARAM_LANGUAGE, "en", BrownCorpusReader.PARAM_SOURCE_LOCATION, trainFolder, BrownCorpusReader.PARAM_LANGUAGE, "en", BrownCorpusReader.PARAM_PATTERNS, "*.xml");
dimReaders.put(DIM_READER_TRAIN, readerTrain);
Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 50, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3), TcFeatureFactory.create(TokenRatioPerDocument.class)));
Map<String, Object> wekaConfig = new HashMap<>();
wekaConfig.put(DIM_CLASSIFICATION_ARGS, new Object[] { new SvmHmmAdapter() });
wekaConfig.put(DIM_DATA_WRITER, new SvmHmmAdapter().getDataWriterClass().getName());
wekaConfig.put(DIM_FEATURE_USE_SPARSE, new SvmHmmAdapter().useSparseFeatures());
Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", wekaConfig);
ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_SEQUENCE), dimFeatureSets, mlas);
return pSpace;
}
use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.
the class WekaComplexConfigurationSingleDemo method getParameterSpace.
public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
// configure training and test data reader dimension
Map<String, Object> dimReaders = new HashMap<String, Object>();
CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, CORPUS_FILEPATH_TRAIN, FolderwiseDataReader.PARAM_LANGUAGE, LANGUAGE_CODE, FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
dimReaders.put(DIM_READER_TRAIN, readerTrain);
CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(FolderwiseDataReader.class, FolderwiseDataReader.PARAM_SOURCE_LOCATION, COPRUS_FILEPATH_TEST, FolderwiseDataReader.PARAM_LANGUAGE, LANGUAGE_CODE, FolderwiseDataReader.PARAM_PATTERNS, "*/*.txt");
dimReaders.put(DIM_READER_TEST, readerTest);
Map<String, Object> config1 = new HashMap<>();
config1.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), SMO.class.getName(), "-C", "1.0", "-K", PolyKernel.class.getName() + " " + "-C -1 -E 2" });
config1.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
config1.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
Map<String, Object> config2 = new HashMap<>();
config2.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), RandomForest.class.getName(), "-I", "5" });
config2.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
config2.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
Map<String, Object> config3 = new HashMap<>();
config3.put(DIM_CLASSIFICATION_ARGS, new Object[] { new WekaAdapter(), Bagging.class.getName(), "-I", "2", "-W", J48.class.getName(), "--", "-C", "0.5", "-M", "2" });
config3.put(DIM_DATA_WRITER, new WekaAdapter().getDataWriterClass().getName());
config3.put(DIM_FEATURE_USE_SPARSE, new WekaAdapter().useSparseFeatures());
Dimension<Map<String, Object>> mlas = Dimension.createBundle("config", config1, config2, config3);
// We configure 2 sets of feature extractors, one consisting of 3 extractors, and one with
// only 1
Dimension<TcFeatureSet> dimFeatureSets = Dimension.create(DIM_FEATURE_SET, new TcFeatureSet(TcFeatureFactory.create(TokenRatioPerDocument.class), TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 50, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3)), new TcFeatureSet(TcFeatureFactory.create(WordNGram.class, WordNGram.PARAM_NGRAM_USE_TOP_K, 50, WordNGram.PARAM_NGRAM_MIN_N, 1, WordNGram.PARAM_NGRAM_MAX_N, 3)));
// single-label feature selection (Weka specific options), reduces the feature set to 10
Map<String, Object> dimFeatureSelection = new HashMap<String, Object>();
dimFeatureSelection.put(DIM_FEATURE_SEARCHER_ARGS, asList(new String[] { Ranker.class.getName(), "-N", "10" }));
dimFeatureSelection.put(DIM_ATTRIBUTE_EVALUATOR_ARGS, asList(new String[] { InfoGainAttributeEval.class.getName() }));
dimFeatureSelection.put(DIM_APPLY_FEATURE_SELECTION, true);
ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL), Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets, mlas, Dimension.createBundle("featureSelection", dimFeatureSelection));
return pSpace;
}
Aggregations