use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.
the class ExtractFeaturesConnectorTest method extractFeaturesConnectorSingleLabelTest.
@Test
public void extractFeaturesConnectorSingleLabelTest() throws Exception {
File outputPath = folder.newFolder();
// we do not need parameters here, but in case we do :)
Object[] parameters = new Object[] { NoopFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123" };
ExternalResourceDescription featureExtractor = ExternalResourceFactory.createExternalResourceDescription(NoopFeatureExtractor.class, parameters);
List<ExternalResourceDescription> fes = new ArrayList<>();
fes.add(featureExtractor);
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestReaderSingleLabel.class, TestReaderSingleLabel.PARAM_SOURCE_LOCATION, "src/test/resources/data/*.txt");
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
AnalysisEngineDescription featExtractorConnector = TaskUtils.getFeatureExtractorConnector(outputPath.getAbsolutePath(), JsonDataWriter.class.getName(), Constants.LM_REGRESSION, Constants.FM_DOCUMENT, false, false, false, false, Collections.emptyList(), fes, new String[] {});
SimplePipeline.runPipeline(reader, segmenter, doc, featExtractorConnector);
Gson gson = new Gson();
System.out.println(FileUtils.readFileToString(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8"));
List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
List<Instance> instances = new ArrayList<>();
for (String l : lines) {
instances.add(gson.fromJson(l, Instance.class));
}
assertEquals(2, instances.size());
assertEquals(1, getUniqueOutcomes(instances));
}
use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.
the class ExtractFeaturesConnectorTest method extractFeaturesConnectorMultiLabelTest.
@Test
public void extractFeaturesConnectorMultiLabelTest() throws Exception {
File outputPath = folder.newFolder();
// we do not need parameters here, but in case we do :)
Object[] parameters = new Object[] { NoopFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123" };
ExternalResourceDescription featureExtractor = ExternalResourceFactory.createExternalResourceDescription(NoopFeatureExtractor.class, parameters);
List<ExternalResourceDescription> fes = new ArrayList<>();
fes.add(featureExtractor);
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestReaderMultiLabel.class, TestReaderMultiLabel.PARAM_SOURCE_LOCATION, "src/test/resources/data/*.txt");
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
AnalysisEngineDescription featExtractorConnector = TaskUtils.getFeatureExtractorConnector(outputPath.getAbsolutePath(), JsonDataWriter.class.getName(), Constants.LM_REGRESSION, Constants.FM_DOCUMENT, false, false, false, false, Collections.emptyList(), fes, new String[] {});
SimplePipeline.runPipeline(reader, segmenter, doc, featExtractorConnector);
Gson gson = new Gson();
List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
List<Instance> instances = new ArrayList<>();
for (String l : lines) {
instances.add(gson.fromJson(l, Instance.class));
}
assertEquals(2, instances.size());
assertEquals(3, getUniqueOutcomes(instances));
}
use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.
the class DeepLearning4jDocumentTrainTest method getParameterSpace.
public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
// configure training and test data reader dimension
// train/test will use both, while cross-validation will only use the train part
Map<String, Object> dimReaders = new HashMap<String, Object>();
CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(LinewiseTextReader.class, LinewiseTextReader.PARAM_SOURCE_LOCATION, corpusFilePathTrain, LinewiseTextReader.PARAM_LANGUAGE, LANGUAGE_CODE, LinewiseTextReader.PARAM_PATTERNS, "/**/*.txt");
dimReaders.put(DIM_READER_TRAIN, readerTrain);
CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(LinewiseTextReader.class, LinewiseTextReader.PARAM_SOURCE_LOCATION, corpusFilePathTest, LinewiseTextReader.PARAM_LANGUAGE, LANGUAGE_CODE, LinewiseTextReader.PARAM_PATTERNS, "/**/*.txt");
dimReaders.put(DIM_READER_TEST, readerTest);
ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_FEATURE_MODE, Constants.FM_DOCUMENT), Dimension.create(DIM_LEARNING_MODE, Constants.LM_SINGLE_LABEL), Dimension.create(DeepLearningConstants.DIM_USER_CODE, new Dl4jDocumentUserCode()), Dimension.create(DeepLearningConstants.DIM_MAXIMUM_LENGTH, 15), Dimension.create(DeepLearningConstants.DIM_VECTORIZE_TO_INTEGER, true), Dimension.create(DeepLearningConstants.DIM_PRETRAINED_EMBEDDINGS, "src/test/resources/wordvector/glove.6B.50d_250.txt"));
return pSpace;
}
use of org.apache.uima.collection.CollectionReaderDescription in project webanno by webanno.
the class TeiReaderTest method testTeiReader.
@Test
@Ignore("No TEI yet to opensource ")
public void testTeiReader() throws Exception {
CollectionReaderDescription reader = createReaderDescription(TeiReader.class, TeiReader.PARAM_LANGUAGE, "en", TeiReader.PARAM_SOURCE_LOCATION, "classpath:/local/", TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" });
String firstSentence = "70 I DAG.";
for (JCas jcas : new JCasIterable(reader)) {
DocumentMetaData meta = DocumentMetaData.get(jcas);
String text = jcas.getDocumentText();
System.out.printf("%s - %d%n", meta.getDocumentId(), text.length());
System.out.println(jcas.getDocumentLanguage());
assertEquals(2235, JCasUtil.select(jcas, Token.class).size());
assertEquals(745, JCasUtil.select(jcas, POS.class).size());
assertEquals(745, JCasUtil.select(jcas, Lemma.class).size());
assertEquals(0, JCasUtil.select(jcas, NamedEntity.class).size());
assertEquals(30, JCasUtil.select(jcas, Sentence.class).size());
assertEquals(firstSentence, JCasUtil.select(jcas, Sentence.class).iterator().next().getCoveredText());
}
}
use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-lab by dkpro.
the class SimpleExecutionEngine method run.
@Override
public String run(Task aConfiguration) throws ExecutionException, LifeCycleException {
if (!(aConfiguration instanceof UimaTask)) {
throw new ExecutionException("This engine can only execute [" + UimaTask.class.getName() + "]");
}
UimaTask configuration = (UimaTask) aConfiguration;
// Create persistence service for injection into analysis components
TaskContext ctx = contextFactory.createContext(aConfiguration);
try {
ResourceManager resMgr = newDefaultResourceManager();
// Make sure the descriptor is fully resolved. It will be modified and
// thus should not be modified again afterwards by UIMA.
AnalysisEngineDescription analysisDesc = configuration.getAnalysisEngineDescription(ctx);
analysisDesc.resolveImports(resMgr);
if (analysisDesc.getMetaData().getName() == null) {
analysisDesc.getMetaData().setName("Analysis for " + aConfiguration.getType());
}
// Scan components that accept the service and bind it to them
bindResource(analysisDesc, TaskContext.class, TaskContextProvider.class, TaskContextProvider.PARAM_FACTORY_NAME, contextFactory.getId(), TaskContextProvider.PARAM_CONTEXT_ID, ctx.getId());
// Set up UIMA context & logging
Logger logger = new UimaLoggingAdapter(ctx);
UimaContextAdmin uimaCtx = newUimaContext(logger, resMgr, newConfigurationManager());
// Set up reader
CollectionReaderDescription readerDesc = configuration.getCollectionReaderDescription(ctx);
if (readerDesc.getMetaData().getName() == null) {
readerDesc.getMetaData().setName("Reader for " + aConfiguration.getType());
}
Map<String, Object> addReaderParam = new HashMap<String, Object>();
addReaderParam.put(Resource.PARAM_UIMA_CONTEXT, uimaCtx);
addReaderParam.put(Resource.PARAM_RESOURCE_MANAGER, resMgr);
CollectionReader reader = produceCollectionReader(readerDesc, resMgr, addReaderParam);
// Set up analysis engine
AnalysisEngine engine;
if (analysisDesc.isPrimitive()) {
engine = new PrimitiveAnalysisEngine_impl();
} else {
engine = new AggregateAnalysisEngine_impl();
}
Map<String, Object> addEngineParam = new HashMap<String, Object>();
addReaderParam.put(Resource.PARAM_UIMA_CONTEXT, uimaCtx);
addReaderParam.put(Resource.PARAM_RESOURCE_MANAGER, resMgr);
engine.initialize(analysisDesc, addEngineParam);
// Now the setup is complete
ctx.getLifeCycleManager().initialize(ctx, aConfiguration);
// Start recording
ctx.getLifeCycleManager().begin(ctx, aConfiguration);
// Run the experiment
// Apply the engine to all documents provided by the reader
List<ResourceMetaData> metaData = new ArrayList<ResourceMetaData>();
metaData.add(reader.getMetaData());
metaData.add(engine.getMetaData());
CAS cas = CasCreationUtils.createCas(metaData);
while (reader.hasNext()) {
reader.getNext(cas);
engine.process(cas);
String documentTitle = "";
Feature documentTitleFeature = cas.getDocumentAnnotation().getType().getFeatureByBaseName("documentTitle");
if (documentTitleFeature != null) {
documentTitle = cas.getDocumentAnnotation().getFeatureValueAsString(documentTitleFeature);
}
cas.reset();
Progress[] progresses = reader.getProgress();
if (progresses != null) {
for (Progress p : progresses) {
ctx.message("Progress " + readerDesc.getImplementationName() + " " + p.getCompleted() + "/" + p.getTotal() + " " + p.getUnit() + " " + "(" + documentTitle + ")");
}
}
}
// Shut down engine and reader
engine.collectionProcessComplete();
reader.close();
engine.destroy();
reader.destroy();
// End recording
ctx.getLifeCycleManager().complete(ctx, aConfiguration);
return ctx.getId();
} catch (LifeCycleException e) {
ctx.getLifeCycleManager().fail(ctx, aConfiguration, e);
throw e;
} catch (Throwable e) {
ctx.getLifeCycleManager().fail(ctx, aConfiguration, e);
throw new ExecutionException(e);
} finally {
if (ctx != null) {
ctx.getLifeCycleManager().destroy(ctx, aConfiguration);
}
}
}
Aggregations