Search in sources :

Example 41 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class ExtractFeaturesConnectorTest method extractFeaturesConnectorSingleLabelTest.

@Test
public void extractFeaturesConnectorSingleLabelTest() throws Exception {
    File outputPath = folder.newFolder();
    // we do not need parameters here, but in case we do :)
    Object[] parameters = new Object[] { NoopFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123" };
    ExternalResourceDescription featureExtractor = ExternalResourceFactory.createExternalResourceDescription(NoopFeatureExtractor.class, parameters);
    List<ExternalResourceDescription> fes = new ArrayList<>();
    fes.add(featureExtractor);
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestReaderSingleLabel.class, TestReaderSingleLabel.PARAM_SOURCE_LOCATION, "src/test/resources/data/*.txt");
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
    AnalysisEngineDescription featExtractorConnector = TaskUtils.getFeatureExtractorConnector(outputPath.getAbsolutePath(), JsonDataWriter.class.getName(), Constants.LM_REGRESSION, Constants.FM_DOCUMENT, false, false, false, false, Collections.emptyList(), fes, new String[] {});
    SimplePipeline.runPipeline(reader, segmenter, doc, featExtractorConnector);
    Gson gson = new Gson();
    System.out.println(FileUtils.readFileToString(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8"));
    List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
    List<Instance> instances = new ArrayList<>();
    for (String l : lines) {
        instances.add(gson.fromJson(l, Instance.class));
    }
    assertEquals(2, instances.size());
    assertEquals(1, getUniqueOutcomes(instances));
}
Also used : JsonDataWriter(org.dkpro.tc.core.io.JsonDataWriter) Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) Gson(com.google.gson.Gson) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) File(java.io.File) ExternalResourceDescription(org.apache.uima.resource.ExternalResourceDescription) Test(org.junit.Test)

Example 42 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class ExtractFeaturesConnectorTest method extractFeaturesConnectorMultiLabelTest.

@Test
public void extractFeaturesConnectorMultiLabelTest() throws Exception {
    File outputPath = folder.newFolder();
    // we do not need parameters here, but in case we do :)
    Object[] parameters = new Object[] { NoopFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123" };
    ExternalResourceDescription featureExtractor = ExternalResourceFactory.createExternalResourceDescription(NoopFeatureExtractor.class, parameters);
    List<ExternalResourceDescription> fes = new ArrayList<>();
    fes.add(featureExtractor);
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestReaderMultiLabel.class, TestReaderMultiLabel.PARAM_SOURCE_LOCATION, "src/test/resources/data/*.txt");
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
    AnalysisEngineDescription featExtractorConnector = TaskUtils.getFeatureExtractorConnector(outputPath.getAbsolutePath(), JsonDataWriter.class.getName(), Constants.LM_REGRESSION, Constants.FM_DOCUMENT, false, false, false, false, Collections.emptyList(), fes, new String[] {});
    SimplePipeline.runPipeline(reader, segmenter, doc, featExtractorConnector);
    Gson gson = new Gson();
    List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
    List<Instance> instances = new ArrayList<>();
    for (String l : lines) {
        instances.add(gson.fromJson(l, Instance.class));
    }
    assertEquals(2, instances.size());
    assertEquals(3, getUniqueOutcomes(instances));
}
Also used : JsonDataWriter(org.dkpro.tc.core.io.JsonDataWriter) Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) Gson(com.google.gson.Gson) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) File(java.io.File) ExternalResourceDescription(org.apache.uima.resource.ExternalResourceDescription) Test(org.junit.Test)

Example 43 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-tc by dkpro.

the class DeepLearning4jDocumentTrainTest method getParameterSpace.

public static ParameterSpace getParameterSpace() throws ResourceInitializationException {
    // configure training and test data reader dimension
    // train/test will use both, while cross-validation will only use the train part
    Map<String, Object> dimReaders = new HashMap<String, Object>();
    CollectionReaderDescription readerTrain = CollectionReaderFactory.createReaderDescription(LinewiseTextReader.class, LinewiseTextReader.PARAM_SOURCE_LOCATION, corpusFilePathTrain, LinewiseTextReader.PARAM_LANGUAGE, LANGUAGE_CODE, LinewiseTextReader.PARAM_PATTERNS, "/**/*.txt");
    dimReaders.put(DIM_READER_TRAIN, readerTrain);
    CollectionReaderDescription readerTest = CollectionReaderFactory.createReaderDescription(LinewiseTextReader.class, LinewiseTextReader.PARAM_SOURCE_LOCATION, corpusFilePathTest, LinewiseTextReader.PARAM_LANGUAGE, LANGUAGE_CODE, LinewiseTextReader.PARAM_PATTERNS, "/**/*.txt");
    dimReaders.put(DIM_READER_TEST, readerTest);
    ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders), Dimension.create(DIM_FEATURE_MODE, Constants.FM_DOCUMENT), Dimension.create(DIM_LEARNING_MODE, Constants.LM_SINGLE_LABEL), Dimension.create(DeepLearningConstants.DIM_USER_CODE, new Dl4jDocumentUserCode()), Dimension.create(DeepLearningConstants.DIM_MAXIMUM_LENGTH, 15), Dimension.create(DeepLearningConstants.DIM_VECTORIZE_TO_INTEGER, true), Dimension.create(DeepLearningConstants.DIM_PRETRAINED_EMBEDDINGS, "src/test/resources/wordvector/glove.6B.50d_250.txt"));
    return pSpace;
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) HashMap(java.util.HashMap) ParameterSpace(org.dkpro.lab.task.ParameterSpace)

Example 44 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project webanno by webanno.

the class TeiReaderTest method testTeiReader.

@Test
@Ignore("No TEI yet to opensource ")
public void testTeiReader() throws Exception {
    CollectionReaderDescription reader = createReaderDescription(TeiReader.class, TeiReader.PARAM_LANGUAGE, "en", TeiReader.PARAM_SOURCE_LOCATION, "classpath:/local/", TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" });
    String firstSentence = "70 I DAG.";
    for (JCas jcas : new JCasIterable(reader)) {
        DocumentMetaData meta = DocumentMetaData.get(jcas);
        String text = jcas.getDocumentText();
        System.out.printf("%s - %d%n", meta.getDocumentId(), text.length());
        System.out.println(jcas.getDocumentLanguage());
        assertEquals(2235, JCasUtil.select(jcas, Token.class).size());
        assertEquals(745, JCasUtil.select(jcas, POS.class).size());
        assertEquals(745, JCasUtil.select(jcas, Lemma.class).size());
        assertEquals(0, JCasUtil.select(jcas, NamedEntity.class).size());
        assertEquals(30, JCasUtil.select(jcas, Sentence.class).size());
        assertEquals(firstSentence, JCasUtil.select(jcas, Sentence.class).iterator().next().getCoveredText());
    }
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) JCasIterable(org.apache.uima.fit.pipeline.JCasIterable) JCas(org.apache.uima.jcas.JCas) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 45 with CollectionReaderDescription

use of org.apache.uima.collection.CollectionReaderDescription in project dkpro-lab by dkpro.

the class SimpleExecutionEngine method run.

@Override
public String run(Task aConfiguration) throws ExecutionException, LifeCycleException {
    if (!(aConfiguration instanceof UimaTask)) {
        throw new ExecutionException("This engine can only execute [" + UimaTask.class.getName() + "]");
    }
    UimaTask configuration = (UimaTask) aConfiguration;
    // Create persistence service for injection into analysis components
    TaskContext ctx = contextFactory.createContext(aConfiguration);
    try {
        ResourceManager resMgr = newDefaultResourceManager();
        // Make sure the descriptor is fully resolved. It will be modified and
        // thus should not be modified again afterwards by UIMA.
        AnalysisEngineDescription analysisDesc = configuration.getAnalysisEngineDescription(ctx);
        analysisDesc.resolveImports(resMgr);
        if (analysisDesc.getMetaData().getName() == null) {
            analysisDesc.getMetaData().setName("Analysis for " + aConfiguration.getType());
        }
        // Scan components that accept the service and bind it to them
        bindResource(analysisDesc, TaskContext.class, TaskContextProvider.class, TaskContextProvider.PARAM_FACTORY_NAME, contextFactory.getId(), TaskContextProvider.PARAM_CONTEXT_ID, ctx.getId());
        // Set up UIMA context & logging
        Logger logger = new UimaLoggingAdapter(ctx);
        UimaContextAdmin uimaCtx = newUimaContext(logger, resMgr, newConfigurationManager());
        // Set up reader
        CollectionReaderDescription readerDesc = configuration.getCollectionReaderDescription(ctx);
        if (readerDesc.getMetaData().getName() == null) {
            readerDesc.getMetaData().setName("Reader for " + aConfiguration.getType());
        }
        Map<String, Object> addReaderParam = new HashMap<String, Object>();
        addReaderParam.put(Resource.PARAM_UIMA_CONTEXT, uimaCtx);
        addReaderParam.put(Resource.PARAM_RESOURCE_MANAGER, resMgr);
        CollectionReader reader = produceCollectionReader(readerDesc, resMgr, addReaderParam);
        // Set up analysis engine
        AnalysisEngine engine;
        if (analysisDesc.isPrimitive()) {
            engine = new PrimitiveAnalysisEngine_impl();
        } else {
            engine = new AggregateAnalysisEngine_impl();
        }
        Map<String, Object> addEngineParam = new HashMap<String, Object>();
        addReaderParam.put(Resource.PARAM_UIMA_CONTEXT, uimaCtx);
        addReaderParam.put(Resource.PARAM_RESOURCE_MANAGER, resMgr);
        engine.initialize(analysisDesc, addEngineParam);
        // Now the setup is complete
        ctx.getLifeCycleManager().initialize(ctx, aConfiguration);
        // Start recording
        ctx.getLifeCycleManager().begin(ctx, aConfiguration);
        // Run the experiment
        // Apply the engine to all documents provided by the reader
        List<ResourceMetaData> metaData = new ArrayList<ResourceMetaData>();
        metaData.add(reader.getMetaData());
        metaData.add(engine.getMetaData());
        CAS cas = CasCreationUtils.createCas(metaData);
        while (reader.hasNext()) {
            reader.getNext(cas);
            engine.process(cas);
            String documentTitle = "";
            Feature documentTitleFeature = cas.getDocumentAnnotation().getType().getFeatureByBaseName("documentTitle");
            if (documentTitleFeature != null) {
                documentTitle = cas.getDocumentAnnotation().getFeatureValueAsString(documentTitleFeature);
            }
            cas.reset();
            Progress[] progresses = reader.getProgress();
            if (progresses != null) {
                for (Progress p : progresses) {
                    ctx.message("Progress " + readerDesc.getImplementationName() + " " + p.getCompleted() + "/" + p.getTotal() + " " + p.getUnit() + " " + "(" + documentTitle + ")");
                }
            }
        }
        // Shut down engine and reader
        engine.collectionProcessComplete();
        reader.close();
        engine.destroy();
        reader.destroy();
        // End recording
        ctx.getLifeCycleManager().complete(ctx, aConfiguration);
        return ctx.getId();
    } catch (LifeCycleException e) {
        ctx.getLifeCycleManager().fail(ctx, aConfiguration, e);
        throw e;
    } catch (Throwable e) {
        ctx.getLifeCycleManager().fail(ctx, aConfiguration, e);
        throw new ExecutionException(e);
    } finally {
        if (ctx != null) {
            ctx.getLifeCycleManager().destroy(ctx, aConfiguration);
        }
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) LifeCycleException(org.dkpro.lab.engine.LifeCycleException) Logger(org.apache.uima.util.Logger) Feature(org.apache.uima.cas.Feature) PrimitiveAnalysisEngine_impl(org.apache.uima.analysis_engine.impl.PrimitiveAnalysisEngine_impl) UimaTask(org.dkpro.lab.uima.task.UimaTask) ExecutionException(org.dkpro.lab.engine.ExecutionException) UimaLoggingAdapter(org.dkpro.lab.uima.task.impl.UimaLoggingAdapter) Progress(org.apache.uima.util.Progress) TaskContext(org.dkpro.lab.engine.TaskContext) UIMAFramework.produceCollectionReader(org.apache.uima.UIMAFramework.produceCollectionReader) CollectionReader(org.apache.uima.collection.CollectionReader) ResourceManager(org.apache.uima.resource.ResourceManager) UIMAFramework.newDefaultResourceManager(org.apache.uima.UIMAFramework.newDefaultResourceManager) AggregateAnalysisEngine_impl(org.apache.uima.analysis_engine.impl.AggregateAnalysisEngine_impl) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) CAS(org.apache.uima.cas.CAS) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) UimaContextAdmin(org.apache.uima.UimaContextAdmin) ResourceMetaData(org.apache.uima.resource.metadata.ResourceMetaData) AnalysisEngine(org.apache.uima.analysis_engine.AnalysisEngine)

Aggregations

CollectionReaderDescription (org.apache.uima.collection.CollectionReaderDescription)78 HashMap (java.util.HashMap)53 ParameterSpace (org.dkpro.lab.task.ParameterSpace)51 TcFeatureSet (org.dkpro.tc.api.features.TcFeatureSet)40 Map (java.util.Map)35 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)25 File (java.io.File)19 WekaAdapter (org.dkpro.tc.ml.weka.WekaAdapter)17 Test (org.junit.Test)14 ArrayList (java.util.ArrayList)13 LiblinearAdapter (org.dkpro.tc.ml.liblinear.LiblinearAdapter)9 NaiveBayes (weka.classifiers.bayes.NaiveBayes)9 ExternalResourceDescription (org.apache.uima.resource.ExternalResourceDescription)7 LibsvmAdapter (org.dkpro.tc.ml.libsvm.LibsvmAdapter)7 Gson (com.google.gson.Gson)6 Instance (org.dkpro.tc.api.features.Instance)6 JsonDataWriter (org.dkpro.tc.core.io.JsonDataWriter)6 XgboostAdapter (org.dkpro.tc.ml.xgboost.XgboostAdapter)6 JCasIterable (org.apache.uima.fit.pipeline.JCasIterable)5 JCas (org.apache.uima.jcas.JCas)5