Search in sources :

Example 26 with AnalysisEngineDescription

use of org.apache.uima.analysis_engine.AnalysisEngineDescription in project dkpro-tc by dkpro.

the class DiffNrOfCharactersPairFeatureExtractorTest method testExtract.

@Test
public void testExtract() throws ResourceInitializationException, AnalysisEngineProcessException, TextClassificationException {
    AnalysisEngineDescription desc = createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngine engine = createEngine(desc);
    JCas jcas1 = engine.newJCas();
    jcas1.setDocumentLanguage("en");
    jcas1.setDocumentText("This is the text of view 1. And some more.");
    engine.process(jcas1);
    JCas jcas2 = engine.newJCas();
    jcas2.setDocumentLanguage("en");
    jcas2.setDocumentText("This is the text of view 2");
    engine.process(jcas2);
    DiffNrOfCharactersPairFeatureExtractor extractor = new DiffNrOfCharactersPairFeatureExtractor();
    Set<Feature> features = extractor.extract(jcas1, jcas2);
    assertEquals(1, features.size());
    for (Feature feature : features) {
        assertFeature("DiffNrOfCharacters", 16, feature);
    }
}
Also used : DiffNrOfCharactersPairFeatureExtractor(org.dkpro.tc.features.pair.core.length.DiffNrOfCharactersPairFeatureExtractor) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) JCas(org.apache.uima.jcas.JCas) FeatureTestUtil.assertFeature(org.dkpro.tc.testing.FeatureTestUtil.assertFeature) Feature(org.dkpro.tc.api.features.Feature) AnalysisEngine(org.apache.uima.analysis_engine.AnalysisEngine) Test(org.junit.Test)

Example 27 with AnalysisEngineDescription

use of org.apache.uima.analysis_engine.AnalysisEngineDescription in project dkpro-tc by dkpro.

the class QuestionRatioTest method questionRatioFeatureExtractorTest.

@Test
public void questionRatioFeatureExtractorTest() throws Exception {
    AnalysisEngineDescription desc = createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngine engine = createEngine(desc);
    JCas jcas = engine.newJCas();
    jcas.setDocumentLanguage("en");
    jcas.setDocumentText("Is he a tester???? Really?? He is a tester! Oh yes.");
    engine.process(jcas);
    TextClassificationTarget aTarget = new TextClassificationTarget(jcas, 0, jcas.getDocumentText().length());
    aTarget.addToIndexes();
    QuestionsRatioFeatureExtractor extractor = new QuestionsRatioFeatureExtractor();
    List<Feature> features = new ArrayList<Feature>(extractor.extract(jcas, aTarget));
    Assert.assertEquals(1, features.size());
    for (Feature feature : features) {
        assertFeature(FN_QUESTION_RATIO, 0.5, feature);
    }
}
Also used : AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) ArrayList(java.util.ArrayList) JCas(org.apache.uima.jcas.JCas) FeatureTestUtil.assertFeature(org.dkpro.tc.testing.FeatureTestUtil.assertFeature) Feature(org.dkpro.tc.api.features.Feature) AnalysisEngine(org.apache.uima.analysis_engine.AnalysisEngine) Test(org.junit.Test)

Example 28 with AnalysisEngineDescription

use of org.apache.uima.analysis_engine.AnalysisEngineDescription in project dkpro-tc by dkpro.

the class VectorizationTask method learningModeDependedVectorizationAnnotator.

private AnalysisEngineDescription learningModeDependedVectorizationAnnotator(File outputDir, File mappingDir) throws ResourceInitializationException {
    if (featureMode == null) {
        throw new ResourceInitializationException(new IllegalStateException("Learning model is [null]"));
    }
    AggregateBuilder builder = new AggregateBuilder();
    // records which document ids are in the train / test set (this is not
    // clear for cross-validation tasks)
    builder.add(createEngineDescription(IdentificationCollector.class, IdentificationCollector.PARAM_TARGET_DIRECTORY, outputDir, IdentificationCollector.PARAM_MODE, featureMode, IdentificationCollector.PARAM_USER_SET_MAXIMUM_LENGTH, maximumLength));
    AnalysisEngineDescription engine = null;
    switch(featureMode) {
        case Constants.FM_DOCUMENT:
            switch(learningMode) {
                case Constants.LM_REGRESSION:
                    engine = createEngineDescription(VectorizationDoc2Regression.class, VectorizationDoc2Regression.PARAM_TARGET_DIRECTORY, outputDir, VectorizationDoc2Regression.PARAM_PREPARATION_DIRECTORY, mappingDir, VectorizationDoc2Regression.PARAM_TO_INTEGER, integerVectorization);
                    builder.add(engine);
                    break;
                case Constants.LM_SINGLE_LABEL:
                    engine = createEngineDescription(VectorizationDoc2SingleLabel.class, VectorizationDoc2SingleLabel.PARAM_TARGET_DIRECTORY, outputDir, VectorizationDoc2SingleLabel.PARAM_PREPARATION_DIRECTORY, mappingDir, VectorizationDoc2SingleLabel.PARAM_TO_INTEGER, integerVectorization);
                    builder.add(engine);
                    break;
                case Constants.LM_MULTI_LABEL:
                    engine = createEngineDescription(VectorizationDocDoc2MultiLabel.class, VectorizationDocDoc2MultiLabel.PARAM_TARGET_DIRECTORY, outputDir, VectorizationDocDoc2MultiLabel.PARAM_PREPARATION_DIRECTORY, mappingDir, VectorizationDocDoc2MultiLabel.PARAM_TO_INTEGER, integerVectorization);
                    builder.add(engine);
                    break;
                default:
                    throw new ResourceInitializationException(new IllegalStateException("Combination of feature mode [" + featureMode + "] with learning mode [" + learningMode + "] not defined"));
            }
            break;
        case Constants.FM_SEQUENCE:
            engine = createEngineDescription(VectorizationSeq2SeqOfLabel.class, VectorizationSeq2SeqOfLabel.PARAM_TARGET_DIRECTORY, outputDir, VectorizationSeq2SeqOfLabel.PARAM_PREPARATION_DIRECTORY, mappingDir, VectorizationDocDoc2MultiLabel.PARAM_TO_INTEGER, integerVectorization);
            builder.add(engine);
            break;
        default:
            throw new ResourceInitializationException(new IllegalStateException("Combination of feature mode [" + featureMode + "] with learning mode [" + learningMode + "] not defined"));
    }
    return builder.createAggregateDescription();
}
Also used : IdentificationCollector(org.dkpro.tc.core.task.deep.anno.IdentificationCollector) VectorizationDoc2SingleLabel(org.dkpro.tc.core.task.deep.anno.VectorizationDoc2SingleLabel) AggregateBuilder(org.apache.uima.fit.factory.AggregateBuilder) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) VectorizationDoc2Regression(org.dkpro.tc.core.task.deep.anno.VectorizationDoc2Regression) VectorizationDocDoc2MultiLabel(org.dkpro.tc.core.task.deep.anno.VectorizationDocDoc2MultiLabel) VectorizationSeq2SeqOfLabel(org.dkpro.tc.core.task.deep.anno.VectorizationSeq2SeqOfLabel)

Example 29 with AnalysisEngineDescription

use of org.apache.uima.analysis_engine.AnalysisEngineDescription in project dkpro-tc by dkpro.

the class ExtractFeaturesConnectorTest method extractFeaturesConnectorSingleLabelTest.

@Test
public void extractFeaturesConnectorSingleLabelTest() throws Exception {
    File outputPath = folder.newFolder();
    // we do not need parameters here, but in case we do :)
    Object[] parameters = new Object[] { NoopFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123" };
    ExternalResourceDescription featureExtractor = ExternalResourceFactory.createExternalResourceDescription(NoopFeatureExtractor.class, parameters);
    List<ExternalResourceDescription> fes = new ArrayList<>();
    fes.add(featureExtractor);
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestReaderSingleLabel.class, TestReaderSingleLabel.PARAM_SOURCE_LOCATION, "src/test/resources/data/*.txt");
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
    AnalysisEngineDescription featExtractorConnector = TaskUtils.getFeatureExtractorConnector(outputPath.getAbsolutePath(), JsonDataWriter.class.getName(), Constants.LM_REGRESSION, Constants.FM_DOCUMENT, false, false, false, false, Collections.emptyList(), fes, new String[] {});
    SimplePipeline.runPipeline(reader, segmenter, doc, featExtractorConnector);
    Gson gson = new Gson();
    System.out.println(FileUtils.readFileToString(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8"));
    List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
    List<Instance> instances = new ArrayList<>();
    for (String l : lines) {
        instances.add(gson.fromJson(l, Instance.class));
    }
    assertEquals(2, instances.size());
    assertEquals(1, getUniqueOutcomes(instances));
}
Also used : JsonDataWriter(org.dkpro.tc.core.io.JsonDataWriter) Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) Gson(com.google.gson.Gson) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) File(java.io.File) ExternalResourceDescription(org.apache.uima.resource.ExternalResourceDescription) Test(org.junit.Test)

Example 30 with AnalysisEngineDescription

use of org.apache.uima.analysis_engine.AnalysisEngineDescription in project dkpro-tc by dkpro.

the class ExtractFeaturesConnectorTest method extractFeaturesConnectorMultiLabelTest.

@Test
public void extractFeaturesConnectorMultiLabelTest() throws Exception {
    File outputPath = folder.newFolder();
    // we do not need parameters here, but in case we do :)
    Object[] parameters = new Object[] { NoopFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123" };
    ExternalResourceDescription featureExtractor = ExternalResourceFactory.createExternalResourceDescription(NoopFeatureExtractor.class, parameters);
    List<ExternalResourceDescription> fes = new ArrayList<>();
    fes.add(featureExtractor);
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestReaderMultiLabel.class, TestReaderMultiLabel.PARAM_SOURCE_LOCATION, "src/test/resources/data/*.txt");
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
    AnalysisEngineDescription featExtractorConnector = TaskUtils.getFeatureExtractorConnector(outputPath.getAbsolutePath(), JsonDataWriter.class.getName(), Constants.LM_REGRESSION, Constants.FM_DOCUMENT, false, false, false, false, Collections.emptyList(), fes, new String[] {});
    SimplePipeline.runPipeline(reader, segmenter, doc, featExtractorConnector);
    Gson gson = new Gson();
    List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
    List<Instance> instances = new ArrayList<>();
    for (String l : lines) {
        instances.add(gson.fromJson(l, Instance.class));
    }
    assertEquals(2, instances.size());
    assertEquals(3, getUniqueOutcomes(instances));
}
Also used : JsonDataWriter(org.dkpro.tc.core.io.JsonDataWriter) Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) Gson(com.google.gson.Gson) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) File(java.io.File) ExternalResourceDescription(org.apache.uima.resource.ExternalResourceDescription) Test(org.junit.Test)

Aggregations

AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)62 Test (org.junit.Test)32 File (java.io.File)27 CollectionReaderDescription (org.apache.uima.collection.CollectionReaderDescription)25 ArrayList (java.util.ArrayList)22 AnalysisEngine (org.apache.uima.analysis_engine.AnalysisEngine)18 JCas (org.apache.uima.jcas.JCas)16 Feature (org.dkpro.tc.api.features.Feature)13 FeatureTestUtil.assertFeature (org.dkpro.tc.testing.FeatureTestUtil.assertFeature)11 ExternalResourceDescription (org.apache.uima.resource.ExternalResourceDescription)10 AggregateBuilder (org.apache.uima.fit.factory.AggregateBuilder)8 ResourceInitializationException (org.apache.uima.resource.ResourceInitializationException)8 JsonDataWriter (org.dkpro.tc.core.io.JsonDataWriter)8 TextClassificationTarget (org.dkpro.tc.api.type.TextClassificationTarget)7 Gson (com.google.gson.Gson)6 IOException (java.io.IOException)6 Instance (org.dkpro.tc.api.features.Instance)6 OpenNlpPosTagger (de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger)4 BreakIteratorSegmenter (de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter)4 CAS (org.apache.uima.cas.CAS)4