use of org.apache.uima.analysis_engine.AnalysisEngineDescription in project dkpro-tc by dkpro.
the class DiffNrOfCharactersPairFeatureExtractorTest method testExtract.
@Test
public void testExtract() throws ResourceInitializationException, AnalysisEngineProcessException, TextClassificationException {
AnalysisEngineDescription desc = createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngine engine = createEngine(desc);
JCas jcas1 = engine.newJCas();
jcas1.setDocumentLanguage("en");
jcas1.setDocumentText("This is the text of view 1. And some more.");
engine.process(jcas1);
JCas jcas2 = engine.newJCas();
jcas2.setDocumentLanguage("en");
jcas2.setDocumentText("This is the text of view 2");
engine.process(jcas2);
DiffNrOfCharactersPairFeatureExtractor extractor = new DiffNrOfCharactersPairFeatureExtractor();
Set<Feature> features = extractor.extract(jcas1, jcas2);
assertEquals(1, features.size());
for (Feature feature : features) {
assertFeature("DiffNrOfCharacters", 16, feature);
}
}
use of org.apache.uima.analysis_engine.AnalysisEngineDescription in project dkpro-tc by dkpro.
the class QuestionRatioTest method questionRatioFeatureExtractorTest.
@Test
public void questionRatioFeatureExtractorTest() throws Exception {
AnalysisEngineDescription desc = createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngine engine = createEngine(desc);
JCas jcas = engine.newJCas();
jcas.setDocumentLanguage("en");
jcas.setDocumentText("Is he a tester???? Really?? He is a tester! Oh yes.");
engine.process(jcas);
TextClassificationTarget aTarget = new TextClassificationTarget(jcas, 0, jcas.getDocumentText().length());
aTarget.addToIndexes();
QuestionsRatioFeatureExtractor extractor = new QuestionsRatioFeatureExtractor();
List<Feature> features = new ArrayList<Feature>(extractor.extract(jcas, aTarget));
Assert.assertEquals(1, features.size());
for (Feature feature : features) {
assertFeature(FN_QUESTION_RATIO, 0.5, feature);
}
}
use of org.apache.uima.analysis_engine.AnalysisEngineDescription in project dkpro-tc by dkpro.
the class VectorizationTask method learningModeDependedVectorizationAnnotator.
private AnalysisEngineDescription learningModeDependedVectorizationAnnotator(File outputDir, File mappingDir) throws ResourceInitializationException {
if (featureMode == null) {
throw new ResourceInitializationException(new IllegalStateException("Learning model is [null]"));
}
AggregateBuilder builder = new AggregateBuilder();
// records which document ids are in the train / test set (this is not
// clear for cross-validation tasks)
builder.add(createEngineDescription(IdentificationCollector.class, IdentificationCollector.PARAM_TARGET_DIRECTORY, outputDir, IdentificationCollector.PARAM_MODE, featureMode, IdentificationCollector.PARAM_USER_SET_MAXIMUM_LENGTH, maximumLength));
AnalysisEngineDescription engine = null;
switch(featureMode) {
case Constants.FM_DOCUMENT:
switch(learningMode) {
case Constants.LM_REGRESSION:
engine = createEngineDescription(VectorizationDoc2Regression.class, VectorizationDoc2Regression.PARAM_TARGET_DIRECTORY, outputDir, VectorizationDoc2Regression.PARAM_PREPARATION_DIRECTORY, mappingDir, VectorizationDoc2Regression.PARAM_TO_INTEGER, integerVectorization);
builder.add(engine);
break;
case Constants.LM_SINGLE_LABEL:
engine = createEngineDescription(VectorizationDoc2SingleLabel.class, VectorizationDoc2SingleLabel.PARAM_TARGET_DIRECTORY, outputDir, VectorizationDoc2SingleLabel.PARAM_PREPARATION_DIRECTORY, mappingDir, VectorizationDoc2SingleLabel.PARAM_TO_INTEGER, integerVectorization);
builder.add(engine);
break;
case Constants.LM_MULTI_LABEL:
engine = createEngineDescription(VectorizationDocDoc2MultiLabel.class, VectorizationDocDoc2MultiLabel.PARAM_TARGET_DIRECTORY, outputDir, VectorizationDocDoc2MultiLabel.PARAM_PREPARATION_DIRECTORY, mappingDir, VectorizationDocDoc2MultiLabel.PARAM_TO_INTEGER, integerVectorization);
builder.add(engine);
break;
default:
throw new ResourceInitializationException(new IllegalStateException("Combination of feature mode [" + featureMode + "] with learning mode [" + learningMode + "] not defined"));
}
break;
case Constants.FM_SEQUENCE:
engine = createEngineDescription(VectorizationSeq2SeqOfLabel.class, VectorizationSeq2SeqOfLabel.PARAM_TARGET_DIRECTORY, outputDir, VectorizationSeq2SeqOfLabel.PARAM_PREPARATION_DIRECTORY, mappingDir, VectorizationDocDoc2MultiLabel.PARAM_TO_INTEGER, integerVectorization);
builder.add(engine);
break;
default:
throw new ResourceInitializationException(new IllegalStateException("Combination of feature mode [" + featureMode + "] with learning mode [" + learningMode + "] not defined"));
}
return builder.createAggregateDescription();
}
use of org.apache.uima.analysis_engine.AnalysisEngineDescription in project dkpro-tc by dkpro.
the class ExtractFeaturesConnectorTest method extractFeaturesConnectorSingleLabelTest.
@Test
public void extractFeaturesConnectorSingleLabelTest() throws Exception {
File outputPath = folder.newFolder();
// we do not need parameters here, but in case we do :)
Object[] parameters = new Object[] { NoopFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123" };
ExternalResourceDescription featureExtractor = ExternalResourceFactory.createExternalResourceDescription(NoopFeatureExtractor.class, parameters);
List<ExternalResourceDescription> fes = new ArrayList<>();
fes.add(featureExtractor);
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestReaderSingleLabel.class, TestReaderSingleLabel.PARAM_SOURCE_LOCATION, "src/test/resources/data/*.txt");
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
AnalysisEngineDescription featExtractorConnector = TaskUtils.getFeatureExtractorConnector(outputPath.getAbsolutePath(), JsonDataWriter.class.getName(), Constants.LM_REGRESSION, Constants.FM_DOCUMENT, false, false, false, false, Collections.emptyList(), fes, new String[] {});
SimplePipeline.runPipeline(reader, segmenter, doc, featExtractorConnector);
Gson gson = new Gson();
System.out.println(FileUtils.readFileToString(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8"));
List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
List<Instance> instances = new ArrayList<>();
for (String l : lines) {
instances.add(gson.fromJson(l, Instance.class));
}
assertEquals(2, instances.size());
assertEquals(1, getUniqueOutcomes(instances));
}
use of org.apache.uima.analysis_engine.AnalysisEngineDescription in project dkpro-tc by dkpro.
the class ExtractFeaturesConnectorTest method extractFeaturesConnectorMultiLabelTest.
@Test
public void extractFeaturesConnectorMultiLabelTest() throws Exception {
File outputPath = folder.newFolder();
// we do not need parameters here, but in case we do :)
Object[] parameters = new Object[] { NoopFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123" };
ExternalResourceDescription featureExtractor = ExternalResourceFactory.createExternalResourceDescription(NoopFeatureExtractor.class, parameters);
List<ExternalResourceDescription> fes = new ArrayList<>();
fes.add(featureExtractor);
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestReaderMultiLabel.class, TestReaderMultiLabel.PARAM_SOURCE_LOCATION, "src/test/resources/data/*.txt");
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
AnalysisEngineDescription featExtractorConnector = TaskUtils.getFeatureExtractorConnector(outputPath.getAbsolutePath(), JsonDataWriter.class.getName(), Constants.LM_REGRESSION, Constants.FM_DOCUMENT, false, false, false, false, Collections.emptyList(), fes, new String[] {});
SimplePipeline.runPipeline(reader, segmenter, doc, featExtractorConnector);
Gson gson = new Gson();
List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
List<Instance> instances = new ArrayList<>();
for (String l : lines) {
instances.add(gson.fromJson(l, Instance.class));
}
assertEquals(2, instances.size());
assertEquals(3, getUniqueOutcomes(instances));
}
Aggregations