use of org.apache.uima.collection.CollectionReader in project dkpro-tc by dkpro.
the class XgboostSaveAndLoadModelDocumentRegression method regressionLoadModel.
private void regressionLoadModel(File modelFolder) throws UIMAException, IOException {
CollectionReader reader = CollectionReaderFactory.createReader(LinewiseTextOutcomeReader.class, LinewiseTextOutcomeReader.PARAM_OUTCOME_INDEX, 0, LinewiseTextOutcomeReader.PARAM_TEXT_INDEX, 1, LinewiseTextOutcomeReader.PARAM_SOURCE_LOCATION, regressionTest, LinewiseTextOutcomeReader.PARAM_LANGUAGE, "en");
AnalysisEngine segmenter = AnalysisEngineFactory.createEngine(BreakIteratorSegmenter.class);
AnalysisEngine tcAnno = AnalysisEngineFactory.createEngine(TcAnnotator.class, TcAnnotator.PARAM_TC_MODEL_LOCATION, modelFolder.getAbsolutePath(), TcAnnotator.PARAM_NAME_UNIT_ANNOTATION, Token.class.getName());
JCas jcas = JCasFactory.createJCas();
reader.hasNext();
reader.getNext(jcas.getCas());
segmenter.process(jcas);
tcAnno.process(jcas);
List<TextClassificationOutcome> outcomes = new ArrayList<>(JCasUtil.select(jcas, TextClassificationOutcome.class));
assertEquals(1, outcomes.size());
Double d = Double.valueOf(outcomes.get(0).getOutcome());
assertTrue(d > 0.1 && d < 5);
}
use of org.apache.uima.collection.CollectionReader in project dkpro-tc by dkpro.
the class SequenceOutcomeReaderTest method testNumberOfCas.
@Test
public void testNumberOfCas() throws Exception {
// all in one
CollectionReader reader = CollectionReaderFactory.createReader(SequenceOutcomeReader.class, SequenceOutcomeReader.PARAM_SOURCE_LOCATION, "src/test/resources/sequence/posDummy.txt", SequenceOutcomeReader.PARAM_SEQUENCES_PER_CAS, 10);
int sentCount = 0;
int tokenCount = 0;
int createdCas = 0;
while (reader.hasNext()) {
JCas theJCas = JCasFactory.createJCas();
reader.getNext(theJCas.getCas());
sentCount += JCasUtil.select(theJCas, Sentence.class).size();
tokenCount += JCasUtil.select(theJCas, Token.class).size();
createdCas++;
}
assertEquals(1, createdCas);
assertEquals(3, sentCount);
assertEquals(15, tokenCount);
// one per cas
reader = CollectionReaderFactory.createReader(SequenceOutcomeReader.class, SequenceOutcomeReader.PARAM_SOURCE_LOCATION, "src/test/resources/sequence/posDummy.txt", SequenceOutcomeReader.PARAM_SEQUENCES_PER_CAS, 1);
sentCount = 0;
tokenCount = 0;
createdCas = 0;
while (reader.hasNext()) {
JCas theJCas = JCasFactory.createJCas();
reader.getNext(theJCas.getCas());
sentCount += JCasUtil.select(theJCas, Sentence.class).size();
tokenCount += JCasUtil.select(theJCas, Token.class).size();
createdCas++;
}
assertEquals(3, createdCas);
assertEquals(3, sentCount);
assertEquals(15, tokenCount);
// two in first one third in second one
reader = CollectionReaderFactory.createReader(SequenceOutcomeReader.class, SequenceOutcomeReader.PARAM_SOURCE_LOCATION, "src/test/resources/sequence/posDummy.txt", SequenceOutcomeReader.PARAM_SEQUENCES_PER_CAS, 2);
sentCount = 0;
tokenCount = 0;
createdCas = 0;
while (reader.hasNext()) {
JCas theJCas = JCasFactory.createJCas();
reader.getNext(theJCas.getCas());
sentCount += JCasUtil.select(theJCas, Sentence.class).size();
tokenCount += JCasUtil.select(theJCas, Token.class).size();
createdCas++;
}
assertEquals(2, createdCas);
assertEquals(3, sentCount);
assertEquals(15, tokenCount);
}
use of org.apache.uima.collection.CollectionReader in project dkpro-tc by dkpro.
the class SequenceOutcomeReaderTest method testReader.
@Test
public void testReader() throws Exception {
CollectionReader reader = CollectionReaderFactory.createReader(SequenceOutcomeReader.class, SequenceOutcomeReader.PARAM_SOURCE_LOCATION, "src/test/resources/sequence/", SequenceOutcomeReader.PARAM_PATTERNS, "posDummy.txt", SequenceOutcomeReader.PARAM_SEQUENCES_PER_CAS, 1);
List<List<String>> readSequences = new ArrayList<>();
List<List<String>> readOutcomes = new ArrayList<>();
int seqTargets = 0;
while (reader.hasNext()) {
JCas theJCas = JCasFactory.createJCas();
reader.getNext(theJCas.getCas());
Collection<TextClassificationSequence> sequence = JCasUtil.select(theJCas, TextClassificationSequence.class);
for (TextClassificationSequence s : sequence) {
List<TextClassificationTarget> targets = JCasUtil.selectCovered(theJCas, TextClassificationTarget.class, s);
List<String> tokens = new ArrayList<>();
for (TextClassificationTarget target : targets) {
tokens.add(target.getCoveredText());
}
readSequences.add(tokens);
}
for (TextClassificationSequence s : sequence) {
List<TextClassificationOutcome> outcomeAnnotations = JCasUtil.selectCovered(theJCas, TextClassificationOutcome.class, s);
List<String> outcomes = new ArrayList<>();
for (TextClassificationOutcome o : outcomeAnnotations) {
outcomes.add(o.getOutcome());
}
readOutcomes.add(outcomes);
}
seqTargets += JCasUtil.select(theJCas, TextClassificationSequence.class).size();
}
assertEquals(3, seqTargets);
assertEquals(3, readSequences.size());
assertEquals(3, readOutcomes.size());
assertEquals(4, readSequences.get(0).size());
// 1 - tokens
assertEquals("This", readSequences.get(0).get(0));
assertEquals("is", readSequences.get(0).get(1));
assertEquals("a", readSequences.get(0).get(2));
assertEquals("test", readSequences.get(0).get(3));
// 2 - outcomes
assertEquals("DET", readOutcomes.get(0).get(0));
assertEquals("VERB", readOutcomes.get(0).get(1));
assertEquals("DET", readOutcomes.get(0).get(2));
assertEquals("NOUN", readOutcomes.get(0).get(3));
assertEquals(5, readSequences.get(1).size());
// 2 - tokens
assertEquals("This2", readSequences.get(1).get(0));
assertEquals("is2", readSequences.get(1).get(1));
assertEquals("a2", readSequences.get(1).get(2));
assertEquals("#test2", readSequences.get(1).get(3));
assertEquals("!", readSequences.get(1).get(4));
// 2 - outcomes
assertEquals("DET2", readOutcomes.get(1).get(0));
assertEquals("VERB2", readOutcomes.get(1).get(1));
assertEquals("DET2", readOutcomes.get(1).get(2));
assertEquals("NOUN2", readOutcomes.get(1).get(3));
assertEquals("PUNCT2", readOutcomes.get(1).get(4));
assertEquals(6, readSequences.get(2).size());
// 3 - tokens
assertEquals("This3", readSequences.get(2).get(0));
assertEquals("is3", readSequences.get(2).get(1));
assertEquals("a3", readSequences.get(2).get(2));
assertEquals("test3", readSequences.get(2).get(3));
assertEquals("!", readSequences.get(2).get(4));
assertEquals("!", readSequences.get(2).get(5));
// 3 - outcomes
assertEquals("DET3", readOutcomes.get(2).get(0));
assertEquals("VERB3", readOutcomes.get(2).get(1));
assertEquals("DET3", readOutcomes.get(2).get(2));
assertEquals("NOUN3", readOutcomes.get(2).get(3));
assertEquals("PUNCT3", readOutcomes.get(2).get(4));
assertEquals("PUNCT3", readOutcomes.get(2).get(5));
}
use of org.apache.uima.collection.CollectionReader in project dkpro-tc by dkpro.
the class SequenceOutcomeReaderTest method testReaderIndexParameter.
@Test
public void testReaderIndexParameter() throws Exception {
CollectionReader reader = CollectionReaderFactory.createReader(SequenceOutcomeReader.class, SequenceOutcomeReader.PARAM_SOURCE_LOCATION, "src/test/resources/sequence/", SequenceOutcomeReader.PARAM_PATTERNS, "otherFormat.txt", SequenceOutcomeReader.PARAM_OUTCOME_INDEX, 1, SequenceOutcomeReader.PARAM_TOKEN_INDEX, 2);
List<List<String>> readSequences = new ArrayList<>();
List<List<String>> readOutcomes = new ArrayList<>();
int seqTargets = 0;
while (reader.hasNext()) {
JCas theJCas = JCasFactory.createJCas();
reader.getNext(theJCas.getCas());
Collection<TextClassificationSequence> sequences = JCasUtil.select(theJCas, TextClassificationSequence.class);
for (TextClassificationSequence s : sequences) {
List<TextClassificationTarget> targets = JCasUtil.selectCovered(theJCas, TextClassificationTarget.class, s);
List<String> tokens = new ArrayList<>();
for (TextClassificationTarget target : targets) {
tokens.add(target.getCoveredText());
}
readSequences.add(tokens);
}
Collection<TextClassificationSequence> outcomeSequences = JCasUtil.select(theJCas, TextClassificationSequence.class);
for (TextClassificationSequence s : outcomeSequences) {
List<TextClassificationOutcome> outcomeAnnotations = JCasUtil.selectCovered(theJCas, TextClassificationOutcome.class, s);
List<String> outcomes = new ArrayList<>();
for (TextClassificationOutcome o : outcomeAnnotations) {
outcomes.add(o.getOutcome());
}
readOutcomes.add(outcomes);
}
seqTargets += JCasUtil.select(theJCas, TextClassificationSequence.class).size();
}
assertEquals(2, seqTargets);
assertEquals(2, readSequences.size());
assertEquals(2, readOutcomes.size());
assertEquals(4, readSequences.get(0).size());
// 1 - tokens
assertEquals("This", readSequences.get(0).get(0));
assertEquals("is", readSequences.get(0).get(1));
assertEquals("a", readSequences.get(0).get(2));
assertEquals("test", readSequences.get(0).get(3));
// 2 - outcomes
assertEquals("DET", readOutcomes.get(0).get(0));
assertEquals("VERB", readOutcomes.get(0).get(1));
assertEquals("DET", readOutcomes.get(0).get(2));
assertEquals("NOUN", readOutcomes.get(0).get(3));
assertEquals(5, readSequences.get(1).size());
// 2 - tokens
assertEquals("This2", readSequences.get(1).get(0));
assertEquals("is2", readSequences.get(1).get(1));
assertEquals("a2", readSequences.get(1).get(2));
assertEquals("test2", readSequences.get(1).get(3));
assertEquals("!2", readSequences.get(1).get(4));
// 2 - outcomes
assertEquals("DET2", readOutcomes.get(1).get(0));
assertEquals("VERB2", readOutcomes.get(1).get(1));
assertEquals("DET2", readOutcomes.get(1).get(2));
assertEquals("NOUN2", readOutcomes.get(1).get(3));
assertEquals("PUNCT2", readOutcomes.get(1).get(4));
}
use of org.apache.uima.collection.CollectionReader in project dkpro-tc by dkpro.
the class TestFoldUtil method countNumberOfTextClassificationUnitsPerCas.
private List<Integer> countNumberOfTextClassificationUnitsPerCas(List<File> writtenBins) throws Exception {
List<Integer> arrayList = new ArrayList<Integer>();
for (File f : writtenBins) {
JCas jcas = JCasFactory.createJCas();
CollectionReader createReader = createReader(jcas, f);
createReader.getNext(jcas.getCas());
Collection<TextClassificationTarget> units = JCasUtil.select(jcas, TextClassificationTarget.class);
arrayList.add(units.size());
}
return arrayList;
}
Aggregations