Search in sources :

Example 31 with CollectionReader

use of org.apache.uima.collection.CollectionReader in project dkpro-tc by dkpro.

the class XgboostSaveAndLoadModelDocumentRegression method regressionLoadModel.

private void regressionLoadModel(File modelFolder) throws UIMAException, IOException {
    CollectionReader reader = CollectionReaderFactory.createReader(LinewiseTextOutcomeReader.class, LinewiseTextOutcomeReader.PARAM_OUTCOME_INDEX, 0, LinewiseTextOutcomeReader.PARAM_TEXT_INDEX, 1, LinewiseTextOutcomeReader.PARAM_SOURCE_LOCATION, regressionTest, LinewiseTextOutcomeReader.PARAM_LANGUAGE, "en");
    AnalysisEngine segmenter = AnalysisEngineFactory.createEngine(BreakIteratorSegmenter.class);
    AnalysisEngine tcAnno = AnalysisEngineFactory.createEngine(TcAnnotator.class, TcAnnotator.PARAM_TC_MODEL_LOCATION, modelFolder.getAbsolutePath(), TcAnnotator.PARAM_NAME_UNIT_ANNOTATION, Token.class.getName());
    JCas jcas = JCasFactory.createJCas();
    reader.hasNext();
    reader.getNext(jcas.getCas());
    segmenter.process(jcas);
    tcAnno.process(jcas);
    List<TextClassificationOutcome> outcomes = new ArrayList<>(JCasUtil.select(jcas, TextClassificationOutcome.class));
    assertEquals(1, outcomes.size());
    Double d = Double.valueOf(outcomes.get(0).getOutcome());
    assertTrue(d > 0.1 && d < 5);
}
Also used : CollectionReader(org.apache.uima.collection.CollectionReader) TextClassificationOutcome(org.dkpro.tc.api.type.TextClassificationOutcome) ArrayList(java.util.ArrayList) JCas(org.apache.uima.jcas.JCas) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) AnalysisEngine(org.apache.uima.analysis_engine.AnalysisEngine)

Example 32 with CollectionReader

use of org.apache.uima.collection.CollectionReader in project dkpro-tc by dkpro.

the class SequenceOutcomeReaderTest method testNumberOfCas.

@Test
public void testNumberOfCas() throws Exception {
    // all in one
    CollectionReader reader = CollectionReaderFactory.createReader(SequenceOutcomeReader.class, SequenceOutcomeReader.PARAM_SOURCE_LOCATION, "src/test/resources/sequence/posDummy.txt", SequenceOutcomeReader.PARAM_SEQUENCES_PER_CAS, 10);
    int sentCount = 0;
    int tokenCount = 0;
    int createdCas = 0;
    while (reader.hasNext()) {
        JCas theJCas = JCasFactory.createJCas();
        reader.getNext(theJCas.getCas());
        sentCount += JCasUtil.select(theJCas, Sentence.class).size();
        tokenCount += JCasUtil.select(theJCas, Token.class).size();
        createdCas++;
    }
    assertEquals(1, createdCas);
    assertEquals(3, sentCount);
    assertEquals(15, tokenCount);
    // one per cas
    reader = CollectionReaderFactory.createReader(SequenceOutcomeReader.class, SequenceOutcomeReader.PARAM_SOURCE_LOCATION, "src/test/resources/sequence/posDummy.txt", SequenceOutcomeReader.PARAM_SEQUENCES_PER_CAS, 1);
    sentCount = 0;
    tokenCount = 0;
    createdCas = 0;
    while (reader.hasNext()) {
        JCas theJCas = JCasFactory.createJCas();
        reader.getNext(theJCas.getCas());
        sentCount += JCasUtil.select(theJCas, Sentence.class).size();
        tokenCount += JCasUtil.select(theJCas, Token.class).size();
        createdCas++;
    }
    assertEquals(3, createdCas);
    assertEquals(3, sentCount);
    assertEquals(15, tokenCount);
    // two in first one third in second one
    reader = CollectionReaderFactory.createReader(SequenceOutcomeReader.class, SequenceOutcomeReader.PARAM_SOURCE_LOCATION, "src/test/resources/sequence/posDummy.txt", SequenceOutcomeReader.PARAM_SEQUENCES_PER_CAS, 2);
    sentCount = 0;
    tokenCount = 0;
    createdCas = 0;
    while (reader.hasNext()) {
        JCas theJCas = JCasFactory.createJCas();
        reader.getNext(theJCas.getCas());
        sentCount += JCasUtil.select(theJCas, Sentence.class).size();
        tokenCount += JCasUtil.select(theJCas, Token.class).size();
        createdCas++;
    }
    assertEquals(2, createdCas);
    assertEquals(3, sentCount);
    assertEquals(15, tokenCount);
}
Also used : CollectionReader(org.apache.uima.collection.CollectionReader) JCas(org.apache.uima.jcas.JCas) Test(org.junit.Test)

Example 33 with CollectionReader

use of org.apache.uima.collection.CollectionReader in project dkpro-tc by dkpro.

the class SequenceOutcomeReaderTest method testReader.

@Test
public void testReader() throws Exception {
    CollectionReader reader = CollectionReaderFactory.createReader(SequenceOutcomeReader.class, SequenceOutcomeReader.PARAM_SOURCE_LOCATION, "src/test/resources/sequence/", SequenceOutcomeReader.PARAM_PATTERNS, "posDummy.txt", SequenceOutcomeReader.PARAM_SEQUENCES_PER_CAS, 1);
    List<List<String>> readSequences = new ArrayList<>();
    List<List<String>> readOutcomes = new ArrayList<>();
    int seqTargets = 0;
    while (reader.hasNext()) {
        JCas theJCas = JCasFactory.createJCas();
        reader.getNext(theJCas.getCas());
        Collection<TextClassificationSequence> sequence = JCasUtil.select(theJCas, TextClassificationSequence.class);
        for (TextClassificationSequence s : sequence) {
            List<TextClassificationTarget> targets = JCasUtil.selectCovered(theJCas, TextClassificationTarget.class, s);
            List<String> tokens = new ArrayList<>();
            for (TextClassificationTarget target : targets) {
                tokens.add(target.getCoveredText());
            }
            readSequences.add(tokens);
        }
        for (TextClassificationSequence s : sequence) {
            List<TextClassificationOutcome> outcomeAnnotations = JCasUtil.selectCovered(theJCas, TextClassificationOutcome.class, s);
            List<String> outcomes = new ArrayList<>();
            for (TextClassificationOutcome o : outcomeAnnotations) {
                outcomes.add(o.getOutcome());
            }
            readOutcomes.add(outcomes);
        }
        seqTargets += JCasUtil.select(theJCas, TextClassificationSequence.class).size();
    }
    assertEquals(3, seqTargets);
    assertEquals(3, readSequences.size());
    assertEquals(3, readOutcomes.size());
    assertEquals(4, readSequences.get(0).size());
    // 1 - tokens
    assertEquals("This", readSequences.get(0).get(0));
    assertEquals("is", readSequences.get(0).get(1));
    assertEquals("a", readSequences.get(0).get(2));
    assertEquals("test", readSequences.get(0).get(3));
    // 2 - outcomes
    assertEquals("DET", readOutcomes.get(0).get(0));
    assertEquals("VERB", readOutcomes.get(0).get(1));
    assertEquals("DET", readOutcomes.get(0).get(2));
    assertEquals("NOUN", readOutcomes.get(0).get(3));
    assertEquals(5, readSequences.get(1).size());
    // 2 - tokens
    assertEquals("This2", readSequences.get(1).get(0));
    assertEquals("is2", readSequences.get(1).get(1));
    assertEquals("a2", readSequences.get(1).get(2));
    assertEquals("#test2", readSequences.get(1).get(3));
    assertEquals("!", readSequences.get(1).get(4));
    // 2 - outcomes
    assertEquals("DET2", readOutcomes.get(1).get(0));
    assertEquals("VERB2", readOutcomes.get(1).get(1));
    assertEquals("DET2", readOutcomes.get(1).get(2));
    assertEquals("NOUN2", readOutcomes.get(1).get(3));
    assertEquals("PUNCT2", readOutcomes.get(1).get(4));
    assertEquals(6, readSequences.get(2).size());
    // 3 - tokens
    assertEquals("This3", readSequences.get(2).get(0));
    assertEquals("is3", readSequences.get(2).get(1));
    assertEquals("a3", readSequences.get(2).get(2));
    assertEquals("test3", readSequences.get(2).get(3));
    assertEquals("!", readSequences.get(2).get(4));
    assertEquals("!", readSequences.get(2).get(5));
    // 3 - outcomes
    assertEquals("DET3", readOutcomes.get(2).get(0));
    assertEquals("VERB3", readOutcomes.get(2).get(1));
    assertEquals("DET3", readOutcomes.get(2).get(2));
    assertEquals("NOUN3", readOutcomes.get(2).get(3));
    assertEquals("PUNCT3", readOutcomes.get(2).get(4));
    assertEquals("PUNCT3", readOutcomes.get(2).get(5));
}
Also used : CollectionReader(org.apache.uima.collection.CollectionReader) ArrayList(java.util.ArrayList) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) JCas(org.apache.uima.jcas.JCas) TextClassificationOutcome(org.dkpro.tc.api.type.TextClassificationOutcome) ArrayList(java.util.ArrayList) List(java.util.List) TextClassificationSequence(org.dkpro.tc.api.type.TextClassificationSequence) Test(org.junit.Test)

Example 34 with CollectionReader

use of org.apache.uima.collection.CollectionReader in project dkpro-tc by dkpro.

the class SequenceOutcomeReaderTest method testReaderIndexParameter.

@Test
public void testReaderIndexParameter() throws Exception {
    CollectionReader reader = CollectionReaderFactory.createReader(SequenceOutcomeReader.class, SequenceOutcomeReader.PARAM_SOURCE_LOCATION, "src/test/resources/sequence/", SequenceOutcomeReader.PARAM_PATTERNS, "otherFormat.txt", SequenceOutcomeReader.PARAM_OUTCOME_INDEX, 1, SequenceOutcomeReader.PARAM_TOKEN_INDEX, 2);
    List<List<String>> readSequences = new ArrayList<>();
    List<List<String>> readOutcomes = new ArrayList<>();
    int seqTargets = 0;
    while (reader.hasNext()) {
        JCas theJCas = JCasFactory.createJCas();
        reader.getNext(theJCas.getCas());
        Collection<TextClassificationSequence> sequences = JCasUtil.select(theJCas, TextClassificationSequence.class);
        for (TextClassificationSequence s : sequences) {
            List<TextClassificationTarget> targets = JCasUtil.selectCovered(theJCas, TextClassificationTarget.class, s);
            List<String> tokens = new ArrayList<>();
            for (TextClassificationTarget target : targets) {
                tokens.add(target.getCoveredText());
            }
            readSequences.add(tokens);
        }
        Collection<TextClassificationSequence> outcomeSequences = JCasUtil.select(theJCas, TextClassificationSequence.class);
        for (TextClassificationSequence s : outcomeSequences) {
            List<TextClassificationOutcome> outcomeAnnotations = JCasUtil.selectCovered(theJCas, TextClassificationOutcome.class, s);
            List<String> outcomes = new ArrayList<>();
            for (TextClassificationOutcome o : outcomeAnnotations) {
                outcomes.add(o.getOutcome());
            }
            readOutcomes.add(outcomes);
        }
        seqTargets += JCasUtil.select(theJCas, TextClassificationSequence.class).size();
    }
    assertEquals(2, seqTargets);
    assertEquals(2, readSequences.size());
    assertEquals(2, readOutcomes.size());
    assertEquals(4, readSequences.get(0).size());
    // 1 - tokens
    assertEquals("This", readSequences.get(0).get(0));
    assertEquals("is", readSequences.get(0).get(1));
    assertEquals("a", readSequences.get(0).get(2));
    assertEquals("test", readSequences.get(0).get(3));
    // 2 - outcomes
    assertEquals("DET", readOutcomes.get(0).get(0));
    assertEquals("VERB", readOutcomes.get(0).get(1));
    assertEquals("DET", readOutcomes.get(0).get(2));
    assertEquals("NOUN", readOutcomes.get(0).get(3));
    assertEquals(5, readSequences.get(1).size());
    // 2 - tokens
    assertEquals("This2", readSequences.get(1).get(0));
    assertEquals("is2", readSequences.get(1).get(1));
    assertEquals("a2", readSequences.get(1).get(2));
    assertEquals("test2", readSequences.get(1).get(3));
    assertEquals("!2", readSequences.get(1).get(4));
    // 2 - outcomes
    assertEquals("DET2", readOutcomes.get(1).get(0));
    assertEquals("VERB2", readOutcomes.get(1).get(1));
    assertEquals("DET2", readOutcomes.get(1).get(2));
    assertEquals("NOUN2", readOutcomes.get(1).get(3));
    assertEquals("PUNCT2", readOutcomes.get(1).get(4));
}
Also used : CollectionReader(org.apache.uima.collection.CollectionReader) ArrayList(java.util.ArrayList) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) JCas(org.apache.uima.jcas.JCas) TextClassificationOutcome(org.dkpro.tc.api.type.TextClassificationOutcome) ArrayList(java.util.ArrayList) List(java.util.List) TextClassificationSequence(org.dkpro.tc.api.type.TextClassificationSequence) Test(org.junit.Test)

Example 35 with CollectionReader

use of org.apache.uima.collection.CollectionReader in project dkpro-tc by dkpro.

the class TestFoldUtil method countNumberOfTextClassificationUnitsPerCas.

private List<Integer> countNumberOfTextClassificationUnitsPerCas(List<File> writtenBins) throws Exception {
    List<Integer> arrayList = new ArrayList<Integer>();
    for (File f : writtenBins) {
        JCas jcas = JCasFactory.createJCas();
        CollectionReader createReader = createReader(jcas, f);
        createReader.getNext(jcas.getCas());
        Collection<TextClassificationTarget> units = JCasUtil.select(jcas, TextClassificationTarget.class);
        arrayList.add(units.size());
    }
    return arrayList;
}
Also used : CollectionReader(org.apache.uima.collection.CollectionReader) ArrayList(java.util.ArrayList) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) JCas(org.apache.uima.jcas.JCas) File(java.io.File)

Aggregations

CollectionReader (org.apache.uima.collection.CollectionReader)35 JCas (org.apache.uima.jcas.JCas)28 ArrayList (java.util.ArrayList)25 TextClassificationOutcome (org.dkpro.tc.api.type.TextClassificationOutcome)15 AnalysisEngine (org.apache.uima.analysis_engine.AnalysisEngine)14 Test (org.junit.Test)13 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)8 CAS (org.apache.uima.cas.CAS)7 File (java.io.File)5 TextClassificationTarget (org.dkpro.tc.api.type.TextClassificationTarget)5 List (java.util.List)4 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)4 TextClassificationSequence (org.dkpro.tc.api.type.TextClassificationSequence)4 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)3 Lemma (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma)3 TypeSystemDescription (org.apache.uima.resource.metadata.TypeSystemDescription)3 Evaluator (de.tudarmstadt.ukp.clarin.webanno.constraints.evaluator.Evaluator)2 PossibleValue (de.tudarmstadt.ukp.clarin.webanno.constraints.evaluator.PossibleValue)2 ValuesGenerator (de.tudarmstadt.ukp.clarin.webanno.constraints.evaluator.ValuesGenerator)2 ConstraintsGrammar (de.tudarmstadt.ukp.clarin.webanno.constraints.grammar.ConstraintsGrammar)2