Search in sources :

Example 1 with JCasIterable

use of org.apache.uima.fit.pipeline.JCasIterable in project dkpro-tc by dkpro.

the class WordNGramMetaCollectorTest method luceneNgramMetaCollectorTest.

@Test
public void luceneNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_PATTERNS, "text*.txt");
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(WordNGramMC.class, WordNGramMC.PARAM_TARGET_LOCATION, tmpDir, WordNGramMC.PARAM_UNIQUE_EXTRACTOR_NAME, UNIQUE_FEATURE_NAME);
    for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) {
        System.out.println(jcas.getDocumentText().length());
    }
    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(WordNGram.LUCENE_NGRAM_FIELD + UNIQUE_FEATURE_NAME);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    if (text.utf8ToString().equals("this")) {
                        assertEquals(1, termsEnum.docFreq());
                        assertEquals(3, termsEnum.totalTermFreq());
                    }
                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }
    assertEquals(35, i);
}
Also used : JCasIterable(org.apache.uima.fit.pipeline.JCasIterable) Terms(org.apache.lucene.index.Terms) JCas(org.apache.uima.jcas.JCas) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) TermsEnum(org.apache.lucene.index.TermsEnum) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) Fields(org.apache.lucene.index.Fields) MultiFields(org.apache.lucene.index.MultiFields) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) IndexReader(org.apache.lucene.index.IndexReader) File(java.io.File) BytesRef(org.apache.lucene.util.BytesRef) Test(org.junit.Test)

Example 2 with JCasIterable

use of org.apache.uima.fit.pipeline.JCasIterable in project dkpro-tc by dkpro.

the class LuceneNGramCPMetaCollectorTest method combinedNgramPairMetaCollectorTest.

@Test
public void combinedNgramPairMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt");
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_PAIR);
    AggregateBuilder builder = new AggregateBuilder();
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);
    builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_TWO);
    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(LuceneNGramCPMetaCollector.class, LuceneNGramCPFE.PARAM_UNIQUE_EXTRACTOR_NAME, "123", LuceneNGramCPFE.PARAM_SOURCE_LOCATION, tmpDir, LuceneNGramPMetaCollector.PARAM_TARGET_LOCATION, tmpDir);
    // test fails if for-loop removed
    for (@SuppressWarnings("unused") JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) {
    // System.out.println(jcas.getDocumentText().length());
    }
    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneNGramCPFE.LUCENE_NGRAM_FIELDCOMBO);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    // then this would be relevant
                    if (text.utf8ToString().equals("mice_ANDcats_.")) {
                        assertEquals(1, termsEnum.docFreq());
                        assertEquals(1, termsEnum.totalTermFreq());
                    }
                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }
    assertEquals(65, i);
}
Also used : JCasIterable(org.apache.uima.fit.pipeline.JCasIterable) Terms(org.apache.lucene.index.Terms) JCas(org.apache.uima.jcas.JCas) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) TermsEnum(org.apache.lucene.index.TermsEnum) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) Fields(org.apache.lucene.index.Fields) MultiFields(org.apache.lucene.index.MultiFields) AggregateBuilder(org.apache.uima.fit.factory.AggregateBuilder) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) IndexReader(org.apache.lucene.index.IndexReader) File(java.io.File) BytesRef(org.apache.lucene.util.BytesRef) Test(org.junit.Test)

Example 3 with JCasIterable

use of org.apache.uima.fit.pipeline.JCasIterable in project dkpro-tc by dkpro.

the class LuceneNGramPMetaCollectorTest method lucenePairNgramMetaCollectorTest.

@Test
public void lucenePairNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt");
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_PAIR);
    AggregateBuilder builder = new AggregateBuilder();
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);
    builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_TWO);
    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(LuceneNGramPMetaCollector.class, LuceneNGramPFE.PARAM_UNIQUE_EXTRACTOR_NAME, "123", LuceneNGramPFE.PARAM_SOURCE_LOCATION, tmpDir, LuceneNGramPMetaCollector.PARAM_TARGET_LOCATION, tmpDir);
    // test fails if for-loop removed
    for (@SuppressWarnings("unused") JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) {
    // System.out.println(jcas.getDocumentText().length());
    }
    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(WordNGram.LUCENE_NGRAM_FIELD);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    if (text.utf8ToString().equals("this")) {
                        assertEquals(2, termsEnum.docFreq());
                        assertEquals(3, termsEnum.totalTermFreq());
                    }
                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }
    assertEquals(16, i);
}
Also used : JCasIterable(org.apache.uima.fit.pipeline.JCasIterable) Terms(org.apache.lucene.index.Terms) JCas(org.apache.uima.jcas.JCas) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) TermsEnum(org.apache.lucene.index.TermsEnum) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) Fields(org.apache.lucene.index.Fields) MultiFields(org.apache.lucene.index.MultiFields) AggregateBuilder(org.apache.uima.fit.factory.AggregateBuilder) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) IndexReader(org.apache.lucene.index.IndexReader) File(java.io.File) BytesRef(org.apache.lucene.util.BytesRef) Test(org.junit.Test)

Example 4 with JCasIterable

use of org.apache.uima.fit.pipeline.JCasIterable in project webanno by webanno.

the class TeiReaderTest method testTeiReader.

@Test
@Ignore("No TEI yet to opensource ")
public void testTeiReader() throws Exception {
    CollectionReaderDescription reader = createReaderDescription(TeiReader.class, TeiReader.PARAM_LANGUAGE, "en", TeiReader.PARAM_SOURCE_LOCATION, "classpath:/local/", TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" });
    String firstSentence = "70 I DAG.";
    for (JCas jcas : new JCasIterable(reader)) {
        DocumentMetaData meta = DocumentMetaData.get(jcas);
        String text = jcas.getDocumentText();
        System.out.printf("%s - %d%n", meta.getDocumentId(), text.length());
        System.out.println(jcas.getDocumentLanguage());
        assertEquals(2235, JCasUtil.select(jcas, Token.class).size());
        assertEquals(745, JCasUtil.select(jcas, POS.class).size());
        assertEquals(745, JCasUtil.select(jcas, Lemma.class).size());
        assertEquals(0, JCasUtil.select(jcas, NamedEntity.class).size());
        assertEquals(30, JCasUtil.select(jcas, Sentence.class).size());
        assertEquals(firstSentence, JCasUtil.select(jcas, Sentence.class).iterator().next().getCoveredText());
    }
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) JCasIterable(org.apache.uima.fit.pipeline.JCasIterable) JCas(org.apache.uima.jcas.JCas) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 5 with JCasIterable

use of org.apache.uima.fit.pipeline.JCasIterable in project dkpro-tc by dkpro.

the class WordNGramMetaCollectorTest method emptyDocumentTest.

@SuppressWarnings("unused")
@Test
public void emptyDocumentTest() throws Exception {
    File tmpDir = folder.newFolder();
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/empty/", TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_PATTERNS, "empty*.txt");
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(WordNGramMC.class, WordNGramMC.PARAM_UNIQUE_EXTRACTOR_NAME, "123", WordNGramMC.PARAM_TARGET_LOCATION, tmpDir);
    for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) {
    // System.out.println(jcas.getDocumentText().length());
    }
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) JCasIterable(org.apache.uima.fit.pipeline.JCasIterable) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) JCas(org.apache.uima.jcas.JCas) File(java.io.File) Test(org.junit.Test)

Aggregations

CollectionReaderDescription (org.apache.uima.collection.CollectionReaderDescription)5 JCasIterable (org.apache.uima.fit.pipeline.JCasIterable)5 JCas (org.apache.uima.jcas.JCas)5 Test (org.junit.Test)5 File (java.io.File)4 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)4 Fields (org.apache.lucene.index.Fields)3 IndexReader (org.apache.lucene.index.IndexReader)3 MultiFields (org.apache.lucene.index.MultiFields)3 Terms (org.apache.lucene.index.Terms)3 TermsEnum (org.apache.lucene.index.TermsEnum)3 BytesRef (org.apache.lucene.util.BytesRef)3 ResourceInitializationException (org.apache.uima.resource.ResourceInitializationException)3 AggregateBuilder (org.apache.uima.fit.factory.AggregateBuilder)2 DocumentMetaData (de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData)1 Ignore (org.junit.Ignore)1