use of org.apache.uima.fit.pipeline.JCasIterable in project dkpro-tc by dkpro.
the class WordNGramMetaCollectorTest method luceneNgramMetaCollectorTest.
@Test
public void luceneNgramMetaCollectorTest() throws Exception {
File tmpDir = folder.newFolder();
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_PATTERNS, "text*.txt");
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(WordNGramMC.class, WordNGramMC.PARAM_TARGET_LOCATION, tmpDir, WordNGramMC.PARAM_UNIQUE_EXTRACTOR_NAME, UNIQUE_FEATURE_NAME);
for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) {
System.out.println(jcas.getDocumentText().length());
}
int i = 0;
IndexReader index;
try {
index = DirectoryReader.open(FSDirectory.open(tmpDir));
Fields fields = MultiFields.getFields(index);
if (fields != null) {
Terms terms = fields.terms(WordNGram.LUCENE_NGRAM_FIELD + UNIQUE_FEATURE_NAME);
if (terms != null) {
TermsEnum termsEnum = terms.iterator(null);
BytesRef text = null;
while ((text = termsEnum.next()) != null) {
if (text.utf8ToString().equals("this")) {
assertEquals(1, termsEnum.docFreq());
assertEquals(3, termsEnum.totalTermFreq());
}
i++;
}
}
}
} catch (Exception e) {
throw new ResourceInitializationException(e);
}
assertEquals(35, i);
}
use of org.apache.uima.fit.pipeline.JCasIterable in project dkpro-tc by dkpro.
the class LuceneNGramCPMetaCollectorTest method combinedNgramPairMetaCollectorTest.
@Test
public void combinedNgramPairMetaCollectorTest() throws Exception {
File tmpDir = folder.newFolder();
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt");
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_PAIR);
AggregateBuilder builder = new AggregateBuilder();
builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);
builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_TWO);
AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(LuceneNGramCPMetaCollector.class, LuceneNGramCPFE.PARAM_UNIQUE_EXTRACTOR_NAME, "123", LuceneNGramCPFE.PARAM_SOURCE_LOCATION, tmpDir, LuceneNGramPMetaCollector.PARAM_TARGET_LOCATION, tmpDir);
// test fails if for-loop removed
for (@SuppressWarnings("unused") JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) {
// System.out.println(jcas.getDocumentText().length());
}
int i = 0;
IndexReader index;
try {
index = DirectoryReader.open(FSDirectory.open(tmpDir));
Fields fields = MultiFields.getFields(index);
if (fields != null) {
Terms terms = fields.terms(LuceneNGramCPFE.LUCENE_NGRAM_FIELDCOMBO);
if (terms != null) {
TermsEnum termsEnum = terms.iterator(null);
BytesRef text = null;
while ((text = termsEnum.next()) != null) {
// then this would be relevant
if (text.utf8ToString().equals("mice_ANDcats_.")) {
assertEquals(1, termsEnum.docFreq());
assertEquals(1, termsEnum.totalTermFreq());
}
i++;
}
}
}
} catch (Exception e) {
throw new ResourceInitializationException(e);
}
assertEquals(65, i);
}
use of org.apache.uima.fit.pipeline.JCasIterable in project dkpro-tc by dkpro.
the class LuceneNGramPMetaCollectorTest method lucenePairNgramMetaCollectorTest.
@Test
public void lucenePairNgramMetaCollectorTest() throws Exception {
File tmpDir = folder.newFolder();
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt");
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_PAIR);
AggregateBuilder builder = new AggregateBuilder();
builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);
builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_TWO);
AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(LuceneNGramPMetaCollector.class, LuceneNGramPFE.PARAM_UNIQUE_EXTRACTOR_NAME, "123", LuceneNGramPFE.PARAM_SOURCE_LOCATION, tmpDir, LuceneNGramPMetaCollector.PARAM_TARGET_LOCATION, tmpDir);
// test fails if for-loop removed
for (@SuppressWarnings("unused") JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) {
// System.out.println(jcas.getDocumentText().length());
}
int i = 0;
IndexReader index;
try {
index = DirectoryReader.open(FSDirectory.open(tmpDir));
Fields fields = MultiFields.getFields(index);
if (fields != null) {
Terms terms = fields.terms(WordNGram.LUCENE_NGRAM_FIELD);
if (terms != null) {
TermsEnum termsEnum = terms.iterator(null);
BytesRef text = null;
while ((text = termsEnum.next()) != null) {
if (text.utf8ToString().equals("this")) {
assertEquals(2, termsEnum.docFreq());
assertEquals(3, termsEnum.totalTermFreq());
}
i++;
}
}
}
} catch (Exception e) {
throw new ResourceInitializationException(e);
}
assertEquals(16, i);
}
use of org.apache.uima.fit.pipeline.JCasIterable in project webanno by webanno.
the class TeiReaderTest method testTeiReader.
@Test
@Ignore("No TEI yet to opensource ")
public void testTeiReader() throws Exception {
CollectionReaderDescription reader = createReaderDescription(TeiReader.class, TeiReader.PARAM_LANGUAGE, "en", TeiReader.PARAM_SOURCE_LOCATION, "classpath:/local/", TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" });
String firstSentence = "70 I DAG.";
for (JCas jcas : new JCasIterable(reader)) {
DocumentMetaData meta = DocumentMetaData.get(jcas);
String text = jcas.getDocumentText();
System.out.printf("%s - %d%n", meta.getDocumentId(), text.length());
System.out.println(jcas.getDocumentLanguage());
assertEquals(2235, JCasUtil.select(jcas, Token.class).size());
assertEquals(745, JCasUtil.select(jcas, POS.class).size());
assertEquals(745, JCasUtil.select(jcas, Lemma.class).size());
assertEquals(0, JCasUtil.select(jcas, NamedEntity.class).size());
assertEquals(30, JCasUtil.select(jcas, Sentence.class).size());
assertEquals(firstSentence, JCasUtil.select(jcas, Sentence.class).iterator().next().getCoveredText());
}
}
use of org.apache.uima.fit.pipeline.JCasIterable in project dkpro-tc by dkpro.
the class WordNGramMetaCollectorTest method emptyDocumentTest.
@SuppressWarnings("unused")
@Test
public void emptyDocumentTest() throws Exception {
File tmpDir = folder.newFolder();
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/empty/", TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_PATTERNS, "empty*.txt");
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(WordNGramMC.class, WordNGramMC.PARAM_UNIQUE_EXTRACTOR_NAME, "123", WordNGramMC.PARAM_TARGET_LOCATION, tmpDir);
for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) {
// System.out.println(jcas.getDocumentText().length());
}
}
Aggregations