use of org.apache.tika.eval.tokens.AnalyzerManager in project tika by apache.
the class AnalyzerManagerTest method testTokenCountFilter.
@Test
public void testTokenCountFilter() throws Exception {
AnalyzerManager analyzerManager = AnalyzerManager.newInstance(1000000);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 1001000; i++) {
sb.append("the ");
}
TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f", sb.toString());
ts.reset();
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
int tokens = 0;
while (ts.incrementToken()) {
tokens++;
}
assertEquals(1000000, tokens);
}
use of org.apache.tika.eval.tokens.AnalyzerManager in project tika by apache.
the class AnalyzerManagerTest method testCommon.
@Test
public void testCommon() throws Exception {
AnalyzerManager analyzerManager = AnalyzerManager.newInstance(100000);
Analyzer common = analyzerManager.getCommonTokensAnalyzer();
TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog");
ts.reset();
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
Set<String> seen = new HashSet<>();
while (ts.incrementToken()) {
String t = termAtt.toString();
if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray()) && t.contains("5")) {
fail("Shouldn't have found a numeric");
}
seen.add(termAtt.toString());
}
ts.end();
ts.close();
assertTrue(seen.contains("dirty"));
assertFalse(seen.contains("the"));
}
use of org.apache.tika.eval.tokens.AnalyzerManager in project tika by apache.
the class AnalyzerManagerTest method testGeneral.
@Test
public void testGeneral() throws Exception {
AnalyzerManager analyzerManager = AnalyzerManager.newInstance(100000);
Analyzer general = analyzerManager.getGeneralAnalyzer();
TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog");
ts.reset();
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
Set<String> seen = new HashSet<>();
while (ts.incrementToken()) {
seen.add(termAtt.toString());
}
ts.end();
ts.close();
assertTrue(seen.contains("the"));
assertTrue(seen.contains("and"));
assertTrue(seen.contains("dog"));
}
Aggregations