use of org.apache.lucene.analysis.Tokenizer in project sukija by ahomansikka.
the class AppTest method test.
private boolean test(String input, String expectedOutput) throws IOException {
Reader r = new StringReader(input);
TokenStream t = new HVTokenizer();
((Tokenizer) t).setReader(r);
t = new VoikkoFilter(t, voikko);
t.reset();
VoikkoAttribute sukijaAtt = t.addAttribute(VoikkoAttribute.class);
CharTermAttribute termAtt = t.addAttribute(CharTermAttribute.class);
while (t.incrementToken()) {
System.out.println("AppTest " + termAtt.toString());
for (int i = 0; i < sukijaAtt.getAnalysis().size(); i++) {
System.out.println(sukijaAtt.getAnalysis(i).get("BASEFORM"));
// VoikkoUtils.printAnalysisResult (sukijaAtt.getAnalysis(i), System.out);
}
System.out.println("");
}
return true;
}
use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.
the class AnalyzingSuggesterTest method testDupSurfaceFormsMissingResults2.
public void testDupSurfaceFormsMissingResults2() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
return new TokenStreamComponents(tokenizer) {
int count;
@Override
public TokenStream getTokenStream() {
if (count == 0) {
count++;
return new CannedTokenStream(new Token[] { token("p", 1, 1), token("q", 1, 1), token("r", 0, 1), token("s", 0, 1) });
} else {
return new CannedTokenStream(new Token[] { token("p", 1, 1) });
}
}
@Override
protected void setReader(final Reader reader) {
}
};
}
};
Directory tempDir = getDirectory();
AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", a, a, 0, 256, -1, true);
suggester.build(new InputArrayIterator(new Input[] { new Input("a", 6), new Input("b", 5) }));
List<LookupResult> results = suggester.lookup("a", false, 2);
assertEquals(2, results.size());
assertEquals("a", results.get(0).key);
assertEquals(6, results.get(0).value);
assertEquals("b", results.get(1).key);
assertEquals(5, results.get(1).value);
// Try again after save/load:
Path tmpDir = createTempDir("AnalyzingSuggesterTest");
Path path = tmpDir.resolve("suggester");
OutputStream os = Files.newOutputStream(path);
suggester.store(os);
os.close();
InputStream is = Files.newInputStream(path);
suggester.load(is);
is.close();
results = suggester.lookup("a", false, 2);
assertEquals(2, results.size());
assertEquals("a", results.get(0).key);
assertEquals(6, results.get(0).value);
assertEquals("b", results.get(1).key);
assertEquals(5, results.get(1).value);
IOUtils.close(a, tempDir);
}
use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.
the class FuzzySuggesterTest method testInputPathRequired.
public void testInputPathRequired() throws Exception {
// SynonymMap.Builder b = new SynonymMap.Builder(false);
// b.add(new CharsRef("ab"), new CharsRef("ba"), true);
// final SynonymMap map = b.build();
// The Analyzer below mimics the functionality of the SynonymAnalyzer
// using the above map, so that the suggest module does not need a dependency on the
// synonym module
final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
return new TokenStreamComponents(tokenizer) {
int tokenStreamCounter = 0;
final TokenStream[] tokenStreams = new TokenStream[] { new CannedTokenStream(new Token[] { token("ab", 1, 1), token("ba", 0, 1), token("xc", 1, 1) }), new CannedTokenStream(new Token[] { token("ba", 1, 1), token("xd", 1, 1) }), new CannedTokenStream(new Token[] { token("ab", 1, 1), token("ba", 0, 1), token("x", 1, 1) }) };
@Override
public TokenStream getTokenStream() {
TokenStream result = tokenStreams[tokenStreamCounter];
tokenStreamCounter++;
return result;
}
@Override
protected void setReader(final Reader reader) {
}
};
}
};
Input[] keys = new Input[] { new Input("ab xc", 50), new Input("ba xd", 50) };
Directory tempDir = getDirectory();
FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", analyzer);
suggester.build(new InputArrayIterator(keys));
List<LookupResult> results = suggester.lookup("ab x", false, 1);
assertTrue(results.size() == 1);
IOUtils.close(analyzer, tempDir);
}
use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.
the class TestFreeTextSuggester method testEndingHole.
// With one ending hole, ShingleFilter produces "of _" and
// we should properly predict from that:
public void testEndingHole() throws Exception {
// Just deletes "of"
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer();
CharArraySet stopSet = StopFilter.makeStopSet("of");
return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet));
}
};
Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(new Input("wizard of oz", 50));
FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
sug.build(new InputArrayIterator(keys));
assertEquals("wizard _ oz/1.00", toString(sug.lookup("wizard of", 10)));
// Falls back to unigram model, with backoff 0.4 times
// prop 0.5:
assertEquals("oz/0.20", toString(sug.lookup("wizard o", 10)));
a.close();
}
use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.
the class TestFreeTextSuggester method testTwoEndingHoles.
// If the number of ending holes exceeds the ngrams window
// then there are no predictions, because ShingleFilter
// does not produce e.g. a hole only "_ _" token:
public void testTwoEndingHoles() throws Exception {
// Just deletes "of"
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer();
CharArraySet stopSet = StopFilter.makeStopSet("of");
return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet));
}
};
Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(new Input("wizard of of oz", 50));
FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
sug.build(new InputArrayIterator(keys));
assertEquals("", toString(sug.lookup("wizard of of", 10)));
a.close();
}
Aggregations