use of org.apache.lucene.analysis.shingle.ShingleFilter in project cogcomp-nlp by CogComp.
the class CharacterShingleAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new CharacterShingleTokenizer();
TokenStream result = new StandardFilter(source);
result = new ASCIIFoldingFilter(result);
result = new LowerCaseFilter(result);
result = new ShingleFilter(result, 3);
return new TokenStreamComponents(source, result);
}
use of org.apache.lucene.analysis.shingle.ShingleFilter in project lucene-solr by apache.
the class FreeTextSuggester method addShingles.
private Analyzer addShingles(final Analyzer other) {
if (grams == 1) {
return other;
} else {
// Tack on ShingleFilter to the end, to generate token ngrams:
return new AnalyzerWrapper(other.getReuseStrategy()) {
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return other;
}
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
ShingleFilter shingles = new ShingleFilter(components.getTokenStream(), 2, grams);
shingles.setTokenSeparator(Character.toString((char) separator));
return new TokenStreamComponents(components.getTokenizer(), shingles);
}
};
}
}
use of org.apache.lucene.analysis.shingle.ShingleFilter in project lucene-solr by apache.
the class EdgeNGramTokenFilterTest method testGraphs.
public void testGraphs() throws IOException {
TokenStream tk = new LetterTokenizer();
((Tokenizer) tk).setReader(new StringReader("abc d efgh ij klmno p q"));
tk = new ShingleFilter(tk);
tk = new EdgeNGramTokenFilter(tk, 7, 10);
assertTokenStreamContents(tk, new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6, 11, 11, 14 }, new int[] { 13, 19, 19, 21 }, new int[] { 3, 1, 0, 1 }, new int[] { 2, 2, 2, 2 }, 23);
}
Aggregations