use of io.anserini.ann.FeatureVectorsTokenizer in project Anserini by castorini.
the class LexicalLshFeaturePositionTokenFilterTest method testFiltering.
@Test
public void testFiltering() throws Exception {
StringReader reader = new StringReader("-0.10 0.20 0.30 0.40");
Tokenizer stream = new FeatureVectorsTokenizer();
stream.setReader(reader);
LexicalLshFeaturePositionTokenFilter filter = new LexicalLshFeaturePositionTokenFilter(stream);
filter.reset();
List<String> expectedTokens = new LinkedList<>();
expectedTokens.add("1_-0.10");
expectedTokens.add("2_0.20");
expectedTokens.add("3_0.30");
expectedTokens.add("4_0.40");
int i = 0;
while (filter.incrementToken()) {
CharTermAttribute charTermAttribute = filter.getAttribute(CharTermAttribute.class);
String token = new String(charTermAttribute.buffer(), 0, charTermAttribute.length());
assertEquals(expectedTokens.get(i), token);
i++;
}
filter.close();
}
use of io.anserini.ann.FeatureVectorsTokenizer in project Anserini by castorini.
the class LexicalLshFeaturePositionTokenFilterTest method testFilteringWithOffset.
@Test
public void testFilteringWithOffset() throws Exception {
StringReader reader = new StringReader("-0.104123 0.20435 0.3042366 0.41243241");
Tokenizer stream = new FeatureVectorsTokenizer();
stream.setReader(reader);
LexicalLshFeaturePositionTokenFilter filter = new LexicalLshFeaturePositionTokenFilter(stream, 2);
filter.reset();
List<String> expectedTokens = new LinkedList<>();
expectedTokens.add("1_-104123");
expectedTokens.add("2_20435");
expectedTokens.add("3_3042366");
expectedTokens.add("4_41243241");
int i = 0;
while (filter.incrementToken()) {
CharTermAttribute charTermAttribute = filter.getAttribute(CharTermAttribute.class);
String token = new String(charTermAttribute.buffer(), 0, charTermAttribute.length());
assertEquals(expectedTokens.get(i), token);
i++;
}
filter.close();
}
use of io.anserini.ann.FeatureVectorsTokenizer in project Anserini by castorini.
the class FakeWordsEncoderAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new FeatureVectorsTokenizer();
TokenFilter filter = new FakeWordsEncodeAndQuantizeFilter(t, q);
filter = new StopFilter(filter, set);
return new TokenStreamComponents(t, filter);
}
use of io.anserini.ann.FeatureVectorsTokenizer in project Anserini by castorini.
the class LexicalLshAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new FeatureVectorsTokenizer();
TokenFilter truncate = new LexicalLshTruncateTokenFilter(source, decimals);
TokenFilter featurePos = new LexicalLshFeaturePositionTokenFilter(truncate);
TokenStream filter;
if (min > 1) {
ShingleFilter shingleFilter = new ShingleFilter(featurePos, min, max);
shingleFilter.setTokenSeparator(" ");
shingleFilter.setOutputUnigrams(false);
shingleFilter.setOutputUnigramsIfNoShingles(false);
filter = new MinHashFilter(shingleFilter, hashCount, bucketCount, hashSetSize, bucketCount > 1);
} else {
filter = new MinHashFilter(featurePos, hashCount, bucketCount, hashSetSize, bucketCount > 1);
}
return new TokenStreamComponents(source, new RemoveDuplicatesTokenFilter(filter));
}
use of io.anserini.ann.FeatureVectorsTokenizer in project Anserini by castorini.
the class FakeWordsEncodeAndQuantizeFilterTest method testFiltering.
@Test
public void testFiltering() throws Exception {
StringReader reader = new StringReader("-0.10 0.20 0.30 0.40");
Tokenizer stream = new FeatureVectorsTokenizer();
stream.setReader(reader);
FakeWordsEncodeAndQuantizeFilter filter = new FakeWordsEncodeAndQuantizeFilter(stream, 20);
filter.reset();
List<String> expectedTokens = new LinkedList<>();
// quantization leads to zero
expectedTokens.add("_");
// quantization leads to 4 tokens
expectedTokens.add("f2");
expectedTokens.add("f2");
expectedTokens.add("f2");
expectedTokens.add("f2");
// quantization leads to 6 tokens
expectedTokens.add("f3");
expectedTokens.add("f3");
expectedTokens.add("f3");
expectedTokens.add("f3");
expectedTokens.add("f3");
expectedTokens.add("f3");
// quantization leads to 16 tokens
expectedTokens.add("f4");
expectedTokens.add("f4");
expectedTokens.add("f4");
expectedTokens.add("f4");
expectedTokens.add("f4");
expectedTokens.add("f4");
expectedTokens.add("f4");
expectedTokens.add("f4");
expectedTokens.add("f4");
expectedTokens.add("f4");
expectedTokens.add("f4");
expectedTokens.add("f4");
expectedTokens.add("f4");
expectedTokens.add("f4");
expectedTokens.add("f4");
int i = 0;
while (filter.incrementToken()) {
CharTermAttribute charTermAttribute = filter.getAttribute(CharTermAttribute.class);
String token = new String(charTermAttribute.buffer(), 0, charTermAttribute.length());
assertEquals(expectedTokens.get(i), token);
i++;
}
filter.close();
}
Aggregations