Search in sources :

Example 1 with FeatureVectorsTokenizer

use of io.anserini.ann.FeatureVectorsTokenizer in project Anserini by castorini.

the class LexicalLshFeaturePositionTokenFilterTest method testFiltering.

@Test
public void testFiltering() throws Exception {
    StringReader reader = new StringReader("-0.10 0.20 0.30 0.40");
    Tokenizer stream = new FeatureVectorsTokenizer();
    stream.setReader(reader);
    LexicalLshFeaturePositionTokenFilter filter = new LexicalLshFeaturePositionTokenFilter(stream);
    filter.reset();
    List<String> expectedTokens = new LinkedList<>();
    expectedTokens.add("1_-0.10");
    expectedTokens.add("2_0.20");
    expectedTokens.add("3_0.30");
    expectedTokens.add("4_0.40");
    int i = 0;
    while (filter.incrementToken()) {
        CharTermAttribute charTermAttribute = filter.getAttribute(CharTermAttribute.class);
        String token = new String(charTermAttribute.buffer(), 0, charTermAttribute.length());
        assertEquals(expectedTokens.get(i), token);
        i++;
    }
    filter.close();
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) FeatureVectorsTokenizer(io.anserini.ann.FeatureVectorsTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) LinkedList(java.util.LinkedList) FeatureVectorsTokenizer(io.anserini.ann.FeatureVectorsTokenizer) Test(org.junit.Test)

Example 2 with FeatureVectorsTokenizer

use of io.anserini.ann.FeatureVectorsTokenizer in project Anserini by castorini.

the class LexicalLshFeaturePositionTokenFilterTest method testFilteringWithOffset.

@Test
public void testFilteringWithOffset() throws Exception {
    StringReader reader = new StringReader("-0.104123 0.20435 0.3042366 0.41243241");
    Tokenizer stream = new FeatureVectorsTokenizer();
    stream.setReader(reader);
    LexicalLshFeaturePositionTokenFilter filter = new LexicalLshFeaturePositionTokenFilter(stream, 2);
    filter.reset();
    List<String> expectedTokens = new LinkedList<>();
    expectedTokens.add("1_-104123");
    expectedTokens.add("2_20435");
    expectedTokens.add("3_3042366");
    expectedTokens.add("4_41243241");
    int i = 0;
    while (filter.incrementToken()) {
        CharTermAttribute charTermAttribute = filter.getAttribute(CharTermAttribute.class);
        String token = new String(charTermAttribute.buffer(), 0, charTermAttribute.length());
        assertEquals(expectedTokens.get(i), token);
        i++;
    }
    filter.close();
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) FeatureVectorsTokenizer(io.anserini.ann.FeatureVectorsTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) LinkedList(java.util.LinkedList) FeatureVectorsTokenizer(io.anserini.ann.FeatureVectorsTokenizer) Test(org.junit.Test)

Example 3 with FeatureVectorsTokenizer

use of io.anserini.ann.FeatureVectorsTokenizer in project Anserini by castorini.

the class FakeWordsEncoderAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer t = new FeatureVectorsTokenizer();
    TokenFilter filter = new FakeWordsEncodeAndQuantizeFilter(t, q);
    filter = new StopFilter(filter, set);
    return new TokenStreamComponents(t, filter);
}
Also used : StopFilter(org.apache.lucene.analysis.StopFilter) FeatureVectorsTokenizer(io.anserini.ann.FeatureVectorsTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) FeatureVectorsTokenizer(io.anserini.ann.FeatureVectorsTokenizer) TokenFilter(org.apache.lucene.analysis.TokenFilter)

Example 4 with FeatureVectorsTokenizer

use of io.anserini.ann.FeatureVectorsTokenizer in project Anserini by castorini.

the class LexicalLshAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new FeatureVectorsTokenizer();
    TokenFilter truncate = new LexicalLshTruncateTokenFilter(source, decimals);
    TokenFilter featurePos = new LexicalLshFeaturePositionTokenFilter(truncate);
    TokenStream filter;
    if (min > 1) {
        ShingleFilter shingleFilter = new ShingleFilter(featurePos, min, max);
        shingleFilter.setTokenSeparator(" ");
        shingleFilter.setOutputUnigrams(false);
        shingleFilter.setOutputUnigramsIfNoShingles(false);
        filter = new MinHashFilter(shingleFilter, hashCount, bucketCount, hashSetSize, bucketCount > 1);
    } else {
        filter = new MinHashFilter(featurePos, hashCount, bucketCount, hashSetSize, bucketCount > 1);
    }
    return new TokenStreamComponents(source, new RemoveDuplicatesTokenFilter(filter));
}
Also used : RemoveDuplicatesTokenFilter(org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter) TokenStream(org.apache.lucene.analysis.TokenStream) ShingleFilter(org.apache.lucene.analysis.shingle.ShingleFilter) MinHashFilter(org.apache.lucene.analysis.minhash.MinHashFilter) FeatureVectorsTokenizer(io.anserini.ann.FeatureVectorsTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) FeatureVectorsTokenizer(io.anserini.ann.FeatureVectorsTokenizer) RemoveDuplicatesTokenFilter(org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter) TokenFilter(org.apache.lucene.analysis.TokenFilter)

Example 5 with FeatureVectorsTokenizer

use of io.anserini.ann.FeatureVectorsTokenizer in project Anserini by castorini.

the class FakeWordsEncodeAndQuantizeFilterTest method testFiltering.

@Test
public void testFiltering() throws Exception {
    StringReader reader = new StringReader("-0.10 0.20 0.30 0.40");
    Tokenizer stream = new FeatureVectorsTokenizer();
    stream.setReader(reader);
    FakeWordsEncodeAndQuantizeFilter filter = new FakeWordsEncodeAndQuantizeFilter(stream, 20);
    filter.reset();
    List<String> expectedTokens = new LinkedList<>();
    // quantization leads to zero
    expectedTokens.add("_");
    // quantization leads to 4 tokens
    expectedTokens.add("f2");
    expectedTokens.add("f2");
    expectedTokens.add("f2");
    expectedTokens.add("f2");
    // quantization leads to 6 tokens
    expectedTokens.add("f3");
    expectedTokens.add("f3");
    expectedTokens.add("f3");
    expectedTokens.add("f3");
    expectedTokens.add("f3");
    expectedTokens.add("f3");
    // quantization leads to 16 tokens
    expectedTokens.add("f4");
    expectedTokens.add("f4");
    expectedTokens.add("f4");
    expectedTokens.add("f4");
    expectedTokens.add("f4");
    expectedTokens.add("f4");
    expectedTokens.add("f4");
    expectedTokens.add("f4");
    expectedTokens.add("f4");
    expectedTokens.add("f4");
    expectedTokens.add("f4");
    expectedTokens.add("f4");
    expectedTokens.add("f4");
    expectedTokens.add("f4");
    expectedTokens.add("f4");
    int i = 0;
    while (filter.incrementToken()) {
        CharTermAttribute charTermAttribute = filter.getAttribute(CharTermAttribute.class);
        String token = new String(charTermAttribute.buffer(), 0, charTermAttribute.length());
        assertEquals(expectedTokens.get(i), token);
        i++;
    }
    filter.close();
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) FeatureVectorsTokenizer(io.anserini.ann.FeatureVectorsTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) LinkedList(java.util.LinkedList) FeatureVectorsTokenizer(io.anserini.ann.FeatureVectorsTokenizer) Test(org.junit.Test)

Aggregations

FeatureVectorsTokenizer (io.anserini.ann.FeatureVectorsTokenizer)5 Tokenizer (org.apache.lucene.analysis.Tokenizer)5 StringReader (java.io.StringReader)3 LinkedList (java.util.LinkedList)3 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)3 Test (org.junit.Test)3 TokenFilter (org.apache.lucene.analysis.TokenFilter)2 StopFilter (org.apache.lucene.analysis.StopFilter)1 TokenStream (org.apache.lucene.analysis.TokenStream)1 MinHashFilter (org.apache.lucene.analysis.minhash.MinHashFilter)1 RemoveDuplicatesTokenFilter (org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter)1 ShingleFilter (org.apache.lucene.analysis.shingle.ShingleFilter)1