Search in sources :

Example 66 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestSimplePatternSplitTokenizer method testEmptyStringPatternOneMatch.

public void testEmptyStringPatternOneMatch() throws Exception {
    Tokenizer t = new SimplePatternSplitTokenizer("a*");
    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
    t.setReader(new StringReader("bbab"));
    assertTokenStreamContents(t, new String[] { "bb", "b" }, new int[] { 0, 3 }, new int[] { 2, 4 });
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 67 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestSimplePatternSplitTokenizer method testNoTokens.

public void testNoTokens() throws Exception {
    Tokenizer t = new SimplePatternSplitTokenizer(".*");
    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
    String s;
    while (true) {
        s = TestUtil.randomUnicodeString(random());
        if (s.length() > 0) {
            break;
        }
    }
    t.setReader(new StringReader(s));
    t.reset();
    assertFalse(t.incrementToken());
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 68 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestElision method filter.

private List<String> filter(TokenFilter filter) throws IOException {
    List<String> tas = new ArrayList<>();
    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
    filter.reset();
    while (filter.incrementToken()) {
        tas.add(termAtt.toString());
    }
    filter.end();
    filter.close();
    return tas;
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList)

Example 69 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestDocumentWriter method testTokenReuse.

public void testTokenReuse() throws IOException {
    Analyzer analyzer = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new TokenFilter(tokenizer) {

                boolean first = true;

                AttributeSource.State state;

                @Override
                public boolean incrementToken() throws IOException {
                    if (state != null) {
                        restoreState(state);
                        payloadAtt.setPayload(null);
                        posIncrAtt.setPositionIncrement(0);
                        termAtt.setEmpty().append("b");
                        state = null;
                        return true;
                    }
                    boolean hasNext = input.incrementToken();
                    if (!hasNext)
                        return false;
                    if (Character.isDigit(termAtt.buffer()[0])) {
                        posIncrAtt.setPositionIncrement(termAtt.buffer()[0] - '0');
                    }
                    if (first) {
                        // set payload on first position only
                        payloadAtt.setPayload(new BytesRef(new byte[] { 100 }));
                        first = false;
                    }
                    // index a "synonym" for every token
                    state = captureState();
                    return true;
                }

                @Override
                public void reset() throws IOException {
                    super.reset();
                    first = true;
                    state = null;
                }

                final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

                final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);

                final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
            });
        }
    };
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer));
    Document doc = new Document();
    doc.add(newTextField("f1", "a 5 a a", Field.Store.YES));
    writer.addDocument(doc);
    writer.commit();
    SegmentCommitInfo info = writer.newestSegment();
    writer.close();
    SegmentReader reader = new SegmentReader(info, Version.LATEST.major, newIOContext(random()));
    PostingsEnum termPositions = MultiFields.getTermPositionsEnum(reader, "f1", new BytesRef("a"));
    assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    int freq = termPositions.freq();
    assertEquals(3, freq);
    assertEquals(0, termPositions.nextPosition());
    assertNotNull(termPositions.getPayload());
    assertEquals(6, termPositions.nextPosition());
    assertNull(termPositions.getPayload());
    assertEquals(7, termPositions.nextPosition());
    assertNull(termPositions.getPayload());
    reader.close();
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) AttributeSource(org.apache.lucene.util.AttributeSource) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) BytesRef(org.apache.lucene.util.BytesRef)

Example 70 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project Anserini by castorini.

the class SequentialDependenceModel method computeUnorderedFrequencyScore.

private float computeUnorderedFrequencyScore(Document doc, Terms terms, RerankerContext context) throws IOException {
    List<String> queryTokens = context.getQueryTokens();
    // Construct token stream with offset 0
    TokenStream stream = new TokenStreamFromTermVector(terms, 0);
    CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
    Map<String, String> queryPairMap = new HashMap<>();
    Map<String, Integer> phraseCountMap = new HashMap<>();
    Map<String, Integer> singleCountMap = new HashMap<>();
    // Construct a count map and a map of phrase pair x y, x->y
    for (int i = 0; i < queryTokens.size() - 1; i++) {
        queryPairMap.put(queryTokens.get(i), queryTokens.get(i + 1));
        phraseCountMap.put(queryTokens.get(i), 0);
        // This will serve as our smoothing param
        singleCountMap.put(queryTokens.get(i), 1);
    }
    int docSize = 0;
    // We will maintain a fifo queue of window size
    LinkedList<String> window = new LinkedList<>();
    while (stream.incrementToken() && docSize <= WINDOW_SIZE * 2) {
        // First construct the window that we need to test on
        docSize++;
        String token = termAttribute.toString();
        window.add(token);
    }
    // But we need to account for the case when the tokenstream just doesn't have that many tokens
    for (int i = 0; i < Math.min(WINDOW_SIZE - 1, docSize); i++) {
        String firstToken = window.get(i);
        if (queryPairMap.containsKey(firstToken) && window.contains(queryPairMap.get(firstToken))) {
            phraseCountMap.put(firstToken, phraseCountMap.get(firstToken) + 1);
            singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
        }
    }
    // Now we continue
    while (stream.incrementToken()) {
        docSize++;
        String token = termAttribute.toString();
        window.add(token);
        // Move the window along
        // The window at this point is guaranteed to be of size WINDOW_SIZE * 2 because of the previous loop
        // if there are not enough tokens this would not even execute
        window.removeFirst();
        // Now test for the phrase at the test index WINDOW_SIZE -1
        String firstToken = window.get(WINDOW_SIZE - 1);
        if (queryPairMap.containsKey(firstToken) && window.contains(queryPairMap.get(firstToken))) {
            phraseCountMap.put(firstToken, phraseCountMap.get(firstToken) + 1);
            singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
        }
    }
    float score = 0.0f;
    // Smoothing count of 1
    docSize++;
    for (String queryToken : phraseCountMap.keySet()) {
        float countToUse = phraseCountMap.get(queryToken);
        if (countToUse == 0) {
            countToUse = singleCountMap.get(queryToken);
        }
        score += Math.log(countToUse / (float) docSize);
    }
    return score;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) HashMap(java.util.HashMap) LinkedList(java.util.LinkedList) TokenStreamFromTermVector(org.apache.lucene.search.highlight.TokenStreamFromTermVector) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)151 TokenStream (org.apache.lucene.analysis.TokenStream)95 StringReader (java.io.StringReader)46 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)35 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)34 IOException (java.io.IOException)27 ArrayList (java.util.ArrayList)27 Tokenizer (org.apache.lucene.analysis.Tokenizer)25 Analyzer (org.apache.lucene.analysis.Analyzer)20 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)16 BytesRef (org.apache.lucene.util.BytesRef)15 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)13 LinkedList (java.util.LinkedList)11 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)10 Term (org.apache.lucene.index.Term)10 HashMap (java.util.HashMap)9 Token (org.apache.lucene.analysis.Token)8 Document (org.apache.lucene.document.Document)8 List (java.util.List)7 HashSet (java.util.HashSet)6