Examples with TokenStreamFromTermVector - org.apache.lucene.search.highlight.TokenStreamFromTermVector

Example 1 with TokenStreamFromTermVector

use of org.apache.lucene.search.highlight.TokenStreamFromTermVector in project Anserini by castorini.

the class SequentialDependenceModel method computeUnorderedFrequencyScore.

private float computeUnorderedFrequencyScore(Document doc, Terms terms, RerankerContext context) throws IOException {
    List<String> queryTokens = context.getQueryTokens();
    // Construct token stream with offset 0
    TokenStream stream = new TokenStreamFromTermVector(terms, 0);
    CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
    Map<String, String> queryPairMap = new HashMap<>();
    Map<String, Integer> phraseCountMap = new HashMap<>();
    Map<String, Integer> singleCountMap = new HashMap<>();
    // Construct a count map and a map of phrase pair x y, x->y
    for (int i = 0; i < queryTokens.size() - 1; i++) {
        queryPairMap.put(queryTokens.get(i), queryTokens.get(i + 1));
        phraseCountMap.put(queryTokens.get(i), 0);
        // This will serve as our smoothing param
        singleCountMap.put(queryTokens.get(i), 1);
    }
    int docSize = 0;
    // We will maintain a fifo queue of window size
    LinkedList<String> window = new LinkedList<>();
    while (stream.incrementToken() && docSize <= WINDOW_SIZE * 2) {
        // First construct the window that we need to test on
        docSize++;
        String token = termAttribute.toString();
        window.add(token);
    }
    // But we need to account for the case when the tokenstream just doesn't have that many tokens
    for (int i = 0; i < Math.min(WINDOW_SIZE - 1, docSize); i++) {
        String firstToken = window.get(i);
        if (queryPairMap.containsKey(firstToken) && window.contains(queryPairMap.get(firstToken))) {
            phraseCountMap.put(firstToken, phraseCountMap.get(firstToken) + 1);
            singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
        }
    }
    // Now we continue
    while (stream.incrementToken()) {
        docSize++;
        String token = termAttribute.toString();
        window.add(token);
        // Move the window along
        // The window at this point is guaranteed to be of size WINDOW_SIZE * 2 because of the previous loop
        // if there are not enough tokens this would not even execute
        window.removeFirst();
        // Now test for the phrase at the test index WINDOW_SIZE -1
        String firstToken = window.get(WINDOW_SIZE - 1);
        if (queryPairMap.containsKey(firstToken) && window.contains(queryPairMap.get(firstToken))) {
            phraseCountMap.put(firstToken, phraseCountMap.get(firstToken) + 1);
            singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
        }
    }
    float score = 0.0f;
    // Smoothing count of 1
    docSize++;
    for (String queryToken : phraseCountMap.keySet()) {
        float countToUse = phraseCountMap.get(queryToken);
        if (countToUse == 0) {
            countToUse = singleCountMap.get(queryToken);
        }
        score += Math.log(countToUse / (float) docSize);
    }
    return score;
}

Also used : TokenStream(org.apache.lucene.analysis.TokenStream) HashMap(java.util.HashMap) LinkedList(java.util.LinkedList) TokenStreamFromTermVector(org.apache.lucene.search.highlight.TokenStreamFromTermVector) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute)

Example 2 with TokenStreamFromTermVector

use of org.apache.lucene.search.highlight.TokenStreamFromTermVector in project Anserini by castorini.

the class CountBigramPairs method countPairs.

/**
 * Method will count coocurrence of pairs specified in queryPairMap
 * and store counts for each window size in counters
 * NOTE method mutates inputs
 * @param singleCountMap    a count of single tokens as we encounter them, useful if any smoothing
 * @param queryPairMap      all pairs of strings we are looking for
 * @param backQueryPairMap  all pairs of reverse pairs, ei if query is test query, this would include query test
 * @param gapSizes          list of window sizes to compute for
 * @param counters          Window size to counter map
 */
public static void countPairs(Map<String, Integer> singleCountMap, Map<String, Set<String>> queryPairMap, Map<String, Set<String>> backQueryPairMap, ArrayList<Integer> gapSizes, Map<Integer, PhraseCounter> counters, Terms terms) throws IOException {
    // Construct token stream with offset 0
    TokenStreamFromTermVector stream = new TokenStreamFromTermVector(terms, -1);
    CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
    int docSize = 0;
    int maxGapSize = 0;
    for (Integer windowSize : gapSizes) {
        if (windowSize > maxGapSize) {
            maxGapSize = windowSize;
        }
    }
    // We will maintain a fifo queue of window size
    LinkedList<String> window = new LinkedList<>();
    // add to the window first and process the first tokens
    stream.reset();
    while (docSize < maxGapSize * 2 + 2 && stream.incrementToken()) {
        // First construct the window that we need to test on
        docSize++;
        String token = termAttribute.toString();
        window.add(token);
    }
    // But we need to account for the case when the tokenstream just doesn't have that many tokens
    for (int i = 0; i < Math.min(maxGapSize + 1, docSize); i++) {
        String firstToken = window.get(i);
        // Look ahead for token
        if (queryPairMap.containsKey(firstToken)) {
            // Count unigram for this token
            singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
            for (int j = i + 1; j < Math.min(i + maxGapSize + 1, docSize); j++) {
                if (queryPairMap.get(firstToken).contains(window.get(j))) {
                    for (int windowSize : counters.keySet()) {
                        if (j - i <= windowSize)
                            counters.get(windowSize).incrementCount(firstToken);
                    }
                }
            }
        }
        if (backQueryPairMap.containsKey(firstToken)) {
            // Count unigram for this token
            for (int j = i + 1; j < Math.min(i + maxGapSize + 1, docSize); j++) {
                if (backQueryPairMap.get(firstToken).contains(window.get(j))) {
                    for (int windowSize : counters.keySet()) {
                        if (j - i <= windowSize)
                            counters.get(windowSize).incrementCount(window.get(j));
                    }
                }
            }
        }
    }
    // Now we continue
    while (stream.incrementToken()) {
        docSize++;
        String token = termAttribute.toString();
        window.add(token);
        // Move the window along
        // The window at this point is guaranteed to be of size WINDOW_SIZE * 2 because of the previous loop
        // if there are not enough tokens this would not even execute
        window.removeFirst();
        // Now test for the phrase at the test index WINDOW_SIZE
        String firstToken = window.get(maxGapSize);
        if (queryPairMap.containsKey(firstToken)) {
            // Count unigram for this token
            singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
            for (int j = maxGapSize + 1; j < maxGapSize * 2 + 2; j++) {
                if (queryPairMap.get(firstToken).contains(window.get(j))) {
                    for (int windowSize : counters.keySet()) {
                        if (j - maxGapSize <= windowSize)
                            counters.get(windowSize).incrementCount(firstToken);
                    }
                }
            }
        }
        if (backQueryPairMap.containsKey(firstToken)) {
            // Count unigram for this token
            for (int j = maxGapSize + 1; j < maxGapSize * 2 + 2; j++) {
                if (backQueryPairMap.get(firstToken).contains(window.get(j))) {
                    for (int windowSize : counters.keySet()) {
                        if (j - maxGapSize <= windowSize)
                            counters.get(windowSize).incrementCount(window.get(j));
                    }
                }
            }
        }
    }
    // the unprocessed portion is the last maxGap + 1 -> end
    for (int i = maxGapSize + 1; i < Math.min(maxGapSize * 2 + 1, docSize); i++) {
        String firstToken = window.get(i);
        if (queryPairMap.containsKey(firstToken)) {
            // Count unigram for this token
            singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
            for (int j = i + 1; j < Math.min(maxGapSize * 2 + 2, docSize); j++) {
                if (queryPairMap.get(firstToken).contains(window.get(j))) {
                    for (int windowSize : counters.keySet()) {
                        if (j - i <= windowSize)
                            counters.get(windowSize).incrementCount(firstToken);
                    }
                }
            }
        }
        if (backQueryPairMap.containsKey(firstToken)) {
            // Count unigram for this token
            for (int j = i + 1; j < Math.min(maxGapSize * 2 + 2, docSize); j++) {
                if (backQueryPairMap.get(firstToken).contains(window.get(j))) {
                    for (int windowSize : counters.keySet()) {
                        if (j - i <= windowSize)
                            counters.get(windowSize).incrementCount(window.get(j));
                    }
                }
            }
        }
    }
    stream.end();
    stream.close();
}

Also used : TokenStreamFromTermVector(org.apache.lucene.search.highlight.TokenStreamFromTermVector) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute)

Example 3 with TokenStreamFromTermVector

use of org.apache.lucene.search.highlight.TokenStreamFromTermVector in project Anserini by castorini.

the class SequentialDependenceModel method computeFullIndependenceScore.

/**
 * The single term scoring function: lambda* log( (1-alpha) tf/ |D|)
 * @param doc
 * @param terms
 * @param context
 * @return
 */
private float computeFullIndependenceScore(Document doc, Terms terms, RerankerContext context) throws IOException {
    // tf can be calculated by iterating over terms, number of times a term occurs in doc
    // |D| total number of terms can be calculated by iterating over stream
    IndexReader reader = context.getIndexSearcher().getIndexReader();
    List<String> queryTokenList = context.getQueryTokens();
    Map<String, Integer> termCount = new HashMap<>();
    TokenStream stream = new TokenStreamFromTermVector(terms, 0);
    CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
    float docSize = 0;
    // Count all the tokens
    while (stream.incrementToken()) {
        docSize++;
        String token = termAttribute.toString();
        if (termCount.containsKey(token)) {
            termCount.put(token, termCount.get(token) + 1);
        } else {
            termCount.put(token, 1);
        }
    }
    float score = 0.0f;
    // Smoothing count of 1
    docSize++;
    // Only compute the score for what's in term count all else 0
    for (String queryToken : termCount.keySet()) {
        score += Math.log((float) (termCount.get(queryToken) + 1) / docSize);
    }
    return score;
}

Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TokenStreamFromTermVector(org.apache.lucene.search.highlight.TokenStreamFromTermVector) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) HashMap(java.util.HashMap) IndexReader(org.apache.lucene.index.IndexReader)

Example 4 with TokenStreamFromTermVector

use of org.apache.lucene.search.highlight.TokenStreamFromTermVector in project Anserini by castorini.

the class SequentialDependenceModel method computeOrderedFrequencyScore.

private float computeOrderedFrequencyScore(Document doc, Terms terms, RerankerContext context) throws IOException {
    List<String> queryTokens = context.getQueryTokens();
    Map<String, String> queryPairMap = new HashMap<>();
    Map<String, Integer> phraseCountMap = new HashMap<>();
    Map<String, Integer> singleCountMap = new HashMap<>();
    // Construct a count map and a map of phrase pair x y, x->y
    for (int i = 0; i < queryTokens.size() - 1; i++) {
        queryPairMap.put(queryTokens.get(i), queryTokens.get(i + 1));
        phraseCountMap.put(queryTokens.get(i), 0);
        // This will serve as our smoothing param
        singleCountMap.put(queryTokens.get(i), 1);
    }
    // Construct token stream with offset 0
    TokenStream stream = new TokenStreamFromTermVector(terms, 0);
    CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
    float docSize = 0.0f;
    // Use these to track which token we need to see to increment count
    // count tracked on the first token
    String expectedToken = "";
    String tokenToIncrement = "";
    while (stream.incrementToken()) {
        docSize++;
        String token = termAttribute.toString();
        if (token.equalsIgnoreCase(expectedToken)) {
            phraseCountMap.put(tokenToIncrement, phraseCountMap.get(tokenToIncrement) + 1);
        }
        // Check now if this token could be the start of an ordered phrase
        if (queryPairMap.containsKey(token)) {
            expectedToken = queryPairMap.get(token);
            singleCountMap.put(token, singleCountMap.get(token) + 1);
            tokenToIncrement = token;
        } else {
            expectedToken = "";
            tokenToIncrement = "";
        }
    }
    float score = 0.0f;
    // Smoothing count of 1
    docSize++;
    for (String queryToken : phraseCountMap.keySet()) {
        score += Math.log((float) (phraseCountMap.get(queryToken) + 1) / docSize);
    }
    return score;
}

Example 5 with TokenStreamFromTermVector

use of org.apache.lucene.search.highlight.TokenStreamFromTermVector in project Anserini by castorini.

the class UnigramFeatureExtractor method computeFullIndependenceScore.

/**
 * The single term scoring function: lambda* log( (1-alpha) tf/ |D|)
 * @param doc
 * @param terms
 * @param context
 * @return
 */
private float computeFullIndependenceScore(Document doc, Terms terms, RerankerContext context) throws IOException {
    // tf can be calculated by iterating over terms, number of times a term occurs in doc
    // |D| total number of terms can be calculated by iterating over stream
    IndexReader reader = context.getIndexSearcher().getIndexReader();
    List<String> queryTokenList = context.getQueryTokens();
    Map<String, Integer> termCount = new HashMap<>();
    for (String queryToken : queryTokenList) {
        termCount.put(queryToken, 0);
    }
    TokenStream stream = new TokenStreamFromTermVector(terms, -1);
    CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    float docSize = 0;
    // Count all the tokens
    while (stream.incrementToken()) {
        docSize++;
        String token = termAttribute.toString();
        if (termCount.containsKey(token)) {
            termCount.put(token, termCount.get(token) + 1);
        }
    }
    float score = 0.0f;
    // Smoothing count of 1
    docSize++;
    // Only compute the score for what's in term count all else 0
    for (String queryToken : termCount.keySet()) {
        score += termCount.get(queryToken);
    }
    stream.end();
    stream.close();
    return score;
}

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)5 TokenStreamFromTermVector (org.apache.lucene.search.highlight.TokenStreamFromTermVector)5 HashMap (java.util.HashMap)4 TokenStream (org.apache.lucene.analysis.TokenStream)4 IndexReader (org.apache.lucene.index.IndexReader)2 LinkedList (java.util.LinkedList)1