use of org.apache.lucene.search.highlight.TokenStreamFromTermVector in project Anserini by castorini.
the class SequentialDependenceModel method computeUnorderedFrequencyScore.
private float computeUnorderedFrequencyScore(Document doc, Terms terms, RerankerContext context) throws IOException {
List<String> queryTokens = context.getQueryTokens();
// Construct token stream with offset 0
TokenStream stream = new TokenStreamFromTermVector(terms, 0);
CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
Map<String, String> queryPairMap = new HashMap<>();
Map<String, Integer> phraseCountMap = new HashMap<>();
Map<String, Integer> singleCountMap = new HashMap<>();
// Construct a count map and a map of phrase pair x y, x->y
for (int i = 0; i < queryTokens.size() - 1; i++) {
queryPairMap.put(queryTokens.get(i), queryTokens.get(i + 1));
phraseCountMap.put(queryTokens.get(i), 0);
// This will serve as our smoothing param
singleCountMap.put(queryTokens.get(i), 1);
}
int docSize = 0;
// We will maintain a fifo queue of window size
LinkedList<String> window = new LinkedList<>();
while (stream.incrementToken() && docSize <= WINDOW_SIZE * 2) {
// First construct the window that we need to test on
docSize++;
String token = termAttribute.toString();
window.add(token);
}
// But we need to account for the case when the tokenstream just doesn't have that many tokens
for (int i = 0; i < Math.min(WINDOW_SIZE - 1, docSize); i++) {
String firstToken = window.get(i);
if (queryPairMap.containsKey(firstToken) && window.contains(queryPairMap.get(firstToken))) {
phraseCountMap.put(firstToken, phraseCountMap.get(firstToken) + 1);
singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
}
}
// Now we continue
while (stream.incrementToken()) {
docSize++;
String token = termAttribute.toString();
window.add(token);
// Move the window along
// The window at this point is guaranteed to be of size WINDOW_SIZE * 2 because of the previous loop
// if there are not enough tokens this would not even execute
window.removeFirst();
// Now test for the phrase at the test index WINDOW_SIZE -1
String firstToken = window.get(WINDOW_SIZE - 1);
if (queryPairMap.containsKey(firstToken) && window.contains(queryPairMap.get(firstToken))) {
phraseCountMap.put(firstToken, phraseCountMap.get(firstToken) + 1);
singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
}
}
float score = 0.0f;
// Smoothing count of 1
docSize++;
for (String queryToken : phraseCountMap.keySet()) {
float countToUse = phraseCountMap.get(queryToken);
if (countToUse == 0) {
countToUse = singleCountMap.get(queryToken);
}
score += Math.log(countToUse / (float) docSize);
}
return score;
}
use of org.apache.lucene.search.highlight.TokenStreamFromTermVector in project Anserini by castorini.
the class CountBigramPairs method countPairs.
/**
* Method will count coocurrence of pairs specified in queryPairMap
* and store counts for each window size in counters
* NOTE method mutates inputs
* @param singleCountMap a count of single tokens as we encounter them, useful if any smoothing
* @param queryPairMap all pairs of strings we are looking for
* @param backQueryPairMap all pairs of reverse pairs, ei if query is test query, this would include query test
* @param gapSizes list of window sizes to compute for
* @param counters Window size to counter map
*/
public static void countPairs(Map<String, Integer> singleCountMap, Map<String, Set<String>> queryPairMap, Map<String, Set<String>> backQueryPairMap, ArrayList<Integer> gapSizes, Map<Integer, PhraseCounter> counters, Terms terms) throws IOException {
// Construct token stream with offset 0
TokenStreamFromTermVector stream = new TokenStreamFromTermVector(terms, -1);
CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
int docSize = 0;
int maxGapSize = 0;
for (Integer windowSize : gapSizes) {
if (windowSize > maxGapSize) {
maxGapSize = windowSize;
}
}
// We will maintain a fifo queue of window size
LinkedList<String> window = new LinkedList<>();
// add to the window first and process the first tokens
stream.reset();
while (docSize < maxGapSize * 2 + 2 && stream.incrementToken()) {
// First construct the window that we need to test on
docSize++;
String token = termAttribute.toString();
window.add(token);
}
// But we need to account for the case when the tokenstream just doesn't have that many tokens
for (int i = 0; i < Math.min(maxGapSize + 1, docSize); i++) {
String firstToken = window.get(i);
// Look ahead for token
if (queryPairMap.containsKey(firstToken)) {
// Count unigram for this token
singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
for (int j = i + 1; j < Math.min(i + maxGapSize + 1, docSize); j++) {
if (queryPairMap.get(firstToken).contains(window.get(j))) {
for (int windowSize : counters.keySet()) {
if (j - i <= windowSize)
counters.get(windowSize).incrementCount(firstToken);
}
}
}
}
if (backQueryPairMap.containsKey(firstToken)) {
// Count unigram for this token
for (int j = i + 1; j < Math.min(i + maxGapSize + 1, docSize); j++) {
if (backQueryPairMap.get(firstToken).contains(window.get(j))) {
for (int windowSize : counters.keySet()) {
if (j - i <= windowSize)
counters.get(windowSize).incrementCount(window.get(j));
}
}
}
}
}
// Now we continue
while (stream.incrementToken()) {
docSize++;
String token = termAttribute.toString();
window.add(token);
// Move the window along
// The window at this point is guaranteed to be of size WINDOW_SIZE * 2 because of the previous loop
// if there are not enough tokens this would not even execute
window.removeFirst();
// Now test for the phrase at the test index WINDOW_SIZE
String firstToken = window.get(maxGapSize);
if (queryPairMap.containsKey(firstToken)) {
// Count unigram for this token
singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
for (int j = maxGapSize + 1; j < maxGapSize * 2 + 2; j++) {
if (queryPairMap.get(firstToken).contains(window.get(j))) {
for (int windowSize : counters.keySet()) {
if (j - maxGapSize <= windowSize)
counters.get(windowSize).incrementCount(firstToken);
}
}
}
}
if (backQueryPairMap.containsKey(firstToken)) {
// Count unigram for this token
for (int j = maxGapSize + 1; j < maxGapSize * 2 + 2; j++) {
if (backQueryPairMap.get(firstToken).contains(window.get(j))) {
for (int windowSize : counters.keySet()) {
if (j - maxGapSize <= windowSize)
counters.get(windowSize).incrementCount(window.get(j));
}
}
}
}
}
// the unprocessed portion is the last maxGap + 1 -> end
for (int i = maxGapSize + 1; i < Math.min(maxGapSize * 2 + 1, docSize); i++) {
String firstToken = window.get(i);
if (queryPairMap.containsKey(firstToken)) {
// Count unigram for this token
singleCountMap.put(firstToken, singleCountMap.get(firstToken) + 1);
for (int j = i + 1; j < Math.min(maxGapSize * 2 + 2, docSize); j++) {
if (queryPairMap.get(firstToken).contains(window.get(j))) {
for (int windowSize : counters.keySet()) {
if (j - i <= windowSize)
counters.get(windowSize).incrementCount(firstToken);
}
}
}
}
if (backQueryPairMap.containsKey(firstToken)) {
// Count unigram for this token
for (int j = i + 1; j < Math.min(maxGapSize * 2 + 2, docSize); j++) {
if (backQueryPairMap.get(firstToken).contains(window.get(j))) {
for (int windowSize : counters.keySet()) {
if (j - i <= windowSize)
counters.get(windowSize).incrementCount(window.get(j));
}
}
}
}
}
stream.end();
stream.close();
}
use of org.apache.lucene.search.highlight.TokenStreamFromTermVector in project Anserini by castorini.
the class SequentialDependenceModel method computeFullIndependenceScore.
/**
* The single term scoring function: lambda* log( (1-alpha) tf/ |D|)
* @param doc
* @param terms
* @param context
* @return
*/
private float computeFullIndependenceScore(Document doc, Terms terms, RerankerContext context) throws IOException {
// tf can be calculated by iterating over terms, number of times a term occurs in doc
// |D| total number of terms can be calculated by iterating over stream
IndexReader reader = context.getIndexSearcher().getIndexReader();
List<String> queryTokenList = context.getQueryTokens();
Map<String, Integer> termCount = new HashMap<>();
TokenStream stream = new TokenStreamFromTermVector(terms, 0);
CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
float docSize = 0;
// Count all the tokens
while (stream.incrementToken()) {
docSize++;
String token = termAttribute.toString();
if (termCount.containsKey(token)) {
termCount.put(token, termCount.get(token) + 1);
} else {
termCount.put(token, 1);
}
}
float score = 0.0f;
// Smoothing count of 1
docSize++;
// Only compute the score for what's in term count all else 0
for (String queryToken : termCount.keySet()) {
score += Math.log((float) (termCount.get(queryToken) + 1) / docSize);
}
return score;
}
use of org.apache.lucene.search.highlight.TokenStreamFromTermVector in project Anserini by castorini.
the class SequentialDependenceModel method computeOrderedFrequencyScore.
private float computeOrderedFrequencyScore(Document doc, Terms terms, RerankerContext context) throws IOException {
List<String> queryTokens = context.getQueryTokens();
Map<String, String> queryPairMap = new HashMap<>();
Map<String, Integer> phraseCountMap = new HashMap<>();
Map<String, Integer> singleCountMap = new HashMap<>();
// Construct a count map and a map of phrase pair x y, x->y
for (int i = 0; i < queryTokens.size() - 1; i++) {
queryPairMap.put(queryTokens.get(i), queryTokens.get(i + 1));
phraseCountMap.put(queryTokens.get(i), 0);
// This will serve as our smoothing param
singleCountMap.put(queryTokens.get(i), 1);
}
// Construct token stream with offset 0
TokenStream stream = new TokenStreamFromTermVector(terms, 0);
CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
float docSize = 0.0f;
// Use these to track which token we need to see to increment count
// count tracked on the first token
String expectedToken = "";
String tokenToIncrement = "";
while (stream.incrementToken()) {
docSize++;
String token = termAttribute.toString();
if (token.equalsIgnoreCase(expectedToken)) {
phraseCountMap.put(tokenToIncrement, phraseCountMap.get(tokenToIncrement) + 1);
}
// Check now if this token could be the start of an ordered phrase
if (queryPairMap.containsKey(token)) {
expectedToken = queryPairMap.get(token);
singleCountMap.put(token, singleCountMap.get(token) + 1);
tokenToIncrement = token;
} else {
expectedToken = "";
tokenToIncrement = "";
}
}
float score = 0.0f;
// Smoothing count of 1
docSize++;
for (String queryToken : phraseCountMap.keySet()) {
score += Math.log((float) (phraseCountMap.get(queryToken) + 1) / docSize);
}
return score;
}
use of org.apache.lucene.search.highlight.TokenStreamFromTermVector in project Anserini by castorini.
the class UnigramFeatureExtractor method computeFullIndependenceScore.
/**
* The single term scoring function: lambda* log( (1-alpha) tf/ |D|)
* @param doc
* @param terms
* @param context
* @return
*/
private float computeFullIndependenceScore(Document doc, Terms terms, RerankerContext context) throws IOException {
// tf can be calculated by iterating over terms, number of times a term occurs in doc
// |D| total number of terms can be calculated by iterating over stream
IndexReader reader = context.getIndexSearcher().getIndexReader();
List<String> queryTokenList = context.getQueryTokens();
Map<String, Integer> termCount = new HashMap<>();
for (String queryToken : queryTokenList) {
termCount.put(queryToken, 0);
}
TokenStream stream = new TokenStreamFromTermVector(terms, -1);
CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
stream.reset();
float docSize = 0;
// Count all the tokens
while (stream.incrementToken()) {
docSize++;
String token = termAttribute.toString();
if (termCount.containsKey(token)) {
termCount.put(token, termCount.get(token) + 1);
}
}
float score = 0.0f;
// Smoothing count of 1
docSize++;
// Only compute the score for what's in term count all else 0
for (String queryToken : termCount.keySet()) {
score += termCount.get(queryToken);
}
stream.end();
stream.close();
return score;
}
Aggregations