use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project Anserini by castorini.
the class SequentialDependenceModel method computeFullIndependenceScore.
/**
* The single term scoring function: lambda* log( (1-alpha) tf/ |D|)
* @param doc
* @param terms
* @param context
* @return
*/
private float computeFullIndependenceScore(Document doc, Terms terms, RerankerContext context) throws IOException {
// tf can be calculated by iterating over terms, number of times a term occurs in doc
// |D| total number of terms can be calculated by iterating over stream
IndexReader reader = context.getIndexSearcher().getIndexReader();
List<String> queryTokenList = context.getQueryTokens();
Map<String, Integer> termCount = new HashMap<>();
TokenStream stream = new TokenStreamFromTermVector(terms, 0);
CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
float docSize = 0;
// Count all the tokens
while (stream.incrementToken()) {
docSize++;
String token = termAttribute.toString();
if (termCount.containsKey(token)) {
termCount.put(token, termCount.get(token) + 1);
} else {
termCount.put(token, 1);
}
}
float score = 0.0f;
// Smoothing count of 1
docSize++;
// Only compute the score for what's in term count all else 0
for (String queryToken : termCount.keySet()) {
score += Math.log((float) (termCount.get(queryToken) + 1) / docSize);
}
return score;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project Anserini by castorini.
the class WmdPassageScorer method score.
@Override
public void score(String query, Map<String, Float> sentences) throws Exception {
StandardAnalyzer sa = new StandardAnalyzer(StopFilter.makeStopSet(stopWords));
TokenStream tokenStream = sa.tokenStream("contents", new StringReader(query));
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
Set<String> questionTerms = new HashSet<>();
Set<String> candidateTerms = new HashSet<>();
// avoid duplicate passages
Set<String> seenSentences = new HashSet<>();
while (tokenStream.incrementToken()) {
questionTerms.add(charTermAttribute.toString());
}
for (Map.Entry<String, Float> sent : sentences.entrySet()) {
double wmd = 0.0;
candidateTerms.clear();
sa = new StandardAnalyzer(StopFilter.makeStopSet(stopWords));
TokenStream candTokenStream = sa.tokenStream("contents", new StringReader(sent.getKey()));
charTermAttribute = candTokenStream.addAttribute(CharTermAttribute.class);
candTokenStream.reset();
while (candTokenStream.incrementToken()) {
candidateTerms.add(charTermAttribute.toString());
}
for (String qTerm : questionTerms) {
double minWMD = Double.MAX_VALUE;
for (String candTerm : candidateTerms) {
try {
double thisWMD = distance(wmdDictionary.getEmbeddingVector(qTerm), wmdDictionary.getEmbeddingVector(candTerm));
if (minWMD > thisWMD) {
minWMD = thisWMD;
}
} catch (TermNotFoundException e) {
String missingTerm = e.getMessage();
// mover's distance is 0
if (!qTerm.equals(missingTerm)) {
continue;
}
if (qTerm.equals(candTerm)) {
minWMD = 0.0;
} else {
try {
// if the embedding for the question term doesn't exist, consider
// it to be an unknown term
double thisWMD = distance(wmdDictionary.getEmbeddingVector("unk"), wmdDictionary.getEmbeddingVector(candTerm));
if (minWMD > thisWMD) {
minWMD = thisWMD;
}
} catch (TermNotFoundException e1) {
// "unk" is OOV
}
}
} catch (IOException e) {
// thrown if the search fails
}
}
if (minWMD != Double.MAX_VALUE) {
wmd += minWMD;
}
}
double weightedScore = -1 * (wmd + 0.0001 * sent.getValue());
ScoredPassage scoredPassage = new ScoredPassage(sent.getKey(), weightedScore, sent.getValue());
if ((scoredPassageHeap.size() < topPassages || weightedScore > scoredPassageHeap.peekLast().getScore()) && !seenSentences.contains(sent)) {
if (scoredPassageHeap.size() == topPassages) {
scoredPassageHeap.pollLast();
}
scoredPassageHeap.add(scoredPassage);
seenSentences.add(sent.getKey());
}
}
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project textdb by TextDB.
the class DataflowUtils method tokenizeQueryWithStopwords.
public static ArrayList<String> tokenizeQueryWithStopwords(String luceneAnalyzerStr, String query) {
Analyzer luceneAnalyzer;
if (luceneAnalyzerStr.equals(LuceneAnalyzerConstants.standardAnalyzerString())) {
// use an empty stop word list for standard analyzer
CharArraySet emptyStopwords = new CharArraySet(1, true);
luceneAnalyzer = new StandardAnalyzer(emptyStopwords);
} else if (luceneAnalyzerStr.equals(LuceneAnalyzerConstants.chineseAnalyzerString())) {
// use the default smart chinese analyzer
// because the smart chinese analyzer's default stopword list is simply a list of punctuations
// https://lucene.apache.org/core/5_5_0/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html
luceneAnalyzer = LuceneAnalyzerConstants.getLuceneAnalyzer(luceneAnalyzerStr);
} else {
throw new TexeraException("tokenizeQueryWithStopwords: analyzer " + luceneAnalyzerStr + " not recgonized");
}
ArrayList<String> result = new ArrayList<String>();
TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(query));
CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
try {
tokenStream.reset();
while (tokenStream.incrementToken()) {
String token = term.toString();
int tokenIndex = query.toLowerCase().indexOf(token);
// Since tokens are converted to lower case,
// get the exact token from the query string.
String actualQueryToken = query.substring(tokenIndex, tokenIndex + token.length());
result.add(actualQueryToken);
}
tokenStream.close();
} catch (IOException e) {
throw new DataflowException(e);
} finally {
luceneAnalyzer.close();
}
return result;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project textdb by TextDB.
the class DataflowUtils method generatePayload.
public static List<Span> generatePayload(String attributeName, String fieldValue, Analyzer luceneAnalyzer) {
List<Span> payload = new ArrayList<>();
try {
TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(fieldValue));
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
int tokenPositionCounter = -1;
tokenStream.reset();
while (tokenStream.incrementToken()) {
tokenPositionCounter += positionIncrementAttribute.getPositionIncrement();
int tokenPosition = tokenPositionCounter;
int charStart = offsetAttribute.startOffset();
int charEnd = offsetAttribute.endOffset();
String analyzedTermStr = charTermAttribute.toString();
String originalTermStr = fieldValue.substring(charStart, charEnd);
payload.add(new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition));
}
tokenStream.close();
} catch (IOException e) {
throw new DataflowException(e);
}
return payload;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project searchcode-server by boyter.
the class LengthFilter method main.
public static void main(String[] args) throws IOException {
// text to tokenize
final String text = "This is a demo of the TokenStream API";
CodeAnalyzer analyzer = new CodeAnalyzer();
TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
// get the CharTermAttribute from the TokenStream
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
try {
stream.reset();
// print all tokens until stream is exhausted
while (stream.incrementToken()) {
System.out.println(termAtt.toString());
}
stream.end();
} finally {
stream.close();
}
}
Aggregations