Search in sources :

Example 76 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project Anserini by castorini.

the class SequentialDependenceModel method computeFullIndependenceScore.

/**
 * The single term scoring function: lambda* log( (1-alpha) tf/ |D|)
 * @param doc
 * @param terms
 * @param context
 * @return
 */
private float computeFullIndependenceScore(Document doc, Terms terms, RerankerContext context) throws IOException {
    // tf can be calculated by iterating over terms, number of times a term occurs in doc
    // |D| total number of terms can be calculated by iterating over stream
    IndexReader reader = context.getIndexSearcher().getIndexReader();
    List<String> queryTokenList = context.getQueryTokens();
    Map<String, Integer> termCount = new HashMap<>();
    TokenStream stream = new TokenStreamFromTermVector(terms, 0);
    CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
    float docSize = 0;
    // Count all the tokens
    while (stream.incrementToken()) {
        docSize++;
        String token = termAttribute.toString();
        if (termCount.containsKey(token)) {
            termCount.put(token, termCount.get(token) + 1);
        } else {
            termCount.put(token, 1);
        }
    }
    float score = 0.0f;
    // Smoothing count of 1
    docSize++;
    // Only compute the score for what's in term count all else 0
    for (String queryToken : termCount.keySet()) {
        score += Math.log((float) (termCount.get(queryToken) + 1) / docSize);
    }
    return score;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TokenStreamFromTermVector(org.apache.lucene.search.highlight.TokenStreamFromTermVector) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) HashMap(java.util.HashMap) IndexReader(org.apache.lucene.index.IndexReader)

Example 77 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project Anserini by castorini.

the class WmdPassageScorer method score.

@Override
public void score(String query, Map<String, Float> sentences) throws Exception {
    StandardAnalyzer sa = new StandardAnalyzer(StopFilter.makeStopSet(stopWords));
    TokenStream tokenStream = sa.tokenStream("contents", new StringReader(query));
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    Set<String> questionTerms = new HashSet<>();
    Set<String> candidateTerms = new HashSet<>();
    // avoid duplicate passages
    Set<String> seenSentences = new HashSet<>();
    while (tokenStream.incrementToken()) {
        questionTerms.add(charTermAttribute.toString());
    }
    for (Map.Entry<String, Float> sent : sentences.entrySet()) {
        double wmd = 0.0;
        candidateTerms.clear();
        sa = new StandardAnalyzer(StopFilter.makeStopSet(stopWords));
        TokenStream candTokenStream = sa.tokenStream("contents", new StringReader(sent.getKey()));
        charTermAttribute = candTokenStream.addAttribute(CharTermAttribute.class);
        candTokenStream.reset();
        while (candTokenStream.incrementToken()) {
            candidateTerms.add(charTermAttribute.toString());
        }
        for (String qTerm : questionTerms) {
            double minWMD = Double.MAX_VALUE;
            for (String candTerm : candidateTerms) {
                try {
                    double thisWMD = distance(wmdDictionary.getEmbeddingVector(qTerm), wmdDictionary.getEmbeddingVector(candTerm));
                    if (minWMD > thisWMD) {
                        minWMD = thisWMD;
                    }
                } catch (TermNotFoundException e) {
                    String missingTerm = e.getMessage();
                    // mover's distance is 0
                    if (!qTerm.equals(missingTerm)) {
                        continue;
                    }
                    if (qTerm.equals(candTerm)) {
                        minWMD = 0.0;
                    } else {
                        try {
                            // if the embedding for the question term doesn't exist, consider
                            // it to be an unknown term
                            double thisWMD = distance(wmdDictionary.getEmbeddingVector("unk"), wmdDictionary.getEmbeddingVector(candTerm));
                            if (minWMD > thisWMD) {
                                minWMD = thisWMD;
                            }
                        } catch (TermNotFoundException e1) {
                        // "unk" is OOV
                        }
                    }
                } catch (IOException e) {
                // thrown if the search fails
                }
            }
            if (minWMD != Double.MAX_VALUE) {
                wmd += minWMD;
            }
        }
        double weightedScore = -1 * (wmd + 0.0001 * sent.getValue());
        ScoredPassage scoredPassage = new ScoredPassage(sent.getKey(), weightedScore, sent.getValue());
        if ((scoredPassageHeap.size() < topPassages || weightedScore > scoredPassageHeap.peekLast().getScore()) && !seenSentences.contains(sent)) {
            if (scoredPassageHeap.size() == topPassages) {
                scoredPassageHeap.pollLast();
            }
            scoredPassageHeap.add(scoredPassage);
            seenSentences.add(sent.getKey());
        }
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TermNotFoundException(io.anserini.embeddings.TermNotFoundException) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer)

Example 78 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project textdb by TextDB.

the class DataflowUtils method tokenizeQueryWithStopwords.

public static ArrayList<String> tokenizeQueryWithStopwords(String luceneAnalyzerStr, String query) {
    Analyzer luceneAnalyzer;
    if (luceneAnalyzerStr.equals(LuceneAnalyzerConstants.standardAnalyzerString())) {
        // use an empty stop word list for standard analyzer
        CharArraySet emptyStopwords = new CharArraySet(1, true);
        luceneAnalyzer = new StandardAnalyzer(emptyStopwords);
    } else if (luceneAnalyzerStr.equals(LuceneAnalyzerConstants.chineseAnalyzerString())) {
        // use the default smart chinese analyzer
        // because the smart chinese analyzer's default stopword list is simply a list of punctuations
        // https://lucene.apache.org/core/5_5_0/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html
        luceneAnalyzer = LuceneAnalyzerConstants.getLuceneAnalyzer(luceneAnalyzerStr);
    } else {
        throw new TexeraException("tokenizeQueryWithStopwords: analyzer " + luceneAnalyzerStr + " not recgonized");
    }
    ArrayList<String> result = new ArrayList<String>();
    TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(query));
    CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String token = term.toString();
            int tokenIndex = query.toLowerCase().indexOf(token);
            // Since tokens are converted to lower case,
            // get the exact token from the query string.
            String actualQueryToken = query.substring(tokenIndex, tokenIndex + token.length());
            result.add(actualQueryToken);
        }
        tokenStream.close();
    } catch (IOException e) {
        throw new DataflowException(e);
    } finally {
        luceneAnalyzer.close();
    }
    return result;
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) IOException(java.io.IOException) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) TexeraException(edu.uci.ics.texera.api.exception.TexeraException) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) StringReader(java.io.StringReader) DataflowException(edu.uci.ics.texera.api.exception.DataflowException)

Example 79 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project textdb by TextDB.

the class DataflowUtils method generatePayload.

public static List<Span> generatePayload(String attributeName, String fieldValue, Analyzer luceneAnalyzer) {
    List<Span> payload = new ArrayList<>();
    try {
        TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(fieldValue));
        OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
        int tokenPositionCounter = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            tokenPositionCounter += positionIncrementAttribute.getPositionIncrement();
            int tokenPosition = tokenPositionCounter;
            int charStart = offsetAttribute.startOffset();
            int charEnd = offsetAttribute.endOffset();
            String analyzedTermStr = charTermAttribute.toString();
            String originalTermStr = fieldValue.substring(charStart, charEnd);
            payload.add(new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition));
        }
        tokenStream.close();
    } catch (IOException e) {
        throw new DataflowException(e);
    }
    return payload;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) IOException(java.io.IOException) Span(edu.uci.ics.texera.api.span.Span) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 80 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project searchcode-server by boyter.

the class LengthFilter method main.

public static void main(String[] args) throws IOException {
    // text to tokenize
    final String text = "This is a demo of the TokenStream API";
    CodeAnalyzer analyzer = new CodeAnalyzer();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
    // get the CharTermAttribute from the TokenStream
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    try {
        stream.reset();
        // print all tokens until stream is exhausted
        while (stream.incrementToken()) {
            System.out.println(termAtt.toString());
        }
        stream.end();
    } finally {
        stream.close();
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)151 TokenStream (org.apache.lucene.analysis.TokenStream)95 StringReader (java.io.StringReader)46 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)35 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)34 IOException (java.io.IOException)27 ArrayList (java.util.ArrayList)27 Tokenizer (org.apache.lucene.analysis.Tokenizer)25 Analyzer (org.apache.lucene.analysis.Analyzer)20 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)16 BytesRef (org.apache.lucene.util.BytesRef)15 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)13 LinkedList (java.util.LinkedList)11 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)10 Term (org.apache.lucene.index.Term)10 HashMap (java.util.HashMap)9 Token (org.apache.lucene.analysis.Token)8 Document (org.apache.lucene.document.Document)8 List (java.util.List)7 HashSet (java.util.HashSet)6