Search in sources :

Example 81 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project HongsCORE by ihongs.

the class DemoTest method main.

public static void main(String[] args) throws IOException {
    Analyzer az = CustomAnalyzer.builder().withTokenizer("Name").addTokenFilter("EdgeNGram", "minGramSize", "1", "maxGramSize", "20").build();
    StringReader sr = new StringReader(args[0]);
    TokenStream ts = az.tokenStream("", sr);
    OffsetAttribute oa = ts.addAttribute(OffsetAttribute.class);
    CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
    try {
        // Resets this stream to the beginning. (Required)
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println(ta.toString() + "|" + ta.length() + "[" + oa.startOffset() + "," + oa.endOffset() + "]");
        }
        // Perform end-of-stream operations, e.g. set the final offset.
        ts.end();
    } finally {
        // Release resources associated with this stream.
        ts.close();
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Analyzer(org.apache.lucene.analysis.Analyzer) CustomAnalyzer(org.apache.lucene.analysis.custom.CustomAnalyzer)

Example 82 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project Anserini by castorini.

the class SequentialDependenceModel method computeOrderedFrequencyScore.

private float computeOrderedFrequencyScore(Document doc, Terms terms, RerankerContext context) throws IOException {
    List<String> queryTokens = context.getQueryTokens();
    Map<String, String> queryPairMap = new HashMap<>();
    Map<String, Integer> phraseCountMap = new HashMap<>();
    Map<String, Integer> singleCountMap = new HashMap<>();
    // Construct a count map and a map of phrase pair x y, x->y
    for (int i = 0; i < queryTokens.size() - 1; i++) {
        queryPairMap.put(queryTokens.get(i), queryTokens.get(i + 1));
        phraseCountMap.put(queryTokens.get(i), 0);
        // This will serve as our smoothing param
        singleCountMap.put(queryTokens.get(i), 1);
    }
    // Construct token stream with offset 0
    TokenStream stream = new TokenStreamFromTermVector(terms, 0);
    CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
    float docSize = 0.0f;
    // Use these to track which token we need to see to increment count
    // count tracked on the first token
    String expectedToken = "";
    String tokenToIncrement = "";
    while (stream.incrementToken()) {
        docSize++;
        String token = termAttribute.toString();
        if (token.equalsIgnoreCase(expectedToken)) {
            phraseCountMap.put(tokenToIncrement, phraseCountMap.get(tokenToIncrement) + 1);
        }
        // Check now if this token could be the start of an ordered phrase
        if (queryPairMap.containsKey(token)) {
            expectedToken = queryPairMap.get(token);
            singleCountMap.put(token, singleCountMap.get(token) + 1);
            tokenToIncrement = token;
        } else {
            expectedToken = "";
            tokenToIncrement = "";
        }
    }
    float score = 0.0f;
    // Smoothing count of 1
    docSize++;
    for (String queryToken : phraseCountMap.keySet()) {
        score += Math.log((float) (phraseCountMap.get(queryToken) + 1) / docSize);
    }
    return score;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TokenStreamFromTermVector(org.apache.lucene.search.highlight.TokenStreamFromTermVector) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) HashMap(java.util.HashMap)

Example 83 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project Anserini by castorini.

the class UnigramFeatureExtractor method computeFullIndependenceScore.

/**
 * The single term scoring function: lambda* log( (1-alpha) tf/ |D|)
 * @param doc
 * @param terms
 * @param context
 * @return
 */
private float computeFullIndependenceScore(Document doc, Terms terms, RerankerContext context) throws IOException {
    // tf can be calculated by iterating over terms, number of times a term occurs in doc
    // |D| total number of terms can be calculated by iterating over stream
    IndexReader reader = context.getIndexSearcher().getIndexReader();
    List<String> queryTokenList = context.getQueryTokens();
    Map<String, Integer> termCount = new HashMap<>();
    for (String queryToken : queryTokenList) {
        termCount.put(queryToken, 0);
    }
    TokenStream stream = new TokenStreamFromTermVector(terms, -1);
    CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    float docSize = 0;
    // Count all the tokens
    while (stream.incrementToken()) {
        docSize++;
        String token = termAttribute.toString();
        if (termCount.containsKey(token)) {
            termCount.put(token, termCount.get(token) + 1);
        }
    }
    float score = 0.0f;
    // Smoothing count of 1
    docSize++;
    // Only compute the score for what's in term count all else 0
    for (String queryToken : termCount.keySet()) {
        score += termCount.get(queryToken);
    }
    stream.end();
    stream.close();
    return score;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TokenStreamFromTermVector(org.apache.lucene.search.highlight.TokenStreamFromTermVector) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) HashMap(java.util.HashMap) IndexReader(org.apache.lucene.index.IndexReader)

Example 84 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.

the class SimpleQueryParser method newPossiblyAnalyzedQuery.

/**
     * Analyze the given string using its analyzer, constructing either a
     * {@code PrefixQuery} or a {@code BooleanQuery} made up
     * of {@code TermQuery}s and {@code PrefixQuery}s
     */
private Query newPossiblyAnalyzedQuery(String field, String termStr) {
    List<List<BytesRef>> tlist = new ArrayList<>();
    // get Analyzer from superclass and tokenize the term
    try (TokenStream source = getAnalyzer().tokenStream(field, termStr)) {
        source.reset();
        List<BytesRef> currentPos = new ArrayList<>();
        CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posAtt = source.addAttribute(PositionIncrementAttribute.class);
        try {
            boolean hasMoreTokens = source.incrementToken();
            while (hasMoreTokens) {
                if (currentPos.isEmpty() == false && posAtt.getPositionIncrement() > 0) {
                    tlist.add(currentPos);
                    currentPos = new ArrayList<>();
                }
                final BytesRef term = getAnalyzer().normalize(field, termAtt.toString());
                currentPos.add(term);
                hasMoreTokens = source.incrementToken();
            }
            if (currentPos.isEmpty() == false) {
                tlist.add(currentPos);
            }
        } catch (IOException e) {
        // ignore
        // TODO: we should not ignore the exception and return a prefix query with the original term ?
        }
    } catch (IOException e) {
        // Bail on any exceptions, going with a regular prefix query
        return new PrefixQuery(new Term(field, termStr));
    }
    if (tlist.size() == 0) {
        return null;
    }
    if (tlist.size() == 1 && tlist.get(0).size() == 1) {
        return new PrefixQuery(new Term(field, tlist.get(0).get(0)));
    }
    // build a boolean query with prefix on the last position only.
    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    for (int pos = 0; pos < tlist.size(); pos++) {
        List<BytesRef> plist = tlist.get(pos);
        boolean isLastPos = (pos == tlist.size() - 1);
        Query posQuery;
        if (plist.size() == 1) {
            if (isLastPos) {
                posQuery = new PrefixQuery(new Term(field, plist.get(0)));
            } else {
                posQuery = newTermQuery(new Term(field, plist.get(0)));
            }
        } else if (isLastPos == false) {
            // build a synonym query for terms in the same position.
            Term[] terms = new Term[plist.size()];
            for (int i = 0; i < plist.size(); i++) {
                terms[i] = new Term(field, plist.get(i));
            }
            posQuery = new SynonymQuery(terms);
        } else {
            BooleanQuery.Builder innerBuilder = new BooleanQuery.Builder();
            for (BytesRef token : plist) {
                innerBuilder.add(new BooleanClause(new PrefixQuery(new Term(field, token)), BooleanClause.Occur.SHOULD));
            }
            posQuery = innerBuilder.setDisableCoord(true).build();
        }
        builder.add(new BooleanClause(posQuery, getDefaultOperator()));
    }
    return builder.build();
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) TokenStream(org.apache.lucene.analysis.TokenStream) Query(org.apache.lucene.search.Query) PrefixQuery(org.apache.lucene.search.PrefixQuery) FuzzyQuery(org.apache.lucene.search.FuzzyQuery) SynonymQuery(org.apache.lucene.search.SynonymQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BoostQuery(org.apache.lucene.search.BoostQuery) SynonymQuery(org.apache.lucene.search.SynonymQuery) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Term(org.apache.lucene.index.Term) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) BooleanClause(org.apache.lucene.search.BooleanClause) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) PrefixQuery(org.apache.lucene.search.PrefixQuery) ArrayList(java.util.ArrayList) List(java.util.List) BytesRef(org.apache.lucene.util.BytesRef)

Example 85 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project elasticsearch by elastic.

the class KeywordFieldMapper method parseCreateField.

@Override
protected void parseCreateField(ParseContext context, List<IndexableField> fields) throws IOException {
    String value;
    if (context.externalValueSet()) {
        value = context.externalValue().toString();
    } else {
        XContentParser parser = context.parser();
        if (parser.currentToken() == XContentParser.Token.VALUE_NULL) {
            value = fieldType().nullValueAsString();
        } else {
            value = parser.textOrNull();
        }
    }
    if (value == null || value.length() > ignoreAbove) {
        return;
    }
    final NamedAnalyzer normalizer = fieldType().normalizer();
    if (normalizer != null) {
        try (TokenStream ts = normalizer.tokenStream(name(), value)) {
            final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            if (ts.incrementToken() == false) {
                throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 0 for analyzer " + normalizer + " and input \"" + value + "\"");
            }
            final String newValue = termAtt.toString();
            if (ts.incrementToken()) {
                throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 2+ for analyzer " + normalizer + " and input \"" + value + "\"");
            }
            ts.end();
            value = newValue;
        }
    }
    if (context.includeInAll(includeInAll, this)) {
        context.allEntries().addText(fieldType().name(), value, fieldType().boost());
    }
    // convert to utf8 only once before feeding postings/dv/stored fields
    final BytesRef binaryValue = new BytesRef(value);
    if (fieldType().indexOptions() != IndexOptions.NONE || fieldType().stored()) {
        Field field = new Field(fieldType().name(), binaryValue, fieldType());
        fields.add(field);
    }
    if (fieldType().hasDocValues()) {
        fields.add(new SortedSetDocValuesField(fieldType().name(), binaryValue));
    }
}
Also used : SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) IndexableField(org.apache.lucene.index.IndexableField) Field(org.apache.lucene.document.Field) TypeParsers.parseField(org.elasticsearch.index.mapper.TypeParsers.parseField) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) NamedAnalyzer(org.elasticsearch.index.analysis.NamedAnalyzer) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) XContentParser(org.elasticsearch.common.xcontent.XContentParser) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)151 TokenStream (org.apache.lucene.analysis.TokenStream)95 StringReader (java.io.StringReader)46 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)35 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)34 IOException (java.io.IOException)27 ArrayList (java.util.ArrayList)27 Tokenizer (org.apache.lucene.analysis.Tokenizer)25 Analyzer (org.apache.lucene.analysis.Analyzer)20 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)16 BytesRef (org.apache.lucene.util.BytesRef)15 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)13 LinkedList (java.util.LinkedList)11 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)10 Term (org.apache.lucene.index.Term)10 HashMap (java.util.HashMap)9 Token (org.apache.lucene.analysis.Token)8 Document (org.apache.lucene.document.Document)8 List (java.util.List)7 HashSet (java.util.HashSet)6