Search in sources :

Example 1 with Token

use of org.apache.lucene.analysis.Token in project elasticsearch by elastic.

the class FlattenGraphTokenFilterFactoryTests method token.

private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
    final Token t = new Token(term, startOffset, endOffset);
    t.setPositionIncrement(posInc);
    t.setPositionLength(posLength);
    return t;
}
Also used : Token(org.apache.lucene.analysis.Token)

Example 2 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class SpellCheckComponent method getTokens.

private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
    Collection<Token> result = new ArrayList<>();
    assert analyzer != null;
    try (TokenStream ts = analyzer.tokenStream("", q)) {
        ts.reset();
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        while (ts.incrementToken()) {
            Token token = new Token();
            token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            token.setType(typeAtt.type());
            token.setFlags(flagsAtt.getFlags());
            token.setPayload(payloadAtt.getPayload());
            token.setPositionIncrement(posIncAtt.getPositionIncrement());
            result.add(token);
        }
        ts.end();
        return result;
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) ArrayList(java.util.ArrayList) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Token(org.apache.lucene.analysis.Token) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 3 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class SpellCheckComponent method process.

@Override
@SuppressWarnings("unchecked")
public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(COMPONENT_NAME, false) || spellCheckers.isEmpty()) {
        return;
    }
    boolean shardRequest = "true".equals(params.get(ShardParams.IS_SHARD));
    String q = params.get(SPELLCHECK_Q);
    SolrSpellChecker spellChecker = getSpellChecker(params);
    Collection<Token> tokens = null;
    if (q != null) {
        //we have a spell check param, tokenize it with the query analyzer applicable for this spellchecker
        tokens = getTokens(q, spellChecker.getQueryAnalyzer());
    } else {
        q = rb.getQueryString();
        if (q == null) {
            q = params.get(CommonParams.Q);
        }
        tokens = queryConverter.convert(q);
    }
    if (tokens != null && tokens.isEmpty() == false) {
        if (spellChecker != null) {
            int count = params.getInt(SPELLCHECK_COUNT, 1);
            boolean onlyMorePopular = params.getBool(SPELLCHECK_ONLY_MORE_POPULAR, DEFAULT_ONLY_MORE_POPULAR);
            boolean extendedResults = params.getBool(SPELLCHECK_EXTENDED_RESULTS, false);
            boolean collate = params.getBool(SPELLCHECK_COLLATE, false);
            float accuracy = params.getFloat(SPELLCHECK_ACCURACY, Float.MIN_VALUE);
            int alternativeTermCount = params.getInt(SpellingParams.SPELLCHECK_ALTERNATIVE_TERM_COUNT, 0);
            //If specified, this can be a discrete # of results, or a percentage of fq results.
            Integer maxResultsForSuggest = maxResultsForSuggest(rb);
            ModifiableSolrParams customParams = new ModifiableSolrParams();
            for (String checkerName : getDictionaryNames(params)) {
                customParams.add(getCustomParams(checkerName, params));
            }
            Integer hitsInteger = (Integer) rb.rsp.getToLog().get("hits");
            long hits = 0;
            if (hitsInteger == null) {
                hits = rb.getNumberDocumentsFound();
            } else {
                hits = hitsInteger.longValue();
            }
            SpellingResult spellingResult = null;
            if (maxResultsForSuggest == null || hits <= maxResultsForSuggest) {
                SuggestMode suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
                if (onlyMorePopular) {
                    suggestMode = SuggestMode.SUGGEST_MORE_POPULAR;
                } else if (alternativeTermCount > 0) {
                    suggestMode = SuggestMode.SUGGEST_ALWAYS;
                }
                IndexReader reader = rb.req.getSearcher().getIndexReader();
                SpellingOptions options = new SpellingOptions(tokens, reader, count, alternativeTermCount, suggestMode, extendedResults, accuracy, customParams);
                spellingResult = spellChecker.getSuggestions(options);
            } else {
                spellingResult = new SpellingResult();
            }
            boolean isCorrectlySpelled = hits > (maxResultsForSuggest == null ? 0 : maxResultsForSuggest);
            NamedList response = new SimpleOrderedMap();
            NamedList suggestions = toNamedList(shardRequest, spellingResult, q, extendedResults);
            response.add("suggestions", suggestions);
            if (extendedResults) {
                response.add("correctlySpelled", isCorrectlySpelled);
            }
            if (collate) {
                addCollationsToResponse(params, spellingResult, rb, q, response, spellChecker.isSuggestionsMayOverlap());
            }
            if (shardRequest) {
                addOriginalTermsToResponse(response, tokens);
            }
            rb.rsp.add("spellcheck", response);
        } else {
            throw new SolrException(SolrException.ErrorCode.NOT_FOUND, "Specified dictionaries do not exist: " + getDictionaryNameAsSingleString(getDictionaryNames(params)));
        }
    }
}
Also used : ConjunctionSolrSpellChecker(org.apache.solr.spelling.ConjunctionSolrSpellChecker) SolrSpellChecker(org.apache.solr.spelling.SolrSpellChecker) NamedList(org.apache.solr.common.util.NamedList) Token(org.apache.lucene.analysis.Token) SpellingOptions(org.apache.solr.spelling.SpellingOptions) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) SuggestMode(org.apache.lucene.search.spell.SuggestMode) SpellingResult(org.apache.solr.spelling.SpellingResult) IndexReader(org.apache.lucene.index.IndexReader) SolrParams(org.apache.solr.common.params.SolrParams) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) SolrException(org.apache.solr.common.SolrException)

Example 4 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class SpellCheckComponent method toNamedList.

protected NamedList toNamedList(boolean shardRequest, SpellingResult spellingResult, String origQuery, boolean extendedResults) {
    NamedList result = new NamedList();
    Map<Token, LinkedHashMap<String, Integer>> suggestions = spellingResult.getSuggestions();
    boolean hasFreqInfo = spellingResult.hasTokenFrequencyInfo();
    boolean hasSuggestions = false;
    boolean hasZeroFrequencyToken = false;
    for (Map.Entry<Token, LinkedHashMap<String, Integer>> entry : suggestions.entrySet()) {
        Token inputToken = entry.getKey();
        String tokenString = new String(inputToken.buffer(), 0, inputToken.length());
        Map<String, Integer> theSuggestions = new LinkedHashMap<>(entry.getValue());
        Iterator<String> sugIter = theSuggestions.keySet().iterator();
        while (sugIter.hasNext()) {
            String sug = sugIter.next();
            if (sug.equals(tokenString)) {
                sugIter.remove();
            }
        }
        if (theSuggestions.size() > 0) {
            hasSuggestions = true;
        }
        if (theSuggestions != null && (theSuggestions.size() > 0 || shardRequest)) {
            SimpleOrderedMap suggestionList = new SimpleOrderedMap();
            suggestionList.add("numFound", theSuggestions.size());
            suggestionList.add("startOffset", inputToken.startOffset());
            suggestionList.add("endOffset", inputToken.endOffset());
            // ]
            if (extendedResults && hasFreqInfo) {
                suggestionList.add("origFreq", spellingResult.getTokenFrequency(inputToken));
                ArrayList<SimpleOrderedMap> sugs = new ArrayList<>();
                suggestionList.add("suggestion", sugs);
                for (Map.Entry<String, Integer> suggEntry : theSuggestions.entrySet()) {
                    SimpleOrderedMap sugEntry = new SimpleOrderedMap();
                    sugEntry.add("word", suggEntry.getKey());
                    sugEntry.add("freq", suggEntry.getValue());
                    sugs.add(sugEntry);
                }
            } else {
                suggestionList.add("suggestion", theSuggestions.keySet());
            }
            if (hasFreqInfo) {
                Integer tokenFrequency = spellingResult.getTokenFrequency(inputToken);
                if (tokenFrequency == null || tokenFrequency == 0) {
                    hasZeroFrequencyToken = true;
                }
            }
            result.add(tokenString, suggestionList);
        }
    }
    return result;
}
Also used : NamedList(org.apache.solr.common.util.NamedList) ArrayList(java.util.ArrayList) Token(org.apache.lucene.analysis.Token) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) LinkedHashMap(java.util.LinkedHashMap) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap)

Example 5 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TokenSourcesTest method testPayloads.

// LUCENE-5294
public void testPayloads() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
    myFieldType.setStoreTermVectors(true);
    myFieldType.setStoreTermVectorOffsets(true);
    myFieldType.setStoreTermVectorPositions(true);
    myFieldType.setStoreTermVectorPayloads(true);
    curOffset = 0;
    Token[] tokens = new Token[] { getToken("foxes"), getToken("can"), getToken("jump"), getToken("high") };
    Document doc = new Document();
    doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
    writer.addDocument(doc);
    IndexReader reader = writer.getReader();
    writer.close();
    assertEquals(1, reader.numDocs());
    TokenStream ts = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
    PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    ts.reset();
    for (Token token : tokens) {
        assertTrue(ts.incrementToken());
        assertEquals(token.toString(), termAtt.toString());
        assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
        assertEquals(token.getPayload(), payloadAtt.getPayload());
        assertEquals(token.startOffset(), offsetAtt.startOffset());
        assertEquals(token.endOffset(), offsetAtt.endOffset());
    }
    assertFalse(ts.incrementToken());
    reader.close();
    dir.close();
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) IndexReader(org.apache.lucene.index.IndexReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Aggregations

Token (org.apache.lucene.analysis.Token)98 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)39 TokenStream (org.apache.lucene.analysis.TokenStream)28 Directory (org.apache.lucene.store.Directory)24 Test (org.junit.Test)20 Document (org.apache.lucene.document.Document)19 TextField (org.apache.lucene.document.TextField)19 BytesRef (org.apache.lucene.util.BytesRef)16 NamedList (org.apache.solr.common.util.NamedList)16 ArrayList (java.util.ArrayList)13 Map (java.util.Map)13 Field (org.apache.lucene.document.Field)13 StringReader (java.io.StringReader)12 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)12 Analyzer (org.apache.lucene.analysis.Analyzer)11 FieldType (org.apache.lucene.document.FieldType)11 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)10 Tokenizer (org.apache.lucene.analysis.Tokenizer)9 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)8 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)8