Search in sources :

Example 1 with AttributeSource

use of org.apache.lucene.util.AttributeSource in project lucene-solr by apache.

the class DirectSpellChecker method suggestSimilar.

/**
   * Provide spelling corrections based on several parameters.
   *
   * @param term The term to suggest spelling corrections for
   * @param numSug The maximum number of spelling corrections
   * @param ir The index reader to fetch the candidate spelling corrections from
   * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included
   * @param editDistance The maximum edit distance candidates are allowed to have
   * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included
   * @param spare a chars scratch
   * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order.
   * @throws IOException If I/O related errors occur
   */
protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, final CharsRefBuilder spare) throws IOException {
    AttributeSource atts = new AttributeSource();
    MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
    Terms terms = MultiFields.getTerms(ir, term.field());
    if (terms == null) {
        return Collections.emptyList();
    }
    FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance - 1), true);
    final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();
    BytesRef queryTerm = new BytesRef(term.text());
    BytesRef candidateTerm;
    ScoreTerm st = new ScoreTerm();
    BoostAttribute boostAtt = e.attributes().addAttribute(BoostAttribute.class);
    while ((candidateTerm = e.next()) != null) {
        // For FuzzyQuery, boost is the score:
        float score = boostAtt.getBoost();
        // ignore uncompetitive hits
        if (stQueue.size() >= numSug && score <= stQueue.peek().boost) {
            continue;
        }
        // ignore exact match of the same term
        if (queryTerm.bytesEquals(candidateTerm)) {
            continue;
        }
        int df = e.docFreq();
        // check docFreq if required
        if (df <= docfreq) {
            continue;
        }
        final String termAsString;
        if (distance == INTERNAL_LEVENSHTEIN) {
            // delay creating strings until the end
            termAsString = null;
        } else {
            spare.copyUTF8Bytes(candidateTerm);
            termAsString = spare.toString();
            score = distance.getDistance(term.text(), termAsString);
        }
        if (score < accuracy) {
            continue;
        }
        // add new entry in PQ
        st.term = BytesRef.deepCopyOf(candidateTerm);
        st.boost = score;
        st.docfreq = df;
        st.termAsString = termAsString;
        st.score = score;
        stQueue.offer(st);
        // possibly drop entries from queue
        st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
        maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
    }
    return stQueue;
}
Also used : AttributeSource(org.apache.lucene.util.AttributeSource) FuzzyTermsEnum(org.apache.lucene.search.FuzzyTermsEnum) Terms(org.apache.lucene.index.Terms) BoostAttribute(org.apache.lucene.search.BoostAttribute) MaxNonCompetitiveBoostAttribute(org.apache.lucene.search.MaxNonCompetitiveBoostAttribute) MaxNonCompetitiveBoostAttribute(org.apache.lucene.search.MaxNonCompetitiveBoostAttribute) PriorityQueue(java.util.PriorityQueue) BytesRef(org.apache.lucene.util.BytesRef)

Example 2 with AttributeSource

use of org.apache.lucene.util.AttributeSource in project lucene-solr by apache.

the class AnalysisRequestHandlerBase method convertTokensToNamedLists.

/**
   * Converts the list of Tokens to a list of NamedLists representing the tokens.
   *
   * @param tokenList  Tokens to convert
   * @param context The analysis context
   *
   * @return List of NamedLists containing the relevant information taken from the tokens
   */
private List<NamedList> convertTokensToNamedLists(final List<AttributeSource> tokenList, AnalysisContext context) {
    final List<NamedList> tokensNamedLists = new ArrayList<>();
    final FieldType fieldType = context.getFieldType();
    final AttributeSource[] tokens = tokenList.toArray(new AttributeSource[tokenList.size()]);
    // sort the tokens by absolute position
    ArrayUtil.timSort(tokens, new Comparator<AttributeSource>() {

        @Override
        public int compare(AttributeSource a, AttributeSource b) {
            return arrayCompare(a.getAttribute(TokenTrackingAttribute.class).getPositions(), b.getAttribute(TokenTrackingAttribute.class).getPositions());
        }

        private int arrayCompare(int[] a, int[] b) {
            int p = 0;
            final int stop = Math.min(a.length, b.length);
            while (p < stop) {
                int diff = a[p] - b[p];
                if (diff != 0)
                    return diff;
                p++;
            }
            // One is a prefix of the other, or, they are equal:
            return a.length - b.length;
        }
    });
    for (int i = 0; i < tokens.length; i++) {
        AttributeSource token = tokens[i];
        final NamedList<Object> tokenNamedList = new SimpleOrderedMap<>();
        final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class);
        BytesRef rawBytes = termAtt.getBytesRef();
        final String text = fieldType.indexedToReadable(rawBytes, new CharsRefBuilder()).toString();
        tokenNamedList.add("text", text);
        if (token.hasAttribute(CharTermAttribute.class)) {
            final String rawText = token.getAttribute(CharTermAttribute.class).toString();
            if (!rawText.equals(text)) {
                tokenNamedList.add("raw_text", rawText);
            }
        }
        tokenNamedList.add("raw_bytes", rawBytes.toString());
        if (context.getTermsToMatch().contains(rawBytes)) {
            tokenNamedList.add("match", true);
        }
        token.reflectWith(new AttributeReflector() {

            @Override
            public void reflect(Class<? extends Attribute> attClass, String key, Object value) {
                // leave out position and bytes term
                if (TermToBytesRefAttribute.class.isAssignableFrom(attClass))
                    return;
                if (CharTermAttribute.class.isAssignableFrom(attClass))
                    return;
                if (PositionIncrementAttribute.class.isAssignableFrom(attClass))
                    return;
                String k = attClass.getName() + '#' + key;
                // map keys for "standard attributes":
                if (ATTRIBUTE_MAPPING.containsKey(k)) {
                    k = ATTRIBUTE_MAPPING.get(k);
                }
                if (value instanceof BytesRef) {
                    final BytesRef p = (BytesRef) value;
                    value = p.toString();
                }
                tokenNamedList.add(k, value);
            }
        });
        tokensNamedLists.add(tokenNamedList);
    }
    return tokensNamedLists;
}
Also used : AttributeSource(org.apache.lucene.util.AttributeSource) NamedList(org.apache.solr.common.util.NamedList) AttributeReflector(org.apache.lucene.util.AttributeReflector) ArrayList(java.util.ArrayList) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) FieldType(org.apache.solr.schema.FieldType) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 3 with AttributeSource

use of org.apache.lucene.util.AttributeSource in project lucene-solr by apache.

the class AnalysisRequestHandlerBase method analyzeValue.

/**
   * Analyzes the given value using the given Analyzer.
   *
   * @param value   Value to analyze
   * @param context The {@link AnalysisContext analysis context}.
   *
   * @return NamedList containing the tokens produced by analyzing the given value
   */
protected NamedList<? extends Object> analyzeValue(String value, AnalysisContext context) {
    Analyzer analyzer = context.getAnalyzer();
    if (!TokenizerChain.class.isInstance(analyzer)) {
        try (TokenStream tokenStream = analyzer.tokenStream(context.getFieldName(), value)) {
            NamedList<List<NamedList>> namedList = new NamedList<>();
            namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context));
            return namedList;
        } catch (IOException e) {
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
        }
    }
    TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
    CharFilterFactory[] cfiltfacs = tokenizerChain.getCharFilterFactories();
    TokenizerFactory tfac = tokenizerChain.getTokenizerFactory();
    TokenFilterFactory[] filtfacs = tokenizerChain.getTokenFilterFactories();
    NamedList<Object> namedList = new NamedList<>();
    if (0 < cfiltfacs.length) {
        String source = value;
        for (CharFilterFactory cfiltfac : cfiltfacs) {
            Reader reader = new StringReader(source);
            reader = cfiltfac.create(reader);
            source = writeCharStream(namedList, reader);
        }
    }
    TokenStream tokenStream = tfac.create();
    ((Tokenizer) tokenStream).setReader(tokenizerChain.initReader(null, new StringReader(value)));
    List<AttributeSource> tokens = analyzeTokenStream(tokenStream);
    namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
    ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokenStream, tokens);
    for (TokenFilterFactory tokenFilterFactory : filtfacs) {
        for (final AttributeSource tok : tokens) {
            tok.getAttribute(TokenTrackingAttribute.class).freezeStage();
        }
        // overwrite the vars "tokenStream", "tokens", and "listBasedTokenStream"
        tokenStream = tokenFilterFactory.create(listBasedTokenStream);
        tokens = analyzeTokenStream(tokenStream);
        namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
        listBasedTokenStream = new ListBasedTokenStream(listBasedTokenStream, tokens);
    }
    return namedList;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) AttributeSource(org.apache.lucene.util.AttributeSource) NamedList(org.apache.solr.common.util.NamedList) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) Reader(java.io.Reader) StringReader(java.io.StringReader) IOException(java.io.IOException) Analyzer(org.apache.lucene.analysis.Analyzer) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) TokenizerChain(org.apache.solr.analysis.TokenizerChain) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList) NamedList(org.apache.solr.common.util.NamedList) List(java.util.List) Tokenizer(org.apache.lucene.analysis.Tokenizer) SolrException(org.apache.solr.common.SolrException)

Example 4 with AttributeSource

use of org.apache.lucene.util.AttributeSource in project lucene-solr by apache.

the class ShingleFilter method getNextToken.

/**
   * <p>Get the next token from the input stream.
   * <p>If the next token has <code>positionIncrement &gt; 1</code>,
   * <code>positionIncrement - 1</code> {@link #fillerToken}s are
   * inserted first.
   * @param target Where to put the new token; if null, a new instance is created.
   * @return On success, the populated token; null otherwise
   * @throws IOException if the input stream has a problem
   */
private InputWindowToken getNextToken(InputWindowToken target) throws IOException {
    InputWindowToken newTarget = target;
    if (numFillerTokensToInsert > 0) {
        if (null == target) {
            newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
        } else {
            nextInputStreamToken.copyTo(target.attSource);
        }
        // A filler token occupies no space
        newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset());
        newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
        newTarget.isFiller = true;
        --numFillerTokensToInsert;
    } else if (isNextInputStreamToken) {
        if (null == target) {
            newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
        } else {
            nextInputStreamToken.copyTo(target.attSource);
        }
        isNextInputStreamToken = false;
        newTarget.isFiller = false;
    } else if (!exhausted) {
        if (input.incrementToken()) {
            if (null == target) {
                newTarget = new InputWindowToken(cloneAttributes());
            } else {
                this.copyTo(target.attSource);
            }
            if (posIncrAtt.getPositionIncrement() > 1) {
                // Each output shingle must contain at least one input token, 
                // so no more than (maxShingleSize - 1) filler tokens will be inserted.
                numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
                // Save the current token as the next input stream token
                if (null == nextInputStreamToken) {
                    nextInputStreamToken = cloneAttributes();
                } else {
                    this.copyTo(nextInputStreamToken);
                }
                isNextInputStreamToken = true;
                // A filler token occupies no space
                newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
                newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
                newTarget.isFiller = true;
                --numFillerTokensToInsert;
            } else {
                newTarget.isFiller = false;
            }
        } else {
            exhausted = true;
            input.end();
            endState = captureState();
            numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1);
            if (numFillerTokensToInsert > 0) {
                nextInputStreamToken = new AttributeSource(getAttributeFactory());
                nextInputStreamToken.addAttribute(CharTermAttribute.class);
                OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(OffsetAttribute.class);
                newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());
                // Recurse/loop just once:
                return getNextToken(target);
            } else {
                newTarget = null;
            }
        }
    } else {
        newTarget = null;
    }
    return newTarget;
}
Also used : AttributeSource(org.apache.lucene.util.AttributeSource) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 5 with AttributeSource

use of org.apache.lucene.util.AttributeSource in project commons by twitter.

the class TokenTypeAttributeSerializerTest method serialize.

private byte[] serialize(TokenType tokenType) throws IOException {
    AttributeSource attributeSource = new AttributeSource();
    TokenTypeAttribute tokenTypeAttribute = attributeSource.addAttribute(TokenTypeAttribute.class);
    tokenTypeAttribute.setType(tokenType);
    TokenTypeAttributeSerializer serializer = new TokenTypeAttributeSerializer();
    serializer.initialize(attributeSource, TokenStreamSerializer.CURRENT_VERSION);
    ByteArrayOutputStream output = new ByteArrayOutputStream();
    TokenStreamSerializer.AttributeOutputStream outputStream = new TokenStreamSerializer.AttributeOutputStream(output);
    serializer.serialize(outputStream);
    return output.toByteArray();
}
Also used : AttributeSource(org.apache.lucene.util.AttributeSource) TokenTypeAttribute(com.twitter.common.text.token.attribute.TokenTypeAttribute) ByteArrayOutputStream(java.io.ByteArrayOutputStream)

Aggregations

AttributeSource (org.apache.lucene.util.AttributeSource)10 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 BytesRef (org.apache.lucene.util.BytesRef)3 TokenTypeAttribute (com.twitter.common.text.token.attribute.TokenTypeAttribute)2 TokenStream (org.apache.lucene.analysis.TokenStream)2 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)2 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)2 Terms (org.apache.lucene.index.Terms)2 BoostAttribute (org.apache.lucene.search.BoostAttribute)2 FuzzyTermsEnum (org.apache.lucene.search.FuzzyTermsEnum)2 MaxNonCompetitiveBoostAttribute (org.apache.lucene.search.MaxNonCompetitiveBoostAttribute)2 NamedList (org.apache.solr.common.util.NamedList)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 Reader (java.io.Reader)1 StringReader (java.io.StringReader)1 HashSet (java.util.HashSet)1 List (java.util.List)1 PriorityQueue (java.util.PriorityQueue)1