Search in sources :

Example 1 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project elasticsearch by elastic.

the class MapperQueryParser method getPossiblyAnalyzedPrefixQuery.

private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException {
    if (!settings.analyzeWildcard()) {
        return super.getPrefixQuery(field, termStr);
    }
    List<List<String>> tlist;
    // get Analyzer from superclass and tokenize the term
    TokenStream source = null;
    try {
        try {
            source = getAnalyzer().tokenStream(field, termStr);
            source.reset();
        } catch (IOException e) {
            return super.getPrefixQuery(field, termStr);
        }
        tlist = new ArrayList<>();
        List<String> currentPos = new ArrayList<>();
        CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posAtt = source.addAttribute(PositionIncrementAttribute.class);
        while (true) {
            try {
                if (!source.incrementToken())
                    break;
            } catch (IOException e) {
                break;
            }
            if (currentPos.isEmpty() == false && posAtt.getPositionIncrement() > 0) {
                tlist.add(currentPos);
                currentPos = new ArrayList<>();
            }
            currentPos.add(termAtt.toString());
        }
        if (currentPos.isEmpty() == false) {
            tlist.add(currentPos);
        }
    } finally {
        if (source != null) {
            IOUtils.closeWhileHandlingException(source);
        }
    }
    if (tlist.size() == 0) {
        return null;
    }
    if (tlist.size() == 1 && tlist.get(0).size() == 1) {
        return super.getPrefixQuery(field, tlist.get(0).get(0));
    }
    // build a boolean query with prefix on the last position only.
    List<BooleanClause> clauses = new ArrayList<>();
    for (int pos = 0; pos < tlist.size(); pos++) {
        List<String> plist = tlist.get(pos);
        boolean isLastPos = (pos == tlist.size() - 1);
        Query posQuery;
        if (plist.size() == 1) {
            if (isLastPos) {
                posQuery = super.getPrefixQuery(field, plist.get(0));
            } else {
                posQuery = newTermQuery(new Term(field, plist.get(0)));
            }
        } else if (isLastPos == false) {
            // build a synonym query for terms in the same position.
            Term[] terms = new Term[plist.size()];
            for (int i = 0; i < plist.size(); i++) {
                terms[i] = new Term(field, plist.get(i));
            }
            posQuery = new SynonymQuery(terms);
        } else {
            List<BooleanClause> innerClauses = new ArrayList<>();
            for (String token : plist) {
                innerClauses.add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD));
            }
            posQuery = getBooleanQueryCoordDisabled(innerClauses);
        }
        clauses.add(new BooleanClause(posQuery, getDefaultOperator() == Operator.AND ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD));
    }
    return getBooleanQuery(clauses);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) Query(org.apache.lucene.search.Query) MatchNoDocsQuery(org.apache.lucene.search.MatchNoDocsQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) MultiPhraseQuery(org.apache.lucene.search.MultiPhraseQuery) SpanQuery(org.apache.lucene.search.spans.SpanQuery) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) FuzzyQuery(org.apache.lucene.search.FuzzyQuery) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) SynonymQuery(org.apache.lucene.search.SynonymQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BoostQuery(org.apache.lucene.search.BoostQuery) SpanOrQuery(org.apache.lucene.search.spans.SpanOrQuery) SynonymQuery(org.apache.lucene.search.SynonymQuery) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Term(org.apache.lucene.index.Term) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) BooleanClause(org.apache.lucene.search.BooleanClause) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) List(java.util.List)

Example 2 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project elasticsearch by elastic.

the class TokenCountFieldMapper method countPositions.

/**
     * Count position increments in a token stream.  Package private for testing.
     * @param analyzer analyzer to create token stream
     * @param fieldName field name to pass to analyzer
     * @param fieldValue field value to pass to analyzer
     * @return number of position increments in a token stream
     * @throws IOException if tokenStream throws it
     */
static int countPositions(Analyzer analyzer, String fieldName, String fieldValue) throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, fieldValue)) {
        int count = 0;
        PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            count += position.getPositionIncrement();
        }
        tokenStream.end();
        count += position.getPositionIncrement();
        return count;
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 3 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project elasticsearch by elastic.

the class TransportAnalyzeAction method simpleAnalyze.

private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) {
    List<AnalyzeResponse.AnalyzeToken> tokens = new ArrayList<>();
    int lastPosition = -1;
    int lastOffset = 0;
    for (String text : request.text()) {
        try (TokenStream stream = analyzer.tokenStream(field, text)) {
            stream.reset();
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
            OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
            TypeAttribute type = stream.addAttribute(TypeAttribute.class);
            PositionLengthAttribute posLen = stream.addAttribute(PositionLengthAttribute.class);
            while (stream.incrementToken()) {
                int increment = posIncr.getPositionIncrement();
                if (increment > 0) {
                    lastPosition = lastPosition + increment;
                }
                tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), posLen.getPositionLength(), type.type(), null));
            }
            stream.end();
            lastOffset += offset.endOffset();
            lastPosition += posIncr.getPositionIncrement();
            lastPosition += analyzer.getPositionIncrementGap(field);
            lastOffset += analyzer.getOffsetGap(field);
        } catch (IOException e) {
            throw new ElasticsearchException("failed to analyze", e);
        }
    }
    return tokens;
}
Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) IOException(java.io.IOException) ElasticsearchException(org.elasticsearch.ElasticsearchException) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 4 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project Solbase by Photobucket.

the class IndexWriter method parseDoc.

@SuppressWarnings("unchecked")
public ParsedDoc parseDoc(Document doc, Analyzer analyzer, String indexName, int docNumber, List<String> sortFieldNames) throws CorruptIndexException, IOException {
    // given doc, what are all of terms we indexed
    List<Term> allIndexedTerms = new ArrayList<Term>();
    Map<String, byte[]> fieldCache = new HashMap<String, byte[]>(1024);
    // need to hold onto TermDocMetaData, so it can return this array
    List<TermDocMetadata> metadatas = new ArrayList<TermDocMetadata>();
    byte[] docId = Bytes.toBytes(docNumber);
    int position = 0;
    for (Fieldable field : (List<Fieldable>) doc.getFields()) {
        // Indexed field
        if (field.isIndexed() && field.isTokenized()) {
            TokenStream tokens = field.tokenStreamValue();
            if (tokens == null) {
                tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
            }
            // collect term information per field
            Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new ConcurrentSkipListMap<Term, Map<ByteBuffer, List<Number>>>();
            int lastOffset = 0;
            if (position > 0) {
                position += analyzer.getPositionIncrementGap(field.name());
            }
            // reset the TokenStream to the first token
            tokens.reset();
            // offsets
            OffsetAttribute offsetAttribute = null;
            if (field.isStoreOffsetWithTermVector())
                offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);
            // positions
            PositionIncrementAttribute posIncrAttribute = null;
            if (field.isStorePositionWithTermVector())
                posIncrAttribute = (PositionIncrementAttribute) tokens.addAttribute(PositionIncrementAttribute.class);
            TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);
            // store normalizations of field per term per document
            // rather
            // than per field.
            // this adds more to write but less to read on other side
            Integer tokensInField = new Integer(0);
            while (tokens.incrementToken()) {
                tokensInField++;
                Term term = new Term(field.name(), termAttribute.term());
                allIndexedTerms.add(term);
                // fetch all collected information for this term
                Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);
                if (termInfo == null) {
                    termInfo = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
                    allTermInformation.put(term, termInfo);
                }
                // term frequency
                List<Number> termFrequency = termInfo.get(TermDocMetadata.termFrequencyKeyBytes);
                if (termFrequency == null) {
                    termFrequency = new ArrayList<Number>();
                    termFrequency.add(new Integer(0));
                    termInfo.put(TermDocMetadata.termFrequencyKeyBytes, termFrequency);
                }
                // increment
                termFrequency.set(0, termFrequency.get(0).intValue() + 1);
                // position vector
                if (field.isStorePositionWithTermVector()) {
                    position += (posIncrAttribute.getPositionIncrement() - 1);
                    List<Number> positionVector = termInfo.get(TermDocMetadata.positionVectorKeyBytes);
                    if (positionVector == null) {
                        positionVector = new ArrayList<Number>();
                        termInfo.put(TermDocMetadata.positionVectorKeyBytes, positionVector);
                    }
                    positionVector.add(++position);
                }
                // term offsets
                if (field.isStoreOffsetWithTermVector()) {
                    List<Number> offsetVector = termInfo.get(TermDocMetadata.offsetVectorKeyBytes);
                    if (offsetVector == null) {
                        offsetVector = new ArrayList<Number>();
                        termInfo.put(TermDocMetadata.offsetVectorKeyBytes, offsetVector);
                    }
                    offsetVector.add(lastOffset + offsetAttribute.startOffset());
                    offsetVector.add(lastOffset + offsetAttribute.endOffset());
                }
                List<Number> sortValues = new ArrayList<Number>();
                // init sortValues
                for (int i = 0; i < Scorer.numSort; i++) {
                    sortValues.add(new Integer(-1));
                }
                int order = 0;
                // extract sort field value and store it in term doc metadata obj
                for (String fieldName : sortFieldNames) {
                    Fieldable fieldable = doc.getFieldable(fieldName);
                    if (fieldable instanceof EmbeddedSortField) {
                        EmbeddedSortField sortField = (EmbeddedSortField) fieldable;
                        int value = -1;
                        if (sortField.stringValue() != null) {
                            value = Integer.parseInt(sortField.stringValue());
                        }
                        int sortSlot = sortField.getSortSlot();
                        sortValues.set(sortSlot - 1, new Integer(value));
                    } else {
                        // TODO: this logic is used for real time indexing.
                        // hacky. depending on order of sort field names in array
                        int value = -1;
                        if (fieldable.stringValue() != null) {
                            value = Integer.parseInt(fieldable.stringValue());
                        }
                        sortValues.set(order++, new Integer(value));
                    }
                }
                termInfo.put(TermDocMetadata.sortFieldKeyBytes, sortValues);
            }
            List<Number> bnorm = null;
            if (!field.getOmitNorms()) {
                bnorm = new ArrayList<Number>();
                float norm = doc.getBoost();
                norm *= field.getBoost();
                norm *= similarity.lengthNorm(field.name(), tokensInField);
                bnorm.add(Similarity.encodeNorm(norm));
            }
            for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet()) {
                Term tempTerm = term.getKey();
                byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(tempTerm);
                // more writes but faster on read side.
                if (!field.getOmitNorms()) {
                    term.getValue().put(TermDocMetadata.normsKeyBytes, bnorm);
                }
                TermDocMetadata data = new TermDocMetadata(docNumber, term.getValue(), fieldTermKeyBytes, tempTerm);
                metadatas.add(data);
            }
        }
        // Untokenized fields go in without a termPosition
        if (field.isIndexed() && !field.isTokenized()) {
            Term term = new Term(field.name(), field.stringValue());
            allIndexedTerms.add(term);
            byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(term);
            Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
            termMap.put(TermDocMetadata.termFrequencyKeyBytes, Arrays.asList(new Number[] {}));
            termMap.put(TermDocMetadata.positionVectorKeyBytes, Arrays.asList(new Number[] {}));
            TermDocMetadata data = new TermDocMetadata(docNumber, termMap, fieldTermKeyBytes, term);
            metadatas.add(data);
        }
        // Stores each field as a column under this doc key
        if (field.isStored()) {
            byte[] _value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue());
            // first byte flags if binary or not
            byte[] value = new byte[_value.length + 1];
            System.arraycopy(_value, 0, value, 0, _value.length);
            value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE);
            // logic to handle multiple fields w/ same name
            byte[] currentValue = fieldCache.get(field.name());
            if (currentValue == null) {
                fieldCache.put(field.name(), value);
            } else {
                // append new data
                byte[] newValue = new byte[currentValue.length + SolbaseUtil.delimiter.length + value.length - 1];
                System.arraycopy(currentValue, 0, newValue, 0, currentValue.length - 1);
                System.arraycopy(SolbaseUtil.delimiter, 0, newValue, currentValue.length - 1, SolbaseUtil.delimiter.length);
                System.arraycopy(value, 0, newValue, currentValue.length + SolbaseUtil.delimiter.length - 1, value.length);
                fieldCache.put(field.name(), newValue);
            }
        }
    }
    Put documentPut = new Put(SolbaseUtil.randomize(docNumber));
    // Store each field as a column under this docId
    for (Map.Entry<String, byte[]> field : fieldCache.entrySet()) {
        documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(field.getKey()), field.getValue());
    }
    // in case of real time update, we need to add back docId field
    if (!documentPut.has(Bytes.toBytes("field"), Bytes.toBytes("docId"))) {
        byte[] docIdStr = Bytes.toBytes(new Integer(docNumber).toString());
        // first byte flags if binary or not
        byte[] value = new byte[docIdStr.length + 1];
        System.arraycopy(docIdStr, 0, value, 0, docIdStr.length);
        value[value.length - 1] = (byte) (Byte.MIN_VALUE);
        documentPut.add(Bytes.toBytes("field"), Bytes.toBytes("docId"), value);
    }
    // Finally, Store meta-data so we can delete this document
    documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"), SolbaseUtil.toBytes(allIndexedTerms).array());
    ParsedDoc parsedDoc = new ParsedDoc(metadatas, doc, documentPut, fieldCache.entrySet(), allIndexedTerms);
    return parsedDoc;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) EmbeddedSortField(org.apache.lucene.document.EmbeddedSortField) Fieldable(org.apache.lucene.document.Fieldable) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList) List(java.util.List) TermAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute) ConcurrentSkipListMap(java.util.concurrent.ConcurrentSkipListMap) Term(org.apache.lucene.index.Term) ByteBuffer(java.nio.ByteBuffer) Put(org.apache.hadoop.hbase.client.Put) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) ParsedDoc(org.solbase.indexer.ParsedDoc) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) HashMap(java.util.HashMap) Map(java.util.Map) ConcurrentSkipListMap(java.util.concurrent.ConcurrentSkipListMap)

Example 5 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project languagetool by languagetool-org.

the class LanguageToolFilterTest method displayTokensWithFullDetails.

private static void displayTokensWithFullDetails(TokenStream stream) throws IOException {
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    int position = 0;
    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ": ");
        }
        System.out.print("[" + term + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] ");
    }
    System.out.println();
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Aggregations

PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)50 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)33 TokenStream (org.apache.lucene.analysis.TokenStream)28 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)25 IOException (java.io.IOException)14 ArrayList (java.util.ArrayList)14 BytesRef (org.apache.lucene.util.BytesRef)14 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)11 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)11 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)11 StringReader (java.io.StringReader)8 Term (org.apache.lucene.index.Term)8 Token (org.apache.lucene.analysis.Token)7 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)7 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)7 List (java.util.List)6 LinkedList (java.util.LinkedList)4 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)4 Document (org.apache.lucene.document.Document)4 Iterator (java.util.Iterator)3