Search in sources :

Example 6 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project languagetool by languagetool-org.

the class LanguageToolFilterTest method displayTokensWithFullDetails.

private static void displayTokensWithFullDetails(TokenStream stream) throws IOException {
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    int position = 0;
    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ": ");
        }
        System.out.print("[" + term + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] ");
    }
    System.out.println();
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 7 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project Solbase by Photobucket.

the class IndexWriter method parseDoc.

@SuppressWarnings("unchecked")
public ParsedDoc parseDoc(Document doc, Analyzer analyzer, String indexName, int docNumber, List<String> sortFieldNames) throws CorruptIndexException, IOException {
    // given doc, what are all of terms we indexed
    List<Term> allIndexedTerms = new ArrayList<Term>();
    Map<String, byte[]> fieldCache = new HashMap<String, byte[]>(1024);
    // need to hold onto TermDocMetaData, so it can return this array
    List<TermDocMetadata> metadatas = new ArrayList<TermDocMetadata>();
    byte[] docId = Bytes.toBytes(docNumber);
    int position = 0;
    for (Fieldable field : (List<Fieldable>) doc.getFields()) {
        // Indexed field
        if (field.isIndexed() && field.isTokenized()) {
            TokenStream tokens = field.tokenStreamValue();
            if (tokens == null) {
                tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
            }
            // collect term information per field
            Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new ConcurrentSkipListMap<Term, Map<ByteBuffer, List<Number>>>();
            int lastOffset = 0;
            if (position > 0) {
                position += analyzer.getPositionIncrementGap(field.name());
            }
            // reset the TokenStream to the first token
            tokens.reset();
            // offsets
            OffsetAttribute offsetAttribute = null;
            if (field.isStoreOffsetWithTermVector())
                offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);
            // positions
            PositionIncrementAttribute posIncrAttribute = null;
            if (field.isStorePositionWithTermVector())
                posIncrAttribute = (PositionIncrementAttribute) tokens.addAttribute(PositionIncrementAttribute.class);
            TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);
            // store normalizations of field per term per document
            // rather
            // than per field.
            // this adds more to write but less to read on other side
            Integer tokensInField = new Integer(0);
            while (tokens.incrementToken()) {
                tokensInField++;
                Term term = new Term(field.name(), termAttribute.term());
                allIndexedTerms.add(term);
                // fetch all collected information for this term
                Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);
                if (termInfo == null) {
                    termInfo = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
                    allTermInformation.put(term, termInfo);
                }
                // term frequency
                List<Number> termFrequency = termInfo.get(TermDocMetadata.termFrequencyKeyBytes);
                if (termFrequency == null) {
                    termFrequency = new ArrayList<Number>();
                    termFrequency.add(new Integer(0));
                    termInfo.put(TermDocMetadata.termFrequencyKeyBytes, termFrequency);
                }
                // increment
                termFrequency.set(0, termFrequency.get(0).intValue() + 1);
                // position vector
                if (field.isStorePositionWithTermVector()) {
                    position += (posIncrAttribute.getPositionIncrement() - 1);
                    List<Number> positionVector = termInfo.get(TermDocMetadata.positionVectorKeyBytes);
                    if (positionVector == null) {
                        positionVector = new ArrayList<Number>();
                        termInfo.put(TermDocMetadata.positionVectorKeyBytes, positionVector);
                    }
                    positionVector.add(++position);
                }
                // term offsets
                if (field.isStoreOffsetWithTermVector()) {
                    List<Number> offsetVector = termInfo.get(TermDocMetadata.offsetVectorKeyBytes);
                    if (offsetVector == null) {
                        offsetVector = new ArrayList<Number>();
                        termInfo.put(TermDocMetadata.offsetVectorKeyBytes, offsetVector);
                    }
                    offsetVector.add(lastOffset + offsetAttribute.startOffset());
                    offsetVector.add(lastOffset + offsetAttribute.endOffset());
                }
                List<Number> sortValues = new ArrayList<Number>();
                // init sortValues
                for (int i = 0; i < Scorer.numSort; i++) {
                    sortValues.add(new Integer(-1));
                }
                int order = 0;
                // extract sort field value and store it in term doc metadata obj
                for (String fieldName : sortFieldNames) {
                    Fieldable fieldable = doc.getFieldable(fieldName);
                    if (fieldable instanceof EmbeddedSortField) {
                        EmbeddedSortField sortField = (EmbeddedSortField) fieldable;
                        int value = -1;
                        if (sortField.stringValue() != null) {
                            value = Integer.parseInt(sortField.stringValue());
                        }
                        int sortSlot = sortField.getSortSlot();
                        sortValues.set(sortSlot - 1, new Integer(value));
                    } else {
                        // TODO: this logic is used for real time indexing.
                        // hacky. depending on order of sort field names in array
                        int value = -1;
                        if (fieldable.stringValue() != null) {
                            value = Integer.parseInt(fieldable.stringValue());
                        }
                        sortValues.set(order++, new Integer(value));
                    }
                }
                termInfo.put(TermDocMetadata.sortFieldKeyBytes, sortValues);
            }
            List<Number> bnorm = null;
            if (!field.getOmitNorms()) {
                bnorm = new ArrayList<Number>();
                float norm = doc.getBoost();
                norm *= field.getBoost();
                norm *= similarity.lengthNorm(field.name(), tokensInField);
                bnorm.add(Similarity.encodeNorm(norm));
            }
            for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet()) {
                Term tempTerm = term.getKey();
                byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(tempTerm);
                // more writes but faster on read side.
                if (!field.getOmitNorms()) {
                    term.getValue().put(TermDocMetadata.normsKeyBytes, bnorm);
                }
                TermDocMetadata data = new TermDocMetadata(docNumber, term.getValue(), fieldTermKeyBytes, tempTerm);
                metadatas.add(data);
            }
        }
        // Untokenized fields go in without a termPosition
        if (field.isIndexed() && !field.isTokenized()) {
            Term term = new Term(field.name(), field.stringValue());
            allIndexedTerms.add(term);
            byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(term);
            Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
            termMap.put(TermDocMetadata.termFrequencyKeyBytes, Arrays.asList(new Number[] {}));
            termMap.put(TermDocMetadata.positionVectorKeyBytes, Arrays.asList(new Number[] {}));
            TermDocMetadata data = new TermDocMetadata(docNumber, termMap, fieldTermKeyBytes, term);
            metadatas.add(data);
        }
        // Stores each field as a column under this doc key
        if (field.isStored()) {
            byte[] _value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue());
            // first byte flags if binary or not
            byte[] value = new byte[_value.length + 1];
            System.arraycopy(_value, 0, value, 0, _value.length);
            value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE);
            // logic to handle multiple fields w/ same name
            byte[] currentValue = fieldCache.get(field.name());
            if (currentValue == null) {
                fieldCache.put(field.name(), value);
            } else {
                // append new data
                byte[] newValue = new byte[currentValue.length + SolbaseUtil.delimiter.length + value.length - 1];
                System.arraycopy(currentValue, 0, newValue, 0, currentValue.length - 1);
                System.arraycopy(SolbaseUtil.delimiter, 0, newValue, currentValue.length - 1, SolbaseUtil.delimiter.length);
                System.arraycopy(value, 0, newValue, currentValue.length + SolbaseUtil.delimiter.length - 1, value.length);
                fieldCache.put(field.name(), newValue);
            }
        }
    }
    Put documentPut = new Put(SolbaseUtil.randomize(docNumber));
    // Store each field as a column under this docId
    for (Map.Entry<String, byte[]> field : fieldCache.entrySet()) {
        documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(field.getKey()), field.getValue());
    }
    // in case of real time update, we need to add back docId field
    if (!documentPut.has(Bytes.toBytes("field"), Bytes.toBytes("docId"))) {
        byte[] docIdStr = Bytes.toBytes(new Integer(docNumber).toString());
        // first byte flags if binary or not
        byte[] value = new byte[docIdStr.length + 1];
        System.arraycopy(docIdStr, 0, value, 0, docIdStr.length);
        value[value.length - 1] = (byte) (Byte.MIN_VALUE);
        documentPut.add(Bytes.toBytes("field"), Bytes.toBytes("docId"), value);
    }
    // Finally, Store meta-data so we can delete this document
    documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"), SolbaseUtil.toBytes(allIndexedTerms).array());
    ParsedDoc parsedDoc = new ParsedDoc(metadatas, doc, documentPut, fieldCache.entrySet(), allIndexedTerms);
    return parsedDoc;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) EmbeddedSortField(org.apache.lucene.document.EmbeddedSortField) Fieldable(org.apache.lucene.document.Fieldable) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList) List(java.util.List) TermAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute) ConcurrentSkipListMap(java.util.concurrent.ConcurrentSkipListMap) Term(org.apache.lucene.index.Term) ByteBuffer(java.nio.ByteBuffer) Put(org.apache.hadoop.hbase.client.Put) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) ParsedDoc(org.solbase.indexer.ParsedDoc) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) HashMap(java.util.HashMap) Map(java.util.Map) ConcurrentSkipListMap(java.util.concurrent.ConcurrentSkipListMap)

Example 8 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project zm-mailbox by Zimbra.

the class UniversalAnalyzerTest method testSTD.

private void testSTD(String src) throws IOException {
    TokenStream std = standardAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute stdTermAttr = std.addAttribute(CharTermAttribute.class);
    OffsetAttribute stdOffsetAttr = std.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute stdPosIncAttr = std.addAttribute(PositionIncrementAttribute.class);
    TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class);
    OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class);
    while (true) {
        boolean result = std.incrementToken();
        Assert.assertEquals(result, uni.incrementToken());
        if (!result) {
            break;
        }
        String term = stdTermAttr.toString();
        Assert.assertEquals(stdTermAttr, uniTermAttr);
        if (assertOffset) {
            Assert.assertEquals(term, stdOffsetAttr, uniOffsetAttr);
        }
        Assert.assertEquals(term, stdPosIncAttr, uniPosIncAttr);
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 9 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.

the class SpellCheckComponent method getTokens.

private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
    Collection<Token> result = new ArrayList<>();
    assert analyzer != null;
    try (TokenStream ts = analyzer.tokenStream("", q)) {
        ts.reset();
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        while (ts.incrementToken()) {
            Token token = new Token();
            token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            token.setType(typeAtt.type());
            token.setFlags(flagsAtt.getFlags());
            token.setPayload(payloadAtt.getPayload());
            token.setPositionIncrement(posIncAtt.getPositionIncrement());
            result.add(token);
        }
        ts.end();
        return result;
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) ArrayList(java.util.ArrayList) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Token(org.apache.lucene.analysis.Token) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 10 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.

the class SimplePreAnalyzedParser method toFormattedString.

@Override
public String toFormattedString(Field f) throws IOException {
    StringBuilder sb = new StringBuilder();
    sb.append(VERSION + " ");
    if (f.fieldType().stored()) {
        String s = f.stringValue();
        if (s != null) {
            // encode the equals sign
            s = s.replaceAll("=", "\\=");
            sb.append('=');
            sb.append(s);
            sb.append('=');
        }
    }
    TokenStream ts = f.tokenStreamValue();
    if (ts != null) {
        StringBuilder tok = new StringBuilder();
        boolean next = false;
        while (ts.incrementToken()) {
            if (next) {
                sb.append(' ');
            } else {
                next = true;
            }
            tok.setLength(0);
            Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
            String cTerm = null;
            String tTerm = null;
            while (it.hasNext()) {
                Class<? extends Attribute> cl = it.next();
                Attribute att = ts.getAttribute(cl);
                if (att == null) {
                    continue;
                }
                if (cl.isAssignableFrom(CharTermAttribute.class)) {
                    CharTermAttribute catt = (CharTermAttribute) att;
                    cTerm = escape(catt.buffer(), catt.length());
                } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
                    TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
                    char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
                    tTerm = escape(tTermChars, tTermChars.length);
                } else {
                    if (tok.length() > 0)
                        tok.append(',');
                    if (cl.isAssignableFrom(FlagsAttribute.class)) {
                        tok.append("f=" + Integer.toHexString(((FlagsAttribute) att).getFlags()));
                    } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
                        tok.append("s=" + ((OffsetAttribute) att).startOffset() + ",e=" + ((OffsetAttribute) att).endOffset());
                    } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
                        BytesRef p = ((PayloadAttribute) att).getPayload();
                        if (p != null && p.length > 0) {
                            tok.append("p=" + bytesToHex(p.bytes, p.offset, p.length));
                        } else if (tok.length() > 0) {
                            // remove the last comma
                            tok.setLength(tok.length() - 1);
                        }
                    } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
                        tok.append("i=" + ((PositionIncrementAttribute) att).getPositionIncrement());
                    } else if (cl.isAssignableFrom(TypeAttribute.class)) {
                        tok.append("y=" + escape(((TypeAttribute) att).type()));
                    } else {
                        tok.append(cl.getName() + "=" + escape(att.toString()));
                    }
                }
            }
            String term = null;
            if (cTerm != null) {
                term = cTerm;
            } else {
                term = tTerm;
            }
            if (term != null && term.length() > 0) {
                if (tok.length() > 0) {
                    tok.insert(0, term + ",");
                } else {
                    tok.insert(0, term);
                }
            }
            sb.append(tok);
        }
    }
    return sb.toString();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Attribute(org.apache.lucene.util.Attribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)51 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)34 TokenStream (org.apache.lucene.analysis.TokenStream)29 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)26 IOException (java.io.IOException)15 ArrayList (java.util.ArrayList)14 BytesRef (org.apache.lucene.util.BytesRef)14 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)11 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)11 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)11 StringReader (java.io.StringReader)9 Term (org.apache.lucene.index.Term)8 Token (org.apache.lucene.analysis.Token)7 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)7 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)7 List (java.util.List)6 LinkedList (java.util.LinkedList)4 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)4 Document (org.apache.lucene.document.Document)4 Iterator (java.util.Iterator)3