Search in sources :

Example 1 with LongsRef

use of org.apache.lucene.util.LongsRef in project lucene-solr by apache.

the class CompressingTermVectorsReader method get.

@Override
public Fields get(int doc) throws IOException {
    ensureOpen();
    // seek to the right place
    {
        final long startPointer = indexReader.getStartPointer(doc);
        vectorsStream.seek(startPointer);
    }
    // decode
    // - docBase: first doc ID of the chunk
    // - chunkDocs: number of docs of the chunk
    final int docBase = vectorsStream.readVInt();
    final int chunkDocs = vectorsStream.readVInt();
    if (doc < docBase || doc >= docBase + chunkDocs || docBase + chunkDocs > numDocs) {
        throw new CorruptIndexException("docBase=" + docBase + ",chunkDocs=" + chunkDocs + ",doc=" + doc, vectorsStream);
    }
    // number of fields to skip
    final int skip;
    // number of fields of the document we're looking for
    final int numFields;
    // total number of fields of the chunk (sum for all docs)
    final int totalFields;
    if (chunkDocs == 1) {
        skip = 0;
        numFields = totalFields = vectorsStream.readVInt();
    } else {
        reader.reset(vectorsStream, chunkDocs);
        int sum = 0;
        for (int i = docBase; i < doc; ++i) {
            sum += reader.next();
        }
        skip = sum;
        numFields = (int) reader.next();
        sum += numFields;
        for (int i = doc + 1; i < docBase + chunkDocs; ++i) {
            sum += reader.next();
        }
        totalFields = sum;
    }
    if (numFields == 0) {
        // no vectors
        return null;
    }
    // read field numbers that have term vectors
    final int[] fieldNums;
    {
        final int token = vectorsStream.readByte() & 0xFF;
        // means no term vectors, cannot happen since we checked for numFields == 0
        assert token != 0;
        final int bitsPerFieldNum = token & 0x1F;
        int totalDistinctFields = token >>> 5;
        if (totalDistinctFields == 0x07) {
            totalDistinctFields += vectorsStream.readVInt();
        }
        ++totalDistinctFields;
        final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalDistinctFields, bitsPerFieldNum, 1);
        fieldNums = new int[totalDistinctFields];
        for (int i = 0; i < totalDistinctFields; ++i) {
            fieldNums[i] = (int) it.next();
        }
    }
    // read field numbers and flags
    final int[] fieldNumOffs = new int[numFields];
    final PackedInts.Reader flags;
    {
        final int bitsPerOff = PackedInts.bitsRequired(fieldNums.length - 1);
        final PackedInts.Reader allFieldNumOffs = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, bitsPerOff);
        switch(vectorsStream.readVInt()) {
            case 0:
                final PackedInts.Reader fieldFlags = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, fieldNums.length, FLAGS_BITS);
                PackedInts.Mutable f = PackedInts.getMutable(totalFields, FLAGS_BITS, PackedInts.COMPACT);
                for (int i = 0; i < totalFields; ++i) {
                    final int fieldNumOff = (int) allFieldNumOffs.get(i);
                    assert fieldNumOff >= 0 && fieldNumOff < fieldNums.length;
                    final int fgs = (int) fieldFlags.get(fieldNumOff);
                    f.set(i, fgs);
                }
                flags = f;
                break;
            case 1:
                flags = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, FLAGS_BITS);
                break;
            default:
                throw new AssertionError();
        }
        for (int i = 0; i < numFields; ++i) {
            fieldNumOffs[i] = (int) allFieldNumOffs.get(skip + i);
        }
    }
    // number of terms per field for all fields
    final PackedInts.Reader numTerms;
    final int totalTerms;
    {
        final int bitsRequired = vectorsStream.readVInt();
        numTerms = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, bitsRequired);
        int sum = 0;
        for (int i = 0; i < totalFields; ++i) {
            sum += numTerms.get(i);
        }
        totalTerms = sum;
    }
    // term lengths
    int docOff = 0, docLen = 0, totalLen;
    final int[] fieldLengths = new int[numFields];
    final int[][] prefixLengths = new int[numFields][];
    final int[][] suffixLengths = new int[numFields][];
    {
        reader.reset(vectorsStream, totalTerms);
        // skip
        int toSkip = 0;
        for (int i = 0; i < skip; ++i) {
            toSkip += numTerms.get(i);
        }
        reader.skip(toSkip);
        // read prefix lengths
        for (int i = 0; i < numFields; ++i) {
            final int termCount = (int) numTerms.get(skip + i);
            final int[] fieldPrefixLengths = new int[termCount];
            prefixLengths[i] = fieldPrefixLengths;
            for (int j = 0; j < termCount; ) {
                final LongsRef next = reader.next(termCount - j);
                for (int k = 0; k < next.length; ++k) {
                    fieldPrefixLengths[j++] = (int) next.longs[next.offset + k];
                }
            }
        }
        reader.skip(totalTerms - reader.ord());
        reader.reset(vectorsStream, totalTerms);
        // skip
        toSkip = 0;
        for (int i = 0; i < skip; ++i) {
            for (int j = 0; j < numTerms.get(i); ++j) {
                docOff += reader.next();
            }
        }
        for (int i = 0; i < numFields; ++i) {
            final int termCount = (int) numTerms.get(skip + i);
            final int[] fieldSuffixLengths = new int[termCount];
            suffixLengths[i] = fieldSuffixLengths;
            for (int j = 0; j < termCount; ) {
                final LongsRef next = reader.next(termCount - j);
                for (int k = 0; k < next.length; ++k) {
                    fieldSuffixLengths[j++] = (int) next.longs[next.offset + k];
                }
            }
            fieldLengths[i] = sum(suffixLengths[i]);
            docLen += fieldLengths[i];
        }
        totalLen = docOff + docLen;
        for (int i = skip + numFields; i < totalFields; ++i) {
            for (int j = 0; j < numTerms.get(i); ++j) {
                totalLen += reader.next();
            }
        }
    }
    // term freqs
    final int[] termFreqs = new int[totalTerms];
    {
        reader.reset(vectorsStream, totalTerms);
        for (int i = 0; i < totalTerms; ) {
            final LongsRef next = reader.next(totalTerms - i);
            for (int k = 0; k < next.length; ++k) {
                termFreqs[i++] = 1 + (int) next.longs[next.offset + k];
            }
        }
    }
    // total number of positions, offsets and payloads
    int totalPositions = 0, totalOffsets = 0, totalPayloads = 0;
    for (int i = 0, termIndex = 0; i < totalFields; ++i) {
        final int f = (int) flags.get(i);
        final int termCount = (int) numTerms.get(i);
        for (int j = 0; j < termCount; ++j) {
            final int freq = termFreqs[termIndex++];
            if ((f & POSITIONS) != 0) {
                totalPositions += freq;
            }
            if ((f & OFFSETS) != 0) {
                totalOffsets += freq;
            }
            if ((f & PAYLOADS) != 0) {
                totalPayloads += freq;
            }
        }
        assert i != totalFields - 1 || termIndex == totalTerms : termIndex + " " + totalTerms;
    }
    final int[][] positionIndex = positionIndex(skip, numFields, numTerms, termFreqs);
    final int[][] positions, startOffsets, lengths;
    if (totalPositions > 0) {
        positions = readPositions(skip, numFields, flags, numTerms, termFreqs, POSITIONS, totalPositions, positionIndex);
    } else {
        positions = new int[numFields][];
    }
    if (totalOffsets > 0) {
        // average number of chars per term
        final float[] charsPerTerm = new float[fieldNums.length];
        for (int i = 0; i < charsPerTerm.length; ++i) {
            charsPerTerm[i] = Float.intBitsToFloat(vectorsStream.readInt());
        }
        startOffsets = readPositions(skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex);
        lengths = readPositions(skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex);
        for (int i = 0; i < numFields; ++i) {
            final int[] fStartOffsets = startOffsets[i];
            final int[] fPositions = positions[i];
            // patch offsets from positions
            if (fStartOffsets != null && fPositions != null) {
                final float fieldCharsPerTerm = charsPerTerm[fieldNumOffs[i]];
                for (int j = 0; j < startOffsets[i].length; ++j) {
                    fStartOffsets[j] += (int) (fieldCharsPerTerm * fPositions[j]);
                }
            }
            if (fStartOffsets != null) {
                final int[] fPrefixLengths = prefixLengths[i];
                final int[] fSuffixLengths = suffixLengths[i];
                final int[] fLengths = lengths[i];
                for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) {
                    // delta-decode start offsets and  patch lengths using term lengths
                    final int termLength = fPrefixLengths[j] + fSuffixLengths[j];
                    lengths[i][positionIndex[i][j]] += termLength;
                    for (int k = positionIndex[i][j] + 1; k < positionIndex[i][j + 1]; ++k) {
                        fStartOffsets[k] += fStartOffsets[k - 1];
                        fLengths[k] += termLength;
                    }
                }
            }
        }
    } else {
        startOffsets = lengths = new int[numFields][];
    }
    if (totalPositions > 0) {
        // delta-decode positions
        for (int i = 0; i < numFields; ++i) {
            final int[] fPositions = positions[i];
            final int[] fpositionIndex = positionIndex[i];
            if (fPositions != null) {
                for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) {
                    // delta-decode start offsets
                    for (int k = fpositionIndex[j] + 1; k < fpositionIndex[j + 1]; ++k) {
                        fPositions[k] += fPositions[k - 1];
                    }
                }
            }
        }
    }
    // payload lengths
    final int[][] payloadIndex = new int[numFields][];
    int totalPayloadLength = 0;
    int payloadOff = 0;
    int payloadLen = 0;
    if (totalPayloads > 0) {
        reader.reset(vectorsStream, totalPayloads);
        // skip
        int termIndex = 0;
        for (int i = 0; i < skip; ++i) {
            final int f = (int) flags.get(i);
            final int termCount = (int) numTerms.get(i);
            if ((f & PAYLOADS) != 0) {
                for (int j = 0; j < termCount; ++j) {
                    final int freq = termFreqs[termIndex + j];
                    for (int k = 0; k < freq; ++k) {
                        final int l = (int) reader.next();
                        payloadOff += l;
                    }
                }
            }
            termIndex += termCount;
        }
        totalPayloadLength = payloadOff;
        // read doc payload lengths
        for (int i = 0; i < numFields; ++i) {
            final int f = (int) flags.get(skip + i);
            final int termCount = (int) numTerms.get(skip + i);
            if ((f & PAYLOADS) != 0) {
                final int totalFreq = positionIndex[i][termCount];
                payloadIndex[i] = new int[totalFreq + 1];
                int posIdx = 0;
                payloadIndex[i][posIdx] = payloadLen;
                for (int j = 0; j < termCount; ++j) {
                    final int freq = termFreqs[termIndex + j];
                    for (int k = 0; k < freq; ++k) {
                        final int payloadLength = (int) reader.next();
                        payloadLen += payloadLength;
                        payloadIndex[i][posIdx + 1] = payloadLen;
                        ++posIdx;
                    }
                }
                assert posIdx == totalFreq;
            }
            termIndex += termCount;
        }
        totalPayloadLength += payloadLen;
        for (int i = skip + numFields; i < totalFields; ++i) {
            final int f = (int) flags.get(i);
            final int termCount = (int) numTerms.get(i);
            if ((f & PAYLOADS) != 0) {
                for (int j = 0; j < termCount; ++j) {
                    final int freq = termFreqs[termIndex + j];
                    for (int k = 0; k < freq; ++k) {
                        totalPayloadLength += reader.next();
                    }
                }
            }
            termIndex += termCount;
        }
        assert termIndex == totalTerms : termIndex + " " + totalTerms;
    }
    // decompress data
    final BytesRef suffixBytes = new BytesRef();
    decompressor.decompress(vectorsStream, totalLen + totalPayloadLength, docOff + payloadOff, docLen + payloadLen, suffixBytes);
    suffixBytes.length = docLen;
    final BytesRef payloadBytes = new BytesRef(suffixBytes.bytes, suffixBytes.offset + docLen, payloadLen);
    final int[] fieldFlags = new int[numFields];
    for (int i = 0; i < numFields; ++i) {
        fieldFlags[i] = (int) flags.get(skip + i);
    }
    final int[] fieldNumTerms = new int[numFields];
    for (int i = 0; i < numFields; ++i) {
        fieldNumTerms[i] = (int) numTerms.get(skip + i);
    }
    final int[][] fieldTermFreqs = new int[numFields][];
    {
        int termIdx = 0;
        for (int i = 0; i < skip; ++i) {
            termIdx += numTerms.get(i);
        }
        for (int i = 0; i < numFields; ++i) {
            final int termCount = (int) numTerms.get(skip + i);
            fieldTermFreqs[i] = new int[termCount];
            for (int j = 0; j < termCount; ++j) {
                fieldTermFreqs[i][j] = termFreqs[termIdx++];
            }
        }
    }
    assert sum(fieldLengths) == docLen : sum(fieldLengths) + " != " + docLen;
    return new TVFields(fieldNums, fieldFlags, fieldNumOffs, fieldNumTerms, fieldLengths, prefixLengths, suffixLengths, fieldTermFreqs, positionIndex, positions, startOffsets, lengths, payloadBytes, payloadIndex, suffixBytes);
}
Also used : CorruptIndexException(org.apache.lucene.index.CorruptIndexException) TermVectorsReader(org.apache.lucene.codecs.TermVectorsReader) PackedInts(org.apache.lucene.util.packed.PackedInts) BlockPackedReaderIterator(org.apache.lucene.util.packed.BlockPackedReaderIterator) BytesRef(org.apache.lucene.util.BytesRef) LongsRef(org.apache.lucene.util.LongsRef)

Example 2 with LongsRef

use of org.apache.lucene.util.LongsRef in project lucene-solr by apache.

the class CompressingTermVectorsReader method readPositions.

private int[][] readPositions(int skip, int numFields, PackedInts.Reader flags, PackedInts.Reader numTerms, int[] termFreqs, int flag, final int totalPositions, int[][] positionIndex) throws IOException {
    final int[][] positions = new int[numFields][];
    reader.reset(vectorsStream, totalPositions);
    // skip
    int toSkip = 0;
    int termIndex = 0;
    for (int i = 0; i < skip; ++i) {
        final int f = (int) flags.get(i);
        final int termCount = (int) numTerms.get(i);
        if ((f & flag) != 0) {
            for (int j = 0; j < termCount; ++j) {
                final int freq = termFreqs[termIndex + j];
                toSkip += freq;
            }
        }
        termIndex += termCount;
    }
    reader.skip(toSkip);
    // read doc positions
    for (int i = 0; i < numFields; ++i) {
        final int f = (int) flags.get(skip + i);
        final int termCount = (int) numTerms.get(skip + i);
        if ((f & flag) != 0) {
            final int totalFreq = positionIndex[i][termCount];
            final int[] fieldPositions = new int[totalFreq];
            positions[i] = fieldPositions;
            for (int j = 0; j < totalFreq; ) {
                final LongsRef nextPositions = reader.next(totalFreq - j);
                for (int k = 0; k < nextPositions.length; ++k) {
                    fieldPositions[j++] = (int) nextPositions.longs[nextPositions.offset + k];
                }
            }
        }
        termIndex += termCount;
    }
    reader.skip(totalPositions - reader.ord());
    return positions;
}
Also used : LongsRef(org.apache.lucene.util.LongsRef)

Example 3 with LongsRef

use of org.apache.lucene.util.LongsRef in project lucene-solr by apache.

the class Lucene54DocValuesConsumer method writeDictionary.

private void writeDictionary(SortedSet<LongsRef> uniqueValueSets) throws IOException {
    int lengthSum = 0;
    for (LongsRef longs : uniqueValueSets) {
        lengthSum += longs.length;
    }
    meta.writeInt(lengthSum);
    for (LongsRef valueSet : uniqueValueSets) {
        for (int i = 0; i < valueSet.length; ++i) {
            meta.writeLong(valueSet.longs[valueSet.offset + i]);
        }
    }
    meta.writeInt(uniqueValueSets.size());
    for (LongsRef valueSet : uniqueValueSets) {
        meta.writeInt(valueSet.length);
    }
}
Also used : LongsRef(org.apache.lucene.util.LongsRef)

Example 4 with LongsRef

use of org.apache.lucene.util.LongsRef in project lucene-solr by apache.

the class Lucene54DocValuesConsumer method uniqueValueSets.

private SortedSet<LongsRef> uniqueValueSets(Iterable<Number> docToValueCount, Iterable<Number> values) {
    Set<LongsRef> uniqueValueSet = new HashSet<>();
    LongsRef docValues = new LongsRef(256);
    Iterator<Number> valueCountIterator = docToValueCount.iterator();
    Iterator<Number> valueIterator = values.iterator();
    int totalDictSize = 0;
    while (valueCountIterator.hasNext()) {
        docValues.length = valueCountIterator.next().intValue();
        if (docValues.length > 256) {
            return null;
        }
        for (int i = 0; i < docValues.length; ++i) {
            docValues.longs[i] = valueIterator.next().longValue();
        }
        if (uniqueValueSet.contains(docValues)) {
            continue;
        }
        totalDictSize += docValues.length;
        if (totalDictSize > 256) {
            return null;
        }
        uniqueValueSet.add(new LongsRef(Arrays.copyOf(docValues.longs, docValues.length), 0, docValues.length));
    }
    assert valueIterator.hasNext() == false;
    return new TreeSet<>(uniqueValueSet);
}
Also used : TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) LongsRef(org.apache.lucene.util.LongsRef)

Example 5 with LongsRef

use of org.apache.lucene.util.LongsRef in project lucene-solr by apache.

the class Lucene54DocValuesConsumer method docToSetId.

private Iterable<Number> docToSetId(SortedSet<LongsRef> uniqueValueSets, Iterable<Number> docToValueCount, Iterable<Number> values) {
    final Map<LongsRef, Integer> setIds = new HashMap<>();
    int i = 0;
    for (LongsRef set : uniqueValueSets) {
        setIds.put(set, i++);
    }
    assert i == uniqueValueSets.size();
    return new Iterable<Number>() {

        @Override
        public Iterator<Number> iterator() {
            final Iterator<Number> valueCountIterator = docToValueCount.iterator();
            final Iterator<Number> valueIterator = values.iterator();
            final LongsRef docValues = new LongsRef(256);
            return new Iterator<Number>() {

                @Override
                public boolean hasNext() {
                    return valueCountIterator.hasNext();
                }

                @Override
                public Number next() {
                    docValues.length = valueCountIterator.next().intValue();
                    for (int i = 0; i < docValues.length; ++i) {
                        docValues.longs[i] = valueIterator.next().longValue();
                    }
                    final Integer id = setIds.get(docValues);
                    assert id != null;
                    return id;
                }
            };
        }
    };
}
Also used : HashMap(java.util.HashMap) Iterator(java.util.Iterator) LongsRef(org.apache.lucene.util.LongsRef)

Aggregations

LongsRef (org.apache.lucene.util.LongsRef)7 Directory (org.apache.lucene.store.Directory)2 IndexInput (org.apache.lucene.store.IndexInput)2 IndexOutput (org.apache.lucene.store.IndexOutput)2 RAMDirectory (org.apache.lucene.store.RAMDirectory)2 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 Iterator (java.util.Iterator)1 TreeSet (java.util.TreeSet)1 TermVectorsReader (org.apache.lucene.codecs.TermVectorsReader)1 CorruptIndexException (org.apache.lucene.index.CorruptIndexException)1 ByteArrayDataInput (org.apache.lucene.store.ByteArrayDataInput)1 DataInput (org.apache.lucene.store.DataInput)1 BytesRef (org.apache.lucene.util.BytesRef)1 BlockPackedReaderIterator (org.apache.lucene.util.packed.BlockPackedReaderIterator)1 PackedInts (org.apache.lucene.util.packed.PackedInts)1 Reader (org.apache.lucene.util.packed.PackedInts.Reader)1