Search in sources :

Example 1 with PagedBytes

use of org.apache.lucene.util.PagedBytes in project lucene-solr by apache.

the class MemoryDocValuesProducer method loadBinary.

private BytesAndAddresses loadBinary(FieldInfo field) throws IOException {
    BytesAndAddresses bytesAndAddresses = new BytesAndAddresses();
    BinaryEntry entry = binaries.get(field.name);
    IndexInput data = this.data.clone();
    data.seek(entry.offset);
    PagedBytes bytes = new PagedBytes(16);
    bytes.copy(data, entry.numBytes);
    bytesAndAddresses.reader = bytes.freeze(true);
    if (!merging) {
        ramBytesUsed.addAndGet(bytesAndAddresses.reader.ramBytesUsed());
    }
    if (entry.minLength != entry.maxLength) {
        data.seek(data.getFilePointer() + entry.missingBytes);
        bytesAndAddresses.addresses = MonotonicBlockPackedReader.of(data, entry.packedIntsVersion, entry.blockSize, maxDoc, false);
        if (!merging) {
            ramBytesUsed.addAndGet(bytesAndAddresses.addresses.ramBytesUsed());
        }
    }
    return bytesAndAddresses;
}
Also used : ChecksumIndexInput(org.apache.lucene.store.ChecksumIndexInput) IndexInput(org.apache.lucene.store.IndexInput) PagedBytes(org.apache.lucene.util.PagedBytes)

Example 2 with PagedBytes

use of org.apache.lucene.util.PagedBytes in project elasticsearch by elastic.

the class PagedBytesIndexFieldData method loadDirect.

@Override
public AtomicOrdinalsFieldData loadDirect(LeafReaderContext context) throws Exception {
    LeafReader reader = context.reader();
    AtomicOrdinalsFieldData data = null;
    PagedBytesEstimator estimator = new PagedBytesEstimator(context, breakerService.getBreaker(CircuitBreaker.FIELDDATA), getFieldName());
    Terms terms = reader.terms(getFieldName());
    if (terms == null) {
        data = AbstractAtomicOrdinalsFieldData.empty();
        estimator.afterLoad(null, data.ramBytesUsed());
        return data;
    }
    final PagedBytes bytes = new PagedBytes(15);
    final PackedLongValues.Builder termOrdToBytesOffset = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
    final float acceptableTransientOverheadRatio = OrdinalsBuilder.DEFAULT_ACCEPTABLE_OVERHEAD_RATIO;
    // Wrap the context in an estimator and use it to either estimate
    // the entire set, or wrap the TermsEnum so it can be calculated
    // per-term
    TermsEnum termsEnum = estimator.beforeLoad(terms);
    boolean success = false;
    try (OrdinalsBuilder builder = new OrdinalsBuilder(reader.maxDoc(), acceptableTransientOverheadRatio)) {
        PostingsEnum docsEnum = null;
        for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.next()) {
            final long termOrd = builder.nextOrdinal();
            assert termOrd == termOrdToBytesOffset.size();
            termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term));
            docsEnum = termsEnum.postings(docsEnum, PostingsEnum.NONE);
            for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
                builder.addDoc(docId);
            }
        }
        PagedBytes.Reader bytesReader = bytes.freeze(true);
        final Ordinals ordinals = builder.build();
        data = new PagedBytesAtomicFieldData(bytesReader, termOrdToBytesOffset.build(), ordinals);
        success = true;
        return data;
    } finally {
        if (!success) {
            // If something went wrong, unwind any current estimations we've made
            estimator.afterLoad(termsEnum, 0);
        } else {
            // Call .afterLoad() to adjust the breaker now that we have an exact size
            estimator.afterLoad(termsEnum, data.ramBytesUsed());
        }
    }
}
Also used : Ordinals(org.elasticsearch.index.fielddata.ordinals.Ordinals) PackedLongValues(org.apache.lucene.util.packed.PackedLongValues) LeafReader(org.apache.lucene.index.LeafReader) Terms(org.apache.lucene.index.Terms) RamAccountingTermsEnum(org.elasticsearch.index.fielddata.RamAccountingTermsEnum) TermsEnum(org.apache.lucene.index.TermsEnum) AtomicOrdinalsFieldData(org.elasticsearch.index.fielddata.AtomicOrdinalsFieldData) PagedBytes(org.apache.lucene.util.PagedBytes) OrdinalsBuilder(org.elasticsearch.index.fielddata.ordinals.OrdinalsBuilder) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef)

Example 3 with PagedBytes

use of org.apache.lucene.util.PagedBytes in project lucene-solr by apache.

the class DocTermOrds method uninvert.

/** Call this only once (if you subclass!) */
protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException {
    final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
    if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) {
        throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
    }
    //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
    final long startTime = System.nanoTime();
    prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix);
    final int maxDoc = reader.maxDoc();
    // immediate term numbers, or the index into the byte[] representing the last number
    final int[] index = new int[maxDoc];
    // last term we saw for this document
    final int[] lastTerm = new int[maxDoc];
    // list of term numbers for the doc (delta encoded vInts)
    final byte[][] bytes = new byte[maxDoc][];
    final Terms terms = reader.terms(field);
    if (terms == null) {
        // No terms
        return;
    }
    final TermsEnum te = terms.iterator();
    final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
    //System.out.println("seekStart=" + seekStart.utf8ToString());
    if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) {
        // No terms match
        return;
    }
    // For our "term index wrapper"
    final List<BytesRef> indexedTerms = new ArrayList<>();
    final PagedBytes indexedTermsBytes = new PagedBytes(15);
    // we need a minimum of 9 bytes, but round up to 12 since the space would
    // be wasted with most allocators anyway.
    byte[] tempArr = new byte[12];
    //
    // enumerate all terms, and build an intermediate form of the un-inverted field.
    //
    // During this intermediate form, every document has a (potential) byte[]
    // and the int[maxDoc()] array either contains the termNumber list directly
    // or the *end* offset of the termNumber list in its byte array (for faster
    // appending and faster creation of the final form).
    //
    // idea... if things are too large while building, we could do a range of docs
    // at a time (but it would be a fair amount slower to build)
    // could also do ranges in parallel to take advantage of multiple CPUs
    // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
    // values.  This requires going over the field first to find the most
    // frequent terms ahead of time.
    int termNum = 0;
    postingsEnum = null;
    // seek above):
    for (; ; ) {
        final BytesRef t = te.term();
        if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) {
            break;
        }
        //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);
        visitTerm(te, termNum);
        if ((termNum & indexIntervalMask) == 0) {
            // Index this term
            sizeOfIndexedStrings += t.length;
            BytesRef indexedTerm = new BytesRef();
            indexedTermsBytes.copy(t, indexedTerm);
            // TODO: really should 1) strip off useless suffix,
            // and 2) use FST not array/PagedBytes
            indexedTerms.add(indexedTerm);
        }
        final int df = te.docFreq();
        if (df <= maxTermDocFreq) {
            postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE);
            // dF, but takes deletions into account
            int actualDF = 0;
            for (; ; ) {
                int doc = postingsEnum.nextDoc();
                if (doc == DocIdSetIterator.NO_MORE_DOCS) {
                    break;
                }
                //System.out.println("  chunk=" + chunk + " docs");
                actualDF++;
                termInstances++;
                //System.out.println("    docID=" + doc);
                // add TNUM_OFFSET to the term number to make room for special reserved values:
                // 0 (end term) and 1 (index into byte array follows)
                int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
                lastTerm[doc] = termNum;
                int val = index[doc];
                if ((val & 0xff) == 1) {
                    // index into byte array (actually the end of
                    // the doc-specific byte[] when building)
                    int pos = val >>> 8;
                    int ilen = vIntSize(delta);
                    byte[] arr = bytes[doc];
                    int newend = pos + ilen;
                    if (newend > arr.length) {
                        // We avoid a doubling strategy to lower memory usage.
                        // this faceting method isn't for docs with many terms.
                        // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary.
                        // TODO: figure out what array lengths we can round up to w/o actually using more memory
                        // (how much space does a byte[] take up?  Is data preceded by a 32 bit length only?
                        // It should be safe to round up to the nearest 32 bits in any case.
                        // 4 byte alignment
                        int newLen = (newend + 3) & 0xfffffffc;
                        byte[] newarr = new byte[newLen];
                        System.arraycopy(arr, 0, newarr, 0, pos);
                        arr = newarr;
                        bytes[doc] = newarr;
                    }
                    pos = writeInt(delta, arr, pos);
                    // update pointer to end index in byte[]
                    index[doc] = (pos << 8) | 1;
                } else {
                    // OK, this int has data in it... find the end (a zero starting byte - not
                    // part of another number, hence not following a byte with the high bit set).
                    int ipos;
                    if (val == 0) {
                        ipos = 0;
                    } else if ((val & 0x0000ff80) == 0) {
                        ipos = 1;
                    } else if ((val & 0x00ff8000) == 0) {
                        ipos = 2;
                    } else if ((val & 0xff800000) == 0) {
                        ipos = 3;
                    } else {
                        ipos = 4;
                    }
                    //System.out.println("      ipos=" + ipos);
                    int endPos = writeInt(delta, tempArr, ipos);
                    //System.out.println("      endpos=" + endPos);
                    if (endPos <= 4) {
                        // value will fit in the integer... move bytes back
                        for (int j = ipos; j < endPos; j++) {
                            val |= (tempArr[j] & 0xff) << (j << 3);
                        }
                        index[doc] = val;
                    } else {
                        // value won't fit... move integer into byte[]
                        for (int j = 0; j < ipos; j++) {
                            tempArr[j] = (byte) val;
                            val >>>= 8;
                        }
                        // point at the end index in the byte[]
                        index[doc] = (endPos << 8) | 1;
                        bytes[doc] = tempArr;
                        tempArr = new byte[12];
                    }
                }
            }
            setActualDocFreq(termNum, actualDF);
        }
        termNum++;
        if (te.next() == null) {
            break;
        }
    }
    numTermsInField = termNum;
    long midPoint = System.nanoTime();
    if (termInstances == 0) {
        // we didn't invert anything
        // lower memory consumption.
        tnums = null;
    } else {
        this.index = index;
        for (int pass = 0; pass < 256; pass++) {
            byte[] target = tnums[pass];
            // end in target;
            int pos = 0;
            if (target != null) {
                pos = target.length;
            } else {
                target = new byte[4096];
            }
            // each pass shares the same byte[] for termNumber lists.
            for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) {
                int lim = Math.min(docbase + (1 << 16), maxDoc);
                for (int doc = docbase; doc < lim; doc++) {
                    //System.out.println("  pass=" + pass + " process docID=" + doc);
                    int val = index[doc];
                    if ((val & 0xff) == 1) {
                        int len = val >>> 8;
                        //System.out.println("    ptr pos=" + pos);
                        // change index to point to start of array
                        index[doc] = (pos << 8) | 1;
                        if ((pos & 0xff000000) != 0) {
                            // we only have 24 bits for the array index
                            throw new IllegalStateException("Too many values for UnInvertedField faceting on field " + field);
                        }
                        byte[] arr = bytes[doc];
                        /*
              for(byte b : arr) {
                //System.out.println("      b=" + Integer.toHexString((int) b));
              }
              */
                        // IMPORTANT: allow GC to avoid OOM
                        bytes[doc] = null;
                        if (target.length <= pos + len) {
                            int newlen = target.length;
                            // doubling strategy                 
                            while (newlen <= pos + len) newlen <<= 1;
                            byte[] newtarget = new byte[newlen];
                            System.arraycopy(target, 0, newtarget, 0, pos);
                            target = newtarget;
                        }
                        System.arraycopy(arr, 0, target, pos, len);
                        // skip single byte at end and leave it 0 for terminator
                        pos += len + 1;
                    }
                }
            }
            // shrink array
            if (pos < target.length) {
                byte[] newtarget = new byte[pos];
                System.arraycopy(target, 0, newtarget, 0, pos);
                target = newtarget;
            }
            tnums[pass] = target;
            if ((pass << 16) > maxDoc)
                break;
        }
    }
    indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);
    long endTime = System.nanoTime();
    total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS);
    phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS);
}
Also used : Terms(org.apache.lucene.index.Terms) ArrayList(java.util.ArrayList) TermsEnum(org.apache.lucene.index.TermsEnum) PagedBytes(org.apache.lucene.util.PagedBytes) FieldInfo(org.apache.lucene.index.FieldInfo) BytesRef(org.apache.lucene.util.BytesRef)

Example 4 with PagedBytes

use of org.apache.lucene.util.PagedBytes in project lucene-solr by apache.

the class Lucene54DocValuesProducer method getReverseIndexInstance.

/** returns a reverse lookup instance for prefix-compressed binary values. */
private synchronized ReverseTermsIndex getReverseIndexInstance(FieldInfo field, BinaryEntry bytes) throws IOException {
    ReverseTermsIndex index = reverseIndexInstances.get(field.name);
    if (index == null) {
        index = new ReverseTermsIndex();
        data.seek(bytes.reverseIndexOffset);
        long size = (bytes.count + REVERSE_INTERVAL_MASK) >>> REVERSE_INTERVAL_SHIFT;
        index.termAddresses = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, size, false);
        long dataSize = data.readVLong();
        PagedBytes pagedBytes = new PagedBytes(15);
        pagedBytes.copy(data, dataSize);
        index.terms = pagedBytes.freeze(true);
        if (!merging) {
            reverseIndexInstances.put(field.name, index);
            ramBytesUsed.addAndGet(index.ramBytesUsed());
        }
    }
    return index;
}
Also used : PagedBytes(org.apache.lucene.util.PagedBytes)

Example 5 with PagedBytes

use of org.apache.lucene.util.PagedBytes in project lucene-solr by apache.

the class Lucene54DocValuesConsumer method addReverseTermIndex.

// writes reverse term index: used for binary searching a term into a range of 64 blocks
// for every 64 blocks (1024 terms) we store a term, trimming any suffix unnecessary for comparison
// terms are written as a contiguous byte[], but never spanning 2^15 byte boundaries.
private void addReverseTermIndex(FieldInfo field, final Iterable<BytesRef> values, int maxLength) throws IOException {
    long count = 0;
    BytesRefBuilder priorTerm = new BytesRefBuilder();
    priorTerm.grow(maxLength);
    BytesRef indexTerm = new BytesRef();
    long startFP = data.getFilePointer();
    PagedBytes pagedBytes = new PagedBytes(15);
    MonotonicBlockPackedWriter addresses = new MonotonicBlockPackedWriter(data, MONOTONIC_BLOCK_SIZE);
    for (BytesRef b : values) {
        int termPosition = (int) (count & REVERSE_INTERVAL_MASK);
        if (termPosition == 0) {
            int len = StringHelper.sortKeyLength(priorTerm.get(), b);
            indexTerm.bytes = b.bytes;
            indexTerm.offset = b.offset;
            indexTerm.length = len;
            addresses.add(pagedBytes.copyUsingLengthPrefix(indexTerm));
        } else if (termPosition == REVERSE_INTERVAL_MASK) {
            priorTerm.copyBytes(b);
        }
        count++;
    }
    addresses.finish();
    long numBytes = pagedBytes.getPointer();
    pagedBytes.freeze(true);
    PagedBytesDataInput in = pagedBytes.getDataInput();
    meta.writeLong(startFP);
    data.writeVLong(numBytes);
    data.copyBytes(in, numBytes);
}
Also used : MonotonicBlockPackedWriter(org.apache.lucene.util.packed.MonotonicBlockPackedWriter) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) PagedBytes(org.apache.lucene.util.PagedBytes) PagedBytesDataInput(org.apache.lucene.util.PagedBytes.PagedBytesDataInput) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

PagedBytes (org.apache.lucene.util.PagedBytes)5 BytesRef (org.apache.lucene.util.BytesRef)3 Terms (org.apache.lucene.index.Terms)2 TermsEnum (org.apache.lucene.index.TermsEnum)2 ArrayList (java.util.ArrayList)1 FieldInfo (org.apache.lucene.index.FieldInfo)1 LeafReader (org.apache.lucene.index.LeafReader)1 PostingsEnum (org.apache.lucene.index.PostingsEnum)1 ChecksumIndexInput (org.apache.lucene.store.ChecksumIndexInput)1 IndexInput (org.apache.lucene.store.IndexInput)1 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)1 PagedBytesDataInput (org.apache.lucene.util.PagedBytes.PagedBytesDataInput)1 MonotonicBlockPackedWriter (org.apache.lucene.util.packed.MonotonicBlockPackedWriter)1 PackedLongValues (org.apache.lucene.util.packed.PackedLongValues)1 AtomicOrdinalsFieldData (org.elasticsearch.index.fielddata.AtomicOrdinalsFieldData)1 RamAccountingTermsEnum (org.elasticsearch.index.fielddata.RamAccountingTermsEnum)1 Ordinals (org.elasticsearch.index.fielddata.ordinals.Ordinals)1 OrdinalsBuilder (org.elasticsearch.index.fielddata.ordinals.OrdinalsBuilder)1