Search in sources :

Example 51 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.

the class CheckIndex method checkTermRanges.

/** Make an effort to visit "fake" (e.g. auto-prefix) terms.  We do this by running term range intersections across an initially wide
   *  interval of terms, at different boundaries, and then gradually decrease the interval.  This is not guaranteed to hit all non-real
   *  terms (doing that in general is non-trivial), but it should hit many of them, and validate their postings against the postings for the
   *  real terms. */
private static void checkTermRanges(String field, int maxDoc, Terms terms, long numTerms) throws IOException {
    // We'll target this many terms in our interval for the current level:
    double currentInterval = numTerms;
    FixedBitSet normalDocs = new FixedBitSet(maxDoc);
    FixedBitSet intersectDocs = new FixedBitSet(maxDoc);
    while (currentInterval >= 10.0) {
        //System.out.println("  cycle interval=" + currentInterval);
        // We iterate this terms enum to locate min/max term for each sliding/overlapping interval we test at the current level:
        TermsEnum termsEnum = terms.iterator();
        long termCount = 0;
        Deque<BytesRef> termBounds = new LinkedList<>();
        long lastTermAdded = Long.MIN_VALUE;
        BytesRefBuilder lastTerm = null;
        while (true) {
            BytesRef term = termsEnum.next();
            if (term == null) {
                break;
            }
            //System.out.println("  top: term=" + term.utf8ToString());
            if (termCount >= lastTermAdded + currentInterval / 4) {
                termBounds.add(BytesRef.deepCopyOf(term));
                lastTermAdded = termCount;
                if (termBounds.size() == 5) {
                    BytesRef minTerm = termBounds.removeFirst();
                    BytesRef maxTerm = termBounds.getLast();
                    checkSingleTermRange(field, maxDoc, terms, minTerm, maxTerm, normalDocs, intersectDocs);
                }
            }
            termCount++;
            if (lastTerm == null) {
                lastTerm = new BytesRefBuilder();
                lastTerm.copyBytes(term);
            } else {
                if (lastTerm.get().compareTo(term) >= 0) {
                    throw new RuntimeException("terms out of order: lastTerm=" + lastTerm.get() + " term=" + term);
                }
                lastTerm.copyBytes(term);
            }
        }
        if (lastTerm != null && termBounds.isEmpty() == false) {
            BytesRef minTerm = termBounds.removeFirst();
            BytesRef maxTerm = lastTerm.get();
            checkSingleTermRange(field, maxDoc, terms, minTerm, maxTerm, normalDocs, intersectDocs);
        }
        currentInterval *= .75;
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) FixedBitSet(org.apache.lucene.util.FixedBitSet) LinkedList(java.util.LinkedList) BytesRef(org.apache.lucene.util.BytesRef)

Example 52 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.

the class CheckIndex method getDocsFromTermRange.

/** Visits all terms in the range minTerm (inclusive) to maxTerm (exclusive), marking all doc IDs encountered into allDocsSeen, and
   *  returning the total number of terms visited. */
private static long getDocsFromTermRange(String field, int maxDoc, TermsEnum termsEnum, FixedBitSet docsSeen, BytesRef minTerm, BytesRef maxTerm, boolean isIntersect) throws IOException {
    docsSeen.clear(0, docsSeen.length());
    long termCount = 0;
    PostingsEnum postingsEnum = null;
    BytesRefBuilder lastTerm = null;
    while (true) {
        BytesRef term;
        // Kinda messy: for intersect, we must first next(), but for "normal", we are already on our first term:
        if (isIntersect || termCount != 0) {
            term = termsEnum.next();
        } else {
            term = termsEnum.term();
        }
        if (term == null) {
            if (isIntersect == false) {
                throw new RuntimeException("didn't see max term field=" + field + " term=" + maxTerm);
            }
            //System.out.println("      terms=" + termCount);
            return termCount;
        }
        assert term.isValid();
        if (lastTerm == null) {
            lastTerm = new BytesRefBuilder();
            lastTerm.copyBytes(term);
        } else {
            if (lastTerm.get().compareTo(term) >= 0) {
                throw new RuntimeException("terms out of order: lastTerm=" + lastTerm.get() + " term=" + term);
            }
            lastTerm.copyBytes(term);
        }
        // Caller already ensured terms enum positioned >= minTerm:
        if (term.compareTo(minTerm) < 0) {
            throw new RuntimeException("saw term before min term field=" + field + " term=" + minTerm);
        }
        if (isIntersect == false) {
            int cmp = term.compareTo(maxTerm);
            if (cmp == 0) {
                //System.out.println("      terms=" + termCount);
                return termCount;
            } else if (cmp > 0) {
                throw new RuntimeException("didn't see end term field=" + field + " term=" + maxTerm);
            }
        }
        postingsEnum = termsEnum.postings(postingsEnum, 0);
        int lastDoc = -1;
        while (true) {
            int doc = postingsEnum.nextDoc();
            if (doc == DocIdSetIterator.NO_MORE_DOCS) {
                break;
            }
            if (doc <= lastDoc) {
                throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
            }
            if (doc >= maxDoc) {
                throw new RuntimeException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc);
            }
            //System.out.println("      doc=" + doc);
            docsSeen.set(doc);
            lastDoc = doc;
        }
        termCount++;
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 53 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.

the class TestLucene70DocValuesFormat method testSortedSetAroundBlockSize.

@Slow
public void testSortedSetAroundBlockSize() throws IOException {
    final int frontier = 1 << Lucene70DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
    for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
        final Directory dir = newDirectory();
        IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
        RAMFile buffer = new RAMFile();
        RAMOutputStream out = new RAMOutputStream(buffer, false);
        Document doc = new Document();
        SortedSetDocValuesField field1 = new SortedSetDocValuesField("sset", new BytesRef());
        doc.add(field1);
        SortedSetDocValuesField field2 = new SortedSetDocValuesField("sset", new BytesRef());
        doc.add(field2);
        for (int i = 0; i < maxDoc; ++i) {
            BytesRef s1 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
            BytesRef s2 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
            field1.setBytesValue(s1);
            field2.setBytesValue(s2);
            w.addDocument(doc);
            Set<BytesRef> set = new TreeSet<>(Arrays.asList(s1, s2));
            out.writeVInt(set.size());
            for (BytesRef ref : set) {
                out.writeVInt(ref.length);
                out.writeBytes(ref.bytes, ref.offset, ref.length);
            }
        }
        out.close();
        w.forceMerge(1);
        DirectoryReader r = DirectoryReader.open(w);
        w.close();
        LeafReader sr = getOnlyLeafReader(r);
        assertEquals(maxDoc, sr.maxDoc());
        SortedSetDocValues values = sr.getSortedSetDocValues("sset");
        assertNotNull(values);
        RAMInputStream in = new RAMInputStream("", buffer);
        BytesRefBuilder b = new BytesRefBuilder();
        for (int i = 0; i < maxDoc; ++i) {
            assertEquals(i, values.nextDoc());
            final int numValues = in.readVInt();
            for (int j = 0; j < numValues; ++j) {
                b.setLength(in.readVInt());
                b.grow(b.length());
                in.readBytes(b.bytes(), 0, b.length());
                assertEquals(b.get(), values.lookupOrd(values.nextOrd()));
            }
            assertEquals(SortedSetDocValues.NO_MORE_ORDS, values.nextOrd());
        }
        r.close();
        dir.close();
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) LeafReader(org.apache.lucene.index.LeafReader) DirectoryReader(org.apache.lucene.index.DirectoryReader) RAMInputStream(org.apache.lucene.store.RAMInputStream) Document(org.apache.lucene.document.Document) RAMFile(org.apache.lucene.store.RAMFile) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) TreeSet(java.util.TreeSet) RAMOutputStream(org.apache.lucene.store.RAMOutputStream) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Example 54 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.

the class SimpleTextDocValuesReader method checkIntegrity.

@Override
public void checkIntegrity() throws IOException {
    BytesRefBuilder scratch = new BytesRefBuilder();
    IndexInput clone = data.clone();
    clone.seek(0);
    // checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included in SimpleTextUtil.CHECKSUM):
    long footerStartPos = data.length() - (SimpleTextUtil.CHECKSUM.length + 21);
    ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
    while (true) {
        SimpleTextUtil.readLine(input, scratch);
        if (input.getFilePointer() >= footerStartPos) {
            // Make sure we landed at precisely the right location:
            if (input.getFilePointer() != footerStartPos) {
                throw new CorruptIndexException("SimpleText failure: footer does not start at expected position current=" + input.getFilePointer() + " vs expected=" + footerStartPos, input);
            }
            SimpleTextUtil.checkFooter(input);
            break;
        }
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) ChecksumIndexInput(org.apache.lucene.store.ChecksumIndexInput) BufferedChecksumIndexInput(org.apache.lucene.store.BufferedChecksumIndexInput) BufferedChecksumIndexInput(org.apache.lucene.store.BufferedChecksumIndexInput) ChecksumIndexInput(org.apache.lucene.store.ChecksumIndexInput) BufferedChecksumIndexInput(org.apache.lucene.store.BufferedChecksumIndexInput) IndexInput(org.apache.lucene.store.IndexInput)

Example 55 with BytesRefBuilder

use of org.apache.lucene.util.BytesRefBuilder in project lucene-solr by apache.

the class SimpleTextDocValuesReader method getSorted.

@Override
public SortedDocValues getSorted(FieldInfo fieldInfo) throws IOException {
    final OneField field = fields.get(fieldInfo.name);
    // valid:
    assert field != null;
    final IndexInput in = data.clone();
    final BytesRefBuilder scratch = new BytesRefBuilder();
    final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
    final DecimalFormat ordDecoder = new DecimalFormat(field.ordPattern, new DecimalFormatSymbols(Locale.ROOT));
    return new SortedDocValues() {

        int doc = -1;

        @Override
        public int nextDoc() throws IOException {
            return advance(docID() + 1);
        }

        @Override
        public int docID() {
            return doc;
        }

        @Override
        public long cost() {
            return maxDoc;
        }

        int ord;

        @Override
        public int advance(int target) throws IOException {
            for (int i = target; i < maxDoc; ++i) {
                in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + i * (1 + field.ordPattern.length()));
                SimpleTextUtil.readLine(in, scratch);
                try {
                    ord = (int) ordDecoder.parse(scratch.get().utf8ToString()).longValue() - 1;
                } catch (ParseException pe) {
                    throw new CorruptIndexException("failed to parse ord", in, pe);
                }
                if (ord >= 0) {
                    return doc = i;
                }
            }
            return doc = NO_MORE_DOCS;
        }

        @Override
        public boolean advanceExact(int target) throws IOException {
            this.doc = target;
            in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + target * (1 + field.ordPattern.length()));
            SimpleTextUtil.readLine(in, scratch);
            try {
                ord = (int) ordDecoder.parse(scratch.get().utf8ToString()).longValue() - 1;
            } catch (ParseException pe) {
                throw new CorruptIndexException("failed to parse ord", in, pe);
            }
            return ord >= 0;
        }

        @Override
        public int ordValue() {
            return ord;
        }

        final BytesRefBuilder term = new BytesRefBuilder();

        @Override
        public BytesRef lookupOrd(int ord) throws IOException {
            if (ord < 0 || ord >= field.numValues) {
                throw new IndexOutOfBoundsException("ord must be 0 .. " + (field.numValues - 1) + "; got " + ord);
            }
            in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength));
            SimpleTextUtil.readLine(in, scratch);
            assert StringHelper.startsWith(scratch.get(), LENGTH) : "got " + scratch.get().utf8ToString() + " in=" + in;
            int len;
            try {
                len = decoder.parse(new String(scratch.bytes(), LENGTH.length, scratch.length() - LENGTH.length, StandardCharsets.UTF_8)).intValue();
            } catch (ParseException pe) {
                throw new CorruptIndexException("failed to parse int length", in, pe);
            }
            term.grow(len);
            term.setLength(len);
            in.readBytes(term.bytes(), 0, len);
            return term.get();
        }

        @Override
        public int getValueCount() {
            return (int) field.numValues;
        }
    };
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) DecimalFormatSymbols(java.text.DecimalFormatSymbols) DecimalFormat(java.text.DecimalFormat) ChecksumIndexInput(org.apache.lucene.store.ChecksumIndexInput) BufferedChecksumIndexInput(org.apache.lucene.store.BufferedChecksumIndexInput) IndexInput(org.apache.lucene.store.IndexInput) ParseException(java.text.ParseException)

Aggregations

BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)150 BytesRef (org.apache.lucene.util.BytesRef)79 ArrayList (java.util.ArrayList)21 IOException (java.io.IOException)17 Term (org.apache.lucene.index.Term)16 HashSet (java.util.HashSet)15 ChecksumIndexInput (org.apache.lucene.store.ChecksumIndexInput)14 FieldType (org.apache.solr.schema.FieldType)14 IndexInput (org.apache.lucene.store.IndexInput)12 BytesRefIterator (org.apache.lucene.util.BytesRefIterator)10 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)10 IntsRef (org.apache.lucene.util.IntsRef)10 SchemaField (org.apache.solr.schema.SchemaField)10 BufferedChecksumIndexInput (org.apache.lucene.store.BufferedChecksumIndexInput)9 ParseException (java.text.ParseException)8 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)8 DecimalFormat (java.text.DecimalFormat)7 HashMap (java.util.HashMap)7 Map (java.util.Map)7 Directory (org.apache.lucene.store.Directory)7