Search in sources :

Example 31 with IndexOutput

use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.

the class FSTCompletionLookup method build.

@Override
public void build(InputIterator iterator) throws IOException {
    if (iterator.hasPayloads()) {
        throw new IllegalArgumentException("this suggester doesn't support payloads");
    }
    if (iterator.hasContexts()) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix);
    ExternalRefSorter externalSorter = new ExternalRefSorter(sorter);
    IndexOutput tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
    String tempSortedFileName = null;
    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    // Push floats up front before sequences to sort them. For now, assume they are non-negative.
    // If negative floats are allowed some trickery needs to be done to find their byte order.
    count = 0;
    try {
        byte[] buffer = new byte[0];
        ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
        BytesRef spare;
        int inputLineCount = 0;
        while ((spare = iterator.next()) != null) {
            if (spare.length + 4 >= buffer.length) {
                buffer = ArrayUtil.grow(buffer, spare.length + 4);
            }
            output.reset(buffer);
            output.writeInt(encodeWeight(iterator.weight()));
            output.writeBytes(spare.bytes, spare.offset, spare.length);
            writer.write(buffer, 0, output.getPosition());
            inputLineCount++;
        }
        CodecUtil.writeFooter(tempInput);
        writer.close();
        // We don't know the distribution of scores and we need to bucket them, so we'll sort
        // and divide into equal buckets.
        tempSortedFileName = sorter.sort(tempInput.getName());
        tempDir.deleteFile(tempInput.getName());
        FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, externalSorter, sharedTailLength);
        reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
        long line = 0;
        int previousBucket = 0;
        int previousScore = 0;
        ByteArrayDataInput input = new ByteArrayDataInput();
        BytesRef tmp2 = new BytesRef();
        while (true) {
            BytesRef scratch = reader.next();
            if (scratch == null) {
                break;
            }
            input.reset(scratch.bytes, scratch.offset, scratch.length);
            int currentScore = input.readInt();
            int bucket;
            if (line > 0 && currentScore == previousScore) {
                bucket = previousBucket;
            } else {
                bucket = (int) (line * buckets / inputLineCount);
            }
            previousScore = currentScore;
            previousBucket = bucket;
            // Only append the input, discard the weight.
            tmp2.bytes = scratch.bytes;
            tmp2.offset = scratch.offset + input.getPosition();
            tmp2.length = scratch.length - input.getPosition();
            builder.add(tmp2, bucket);
            line++;
            count++;
        }
        // The two FSTCompletions share the same automaton.
        this.higherWeightsCompletion = builder.build();
        this.normalCompletion = new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst);
    } finally {
        IOUtils.closeWhileHandlingException(reader, writer, externalSorter);
        IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) IndexOutput(org.apache.lucene.store.IndexOutput) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) BytesRef(org.apache.lucene.util.BytesRef)

Example 32 with IndexOutput

use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.

the class TestBKD method verify.

private void verify(Directory dir, byte[][][] docValues, int[] docIDs, int numDims, int numBytesPerDim, int maxPointsInLeafNode, double maxMB) throws Exception {
    int numValues = docValues.length;
    if (VERBOSE) {
        System.out.println("TEST: numValues=" + numValues + " numDims=" + numDims + " numBytesPerDim=" + numBytesPerDim + " maxPointsInLeafNode=" + maxPointsInLeafNode + " maxMB=" + maxMB);
    }
    List<Long> toMerge = null;
    List<MergeState.DocMap> docMaps = null;
    int seg = 0;
    BKDWriter w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, docValues.length, false);
    IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT);
    IndexInput in = null;
    boolean success = false;
    try {
        byte[] scratch = new byte[numBytesPerDim * numDims];
        int lastDocIDBase = 0;
        boolean useMerge = numDims == 1 && numValues >= 10 && random().nextBoolean();
        int valuesInThisSeg;
        if (useMerge) {
            // Sometimes we will call merge with a single segment:
            valuesInThisSeg = TestUtil.nextInt(random(), numValues / 10, numValues);
        } else {
            valuesInThisSeg = 0;
        }
        int segCount = 0;
        for (int ord = 0; ord < numValues; ord++) {
            int docID;
            if (docIDs == null) {
                docID = ord;
            } else {
                docID = docIDs[ord];
            }
            if (VERBOSE) {
                System.out.println("  ord=" + ord + " docID=" + docID + " lastDocIDBase=" + lastDocIDBase);
            }
            for (int dim = 0; dim < numDims; dim++) {
                if (VERBOSE) {
                    System.out.println("    " + dim + " -> " + new BytesRef(docValues[ord][dim]));
                }
                System.arraycopy(docValues[ord][dim], 0, scratch, dim * numBytesPerDim, numBytesPerDim);
            }
            w.add(scratch, docID - lastDocIDBase);
            segCount++;
            if (useMerge && segCount == valuesInThisSeg) {
                if (toMerge == null) {
                    toMerge = new ArrayList<>();
                    docMaps = new ArrayList<>();
                }
                final int curDocIDBase = lastDocIDBase;
                docMaps.add(new MergeState.DocMap() {

                    @Override
                    public int get(int docID) {
                        return curDocIDBase + docID;
                    }
                });
                toMerge.add(w.finish(out));
                valuesInThisSeg = TestUtil.nextInt(random(), numValues / 10, numValues / 2);
                segCount = 0;
                seg++;
                maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 1000);
                maxMB = (float) 3.0 + (3 * random().nextDouble());
                w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, docValues.length, false);
                lastDocIDBase = docID;
            }
        }
        long indexFP;
        if (toMerge != null) {
            if (segCount > 0) {
                toMerge.add(w.finish(out));
                final int curDocIDBase = lastDocIDBase;
                docMaps.add(new MergeState.DocMap() {

                    @Override
                    public int get(int docID) {
                        return curDocIDBase + docID;
                    }
                });
            }
            out.close();
            in = dir.openInput("bkd", IOContext.DEFAULT);
            seg++;
            w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, docValues.length, false);
            List<BKDReader> readers = new ArrayList<>();
            for (long fp : toMerge) {
                in.seek(fp);
                readers.add(new BKDReader(in));
            }
            out = dir.createOutput("bkd2", IOContext.DEFAULT);
            indexFP = w.merge(out, docMaps, readers);
            out.close();
            in.close();
            in = dir.openInput("bkd2", IOContext.DEFAULT);
        } else {
            indexFP = w.finish(out);
            out.close();
            in = dir.openInput("bkd", IOContext.DEFAULT);
        }
        in.seek(indexFP);
        BKDReader r = new BKDReader(in);
        int iters = atLeast(100);
        for (int iter = 0; iter < iters; iter++) {
            if (VERBOSE) {
                System.out.println("\nTEST: iter=" + iter);
            }
            // Random N dims rect query:
            byte[][] queryMin = new byte[numDims][];
            byte[][] queryMax = new byte[numDims][];
            for (int dim = 0; dim < numDims; dim++) {
                queryMin[dim] = new byte[numBytesPerDim];
                random().nextBytes(queryMin[dim]);
                queryMax[dim] = new byte[numBytesPerDim];
                random().nextBytes(queryMax[dim]);
                if (StringHelper.compare(numBytesPerDim, queryMin[dim], 0, queryMax[dim], 0) > 0) {
                    byte[] x = queryMin[dim];
                    queryMin[dim] = queryMax[dim];
                    queryMax[dim] = x;
                }
            }
            final BitSet hits = new BitSet();
            r.intersect(new IntersectVisitor() {

                @Override
                public void visit(int docID) {
                    hits.set(docID);
                //System.out.println("visit docID=" + docID);
                }

                @Override
                public void visit(int docID, byte[] packedValue) {
                    //System.out.println("visit check docID=" + docID);
                    for (int dim = 0; dim < numDims; dim++) {
                        if (StringHelper.compare(numBytesPerDim, packedValue, dim * numBytesPerDim, queryMin[dim], 0) < 0 || StringHelper.compare(numBytesPerDim, packedValue, dim * numBytesPerDim, queryMax[dim], 0) > 0) {
                            //System.out.println("  no");
                            return;
                        }
                    }
                    //System.out.println("  yes");
                    hits.set(docID);
                }

                @Override
                public Relation compare(byte[] minPacked, byte[] maxPacked) {
                    boolean crosses = false;
                    for (int dim = 0; dim < numDims; dim++) {
                        if (StringHelper.compare(numBytesPerDim, maxPacked, dim * numBytesPerDim, queryMin[dim], 0) < 0 || StringHelper.compare(numBytesPerDim, minPacked, dim * numBytesPerDim, queryMax[dim], 0) > 0) {
                            return Relation.CELL_OUTSIDE_QUERY;
                        } else if (StringHelper.compare(numBytesPerDim, minPacked, dim * numBytesPerDim, queryMin[dim], 0) < 0 || StringHelper.compare(numBytesPerDim, maxPacked, dim * numBytesPerDim, queryMax[dim], 0) > 0) {
                            crosses = true;
                        }
                    }
                    if (crosses) {
                        return Relation.CELL_CROSSES_QUERY;
                    } else {
                        return Relation.CELL_INSIDE_QUERY;
                    }
                }
            });
            BitSet expected = new BitSet();
            for (int ord = 0; ord < numValues; ord++) {
                boolean matches = true;
                for (int dim = 0; dim < numDims; dim++) {
                    byte[] x = docValues[ord][dim];
                    if (StringHelper.compare(numBytesPerDim, x, 0, queryMin[dim], 0) < 0 || StringHelper.compare(numBytesPerDim, x, 0, queryMax[dim], 0) > 0) {
                        matches = false;
                        break;
                    }
                }
                if (matches) {
                    int docID;
                    if (docIDs == null) {
                        docID = ord;
                    } else {
                        docID = docIDs[ord];
                    }
                    expected.set(docID);
                }
            }
            int limit = Math.max(expected.length(), hits.length());
            for (int docID = 0; docID < limit; docID++) {
                assertEquals("docID=" + docID, expected.get(docID), hits.get(docID));
            }
        }
        in.close();
        dir.deleteFile("bkd");
        if (toMerge != null) {
            dir.deleteFile("bkd2");
        }
        success = true;
    } finally {
        if (success == false) {
            IOUtils.closeWhileHandlingException(w, in, out);
            IOUtils.deleteFilesIgnoringExceptions(dir, "bkd", "bkd2");
        }
    }
}
Also used : IntersectVisitor(org.apache.lucene.index.PointValues.IntersectVisitor) MergeState(org.apache.lucene.index.MergeState) ArrayList(java.util.ArrayList) BitSet(java.util.BitSet) CorruptingIndexOutput(org.apache.lucene.store.CorruptingIndexOutput) IndexOutput(org.apache.lucene.store.IndexOutput) Relation(org.apache.lucene.index.PointValues.Relation) IndexInput(org.apache.lucene.store.IndexInput) BytesRef(org.apache.lucene.util.BytesRef)

Example 33 with IndexOutput

use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.

the class Test2BBKDPoints method test2D.

public void test2D() throws Exception {
    Directory dir = FSDirectory.open(createTempDir("2BBKDPoints2D"));
    final int numDocs = (Integer.MAX_VALUE / 26) + 100;
    BKDWriter w = new BKDWriter(numDocs, dir, "_0", 2, Long.BYTES, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, 26L * numDocs, false);
    int counter = 0;
    byte[] packedBytes = new byte[2 * Long.BYTES];
    for (int docID = 0; docID < numDocs; docID++) {
        for (int j = 0; j < 26; j++) {
            // first a random int:
            NumericUtils.intToSortableBytes(random().nextInt(), packedBytes, 0);
            // then our counter, which will overflow a bit in the end:
            NumericUtils.intToSortableBytes(counter, packedBytes, Integer.BYTES);
            // then two random ints for the 2nd dimension:
            NumericUtils.intToSortableBytes(random().nextInt(), packedBytes, Long.BYTES);
            NumericUtils.intToSortableBytes(random().nextInt(), packedBytes, Long.BYTES + Integer.BYTES);
            w.add(packedBytes, docID);
            counter++;
        }
        if (VERBOSE && docID % 100000 == 0) {
            System.out.println(docID + " of " + numDocs + "...");
        }
    }
    IndexOutput out = dir.createOutput("2d.bkd", IOContext.DEFAULT);
    long indexFP = w.finish(out);
    out.close();
    IndexInput in = dir.openInput("2d.bkd", IOContext.DEFAULT);
    in.seek(indexFP);
    BKDReader r = new BKDReader(in);
    CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("2d", numDocs, r);
    r.intersect(visitor);
    assertEquals(r.size(), visitor.getPointCountSeen());
    assertEquals(r.getDocCount(), visitor.getDocCountSeen());
    in.close();
    dir.close();
}
Also used : IndexInput(org.apache.lucene.store.IndexInput) IndexOutput(org.apache.lucene.store.IndexOutput) CheckIndex(org.apache.lucene.index.CheckIndex) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory)

Example 34 with IndexOutput

use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.

the class Test2BBKDPoints method test1D.

public void test1D() throws Exception {
    Directory dir = FSDirectory.open(createTempDir("2BBKDPoints1D"));
    final int numDocs = (Integer.MAX_VALUE / 26) + 100;
    BKDWriter w = new BKDWriter(numDocs, dir, "_0", 1, Long.BYTES, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, 26L * numDocs, false);
    int counter = 0;
    byte[] packedBytes = new byte[Long.BYTES];
    for (int docID = 0; docID < numDocs; docID++) {
        for (int j = 0; j < 26; j++) {
            // first a random int:
            NumericUtils.intToSortableBytes(random().nextInt(), packedBytes, 0);
            // then our counter, which will overflow a bit in the end:
            NumericUtils.intToSortableBytes(counter, packedBytes, Integer.BYTES);
            w.add(packedBytes, docID);
            counter++;
        }
        if (VERBOSE && docID % 100000 == 0) {
            System.out.println(docID + " of " + numDocs + "...");
        }
    }
    IndexOutput out = dir.createOutput("1d.bkd", IOContext.DEFAULT);
    long indexFP = w.finish(out);
    out.close();
    IndexInput in = dir.openInput("1d.bkd", IOContext.DEFAULT);
    in.seek(indexFP);
    BKDReader r = new BKDReader(in);
    CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("1d", numDocs, r);
    r.intersect(visitor);
    assertEquals(r.size(), visitor.getPointCountSeen());
    assertEquals(r.getDocCount(), visitor.getDocCountSeen());
    in.close();
    dir.close();
}
Also used : IndexInput(org.apache.lucene.store.IndexInput) IndexOutput(org.apache.lucene.store.IndexOutput) CheckIndex(org.apache.lucene.index.CheckIndex) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory)

Example 35 with IndexOutput

use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.

the class TestBKD method testBigIntNDims.

// Tests on N-dimensional points where each dimension is a BigInteger
public void testBigIntNDims() throws Exception {
    int numDocs = atLeast(1000);
    try (Directory dir = getDirectory(numDocs)) {
        int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
        int numDims = TestUtil.nextInt(random(), 1, 5);
        int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 100);
        float maxMB = (float) 3.0 + (3 * random().nextFloat());
        BKDWriter w = new BKDWriter(numDocs, dir, "tmp", numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, numDocs, true);
        BigInteger[][] docs = new BigInteger[numDocs][];
        byte[] scratch = new byte[numBytesPerDim * numDims];
        for (int docID = 0; docID < numDocs; docID++) {
            BigInteger[] values = new BigInteger[numDims];
            if (VERBOSE) {
                System.out.println("  docID=" + docID);
            }
            for (int dim = 0; dim < numDims; dim++) {
                values[dim] = randomBigInt(numBytesPerDim);
                NumericUtils.bigIntToSortableBytes(values[dim], numBytesPerDim, scratch, dim * numBytesPerDim);
                if (VERBOSE) {
                    System.out.println("    " + dim + " -> " + values[dim]);
                }
            }
            docs[docID] = values;
            w.add(scratch, docID);
        }
        long indexFP;
        try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
            indexFP = w.finish(out);
        }
        try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
            in.seek(indexFP);
            BKDReader r = new BKDReader(in);
            int iters = atLeast(100);
            for (int iter = 0; iter < iters; iter++) {
                if (VERBOSE) {
                    System.out.println("\nTEST: iter=" + iter);
                }
                // Random N dims rect query:
                BigInteger[] queryMin = new BigInteger[numDims];
                BigInteger[] queryMax = new BigInteger[numDims];
                for (int dim = 0; dim < numDims; dim++) {
                    queryMin[dim] = randomBigInt(numBytesPerDim);
                    queryMax[dim] = randomBigInt(numBytesPerDim);
                    if (queryMin[dim].compareTo(queryMax[dim]) > 0) {
                        BigInteger x = queryMin[dim];
                        queryMin[dim] = queryMax[dim];
                        queryMax[dim] = x;
                    }
                }
                final BitSet hits = new BitSet();
                r.intersect(new IntersectVisitor() {

                    @Override
                    public void visit(int docID) {
                        hits.set(docID);
                    //System.out.println("visit docID=" + docID);
                    }

                    @Override
                    public void visit(int docID, byte[] packedValue) {
                        //System.out.println("visit check docID=" + docID);
                        for (int dim = 0; dim < numDims; dim++) {
                            BigInteger x = NumericUtils.sortableBytesToBigInt(packedValue, dim * numBytesPerDim, numBytesPerDim);
                            if (x.compareTo(queryMin[dim]) < 0 || x.compareTo(queryMax[dim]) > 0) {
                                //System.out.println("  no");
                                return;
                            }
                        }
                        //System.out.println("  yes");
                        hits.set(docID);
                    }

                    @Override
                    public Relation compare(byte[] minPacked, byte[] maxPacked) {
                        boolean crosses = false;
                        for (int dim = 0; dim < numDims; dim++) {
                            BigInteger min = NumericUtils.sortableBytesToBigInt(minPacked, dim * numBytesPerDim, numBytesPerDim);
                            BigInteger max = NumericUtils.sortableBytesToBigInt(maxPacked, dim * numBytesPerDim, numBytesPerDim);
                            assert max.compareTo(min) >= 0;
                            if (max.compareTo(queryMin[dim]) < 0 || min.compareTo(queryMax[dim]) > 0) {
                                return Relation.CELL_OUTSIDE_QUERY;
                            } else if (min.compareTo(queryMin[dim]) < 0 || max.compareTo(queryMax[dim]) > 0) {
                                crosses = true;
                            }
                        }
                        if (crosses) {
                            return Relation.CELL_CROSSES_QUERY;
                        } else {
                            return Relation.CELL_INSIDE_QUERY;
                        }
                    }
                });
                for (int docID = 0; docID < numDocs; docID++) {
                    BigInteger[] docValues = docs[docID];
                    boolean expected = true;
                    for (int dim = 0; dim < numDims; dim++) {
                        BigInteger x = docValues[dim];
                        if (x.compareTo(queryMin[dim]) < 0 || x.compareTo(queryMax[dim]) > 0) {
                            expected = false;
                            break;
                        }
                    }
                    boolean actual = hits.get(docID);
                    assertEquals("docID=" + docID, expected, actual);
                }
            }
        }
    }
}
Also used : IntersectVisitor(org.apache.lucene.index.PointValues.IntersectVisitor) BitSet(java.util.BitSet) CorruptingIndexOutput(org.apache.lucene.store.CorruptingIndexOutput) IndexOutput(org.apache.lucene.store.IndexOutput) Relation(org.apache.lucene.index.PointValues.Relation) BigInteger(java.math.BigInteger) IndexInput(org.apache.lucene.store.IndexInput) FilterDirectory(org.apache.lucene.store.FilterDirectory) Directory(org.apache.lucene.store.Directory)

Aggregations

IndexOutput (org.apache.lucene.store.IndexOutput)182 Directory (org.apache.lucene.store.Directory)79 IndexInput (org.apache.lucene.store.IndexInput)76 RAMDirectory (org.apache.lucene.store.RAMDirectory)36 FilterDirectory (org.apache.lucene.store.FilterDirectory)34 CorruptIndexException (org.apache.lucene.index.CorruptIndexException)27 ChecksumIndexInput (org.apache.lucene.store.ChecksumIndexInput)27 BytesRef (org.apache.lucene.util.BytesRef)26 IOException (java.io.IOException)20 CorruptingIndexOutput (org.apache.lucene.store.CorruptingIndexOutput)18 RAMFile (org.apache.lucene.store.RAMFile)16 RAMOutputStream (org.apache.lucene.store.RAMOutputStream)16 IndexFormatTooNewException (org.apache.lucene.index.IndexFormatTooNewException)14 IndexFormatTooOldException (org.apache.lucene.index.IndexFormatTooOldException)14 IOContext (org.apache.lucene.store.IOContext)13 ArrayList (java.util.ArrayList)11 BufferedChecksumIndexInput (org.apache.lucene.store.BufferedChecksumIndexInput)11 RAMInputStream (org.apache.lucene.store.RAMInputStream)11 NIOFSDirectory (org.apache.lucene.store.NIOFSDirectory)10 NRTCachingDirectory (org.apache.lucene.store.NRTCachingDirectory)10