use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.
the class FSTCompletionLookup method build.
@Override
public void build(InputIterator iterator) throws IOException {
if (iterator.hasPayloads()) {
throw new IllegalArgumentException("this suggester doesn't support payloads");
}
if (iterator.hasContexts()) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix);
ExternalRefSorter externalSorter = new ExternalRefSorter(sorter);
IndexOutput tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
String tempSortedFileName = null;
OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
OfflineSorter.ByteSequencesReader reader = null;
// Push floats up front before sequences to sort them. For now, assume they are non-negative.
// If negative floats are allowed some trickery needs to be done to find their byte order.
count = 0;
try {
byte[] buffer = new byte[0];
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
BytesRef spare;
int inputLineCount = 0;
while ((spare = iterator.next()) != null) {
if (spare.length + 4 >= buffer.length) {
buffer = ArrayUtil.grow(buffer, spare.length + 4);
}
output.reset(buffer);
output.writeInt(encodeWeight(iterator.weight()));
output.writeBytes(spare.bytes, spare.offset, spare.length);
writer.write(buffer, 0, output.getPosition());
inputLineCount++;
}
CodecUtil.writeFooter(tempInput);
writer.close();
// We don't know the distribution of scores and we need to bucket them, so we'll sort
// and divide into equal buckets.
tempSortedFileName = sorter.sort(tempInput.getName());
tempDir.deleteFile(tempInput.getName());
FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, externalSorter, sharedTailLength);
reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
long line = 0;
int previousBucket = 0;
int previousScore = 0;
ByteArrayDataInput input = new ByteArrayDataInput();
BytesRef tmp2 = new BytesRef();
while (true) {
BytesRef scratch = reader.next();
if (scratch == null) {
break;
}
input.reset(scratch.bytes, scratch.offset, scratch.length);
int currentScore = input.readInt();
int bucket;
if (line > 0 && currentScore == previousScore) {
bucket = previousBucket;
} else {
bucket = (int) (line * buckets / inputLineCount);
}
previousScore = currentScore;
previousBucket = bucket;
// Only append the input, discard the weight.
tmp2.bytes = scratch.bytes;
tmp2.offset = scratch.offset + input.getPosition();
tmp2.length = scratch.length - input.getPosition();
builder.add(tmp2, bucket);
line++;
count++;
}
// The two FSTCompletions share the same automaton.
this.higherWeightsCompletion = builder.build();
this.normalCompletion = new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst);
} finally {
IOUtils.closeWhileHandlingException(reader, writer, externalSorter);
IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
}
}
use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.
the class TestBKD method verify.
private void verify(Directory dir, byte[][][] docValues, int[] docIDs, int numDims, int numBytesPerDim, int maxPointsInLeafNode, double maxMB) throws Exception {
int numValues = docValues.length;
if (VERBOSE) {
System.out.println("TEST: numValues=" + numValues + " numDims=" + numDims + " numBytesPerDim=" + numBytesPerDim + " maxPointsInLeafNode=" + maxPointsInLeafNode + " maxMB=" + maxMB);
}
List<Long> toMerge = null;
List<MergeState.DocMap> docMaps = null;
int seg = 0;
BKDWriter w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, docValues.length, false);
IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT);
IndexInput in = null;
boolean success = false;
try {
byte[] scratch = new byte[numBytesPerDim * numDims];
int lastDocIDBase = 0;
boolean useMerge = numDims == 1 && numValues >= 10 && random().nextBoolean();
int valuesInThisSeg;
if (useMerge) {
// Sometimes we will call merge with a single segment:
valuesInThisSeg = TestUtil.nextInt(random(), numValues / 10, numValues);
} else {
valuesInThisSeg = 0;
}
int segCount = 0;
for (int ord = 0; ord < numValues; ord++) {
int docID;
if (docIDs == null) {
docID = ord;
} else {
docID = docIDs[ord];
}
if (VERBOSE) {
System.out.println(" ord=" + ord + " docID=" + docID + " lastDocIDBase=" + lastDocIDBase);
}
for (int dim = 0; dim < numDims; dim++) {
if (VERBOSE) {
System.out.println(" " + dim + " -> " + new BytesRef(docValues[ord][dim]));
}
System.arraycopy(docValues[ord][dim], 0, scratch, dim * numBytesPerDim, numBytesPerDim);
}
w.add(scratch, docID - lastDocIDBase);
segCount++;
if (useMerge && segCount == valuesInThisSeg) {
if (toMerge == null) {
toMerge = new ArrayList<>();
docMaps = new ArrayList<>();
}
final int curDocIDBase = lastDocIDBase;
docMaps.add(new MergeState.DocMap() {
@Override
public int get(int docID) {
return curDocIDBase + docID;
}
});
toMerge.add(w.finish(out));
valuesInThisSeg = TestUtil.nextInt(random(), numValues / 10, numValues / 2);
segCount = 0;
seg++;
maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 1000);
maxMB = (float) 3.0 + (3 * random().nextDouble());
w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, docValues.length, false);
lastDocIDBase = docID;
}
}
long indexFP;
if (toMerge != null) {
if (segCount > 0) {
toMerge.add(w.finish(out));
final int curDocIDBase = lastDocIDBase;
docMaps.add(new MergeState.DocMap() {
@Override
public int get(int docID) {
return curDocIDBase + docID;
}
});
}
out.close();
in = dir.openInput("bkd", IOContext.DEFAULT);
seg++;
w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, docValues.length, false);
List<BKDReader> readers = new ArrayList<>();
for (long fp : toMerge) {
in.seek(fp);
readers.add(new BKDReader(in));
}
out = dir.createOutput("bkd2", IOContext.DEFAULT);
indexFP = w.merge(out, docMaps, readers);
out.close();
in.close();
in = dir.openInput("bkd2", IOContext.DEFAULT);
} else {
indexFP = w.finish(out);
out.close();
in = dir.openInput("bkd", IOContext.DEFAULT);
}
in.seek(indexFP);
BKDReader r = new BKDReader(in);
int iters = atLeast(100);
for (int iter = 0; iter < iters; iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Random N dims rect query:
byte[][] queryMin = new byte[numDims][];
byte[][] queryMax = new byte[numDims][];
for (int dim = 0; dim < numDims; dim++) {
queryMin[dim] = new byte[numBytesPerDim];
random().nextBytes(queryMin[dim]);
queryMax[dim] = new byte[numBytesPerDim];
random().nextBytes(queryMax[dim]);
if (StringHelper.compare(numBytesPerDim, queryMin[dim], 0, queryMax[dim], 0) > 0) {
byte[] x = queryMin[dim];
queryMin[dim] = queryMax[dim];
queryMax[dim] = x;
}
}
final BitSet hits = new BitSet();
r.intersect(new IntersectVisitor() {
@Override
public void visit(int docID) {
hits.set(docID);
//System.out.println("visit docID=" + docID);
}
@Override
public void visit(int docID, byte[] packedValue) {
//System.out.println("visit check docID=" + docID);
for (int dim = 0; dim < numDims; dim++) {
if (StringHelper.compare(numBytesPerDim, packedValue, dim * numBytesPerDim, queryMin[dim], 0) < 0 || StringHelper.compare(numBytesPerDim, packedValue, dim * numBytesPerDim, queryMax[dim], 0) > 0) {
//System.out.println(" no");
return;
}
}
//System.out.println(" yes");
hits.set(docID);
}
@Override
public Relation compare(byte[] minPacked, byte[] maxPacked) {
boolean crosses = false;
for (int dim = 0; dim < numDims; dim++) {
if (StringHelper.compare(numBytesPerDim, maxPacked, dim * numBytesPerDim, queryMin[dim], 0) < 0 || StringHelper.compare(numBytesPerDim, minPacked, dim * numBytesPerDim, queryMax[dim], 0) > 0) {
return Relation.CELL_OUTSIDE_QUERY;
} else if (StringHelper.compare(numBytesPerDim, minPacked, dim * numBytesPerDim, queryMin[dim], 0) < 0 || StringHelper.compare(numBytesPerDim, maxPacked, dim * numBytesPerDim, queryMax[dim], 0) > 0) {
crosses = true;
}
}
if (crosses) {
return Relation.CELL_CROSSES_QUERY;
} else {
return Relation.CELL_INSIDE_QUERY;
}
}
});
BitSet expected = new BitSet();
for (int ord = 0; ord < numValues; ord++) {
boolean matches = true;
for (int dim = 0; dim < numDims; dim++) {
byte[] x = docValues[ord][dim];
if (StringHelper.compare(numBytesPerDim, x, 0, queryMin[dim], 0) < 0 || StringHelper.compare(numBytesPerDim, x, 0, queryMax[dim], 0) > 0) {
matches = false;
break;
}
}
if (matches) {
int docID;
if (docIDs == null) {
docID = ord;
} else {
docID = docIDs[ord];
}
expected.set(docID);
}
}
int limit = Math.max(expected.length(), hits.length());
for (int docID = 0; docID < limit; docID++) {
assertEquals("docID=" + docID, expected.get(docID), hits.get(docID));
}
}
in.close();
dir.deleteFile("bkd");
if (toMerge != null) {
dir.deleteFile("bkd2");
}
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(w, in, out);
IOUtils.deleteFilesIgnoringExceptions(dir, "bkd", "bkd2");
}
}
}
use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.
the class Test2BBKDPoints method test2D.
public void test2D() throws Exception {
Directory dir = FSDirectory.open(createTempDir("2BBKDPoints2D"));
final int numDocs = (Integer.MAX_VALUE / 26) + 100;
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 2, Long.BYTES, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, 26L * numDocs, false);
int counter = 0;
byte[] packedBytes = new byte[2 * Long.BYTES];
for (int docID = 0; docID < numDocs; docID++) {
for (int j = 0; j < 26; j++) {
// first a random int:
NumericUtils.intToSortableBytes(random().nextInt(), packedBytes, 0);
// then our counter, which will overflow a bit in the end:
NumericUtils.intToSortableBytes(counter, packedBytes, Integer.BYTES);
// then two random ints for the 2nd dimension:
NumericUtils.intToSortableBytes(random().nextInt(), packedBytes, Long.BYTES);
NumericUtils.intToSortableBytes(random().nextInt(), packedBytes, Long.BYTES + Integer.BYTES);
w.add(packedBytes, docID);
counter++;
}
if (VERBOSE && docID % 100000 == 0) {
System.out.println(docID + " of " + numDocs + "...");
}
}
IndexOutput out = dir.createOutput("2d.bkd", IOContext.DEFAULT);
long indexFP = w.finish(out);
out.close();
IndexInput in = dir.openInput("2d.bkd", IOContext.DEFAULT);
in.seek(indexFP);
BKDReader r = new BKDReader(in);
CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("2d", numDocs, r);
r.intersect(visitor);
assertEquals(r.size(), visitor.getPointCountSeen());
assertEquals(r.getDocCount(), visitor.getDocCountSeen());
in.close();
dir.close();
}
use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.
the class Test2BBKDPoints method test1D.
public void test1D() throws Exception {
Directory dir = FSDirectory.open(createTempDir("2BBKDPoints1D"));
final int numDocs = (Integer.MAX_VALUE / 26) + 100;
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 1, Long.BYTES, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, 26L * numDocs, false);
int counter = 0;
byte[] packedBytes = new byte[Long.BYTES];
for (int docID = 0; docID < numDocs; docID++) {
for (int j = 0; j < 26; j++) {
// first a random int:
NumericUtils.intToSortableBytes(random().nextInt(), packedBytes, 0);
// then our counter, which will overflow a bit in the end:
NumericUtils.intToSortableBytes(counter, packedBytes, Integer.BYTES);
w.add(packedBytes, docID);
counter++;
}
if (VERBOSE && docID % 100000 == 0) {
System.out.println(docID + " of " + numDocs + "...");
}
}
IndexOutput out = dir.createOutput("1d.bkd", IOContext.DEFAULT);
long indexFP = w.finish(out);
out.close();
IndexInput in = dir.openInput("1d.bkd", IOContext.DEFAULT);
in.seek(indexFP);
BKDReader r = new BKDReader(in);
CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("1d", numDocs, r);
r.intersect(visitor);
assertEquals(r.size(), visitor.getPointCountSeen());
assertEquals(r.getDocCount(), visitor.getDocCountSeen());
in.close();
dir.close();
}
use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.
the class TestBKD method testBigIntNDims.
// Tests on N-dimensional points where each dimension is a BigInteger
public void testBigIntNDims() throws Exception {
int numDocs = atLeast(1000);
try (Directory dir = getDirectory(numDocs)) {
int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
int numDims = TestUtil.nextInt(random(), 1, 5);
int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 100);
float maxMB = (float) 3.0 + (3 * random().nextFloat());
BKDWriter w = new BKDWriter(numDocs, dir, "tmp", numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, numDocs, true);
BigInteger[][] docs = new BigInteger[numDocs][];
byte[] scratch = new byte[numBytesPerDim * numDims];
for (int docID = 0; docID < numDocs; docID++) {
BigInteger[] values = new BigInteger[numDims];
if (VERBOSE) {
System.out.println(" docID=" + docID);
}
for (int dim = 0; dim < numDims; dim++) {
values[dim] = randomBigInt(numBytesPerDim);
NumericUtils.bigIntToSortableBytes(values[dim], numBytesPerDim, scratch, dim * numBytesPerDim);
if (VERBOSE) {
System.out.println(" " + dim + " -> " + values[dim]);
}
}
docs[docID] = values;
w.add(scratch, docID);
}
long indexFP;
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
indexFP = w.finish(out);
}
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
in.seek(indexFP);
BKDReader r = new BKDReader(in);
int iters = atLeast(100);
for (int iter = 0; iter < iters; iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Random N dims rect query:
BigInteger[] queryMin = new BigInteger[numDims];
BigInteger[] queryMax = new BigInteger[numDims];
for (int dim = 0; dim < numDims; dim++) {
queryMin[dim] = randomBigInt(numBytesPerDim);
queryMax[dim] = randomBigInt(numBytesPerDim);
if (queryMin[dim].compareTo(queryMax[dim]) > 0) {
BigInteger x = queryMin[dim];
queryMin[dim] = queryMax[dim];
queryMax[dim] = x;
}
}
final BitSet hits = new BitSet();
r.intersect(new IntersectVisitor() {
@Override
public void visit(int docID) {
hits.set(docID);
//System.out.println("visit docID=" + docID);
}
@Override
public void visit(int docID, byte[] packedValue) {
//System.out.println("visit check docID=" + docID);
for (int dim = 0; dim < numDims; dim++) {
BigInteger x = NumericUtils.sortableBytesToBigInt(packedValue, dim * numBytesPerDim, numBytesPerDim);
if (x.compareTo(queryMin[dim]) < 0 || x.compareTo(queryMax[dim]) > 0) {
//System.out.println(" no");
return;
}
}
//System.out.println(" yes");
hits.set(docID);
}
@Override
public Relation compare(byte[] minPacked, byte[] maxPacked) {
boolean crosses = false;
for (int dim = 0; dim < numDims; dim++) {
BigInteger min = NumericUtils.sortableBytesToBigInt(minPacked, dim * numBytesPerDim, numBytesPerDim);
BigInteger max = NumericUtils.sortableBytesToBigInt(maxPacked, dim * numBytesPerDim, numBytesPerDim);
assert max.compareTo(min) >= 0;
if (max.compareTo(queryMin[dim]) < 0 || min.compareTo(queryMax[dim]) > 0) {
return Relation.CELL_OUTSIDE_QUERY;
} else if (min.compareTo(queryMin[dim]) < 0 || max.compareTo(queryMax[dim]) > 0) {
crosses = true;
}
}
if (crosses) {
return Relation.CELL_CROSSES_QUERY;
} else {
return Relation.CELL_INSIDE_QUERY;
}
}
});
for (int docID = 0; docID < numDocs; docID++) {
BigInteger[] docValues = docs[docID];
boolean expected = true;
for (int dim = 0; dim < numDims; dim++) {
BigInteger x = docValues[dim];
if (x.compareTo(queryMin[dim]) < 0 || x.compareTo(queryMax[dim]) > 0) {
expected = false;
break;
}
}
boolean actual = hits.get(docID);
assertEquals("docID=" + docID, expected, actual);
}
}
}
}
}
Aggregations