use of org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output in project lucene-solr by apache.
the class OrdsSegmentTermsEnum method getByOutput.
// TODO: this is similar to Util.getByOutput ... can we refactor/share?
/** Specialized getByOutput that can understand the ranges (startOrd to endOrd) we use here, not just startOrd. */
private InputOutput getByOutput(long targetOrd) throws IOException {
final IntsRefBuilder result = new IntsRefBuilder();
fr.index.getFirstArc(arc);
Output output = arc.output;
int upto = 0;
int bestUpto = 0;
Output bestOutput = null;
while (true) {
// System.out.println(" loop: output=" + output.startOrd + "-" + (Long.MAX_VALUE-output.endOrd) + " upto=" + upto + " arc=" + arc + " final?=" + arc.isFinal());
if (arc.isFinal()) {
final Output finalOutput = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput);
// System.out.println(" isFinal: " + finalOutput.startOrd + "-" + (Long.MAX_VALUE-finalOutput.endOrd));
if (targetOrd >= finalOutput.startOrd && targetOrd <= Long.MAX_VALUE - finalOutput.endOrd) {
// Only one range should match across all arc leaving this node
//assert bestOutput == null;
bestOutput = finalOutput;
bestUpto = upto;
}
}
if (FST.targetHasArcs(arc)) {
// System.out.println(" targetHasArcs");
result.grow(1 + upto);
fr.index.readFirstRealTargetArc(arc.target, arc, fstReader);
if (arc.bytesPerArc != 0) {
// System.out.println(" array arcs");
int low = 0;
int high = arc.numArcs - 1;
int mid = 0;
//System.out.println("bsearch: numArcs=" + arc.numArcs + " target=" + targetOutput + " output=" + output);
boolean found = false;
while (low <= high) {
mid = (low + high) >>> 1;
fstReader.setPosition(arc.posArcsStart);
fstReader.skipBytes(arc.bytesPerArc * mid);
final byte flags = fstReader.readByte();
fr.index.readLabel(fstReader);
final Output minArcOutput;
if ((flags & FST.BIT_ARC_HAS_OUTPUT) != 0) {
minArcOutput = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, OrdsBlockTreeTermsWriter.FST_OUTPUTS.read(fstReader));
} else {
minArcOutput = output;
}
// System.out.println(" cycle mid=" + mid + " targetOrd=" + targetOrd + " output=" + minArcOutput.startOrd + "-" + (Long.MAX_VALUE-minArcOutput.endOrd));
if (targetOrd > Long.MAX_VALUE - minArcOutput.endOrd) {
low = mid + 1;
} else if (targetOrd < minArcOutput.startOrd) {
high = mid - 1;
} else {
// System.out.println(" found!!");
found = true;
break;
}
}
if (found) {
// Keep recursing
arc.arcIdx = mid - 1;
} else {
result.setLength(bestUpto);
InputOutput io = new InputOutput();
io.input = result.get();
io.output = bestOutput;
// System.out.println(" ret0=" + io);
return io;
}
fr.index.readNextRealArc(arc, fstReader);
// Recurse on this arc:
result.setIntAt(upto++, arc.label);
output = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
} else {
while (true) {
// System.out.println(" cycle label=" + arc.label + " output=" + arc.output);
// This is the min output we'd hit if we follow
// this arc:
final Output minArcOutput = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
long endOrd = Long.MAX_VALUE - minArcOutput.endOrd;
if (targetOrd >= minArcOutput.startOrd && targetOrd <= endOrd) {
// Recurse on this arc:
output = minArcOutput;
result.setIntAt(upto++, arc.label);
break;
} else if (targetOrd < endOrd || arc.isLast()) {
result.setLength(bestUpto);
InputOutput io = new InputOutput();
io.input = result.get();
assert bestOutput != null;
io.output = bestOutput;
// System.out.println(" ret2=" + io);
return io;
} else {
// System.out.println(" next arc");
// Read next arc in this node:
fr.index.readNextRealArc(arc, fstReader);
}
}
}
} else {
result.setLength(bestUpto);
InputOutput io = new InputOutput();
io.input = result.get();
io.output = bestOutput;
// System.out.println(" ret3=" + io);
return io;
}
}
}
use of org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output in project lucene-solr by apache.
the class OrdsSegmentTermsEnum method seekExact.
@Override
public boolean seekExact(final BytesRef target) throws IOException {
if (fr.index == null) {
throw new IllegalStateException("terms index was not loaded");
}
term.grow(1 + target.length);
assert clearEOF();
/*
if (DEBUG) {
System.out.println("\nBTTR.seekExact seg=" + fr.parent.segment + " target=" + fr.fieldInfo.name + ":" + brToString(target) + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=" + validIndexPrefix);
printSeekState(System.out);
}
*/
FST.Arc<Output> arc;
int targetUpto;
Output output;
targetBeforeCurrentLength = currentFrame.ord;
if (positioned && currentFrame != staticFrame) {
// We are already seek'd; find the common
// prefix of new seek term vs current term and
// re-use the corresponding seek state. For
// example, if app first seeks to foobar, then
// seeks to foobaz, we can re-use the seek state
// for the first 5 bytes.
// if (DEBUG) {
// System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix);
// }
arc = arcs[0];
assert arc.isFinal();
output = arc.output;
targetUpto = 0;
OrdsSegmentTermsEnumFrame lastFrame = stack[0];
assert validIndexPrefix <= term.length();
final int targetLimit = Math.min(target.length, validIndexPrefix);
int cmp = 0;
// First compare up to valid seek frames:
while (targetUpto < targetLimit) {
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
// }
if (cmp != 0) {
break;
}
arc = arcs[1 + targetUpto];
assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF) : "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
if (arc.output != OrdsBlockTreeTermsWriter.NO_OUTPUT) {
output = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
}
if (arc.isFinal()) {
lastFrame = stack[1 + lastFrame.ord];
}
targetUpto++;
}
if (cmp == 0) {
final int targetUptoMid = targetUpto;
// Second compare the rest of the term, but
// don't save arc/output/frame; we only do this
// to find out if the target term is before,
// equal or after the current term
final int targetLimit2 = Math.min(target.length, term.length());
while (targetUpto < targetLimit2) {
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
// }
if (cmp != 0) {
break;
}
targetUpto++;
}
if (cmp == 0) {
cmp = term.length() - target.length;
}
targetUpto = targetUptoMid;
}
if (cmp < 0) {
// Common case: target term is after current
// term, ie, app is seeking multiple terms
// in sorted order
// if (DEBUG) {
// System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); frame.ord=" + lastFrame.ord);
// }
currentFrame = lastFrame;
} else if (cmp > 0) {
// Uncommon case: target term
// is before current term; this means we can
// keep the currentFrame but we must rewind it
// (so we scan from the start)
targetBeforeCurrentLength = lastFrame.ord;
// if (DEBUG) {
// System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord);
// }
currentFrame = lastFrame;
currentFrame.rewind();
} else {
// Target is exactly the same as current term
assert term.length() == target.length;
if (termExists) {
// }
return true;
} else {
// if (DEBUG) {
// System.out.println(" target is same as current but term doesn't exist");
// }
}
//validIndexPrefix = currentFrame.depth;
//term.length = target.length;
//return termExists;
}
} else {
targetBeforeCurrentLength = -1;
arc = fr.index.getFirstArc(arcs[0]);
// Empty string prefix must have an output (block) in the index!
assert arc.isFinal();
assert arc.output != null;
// if (DEBUG) {
// System.out.println(" no seek state; push root frame");
// }
output = arc.output;
currentFrame = staticFrame;
//term.length = 0;
targetUpto = 0;
currentFrame = pushFrame(arc, OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
}
positioned = true;
// We are done sharing the common prefix with the incoming target and where we are currently seek'd; now continue walking the index:
while (targetUpto < target.length) {
final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
final FST.Arc<Output> nextArc = fr.index.findTargetArc(targetLabel, arc, getArc(1 + targetUpto), fstReader);
if (nextArc == null) {
// Index is exhausted
// if (DEBUG) {
// System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel));
// }
validIndexPrefix = currentFrame.prefix;
//validIndexPrefix = targetUpto;
currentFrame.scanToFloorFrame(target);
if (!currentFrame.hasTerms) {
termExists = false;
term.setByteAt(targetUpto, (byte) targetLabel);
term.setLength(1 + targetUpto);
// }
return false;
}
currentFrame.loadBlock();
final SeekStatus result = currentFrame.scanToTerm(target, true);
if (result == SeekStatus.FOUND) {
// }
return true;
} else {
// }
return false;
}
} else {
// Follow this arc
arc = nextArc;
term.setByteAt(targetUpto, (byte) targetLabel);
// Aggregate output as we go:
assert arc.output != null;
if (arc.output != OrdsBlockTreeTermsWriter.NO_OUTPUT) {
output = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
}
// if (DEBUG) {
// System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
// }
targetUpto++;
if (arc.isFinal()) {
//if (DEBUG) System.out.println(" arc is final!");
currentFrame = pushFrame(arc, OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto);
//if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
}
}
}
//validIndexPrefix = targetUpto;
validIndexPrefix = currentFrame.prefix;
currentFrame.scanToFloorFrame(target);
// Target term is entirely contained in the index:
if (!currentFrame.hasTerms) {
termExists = false;
term.setLength(targetUpto);
// }
return false;
}
currentFrame.loadBlock();
final SeekStatus result = currentFrame.scanToTerm(target, true);
if (result == SeekStatus.FOUND) {
// }
return true;
} else {
return false;
}
}
use of org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output in project lucene-solr by apache.
the class OrdsIntersectTermsEnum method pushFrame.
private OrdsIntersectTermsEnumFrame pushFrame(int state) throws IOException {
final OrdsIntersectTermsEnumFrame f = getFrame(currentFrame == null ? 0 : 1 + currentFrame.ord);
f.fp = f.fpOrig = currentFrame.lastSubFP;
f.prefix = currentFrame.prefix + currentFrame.suffix;
// if (DEBUG) System.out.println(" pushFrame state=" + state + " prefix=" + f.prefix);
f.setState(state);
// Walk the arc through the index -- we only
// "bother" with this so we can get the floor data
// from the index and skip floor blocks when
// possible:
FST.Arc<Output> arc = currentFrame.arc;
int idx = currentFrame.prefix;
assert currentFrame.suffix > 0;
Output output = currentFrame.outputPrefix;
while (idx < f.prefix) {
final int target = term.bytes[idx] & 0xff;
// TODO: we could be more efficient for the next()
// case by using current arc as starting point,
// passed to findTargetArc
arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader);
assert arc != null;
output = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
idx++;
}
f.arc = arc;
f.outputPrefix = output;
assert arc.isFinal();
f.load(OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput));
return f;
}
use of org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output in project lucene-solr by apache.
the class OrdsSegmentTermsEnum method printSeekState.
@SuppressWarnings("unused")
private void printSeekState(PrintStream out) throws IOException {
if (currentFrame == staticFrame) {
out.println(" no prior seek");
} else {
out.println(" prior seek state:");
int ord = 0;
boolean isSeekFrame = true;
while (true) {
OrdsSegmentTermsEnumFrame f = getFrame(ord);
assert f != null;
final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix);
if (f.nextEnt == -1) {
out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp << OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? OrdsBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0) + (f.isFloor ? OrdsBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd() + " termOrd=" + f.termOrd);
} else {
out.println(" frame " + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + " nextEnt=" + f.nextEnt + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp << OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? OrdsBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0) + (f.isFloor ? OrdsBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd() + " termOrd=" + f.termOrd);
}
if (fr.index != null) {
assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc;
if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix - 1) & 0xFF)) {
out.println(" broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix - 1) & 0xFF));
throw new RuntimeException("seek state is broken");
}
Output output = Util.get(fr.index, prefix);
if (output == null) {
out.println(" broken seek state: prefix is not final in index");
throw new RuntimeException("seek state is broken");
} else if (isSeekFrame && !f.isFloor) {
final ByteArrayDataInput reader = new ByteArrayDataInput(output.bytes.bytes, output.bytes.offset, output.bytes.length);
final long codeOrig = reader.readVLong();
final long code = (f.fp << OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? OrdsBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0) | (f.isFloor ? OrdsBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0);
if (codeOrig != code) {
out.println(" broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code);
throw new RuntimeException("seek state is broken");
}
}
}
if (f == currentFrame) {
break;
}
if (f.prefix == validIndexPrefix) {
isSeekFrame = false;
}
ord++;
}
}
}
use of org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output in project lucene-solr by apache.
the class OrdsSegmentTermsEnum method seekCeil.
@Override
public SeekStatus seekCeil(final BytesRef target) throws IOException {
if (fr.index == null) {
throw new IllegalStateException("terms index was not loaded");
}
term.grow(1 + target.length);
assert clearEOF();
//if (DEBUG) {
//System.out.println("\nBTTR.seekCeil seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix= " + validIndexPrefix);
//printSeekState();
//}
FST.Arc<Output> arc;
int targetUpto;
Output output;
targetBeforeCurrentLength = currentFrame.ord;
if (positioned && currentFrame != staticFrame) {
// We are already seek'd; find the common
// prefix of new seek term vs current term and
// re-use the corresponding seek state. For
// example, if app first seeks to foobar, then
// seeks to foobaz, we can re-use the seek state
// for the first 5 bytes.
//if (DEBUG) {
//System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix);
//}
arc = arcs[0];
assert arc.isFinal();
output = arc.output;
targetUpto = 0;
OrdsSegmentTermsEnumFrame lastFrame = stack[0];
assert validIndexPrefix <= term.length();
final int targetLimit = Math.min(target.length, validIndexPrefix);
int cmp = 0;
// First compare up to valid seek frames:
while (targetUpto < targetLimit) {
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
//}
if (cmp != 0) {
break;
}
arc = arcs[1 + targetUpto];
assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF) : "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
// reverse vLong byte order)
if (arc.output != OrdsBlockTreeTermsWriter.NO_OUTPUT) {
output = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
}
if (arc.isFinal()) {
lastFrame = stack[1 + lastFrame.ord];
}
targetUpto++;
}
if (cmp == 0) {
final int targetUptoMid = targetUpto;
// Second compare the rest of the term, but
// don't save arc/output/frame:
final int targetLimit2 = Math.min(target.length, term.length());
while (targetUpto < targetLimit2) {
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
//}
if (cmp != 0) {
break;
}
targetUpto++;
}
if (cmp == 0) {
cmp = term.length() - target.length;
}
targetUpto = targetUptoMid;
}
if (cmp < 0) {
// Common case: target term is after current
// term, ie, app is seeking multiple terms
// in sorted order
//if (DEBUG) {
//System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); clear frame.scanned ord=" + lastFrame.ord);
//}
currentFrame = lastFrame;
} else if (cmp > 0) {
// Uncommon case: target term
// is before current term; this means we can
// keep the currentFrame but we must rewind it
// (so we scan from the start)
targetBeforeCurrentLength = 0;
//if (DEBUG) {
//System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord);
//}
currentFrame = lastFrame;
currentFrame.rewind();
} else {
// Target is exactly the same as current term
assert term.length() == target.length;
if (termExists) {
//}
return SeekStatus.FOUND;
} else {
//if (DEBUG) {
//System.out.println(" target is same as current but term doesn't exist");
//}
}
}
} else {
targetBeforeCurrentLength = -1;
arc = fr.index.getFirstArc(arcs[0]);
// Empty string prefix must have an output (block) in the index!
assert arc.isFinal();
assert arc.output != null;
//if (DEBUG) {
//System.out.println(" no seek state; push root frame");
//}
output = arc.output;
currentFrame = staticFrame;
//term.length = 0;
targetUpto = 0;
currentFrame = pushFrame(arc, OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
}
positioned = true;
// We are done sharing the common prefix with the incoming target and where we are currently seek'd; now continue walking the index:
while (targetUpto < target.length) {
final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
final FST.Arc<Output> nextArc = fr.index.findTargetArc(targetLabel, arc, getArc(1 + targetUpto), fstReader);
if (nextArc == null) {
// Index is exhausted
// if (DEBUG) {
// System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel));
// }
validIndexPrefix = currentFrame.prefix;
//validIndexPrefix = targetUpto;
currentFrame.scanToFloorFrame(target);
currentFrame.loadBlock();
final SeekStatus result = currentFrame.scanToTerm(target, false);
if (result == SeekStatus.END) {
term.copyBytes(target);
termExists = false;
if (next() != null) {
//}
return SeekStatus.NOT_FOUND;
} else {
//}
return SeekStatus.END;
}
} else {
//}
return result;
}
} else {
// Follow this arc
term.setByteAt(targetUpto, (byte) targetLabel);
arc = nextArc;
// Aggregate output as we go:
assert arc.output != null;
if (arc.output != OrdsBlockTreeTermsWriter.NO_OUTPUT) {
output = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
}
//if (DEBUG) {
//System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
//}
targetUpto++;
if (arc.isFinal()) {
//if (DEBUG) System.out.println(" arc is final!");
currentFrame = pushFrame(arc, OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto);
//if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
}
}
}
//validIndexPrefix = targetUpto;
validIndexPrefix = currentFrame.prefix;
currentFrame.scanToFloorFrame(target);
currentFrame.loadBlock();
final SeekStatus result = currentFrame.scanToTerm(target, false);
if (result == SeekStatus.END) {
term.copyBytes(target);
termExists = false;
if (next() != null) {
//}
return SeekStatus.NOT_FOUND;
} else {
//}
return SeekStatus.END;
}
} else {
return result;
}
}
Aggregations