use of org.apache.lucene.util.fst.FST.BytesReader in project lucene-solr by apache.
the class WFSTCompletionLookup method lookupPrefix.
private Long lookupPrefix(BytesRef scratch, Arc<Long> arc) throws /*Bogus*/
IOException {
assert 0 == fst.outputs.getNoOutput().longValue();
long output = 0;
BytesReader bytesReader = fst.getBytesReader();
fst.getFirstArc(arc);
byte[] bytes = scratch.bytes;
int pos = scratch.offset;
int end = pos + scratch.length;
while (pos < end) {
if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) {
return null;
} else {
output += arc.output.longValue();
}
}
return output;
}
use of org.apache.lucene.util.fst.FST.BytesReader in project lucene-solr by apache.
the class Util method toDot.
/**
* Dumps an {@link FST} to a GraphViz's <code>dot</code> language description
* for visualization. Example of use:
*
* <pre class="prettyprint">
* PrintWriter pw = new PrintWriter("out.dot");
* Util.toDot(fst, pw, true, true);
* pw.close();
* </pre>
*
* and then, from command line:
*
* <pre>
* dot -Tpng -o out.png out.dot
* </pre>
*
* <p>
* Note: larger FSTs (a few thousand nodes) won't even
* render, don't bother. If the FST is > 2.1 GB in size
* then this method will throw strange exceptions.
*
* @param sameRank
* If <code>true</code>, the resulting <code>dot</code> file will try
* to order states in layers of breadth-first traversal. This may
* mess up arcs, but makes the output FST's structure a bit clearer.
*
* @param labelStates
* If <code>true</code> states will have labels equal to their offsets in their
* binary format. Expands the graph considerably.
*
* @see <a href="http://www.graphviz.org/">graphviz project</a>
*/
public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates) throws IOException {
final String expandedNodeColor = "blue";
// This is the start arc in the automaton (from the epsilon state to the first state
// with outgoing transitions.
final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
// A queue of transitions to consider for the next level.
final List<FST.Arc<T>> thisLevelQueue = new ArrayList<>();
// A queue of transitions to consider when processing the next level.
final List<FST.Arc<T>> nextLevelQueue = new ArrayList<>();
nextLevelQueue.add(startArc);
//System.out.println("toDot: startArc: " + startArc);
// A list of states on the same level (for ranking).
final List<Integer> sameLevelStates = new ArrayList<>();
// A bitset of already seen states (target offset).
final BitSet seen = new BitSet();
seen.set((int) startArc.target);
// Shape for states.
final String stateShape = "circle";
final String finalStateShape = "doublecircle";
// Emit DOT prologue.
out.write("digraph FST {\n");
out.write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n");
if (!labelStates) {
out.write(" node [shape=circle, width=.2, height=.2, style=filled]\n");
}
emitDotState(out, "initial", "point", "white", "");
final T NO_OUTPUT = fst.outputs.getNoOutput();
final BytesReader r = fst.getBytesReader();
// final FST.Arc<T> scratchArc = new FST.Arc<>();
{
final String stateColor;
if (fst.isExpandedTarget(startArc, r)) {
stateColor = expandedNodeColor;
} else {
stateColor = null;
}
final boolean isFinal;
final T finalOutput;
if (startArc.isFinal()) {
isFinal = true;
finalOutput = startArc.nextFinalOutput == NO_OUTPUT ? null : startArc.nextFinalOutput;
} else {
isFinal = false;
finalOutput = null;
}
emitDotState(out, Long.toString(startArc.target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.outputs.outputToString(finalOutput));
}
out.write(" initial -> " + startArc.target + "\n");
int level = 0;
while (!nextLevelQueue.isEmpty()) {
// we could double buffer here, but it doesn't matter probably.
//System.out.println("next level=" + level);
thisLevelQueue.addAll(nextLevelQueue);
nextLevelQueue.clear();
level++;
out.write("\n // Transitions and states at level: " + level + "\n");
while (!thisLevelQueue.isEmpty()) {
final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);
//System.out.println(" pop: " + arc);
if (FST.targetHasArcs(arc)) {
// scan all target arcs
//System.out.println(" readFirstTarget...");
final long node = arc.target;
fst.readFirstRealTargetArc(arc.target, arc, r);
while (true) {
// Emit the unseen state and add it to the queue for the next level.
if (arc.target >= 0 && !seen.get((int) arc.target)) {
/*
boolean isFinal = false;
T finalOutput = null;
fst.readFirstTargetArc(arc, scratchArc);
if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) {
// target is final
isFinal = true;
finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output;
System.out.println("dot hit final label=" + (char) scratchArc.label);
}
*/
final String stateColor;
if (fst.isExpandedTarget(arc, r)) {
stateColor = expandedNodeColor;
} else {
stateColor = null;
}
final String finalOutput;
if (arc.nextFinalOutput != null && arc.nextFinalOutput != NO_OUTPUT) {
finalOutput = fst.outputs.outputToString(arc.nextFinalOutput);
} else {
finalOutput = "";
}
emitDotState(out, Long.toString(arc.target), stateShape, stateColor, finalOutput);
// To see the node address, use this instead:
//emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target));
seen.set((int) arc.target);
nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
sameLevelStates.add((int) arc.target);
}
String outs;
if (arc.output != NO_OUTPUT) {
outs = "/" + fst.outputs.outputToString(arc.output);
} else {
outs = "";
}
if (!FST.targetHasArcs(arc) && arc.isFinal() && arc.nextFinalOutput != NO_OUTPUT) {
// Tricky special case: sometimes, due to
// pruning, the builder can [sillily] produce
// an FST with an arc into the final end state
// (-1) but also with a next final output; in
// this case we pull that output up onto this
// arc
outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]";
}
final String arcColor;
if (arc.flag(FST.BIT_TARGET_NEXT)) {
arcColor = "red";
} else {
arcColor = "black";
}
assert arc.label != FST.END_LABEL;
out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"" + (arc.isFinal() ? " style=\"bold\"" : "") + " color=\"" + arcColor + "\"]\n");
// Break the loop if we're on the last arc of this state.
if (arc.isLast()) {
//System.out.println(" break");
break;
}
fst.readNextRealArc(arc, r);
}
}
}
// Emit state ranking information.
if (sameRank && sameLevelStates.size() > 1) {
out.write(" {rank=same; ");
for (int state : sameLevelStates) {
out.write(state + "; ");
}
out.write(" }\n");
}
sameLevelStates.clear();
}
// Emit terminating state (always there anyway).
out.write(" -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n");
out.write(" {rank=sink; -1 }\n");
out.write("}\n");
out.flush();
}
use of org.apache.lucene.util.fst.FST.BytesReader in project lucene-solr by apache.
the class TestFSTs method testShortestPathsWFSTRandom.
/** like testShortestPathsRandom, but uses pairoutputs so we have both a weight and an output */
public void testShortestPathsWFSTRandom() throws Exception {
int numWords = atLeast(1000);
final TreeMap<String, TwoLongs> slowCompletor = new TreeMap<>();
final TreeSet<String> allPrefixes = new TreeSet<>();
PairOutputs<Long, Long> outputs = new PairOutputs<>(// weight
PositiveIntOutputs.getSingleton(), // output
PositiveIntOutputs.getSingleton());
final Builder<Pair<Long, Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
Random random = random();
for (int i = 0; i < numWords; i++) {
String s;
while (true) {
s = TestUtil.randomSimpleString(random);
if (!slowCompletor.containsKey(s)) {
break;
}
}
for (int j = 1; j < s.length(); j++) {
allPrefixes.add(s.substring(0, j));
}
// weights 1..100
int weight = TestUtil.nextInt(random, 1, 100);
// outputs 0..500
int output = TestUtil.nextInt(random, 0, 500);
slowCompletor.put(s, new TwoLongs(weight, output));
}
for (Map.Entry<String, TwoLongs> e : slowCompletor.entrySet()) {
//System.out.println("add: " + e);
long weight = e.getValue().a;
long output = e.getValue().b;
builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output));
}
final FST<Pair<Long, Long>> fst = builder.finish();
//System.out.println("SAVE out.dot");
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false);
//w.close();
BytesReader reader = fst.getBytesReader();
//System.out.println("testing: " + allPrefixes.size() + " prefixes");
for (String prefix : allPrefixes) {
// 1. run prefix against fst, then complete by value
//System.out.println("TEST: " + prefix);
Pair<Long, Long> prefixOutput = outputs.getNoOutput();
FST.Arc<Pair<Long, Long>> arc = fst.getFirstArc(new FST.Arc<Pair<Long, Long>>());
for (int idx = 0; idx < prefix.length(); idx++) {
if (fst.findTargetArc((int) prefix.charAt(idx), arc, arc, reader) == null) {
fail();
}
prefixOutput = outputs.add(prefixOutput, arc.output);
}
final int topN = TestUtil.nextInt(random, 1, 10);
Util.TopResults<Pair<Long, Long>> r = Util.shortestPaths(fst, arc, fst.outputs.getNoOutput(), minPairWeightComparator, topN, true);
assertTrue(r.isComplete);
// 2. go thru whole treemap (slowCompletor) and check it's actually the best suggestion
final List<Result<Pair<Long, Long>>> matches = new ArrayList<>();
// TODO: could be faster... but it's slowCompletor for a reason
for (Map.Entry<String, TwoLongs> e : slowCompletor.entrySet()) {
if (e.getKey().startsWith(prefix)) {
//System.out.println(" consider " + e.getKey());
matches.add(new Result<>(Util.toIntsRef(new BytesRef(e.getKey().substring(prefix.length())), new IntsRefBuilder()), outputs.newPair(e.getValue().a - prefixOutput.output1, e.getValue().b - prefixOutput.output2)));
}
}
assertTrue(matches.size() > 0);
Collections.sort(matches, new TieBreakByInputComparator<>(minPairWeightComparator));
if (matches.size() > topN) {
matches.subList(topN, matches.size()).clear();
}
assertEquals(matches.size(), r.topN.size());
for (int hit = 0; hit < r.topN.size(); hit++) {
//System.out.println(" check hit " + hit);
assertEquals(matches.get(hit).input, r.topN.get(hit).input);
assertEquals(matches.get(hit).output, r.topN.get(hit).output);
}
}
}
Aggregations