use of org.apache.lucene.util.fst.FST.BytesReader in project lucene-solr by apache.
the class Util method get.
// TODO: maybe a CharsRef version for BYTE2
/** Looks up the output for this input, or null if the
* input is not accepted */
public static <T> T get(FST<T> fst, BytesRef input) throws IOException {
assert fst.inputType == FST.INPUT_TYPE.BYTE1;
final BytesReader fstReader = fst.getBytesReader();
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
// Accumulate output as we go
T output = fst.outputs.getNoOutput();
for (int i = 0; i < input.length; i++) {
if (fst.findTargetArc(input.bytes[i + input.offset] & 0xFF, arc, arc, fstReader) == null) {
return null;
}
output = fst.outputs.add(output, arc.output);
}
if (arc.isFinal()) {
return fst.outputs.add(output, arc.nextFinalOutput);
} else {
return null;
}
}
use of org.apache.lucene.util.fst.FST.BytesReader in project lucene-solr by apache.
the class Util method getByOutput.
/** Reverse lookup (lookup by output instead of by input),
* in the special case when your FSTs outputs are
* strictly ascending. This locates the input/output
* pair where the output is equal to the target, and will
* return null if that output does not exist.
*
* <p>NOTE: this only works with {@code FST<Long>}, only
* works when the outputs are ascending in order with
* the inputs.
* For example, simple ordinals (0, 1,
* 2, ...), or file offets (when appending to a file)
* fit this. */
public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException {
final BytesReader in = fst.getBytesReader();
// TODO: would be nice not to alloc this on every lookup
FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>());
FST.Arc<Long> scratchArc = new FST.Arc<>();
final IntsRefBuilder result = new IntsRefBuilder();
return getByOutput(fst, targetOutput, in, arc, scratchArc, result);
}
use of org.apache.lucene.util.fst.FST.BytesReader in project lucene-solr by apache.
the class MemoryDocValuesProducer method getSortedSet.
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
SortedSetEntry sortedSetEntry = sortedSets.get(field.name);
if (sortedSetEntry.singleton) {
return DocValues.singleton(getSorted(field));
}
final FSTEntry entry = fsts.get(field.name);
if (entry.numOrds == 0) {
// empty FST!
return DocValues.emptySortedSet();
}
FST<Long> instance;
synchronized (this) {
instance = fstInstances.get(field.name);
if (instance == null) {
IndexInput data = this.data.clone();
data.seek(entry.offset);
instance = new FST<>(data, PositiveIntOutputs.getSingleton());
if (!merging) {
ramBytesUsed.addAndGet(instance.ramBytesUsed());
fstInstances.put(field.name, instance);
}
}
}
final LegacyBinaryDocValues docToOrds = getLegacyBinary(field);
final FST<Long> fst = instance;
// per-thread resources
final BytesReader in = fst.getBytesReader();
final Arc<Long> firstArc = new Arc<>();
final Arc<Long> scratchArc = new Arc<>();
final IntsRefBuilder scratchInts = new IntsRefBuilder();
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
final ByteArrayDataInput input = new ByteArrayDataInput();
return new LegacySortedSetDocValuesWrapper(new LegacySortedSetDocValues() {
final BytesRefBuilder term = new BytesRefBuilder();
BytesRef ref;
long currentOrd;
@Override
public long nextOrd() {
if (input.eof()) {
return NO_MORE_ORDS;
} else {
currentOrd += input.readVLong();
return currentOrd;
}
}
@Override
public void setDocument(int docID) {
ref = docToOrds.get(docID);
input.reset(ref.bytes, ref.offset, ref.length);
currentOrd = 0;
}
@Override
public BytesRef lookupOrd(long ord) {
try {
in.setPosition(0);
fst.getFirstArc(firstArc);
IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts);
return Util.toBytesRef(output, term);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
@Override
public long lookupTerm(BytesRef key) {
try {
InputOutput<Long> o = fstEnum.seekCeil(key);
if (o == null) {
return -getValueCount() - 1;
} else if (o.input.equals(key)) {
return o.output.intValue();
} else {
return -o.output - 1;
}
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
@Override
public long getValueCount() {
return entry.numOrds;
}
@Override
public TermsEnum termsEnum() {
return new FSTTermsEnum(fst);
}
}, maxDoc);
}
use of org.apache.lucene.util.fst.FST.BytesReader in project elasticsearch by elastic.
the class XAnalyzingSuggester method lookup.
@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
assert num > 0;
if (onlyMorePopular) {
throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
}
if (fst == null) {
return Collections.emptyList();
}
//System.out.println("lookup key=" + key + " num=" + num);
for (int i = 0; i < key.length(); i++) {
if (key.charAt(i) == holeCharacter) {
throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
}
if (key.charAt(i) == sepLabel) {
throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
}
}
final BytesRef utf8Key = new BytesRef(key);
try {
Automaton lookupAutomaton = toLookupAutomaton(key);
final CharsRefBuilder spare = new CharsRefBuilder();
//System.out.println(" now intersect exactFirst=" + exactFirst);
// Intersect automaton w/ suggest wFST and get all
// prefix starting nodes & their outputs:
//final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
//System.out.println(" prefixPaths: " + prefixPaths.size());
BytesReader bytesReader = fst.getBytesReader();
FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
final List<LookupResult> results = new ArrayList<>();
List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
if (exactFirst) {
int count = 0;
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
count++;
}
}
// Searcher just to find the single exact only
// match, if present:
Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
// ...:
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
}
}
Util.TopResults<Pair<Long, BytesRef>> completions = searcher.search();
// maxSurfaceFormsPerAnalyzedForm:
for (Result<Pair<Long, BytesRef>> completion : completions) {
BytesRef output2 = completion.output.output2;
if (sameSurfaceForm(utf8Key, output2)) {
results.add(getLookupResult(completion.output.output1, output2, spare));
break;
}
}
if (results.size() == num) {
// That was quick:
return results;
}
}
Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {
private final Set<BytesRef> seen = new HashSet<>();
@Override
protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
// can get duplicate surface forms:
if (seen.contains(output.output2)) {
return false;
}
seen.add(output.output2);
if (!exactFirst) {
return true;
} else {
// create duplicate results:
if (sameSurfaceForm(utf8Key, output.output2)) {
// have already found it in the first search:
assert results.size() == 1;
return false;
} else {
return true;
}
}
}
};
prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
searcher.addStartPaths(path.fstNode, path.output, true, path.input);
}
TopResults<Pair<Long, BytesRef>> completions = searcher.search();
for (Result<Pair<Long, BytesRef>> completion : completions) {
LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
// TODO: for fuzzy case would be nice to return
// how many edits were required
//System.out.println(" result=" + result);
results.add(result);
if (results.size() == num) {
// produce one extra path
break;
}
}
return results;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
use of org.apache.lucene.util.fst.FST.BytesReader in project lucene-solr by apache.
the class Util method toDot.
/**
* Dumps an {@link FST} to a GraphViz's <code>dot</code> language description
* for visualization. Example of use:
*
* <pre class="prettyprint">
* PrintWriter pw = new PrintWriter("out.dot");
* Util.toDot(fst, pw, true, true);
* pw.close();
* </pre>
*
* and then, from command line:
*
* <pre>
* dot -Tpng -o out.png out.dot
* </pre>
*
* <p>
* Note: larger FSTs (a few thousand nodes) won't even
* render, don't bother. If the FST is > 2.1 GB in size
* then this method will throw strange exceptions.
*
* @param sameRank
* If <code>true</code>, the resulting <code>dot</code> file will try
* to order states in layers of breadth-first traversal. This may
* mess up arcs, but makes the output FST's structure a bit clearer.
*
* @param labelStates
* If <code>true</code> states will have labels equal to their offsets in their
* binary format. Expands the graph considerably.
*
* @see <a href="http://www.graphviz.org/">graphviz project</a>
*/
public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates) throws IOException {
final String expandedNodeColor = "blue";
// This is the start arc in the automaton (from the epsilon state to the first state
// with outgoing transitions.
final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
// A queue of transitions to consider for the next level.
final List<FST.Arc<T>> thisLevelQueue = new ArrayList<>();
// A queue of transitions to consider when processing the next level.
final List<FST.Arc<T>> nextLevelQueue = new ArrayList<>();
nextLevelQueue.add(startArc);
//System.out.println("toDot: startArc: " + startArc);
// A list of states on the same level (for ranking).
final List<Integer> sameLevelStates = new ArrayList<>();
// A bitset of already seen states (target offset).
final BitSet seen = new BitSet();
seen.set((int) startArc.target);
// Shape for states.
final String stateShape = "circle";
final String finalStateShape = "doublecircle";
// Emit DOT prologue.
out.write("digraph FST {\n");
out.write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n");
if (!labelStates) {
out.write(" node [shape=circle, width=.2, height=.2, style=filled]\n");
}
emitDotState(out, "initial", "point", "white", "");
final T NO_OUTPUT = fst.outputs.getNoOutput();
final BytesReader r = fst.getBytesReader();
// final FST.Arc<T> scratchArc = new FST.Arc<>();
{
final String stateColor;
if (fst.isExpandedTarget(startArc, r)) {
stateColor = expandedNodeColor;
} else {
stateColor = null;
}
final boolean isFinal;
final T finalOutput;
if (startArc.isFinal()) {
isFinal = true;
finalOutput = startArc.nextFinalOutput == NO_OUTPUT ? null : startArc.nextFinalOutput;
} else {
isFinal = false;
finalOutput = null;
}
emitDotState(out, Long.toString(startArc.target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.outputs.outputToString(finalOutput));
}
out.write(" initial -> " + startArc.target + "\n");
int level = 0;
while (!nextLevelQueue.isEmpty()) {
// we could double buffer here, but it doesn't matter probably.
//System.out.println("next level=" + level);
thisLevelQueue.addAll(nextLevelQueue);
nextLevelQueue.clear();
level++;
out.write("\n // Transitions and states at level: " + level + "\n");
while (!thisLevelQueue.isEmpty()) {
final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);
//System.out.println(" pop: " + arc);
if (FST.targetHasArcs(arc)) {
// scan all target arcs
//System.out.println(" readFirstTarget...");
final long node = arc.target;
fst.readFirstRealTargetArc(arc.target, arc, r);
while (true) {
// Emit the unseen state and add it to the queue for the next level.
if (arc.target >= 0 && !seen.get((int) arc.target)) {
/*
boolean isFinal = false;
T finalOutput = null;
fst.readFirstTargetArc(arc, scratchArc);
if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) {
// target is final
isFinal = true;
finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output;
System.out.println("dot hit final label=" + (char) scratchArc.label);
}
*/
final String stateColor;
if (fst.isExpandedTarget(arc, r)) {
stateColor = expandedNodeColor;
} else {
stateColor = null;
}
final String finalOutput;
if (arc.nextFinalOutput != null && arc.nextFinalOutput != NO_OUTPUT) {
finalOutput = fst.outputs.outputToString(arc.nextFinalOutput);
} else {
finalOutput = "";
}
emitDotState(out, Long.toString(arc.target), stateShape, stateColor, finalOutput);
// To see the node address, use this instead:
//emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target));
seen.set((int) arc.target);
nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
sameLevelStates.add((int) arc.target);
}
String outs;
if (arc.output != NO_OUTPUT) {
outs = "/" + fst.outputs.outputToString(arc.output);
} else {
outs = "";
}
if (!FST.targetHasArcs(arc) && arc.isFinal() && arc.nextFinalOutput != NO_OUTPUT) {
// Tricky special case: sometimes, due to
// pruning, the builder can [sillily] produce
// an FST with an arc into the final end state
// (-1) but also with a next final output; in
// this case we pull that output up onto this
// arc
outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]";
}
final String arcColor;
if (arc.flag(FST.BIT_TARGET_NEXT)) {
arcColor = "red";
} else {
arcColor = "black";
}
assert arc.label != FST.END_LABEL;
out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"" + (arc.isFinal() ? " style=\"bold\"" : "") + " color=\"" + arcColor + "\"]\n");
// Break the loop if we're on the last arc of this state.
if (arc.isLast()) {
//System.out.println(" break");
break;
}
fst.readNextRealArc(arc, r);
}
}
}
// Emit state ranking information.
if (sameRank && sameLevelStates.size() > 1) {
out.write(" {rank=same; ");
for (int state : sameLevelStates) {
out.write(state + "; ");
}
out.write(" }\n");
}
sameLevelStates.clear();
}
// Emit terminating state (always there anyway).
out.write(" -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n");
out.write(" {rank=sink; -1 }\n");
out.write("}\n");
out.flush();
}
Aggregations