use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class FiniteStringsIteratorTest method testShortAccept.
public void testShortAccept() {
Automaton a = Operations.union(Automata.makeString("x"), Automata.makeString("xy"));
a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
FiniteStringsIterator iterator = new FiniteStringsIterator(a);
List<IntsRef> actual = getFiniteStrings(iterator);
assertEquals(2, actual.size());
IntsRefBuilder x = new IntsRefBuilder();
Util.toIntsRef(new BytesRef("x"), x);
assertTrue(actual.contains(x.get()));
IntsRefBuilder xy = new IntsRefBuilder();
Util.toIntsRef(new BytesRef("xy"), xy);
assertTrue(actual.contains(xy.get()));
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class FiniteStringsIteratorTest method testSingletonNoLimit.
public void testSingletonNoLimit() {
Automaton a = Automata.makeString("foobar");
FiniteStringsIterator iterator = new FiniteStringsIterator(a);
List<IntsRef> actual = getFiniteStrings(iterator);
assertEquals(1, actual.size());
IntsRefBuilder scratch = new IntsRefBuilder();
Util.toUTF32("foobar".toCharArray(), 0, 6, scratch);
assertTrue(actual.contains(scratch.get()));
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class FiniteStringsIteratorTest method testRandomFiniteStrings1.
public void testRandomFiniteStrings1() {
int numStrings = atLeast(100);
if (VERBOSE) {
System.out.println("TEST: numStrings=" + numStrings);
}
Set<IntsRef> strings = new HashSet<>();
List<Automaton> automata = new ArrayList<>();
IntsRefBuilder scratch = new IntsRefBuilder();
for (int i = 0; i < numStrings; i++) {
String s = TestUtil.randomSimpleString(random(), 1, 200);
Util.toUTF32(s.toCharArray(), 0, s.length(), scratch);
if (strings.add(scratch.toIntsRef())) {
automata.add(Automata.makeString(s));
if (VERBOSE) {
System.out.println(" add string=" + s);
}
}
}
// TODO: we could sometimes use
// DaciukMihovAutomatonBuilder here
// TODO: what other random things can we do here...
Automaton a = Operations.union(automata);
if (random().nextBoolean()) {
a = MinimizationOperations.minimize(a, 1000000);
if (VERBOSE) {
System.out.println("TEST: a.minimize numStates=" + a.getNumStates());
}
} else if (random().nextBoolean()) {
if (VERBOSE) {
System.out.println("TEST: a.determinize");
}
a = Operations.determinize(a, 1000000);
} else if (random().nextBoolean()) {
if (VERBOSE) {
System.out.println("TEST: a.removeDeadStates");
}
a = Operations.removeDeadStates(a);
}
FiniteStringsIterator iterator = new FiniteStringsIterator(a);
List<IntsRef> actual = getFiniteStrings(iterator);
assertFiniteStringsRecursive(a, actual);
if (!strings.equals(new HashSet<>(actual))) {
System.out.println("strings.size()=" + strings.size() + " actual.size=" + actual.size());
List<IntsRef> x = new ArrayList<>(strings);
Collections.sort(x);
List<IntsRef> y = new ArrayList<>(actual);
Collections.sort(y);
int end = Math.min(x.size(), y.size());
for (int i = 0; i < end; i++) {
System.out.println(" i=" + i + " string=" + toString(x.get(i)) + " actual=" + toString(y.get(i)));
}
fail("wrong strings found");
}
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class BaseTokenStreamTestCase method assertGraphStrings.
/**
* Enumerates all accepted strings in the token graph created by the already initialized {@link TokenStream}.
*/
public static void assertGraphStrings(TokenStream tokenStream, String... expectedStrings) throws IOException {
Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
Set<String> expectedStringsSet = new HashSet<>(Arrays.asList(expectedStrings));
BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
Set<String> actualStrings = new HashSet<>();
for (IntsRef ir : actualStringPaths) {
actualStrings.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
}
for (String s : actualStrings) {
assertTrue("Analyzer created unexpected string path: " + s + "\nexpected:\n" + toString(expectedStringsSet) + "\nactual:\n" + toString(actualStrings), expectedStringsSet.contains(s));
}
for (String s : expectedStrings) {
assertTrue("Analyzer created unexpected string path: " + s + "\nexpected:\n" + toString(expectedStringsSet) + "\nactual:\n" + toString(actualStrings), actualStrings.contains(s));
}
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class FSTTester method verifyPruned.
// FST is pruned
private void verifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) throws IOException {
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs);
for (InputOutput<T> pair : pairs) {
System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
}
}
// To validate the FST, we brute-force compute all prefixes
// in the terms, matched to their "common" outputs, prune that
// set according to the prune thresholds, then assert the FST
// matches that same set.
// NOTE: Crazy RAM intensive!!
//System.out.println("TEST: tally prefixes");
// build all prefixes
final Map<IntsRef, CountMinOutput<T>> prefixes = new HashMap<>();
final IntsRefBuilder scratch = new IntsRefBuilder();
for (InputOutput<T> pair : pairs) {
scratch.copyInts(pair.input);
for (int idx = 0; idx <= pair.input.length; idx++) {
scratch.setLength(idx);
CountMinOutput<T> cmo = prefixes.get(scratch.get());
if (cmo == null) {
cmo = new CountMinOutput<>();
cmo.count = 1;
cmo.output = pair.output;
prefixes.put(scratch.toIntsRef(), cmo);
} else {
cmo.count++;
T output1 = cmo.output;
if (output1.equals(outputs.getNoOutput())) {
output1 = outputs.getNoOutput();
}
T output2 = pair.output;
if (output2.equals(outputs.getNoOutput())) {
output2 = outputs.getNoOutput();
}
cmo.output = outputs.common(output1, output2);
}
if (idx == pair.input.length) {
cmo.isFinal = true;
cmo.finalOutput = cmo.output;
}
}
}
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: now prune");
}
// prune 'em
final Iterator<Map.Entry<IntsRef, CountMinOutput<T>>> it = prefixes.entrySet().iterator();
while (it.hasNext()) {
Map.Entry<IntsRef, CountMinOutput<T>> ent = it.next();
final IntsRef prefix = ent.getKey();
final CountMinOutput<T> cmo = ent.getValue();
if (LuceneTestCase.VERBOSE) {
System.out.println(" term prefix=" + inputToString(inputMode, prefix, false) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal);
}
final boolean keep;
if (prune1 > 0) {
keep = cmo.count >= prune1;
} else {
assert prune2 > 0;
if (prune2 > 1 && cmo.count >= prune2) {
keep = true;
} else if (prefix.length > 0) {
// consult our parent
scratch.setLength(prefix.length - 1);
System.arraycopy(prefix.ints, prefix.offset, scratch.ints(), 0, scratch.length());
final CountMinOutput<T> cmo2 = prefixes.get(scratch.get());
//System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count));
keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1)));
} else if (cmo.count >= prune2) {
keep = true;
} else {
keep = false;
}
}
if (!keep) {
it.remove();
//System.out.println(" remove");
} else {
// clear isLeaf for all ancestors
//System.out.println(" keep");
scratch.copyInts(prefix);
scratch.setLength(scratch.length() - 1);
while (scratch.length() >= 0) {
final CountMinOutput<T> cmo2 = prefixes.get(scratch.get());
if (cmo2 != null) {
//System.out.println(" clear isLeaf " + inputToString(inputMode, scratch));
cmo2.isLeaf = false;
}
scratch.setLength(scratch.length() - 1);
}
}
}
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: after prune");
for (Map.Entry<IntsRef, CountMinOutput<T>> ent : prefixes.entrySet()) {
System.out.println(" " + inputToString(inputMode, ent.getKey(), false) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
if (ent.getValue().isFinal) {
System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
}
}
}
if (prefixes.size() <= 1) {
assertNull(fst);
return;
}
assertNotNull(fst);
// make sure FST only enums valid prefixes
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: check pruned enum");
}
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<>(fst);
IntsRefFSTEnum.InputOutput<T> current;
while ((current = fstEnum.next()) != null) {
if (LuceneTestCase.VERBOSE) {
System.out.println(" fstEnum.next prefix=" + inputToString(inputMode, current.input, false) + " output=" + outputs.outputToString(current.output));
}
final CountMinOutput<T> cmo = prefixes.get(current.input);
assertNotNull(cmo);
assertTrue(cmo.isLeaf || cmo.isFinal);
//if (cmo.isFinal && !cmo.isLeaf) {
if (cmo.isFinal) {
assertEquals(cmo.finalOutput, current.output);
} else {
assertEquals(cmo.output, current.output);
}
}
// make sure all non-pruned prefixes are present in the FST
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: verify all prefixes");
}
final int[] stopNode = new int[1];
for (Map.Entry<IntsRef, CountMinOutput<T>> ent : prefixes.entrySet()) {
if (ent.getKey().length > 0) {
final CountMinOutput<T> cmo = ent.getValue();
final T output = run(fst, ent.getKey(), stopNode);
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: verify prefix=" + inputToString(inputMode, ent.getKey(), false) + " output=" + outputs.outputToString(cmo.output));
}
// if (cmo.isFinal && !cmo.isLeaf) {
if (cmo.isFinal) {
assertEquals(cmo.finalOutput, output);
} else {
assertEquals(cmo.output, output);
}
assertEquals(ent.getKey().length, stopNode[0]);
}
}
}
Aggregations