use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class FSTTester method verifyUnPruned.
// FST is complete
private void verifyUnPruned(int inputMode, FST<T> fst) throws IOException {
final FST<Long> fstLong;
final Set<Long> validOutputs;
long minLong = Long.MAX_VALUE;
long maxLong = Long.MIN_VALUE;
if (doReverseLookup) {
@SuppressWarnings("unchecked") FST<Long> fstLong0 = (FST<Long>) fst;
fstLong = fstLong0;
validOutputs = new HashSet<>();
for (InputOutput<T> pair : pairs) {
Long output = (Long) pair.output;
maxLong = Math.max(maxLong, output);
minLong = Math.min(minLong, output);
validOutputs.add(output);
}
} else {
fstLong = null;
validOutputs = null;
}
if (pairs.size() == 0) {
assertNull(fst);
return;
}
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: now verify " + pairs.size() + " terms");
for (InputOutput<T> pair : pairs) {
assertNotNull(pair);
assertNotNull(pair.input);
assertNotNull(pair.output);
System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
}
}
assertNotNull(fst);
// them correctly
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: check valid terms/next()");
}
{
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<>(fst);
for (InputOutput<T> pair : pairs) {
IntsRef term = pair.input;
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output));
}
T output = run(fst, term, null);
assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output);
assertTrue(outputsEqual(pair.output, output));
// verify enum's next
IntsRefFSTEnum.InputOutput<T> t = fstEnum.next();
assertNotNull(t);
assertEquals("expected input=" + inputToString(inputMode, term) + " but fstEnum returned " + inputToString(inputMode, t.input), term, t.input);
assertTrue(outputsEqual(pair.output, t.output));
}
assertNull(fstEnum.next());
}
final Map<IntsRef, T> termsMap = new HashMap<>();
for (InputOutput<T> pair : pairs) {
termsMap.put(pair.input, pair.output);
}
if (doReverseLookup && maxLong > minLong) {
// Do random lookups so we test null (output doesn't
// exist) case:
assertNull(Util.getByOutput(fstLong, minLong - 7));
assertNull(Util.getByOutput(fstLong, maxLong + 7));
final int num = LuceneTestCase.atLeast(random, 100);
for (int iter = 0; iter < num; iter++) {
Long v = TestUtil.nextLong(random, minLong, maxLong);
IntsRef input = Util.getByOutput(fstLong, v);
assertTrue(validOutputs.contains(v) || input == null);
}
}
// find random matching word and make sure it's valid
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: verify random accepted terms");
}
final IntsRefBuilder scratch = new IntsRefBuilder();
int num = LuceneTestCase.atLeast(random, 500);
for (int iter = 0; iter < num; iter++) {
T output = randomAcceptedWord(fst, scratch);
assertTrue("accepted word " + inputToString(inputMode, scratch.get()) + " is not valid", termsMap.containsKey(scratch.get()));
assertTrue(outputsEqual(termsMap.get(scratch.get()), output));
if (doReverseLookup) {
//System.out.println("lookup output=" + output + " outs=" + fst.outputs);
IntsRef input = Util.getByOutput(fstLong, (Long) output);
assertNotNull(input);
//System.out.println(" got " + Util.toBytesRef(input, new BytesRef()).utf8ToString());
assertEquals(scratch.get(), input);
}
}
// test IntsRefFSTEnum.seek:
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: verify seek");
}
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<>(fst);
num = LuceneTestCase.atLeast(random, 100);
for (int iter = 0; iter < num; iter++) {
if (LuceneTestCase.VERBOSE) {
System.out.println(" iter=" + iter);
}
if (random.nextBoolean()) {
// seek to term that doesn't exist:
while (true) {
final IntsRef term = toIntsRef(getRandomString(random), inputMode);
int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
if (pos < 0) {
pos = -(pos + 1);
// ok doesn't exist
//System.out.println(" seek " + inputToString(inputMode, term));
final IntsRefFSTEnum.InputOutput<T> seekResult;
if (random.nextInt(3) == 0) {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do non-exist seekExact term=" + inputToString(inputMode, term));
}
seekResult = fstEnum.seekExact(term);
pos = -1;
} else if (random.nextBoolean()) {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do non-exist seekFloor term=" + inputToString(inputMode, term));
}
seekResult = fstEnum.seekFloor(term);
pos--;
} else {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do non-exist seekCeil term=" + inputToString(inputMode, term));
}
seekResult = fstEnum.seekCeil(term);
}
if (pos != -1 && pos < pairs.size()) {
//System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.outputs.outputToString(seekResult.output));
assertNotNull("got null but expected term=" + inputToString(inputMode, pairs.get(pos).input), seekResult);
if (LuceneTestCase.VERBOSE) {
System.out.println(" got " + inputToString(inputMode, seekResult.input));
}
assertEquals("expected " + inputToString(inputMode, pairs.get(pos).input) + " but got " + inputToString(inputMode, seekResult.input), pairs.get(pos).input, seekResult.input);
assertTrue(outputsEqual(pairs.get(pos).output, seekResult.output));
} else {
// seeked before start or beyond end
//System.out.println("seek=" + seekTerm);
assertNull("expected null but got " + (seekResult == null ? "null" : inputToString(inputMode, seekResult.input)), seekResult);
if (LuceneTestCase.VERBOSE) {
System.out.println(" got null");
}
}
break;
}
}
} else {
// seek to term that does exist:
InputOutput<T> pair = pairs.get(random.nextInt(pairs.size()));
final IntsRefFSTEnum.InputOutput<T> seekResult;
if (random.nextInt(3) == 2) {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do exists seekExact term=" + inputToString(inputMode, pair.input));
}
seekResult = fstEnum.seekExact(pair.input);
} else if (random.nextBoolean()) {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do exists seekFloor " + inputToString(inputMode, pair.input));
}
seekResult = fstEnum.seekFloor(pair.input);
} else {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do exists seekCeil " + inputToString(inputMode, pair.input));
}
seekResult = fstEnum.seekCeil(pair.input);
}
assertNotNull(seekResult);
assertEquals("got " + inputToString(inputMode, seekResult.input) + " but expected " + inputToString(inputMode, pair.input), pair.input, seekResult.input);
assertTrue(outputsEqual(pair.output, seekResult.output));
}
}
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: mixed next/seek");
}
// test mixed next/seek
num = LuceneTestCase.atLeast(random, 100);
for (int iter = 0; iter < num; iter++) {
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: iter " + iter);
}
// reset:
fstEnum = new IntsRefFSTEnum<>(fst);
int upto = -1;
while (true) {
boolean isDone = false;
if (upto == pairs.size() - 1 || random.nextBoolean()) {
// next
upto++;
if (LuceneTestCase.VERBOSE) {
System.out.println(" do next");
}
isDone = fstEnum.next() == null;
} else if (upto != -1 && upto < 0.75 * pairs.size() && random.nextBoolean()) {
int attempt = 0;
for (; attempt < 10; attempt++) {
IntsRef term = toIntsRef(getRandomString(random), inputMode);
if (!termsMap.containsKey(term) && term.compareTo(pairs.get(upto).input) > 0) {
int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
assert pos < 0;
upto = -(pos + 1);
if (random.nextBoolean()) {
upto--;
assertTrue(upto != -1);
if (LuceneTestCase.VERBOSE) {
System.out.println(" do non-exist seekFloor(" + inputToString(inputMode, term) + ")");
}
isDone = fstEnum.seekFloor(term) == null;
} else {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do non-exist seekCeil(" + inputToString(inputMode, term) + ")");
}
isDone = fstEnum.seekCeil(term) == null;
}
break;
}
}
if (attempt == 10) {
continue;
}
} else {
final int inc = random.nextInt(pairs.size() - upto - 1);
upto += inc;
if (upto == -1) {
upto = 0;
}
if (random.nextBoolean()) {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do seekCeil(" + inputToString(inputMode, pairs.get(upto).input) + ")");
}
isDone = fstEnum.seekCeil(pairs.get(upto).input) == null;
} else {
if (LuceneTestCase.VERBOSE) {
System.out.println(" do seekFloor(" + inputToString(inputMode, pairs.get(upto).input) + ")");
}
isDone = fstEnum.seekFloor(pairs.get(upto).input) == null;
}
}
if (LuceneTestCase.VERBOSE) {
if (!isDone) {
System.out.println(" got " + inputToString(inputMode, fstEnum.current().input));
} else {
System.out.println(" got null");
}
}
if (upto == pairs.size()) {
assertTrue(isDone);
break;
} else {
assertFalse(isDone);
assertEquals(pairs.get(upto).input, fstEnum.current().input);
assertTrue(outputsEqual(pairs.get(upto).output, fstEnum.current().output));
/*
if (upto < pairs.size()-1) {
int tryCount = 0;
while(tryCount < 10) {
final IntsRef t = toIntsRef(getRandomString(), inputMode);
if (pairs.get(upto).input.compareTo(t) < 0) {
final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0;
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected);
}
assertEquals(expected, fstEnum.beforeNext(t));
break;
}
tryCount++;
}
}
*/
}
}
}
}
use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class TestSynonymGraphFilter method testPositionLengthAndType.
/**
* verify type of token and positionLengths on synonyms of different word counts.
*/
public void testPositionLengthAndType() throws Exception {
String testFile = "spider man, spiderman\n" + "usa,united states,u s a,united states of america";
Analyzer analyzer = new MockAnalyzer(random());
SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
parser.parse(new StringReader(testFile));
analyzer.close();
SynonymMap map = parser.build();
analyzer = getFlattenAnalyzer(parser, true);
BytesRef value = Util.get(map.fst, Util.toUTF32(new CharsRef("usa"), new IntsRefBuilder()));
ByteArrayDataInput bytesReader = new ByteArrayDataInput(value.bytes, value.offset, value.length);
final int code = bytesReader.readVInt();
final int count = code >>> 1;
final int[] synonymsIdxs = new int[count];
for (int i = 0; i < count; i++) {
synonymsIdxs[i] = bytesReader.readVInt();
}
BytesRef scratchBytes = new BytesRef();
map.words.get(synonymsIdxs[2], scratchBytes);
int synonymLength = 1;
for (int i = scratchBytes.offset; i < scratchBytes.offset + scratchBytes.length; i++) {
if (scratchBytes.bytes[i] == SynonymMap.WORD_SEPARATOR) {
synonymLength++;
}
}
assertEquals(count, 3);
assertEquals(synonymLength, 4);
assertAnalyzesTo(analyzer, "spider man", new String[] { "spiderman", "spider", "man" }, new int[] { 0, 0, 7 }, new int[] { 10, 6, 10 }, new String[] { "SYNONYM", "word", "word" }, new int[] { 1, 0, 1 }, new int[] { 2, 1, 1 });
assertAnalyzesToPositions(analyzer, "amazing spider man", new String[] { "amazing", "spiderman", "spider", "man" }, new String[] { "word", "SYNONYM", "word", "word" }, new int[] { 1, 1, 0, 1 }, new int[] { 1, 2, 1, 1 });
// System.out.println(toDot(getAnalyzer(parser, true).tokenStream("field", new StringReader("the usa is wealthy"))));
assertAnalyzesTo(analyzer, "the united states of america is wealthy", new String[] { "the", "usa", "united", "u", "united", "states", "s", "states", "a", "of", "america", "is", "wealthy" }, new int[] { 0, 4, 4, 4, 4, 11, 11, 11, 18, 18, 21, 29, 32 }, new int[] { 3, 28, 10, 10, 10, 28, 17, 17, 28, 20, 28, 31, 39 }, new String[] { "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "word", "word", "word", "word" }, new int[] { 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1 }, new int[] { 1, 4, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1 });
assertAnalyzesToPositions(analyzer, "spiderman", new String[] { "spider", "spiderman", "man" }, new String[] { "SYNONYM", "word", "SYNONYM" }, new int[] { 1, 0, 1 }, new int[] { 1, 2, 1 });
assertAnalyzesTo(analyzer, "spiderman enemies", new String[] { "spider", "spiderman", "man", "enemies" }, new int[] { 0, 0, 0, 10 }, new int[] { 9, 9, 9, 17 }, new String[] { "SYNONYM", "word", "SYNONYM", "word" }, new int[] { 1, 0, 1, 1 }, new int[] { 1, 2, 1, 1 });
assertAnalyzesTo(analyzer, "the usa is wealthy", new String[] { "the", "united", "u", "united", "usa", "states", "s", "states", "a", "of", "america", "is", "wealthy" }, new int[] { 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 11 }, new int[] { 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10, 18 }, new String[] { "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word" }, new int[] { 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1 }, new int[] { 1, 1, 1, 1, 4, 3, 1, 1, 2, 1, 1, 1, 1 });
assertGraphStrings(analyzer, "the usa is wealthy", new String[] { "the usa is wealthy", "the united states is wealthy", "the u s a is wealthy", "the united states of america is wealthy", // Wrong. Here only due to "sausagization" of the multi word synonyms.
"the u states is wealthy", "the u states a is wealthy", "the u s of america is wealthy", "the u states of america is wealthy", "the united s a is wealthy", "the united states a is wealthy", "the united s of america is wealthy" });
assertAnalyzesTo(analyzer, "the united states is wealthy", new String[] { "the", "usa", "u", "united", "united", "s", "states", "states", "a", "of", "america", "is", "wealthy" }, new int[] { 0, 4, 4, 4, 4, 11, 11, 11, 11, 11, 11, 18, 21 }, new int[] { 3, 17, 10, 10, 10, 17, 17, 17, 17, 17, 17, 20, 28 }, new String[] { "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word" }, new int[] { 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1 }, new int[] { 1, 4, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1 }, false);
assertAnalyzesTo(analyzer, "the united states of balance", new String[] { "the", "usa", "u", "united", "united", "s", "states", "states", "a", "of", "america", "of", "balance" }, new int[] { 0, 4, 4, 4, 4, 11, 11, 11, 11, 11, 11, 18, 21 }, new int[] { 3, 17, 10, 10, 10, 17, 17, 17, 17, 17, 17, 20, 28 }, new String[] { "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word" }, new int[] { 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1 }, new int[] { 1, 4, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1 });
analyzer.close();
}
use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class TestFSTs method testFinalOutputOnEndState.
public void testFinalOutputOnEndState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
builder.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
builder.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
final FST<Long> fst = builder.finish();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
StringWriter w = new StringWriter();
Util.toDot(fst, w, false, false);
w.close();
//System.out.println(w.toString());
assertTrue(w.toString().indexOf("label=\"t/[7]\"") != -1);
}
use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class TestFSTs method testLargeOutputsOnArrayArcs.
public void testLargeOutputsOnArrayArcs() throws Exception {
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final byte[] bytes = new byte[300];
final IntsRefBuilder input = new IntsRefBuilder();
input.append(0);
final BytesRef output = new BytesRef(bytes);
for (int arc = 0; arc < 6; arc++) {
input.setIntAt(0, arc);
output.bytes[0] = (byte) arc;
builder.add(input.get(), BytesRef.deepCopyOf(output));
}
final FST<BytesRef> fst = builder.finish();
for (int arc = 0; arc < 6; arc++) {
input.setIntAt(0, arc);
final BytesRef result = Util.get(fst, input.get());
assertNotNull(result);
assertEquals(300, result.length);
assertEquals(result.bytes[result.offset], arc);
for (int byteIDX = 1; byteIDX < result.length; byteIDX++) {
assertEquals(0, result.bytes[result.offset + byteIDX]);
}
}
}
use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class TestFSTs method testShortestPathsWFSTRandom.
/** like testShortestPathsRandom, but uses pairoutputs so we have both a weight and an output */
public void testShortestPathsWFSTRandom() throws Exception {
int numWords = atLeast(1000);
final TreeMap<String, TwoLongs> slowCompletor = new TreeMap<>();
final TreeSet<String> allPrefixes = new TreeSet<>();
PairOutputs<Long, Long> outputs = new PairOutputs<>(// weight
PositiveIntOutputs.getSingleton(), // output
PositiveIntOutputs.getSingleton());
final Builder<Pair<Long, Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
Random random = random();
for (int i = 0; i < numWords; i++) {
String s;
while (true) {
s = TestUtil.randomSimpleString(random);
if (!slowCompletor.containsKey(s)) {
break;
}
}
for (int j = 1; j < s.length(); j++) {
allPrefixes.add(s.substring(0, j));
}
// weights 1..100
int weight = TestUtil.nextInt(random, 1, 100);
// outputs 0..500
int output = TestUtil.nextInt(random, 0, 500);
slowCompletor.put(s, new TwoLongs(weight, output));
}
for (Map.Entry<String, TwoLongs> e : slowCompletor.entrySet()) {
//System.out.println("add: " + e);
long weight = e.getValue().a;
long output = e.getValue().b;
builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output));
}
final FST<Pair<Long, Long>> fst = builder.finish();
//System.out.println("SAVE out.dot");
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false);
//w.close();
BytesReader reader = fst.getBytesReader();
//System.out.println("testing: " + allPrefixes.size() + " prefixes");
for (String prefix : allPrefixes) {
// 1. run prefix against fst, then complete by value
//System.out.println("TEST: " + prefix);
Pair<Long, Long> prefixOutput = outputs.getNoOutput();
FST.Arc<Pair<Long, Long>> arc = fst.getFirstArc(new FST.Arc<Pair<Long, Long>>());
for (int idx = 0; idx < prefix.length(); idx++) {
if (fst.findTargetArc((int) prefix.charAt(idx), arc, arc, reader) == null) {
fail();
}
prefixOutput = outputs.add(prefixOutput, arc.output);
}
final int topN = TestUtil.nextInt(random, 1, 10);
Util.TopResults<Pair<Long, Long>> r = Util.shortestPaths(fst, arc, fst.outputs.getNoOutput(), minPairWeightComparator, topN, true);
assertTrue(r.isComplete);
// 2. go thru whole treemap (slowCompletor) and check it's actually the best suggestion
final List<Result<Pair<Long, Long>>> matches = new ArrayList<>();
// TODO: could be faster... but it's slowCompletor for a reason
for (Map.Entry<String, TwoLongs> e : slowCompletor.entrySet()) {
if (e.getKey().startsWith(prefix)) {
//System.out.println(" consider " + e.getKey());
matches.add(new Result<>(Util.toIntsRef(new BytesRef(e.getKey().substring(prefix.length())), new IntsRefBuilder()), outputs.newPair(e.getValue().a - prefixOutput.output1, e.getValue().b - prefixOutput.output2)));
}
}
assertTrue(matches.size() > 0);
Collections.sort(matches, new TieBreakByInputComparator<>(minPairWeightComparator));
if (matches.size() > topN) {
matches.subList(topN, matches.size()).clear();
}
assertEquals(matches.size(), r.topN.size());
for (int hit = 0; hit < r.topN.size(); hit++) {
//System.out.println(" check hit " + hit);
assertEquals(matches.get(hit).input, r.topN.get(hit).input);
assertEquals(matches.get(hit).output, r.topN.get(hit).output);
}
}
}
Aggregations