use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class FSTTester method verifyPruned.
// FST is pruned
private void verifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) throws IOException {
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs);
for (InputOutput<T> pair : pairs) {
System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
}
}
// To validate the FST, we brute-force compute all prefixes
// in the terms, matched to their "common" outputs, prune that
// set according to the prune thresholds, then assert the FST
// matches that same set.
// NOTE: Crazy RAM intensive!!
//System.out.println("TEST: tally prefixes");
// build all prefixes
final Map<IntsRef, CountMinOutput<T>> prefixes = new HashMap<>();
final IntsRefBuilder scratch = new IntsRefBuilder();
for (InputOutput<T> pair : pairs) {
scratch.copyInts(pair.input);
for (int idx = 0; idx <= pair.input.length; idx++) {
scratch.setLength(idx);
CountMinOutput<T> cmo = prefixes.get(scratch.get());
if (cmo == null) {
cmo = new CountMinOutput<>();
cmo.count = 1;
cmo.output = pair.output;
prefixes.put(scratch.toIntsRef(), cmo);
} else {
cmo.count++;
T output1 = cmo.output;
if (output1.equals(outputs.getNoOutput())) {
output1 = outputs.getNoOutput();
}
T output2 = pair.output;
if (output2.equals(outputs.getNoOutput())) {
output2 = outputs.getNoOutput();
}
cmo.output = outputs.common(output1, output2);
}
if (idx == pair.input.length) {
cmo.isFinal = true;
cmo.finalOutput = cmo.output;
}
}
}
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: now prune");
}
// prune 'em
final Iterator<Map.Entry<IntsRef, CountMinOutput<T>>> it = prefixes.entrySet().iterator();
while (it.hasNext()) {
Map.Entry<IntsRef, CountMinOutput<T>> ent = it.next();
final IntsRef prefix = ent.getKey();
final CountMinOutput<T> cmo = ent.getValue();
if (LuceneTestCase.VERBOSE) {
System.out.println(" term prefix=" + inputToString(inputMode, prefix, false) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal);
}
final boolean keep;
if (prune1 > 0) {
keep = cmo.count >= prune1;
} else {
assert prune2 > 0;
if (prune2 > 1 && cmo.count >= prune2) {
keep = true;
} else if (prefix.length > 0) {
// consult our parent
scratch.setLength(prefix.length - 1);
System.arraycopy(prefix.ints, prefix.offset, scratch.ints(), 0, scratch.length());
final CountMinOutput<T> cmo2 = prefixes.get(scratch.get());
//System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count));
keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1)));
} else if (cmo.count >= prune2) {
keep = true;
} else {
keep = false;
}
}
if (!keep) {
it.remove();
//System.out.println(" remove");
} else {
// clear isLeaf for all ancestors
//System.out.println(" keep");
scratch.copyInts(prefix);
scratch.setLength(scratch.length() - 1);
while (scratch.length() >= 0) {
final CountMinOutput<T> cmo2 = prefixes.get(scratch.get());
if (cmo2 != null) {
//System.out.println(" clear isLeaf " + inputToString(inputMode, scratch));
cmo2.isLeaf = false;
}
scratch.setLength(scratch.length() - 1);
}
}
}
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: after prune");
for (Map.Entry<IntsRef, CountMinOutput<T>> ent : prefixes.entrySet()) {
System.out.println(" " + inputToString(inputMode, ent.getKey(), false) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
if (ent.getValue().isFinal) {
System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
}
}
}
if (prefixes.size() <= 1) {
assertNull(fst);
return;
}
assertNotNull(fst);
// make sure FST only enums valid prefixes
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: check pruned enum");
}
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<>(fst);
IntsRefFSTEnum.InputOutput<T> current;
while ((current = fstEnum.next()) != null) {
if (LuceneTestCase.VERBOSE) {
System.out.println(" fstEnum.next prefix=" + inputToString(inputMode, current.input, false) + " output=" + outputs.outputToString(current.output));
}
final CountMinOutput<T> cmo = prefixes.get(current.input);
assertNotNull(cmo);
assertTrue(cmo.isLeaf || cmo.isFinal);
//if (cmo.isFinal && !cmo.isLeaf) {
if (cmo.isFinal) {
assertEquals(cmo.finalOutput, current.output);
} else {
assertEquals(cmo.output, current.output);
}
}
// make sure all non-pruned prefixes are present in the FST
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: verify all prefixes");
}
final int[] stopNode = new int[1];
for (Map.Entry<IntsRef, CountMinOutput<T>> ent : prefixes.entrySet()) {
if (ent.getKey().length > 0) {
final CountMinOutput<T> cmo = ent.getValue();
final T output = run(fst, ent.getKey(), stopNode);
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: verify prefix=" + inputToString(inputMode, ent.getKey(), false) + " output=" + outputs.outputToString(cmo.output));
}
// if (cmo.isFinal && !cmo.isLeaf) {
if (cmo.isFinal) {
assertEquals(cmo.finalOutput, output);
} else {
assertEquals(cmo.output, output);
}
assertEquals(ent.getKey().length, stopNode[0]);
}
}
}
use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class BaseSynonymParserTestCase method assertEntryEquals.
/**
* Helper method to validate synonym parsing.
*
* @param synonynMap the generated synonym map after parsing
* @param word word (phrase) we are validating the synonyms for. Should be the value that comes out of the analyzer.
* All spaces will be replaced by word separators.
* @param includeOrig if synonyms should include original
* @param synonyms actual synonyms. All word separators are replaced with a single space.
*/
public static void assertEntryEquals(SynonymMap synonynMap, String word, boolean includeOrig, String[] synonyms) throws Exception {
word = word.replace(' ', SynonymMap.WORD_SEPARATOR);
BytesRef value = Util.get(synonynMap.fst, Util.toUTF32(new CharsRef(word), new IntsRefBuilder()));
assertNotNull("No synonyms found for: " + word, value);
ByteArrayDataInput bytesReader = new ByteArrayDataInput(value.bytes, value.offset, value.length);
final int code = bytesReader.readVInt();
final boolean keepOrig = (code & 0x1) == 0;
assertEquals("Include original different than expected. Expected " + includeOrig + " was " + keepOrig, includeOrig, keepOrig);
final int count = code >>> 1;
assertEquals("Invalid synonym count. Expected " + synonyms.length + " was " + count, synonyms.length, count);
Set<String> synonymSet = new HashSet<>(Arrays.asList(synonyms));
BytesRef scratchBytes = new BytesRef();
for (int i = 0; i < count; i++) {
synonynMap.words.get(bytesReader.readVInt(), scratchBytes);
String synonym = scratchBytes.utf8ToString().replace(SynonymMap.WORD_SEPARATOR, ' ');
assertTrue("Unexpected synonym found: " + synonym, synonymSet.contains(synonym));
}
}
use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class MemoryDocValuesConsumer method writeFST.
private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException {
meta.writeVInt(field.number);
meta.writeByte(FST);
meta.writeLong(data.getFilePointer());
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = 0;
for (BytesRef v : values) {
builder.add(Util.toIntsRef(v, scratch), ord);
ord++;
}
FST<Long> fst = builder.finish();
if (fst != null) {
fst.save(data);
}
meta.writeVLong(ord);
}
use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class MemoryDocValuesProducer method getSortedNonIterator.
private LegacySortedDocValues getSortedNonIterator(FieldInfo field) throws IOException {
final FSTEntry entry = fsts.get(field.name);
if (entry.numOrds == 0) {
return DocValues.emptyLegacySorted();
}
FST<Long> instance;
synchronized (this) {
instance = fstInstances.get(field.name);
if (instance == null) {
IndexInput data = this.data.clone();
data.seek(entry.offset);
instance = new FST<>(data, PositiveIntOutputs.getSingleton());
if (!merging) {
ramBytesUsed.addAndGet(instance.ramBytesUsed());
fstInstances.put(field.name, instance);
}
}
}
final LegacyNumericDocValues docToOrd = getNumericNonIterator(field);
final FST<Long> fst = instance;
// per-thread resources
final BytesReader in = fst.getBytesReader();
final Arc<Long> firstArc = new Arc<>();
final Arc<Long> scratchArc = new Arc<>();
final IntsRefBuilder scratchInts = new IntsRefBuilder();
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
return new LegacySortedDocValues() {
final BytesRefBuilder term = new BytesRefBuilder();
@Override
public int getOrd(int docID) {
return (int) docToOrd.get(docID);
}
@Override
public BytesRef lookupOrd(int ord) {
try {
in.setPosition(0);
fst.getFirstArc(firstArc);
IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts);
return Util.toBytesRef(output, term);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
@Override
public int lookupTerm(BytesRef key) {
try {
InputOutput<Long> o = fstEnum.seekCeil(key);
if (o == null) {
return -getValueCount() - 1;
} else if (o.input.equals(key)) {
return o.output.intValue();
} else {
return (int) -o.output - 1;
}
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
@Override
public int getValueCount() {
return (int) entry.numOrds;
}
@Override
public TermsEnum termsEnum() {
return new FSTTermsEnum(fst);
}
};
}
use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class FreeTextSuggester method build.
/** Build the suggest index, using up to the specified
* amount of temporary RAM while building. Note that
* the weights for the suggestions are ignored. */
public void build(InputIterator iterator, double ramBufferSizeMB) throws IOException {
if (iterator.hasPayloads()) {
throw new IllegalArgumentException("this suggester doesn't support payloads");
}
if (iterator.hasContexts()) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
String prefix = getClass().getSimpleName();
Path tempIndexPath = Files.createTempDirectory(prefix + ".index.");
Directory dir = FSDirectory.open(tempIndexPath);
IndexWriterConfig iwc = new IndexWriterConfig(indexAnalyzer);
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
iwc.setRAMBufferSizeMB(ramBufferSizeMB);
IndexWriter writer = new IndexWriter(dir, iwc);
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
// TODO: if only we had IndexOptions.TERMS_ONLY...
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
ft.setOmitNorms(true);
ft.freeze();
Document doc = new Document();
Field field = new Field("body", "", ft);
doc.add(field);
totTokens = 0;
IndexReader reader = null;
boolean success = false;
count = 0;
try {
while (true) {
BytesRef surfaceForm = iterator.next();
if (surfaceForm == null) {
break;
}
field.setStringValue(surfaceForm.utf8ToString());
writer.addDocument(doc);
count++;
}
reader = DirectoryReader.open(writer);
Terms terms = MultiFields.getTerms(reader, "body");
if (terms == null) {
throw new IllegalArgumentException("need at least one suggestion");
}
// Move all ngrams into an FST:
TermsEnum termsEnum = terms.iterator();
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder();
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
int ngramCount = countGrams(term);
if (ngramCount > grams) {
throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams);
}
if (ngramCount == 1) {
totTokens += termsEnum.totalTermFreq();
}
builder.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));
}
fst = builder.finish();
if (fst == null) {
throw new IllegalArgumentException("need at least one suggestion");
}
//System.out.println("FST: " + fst.getNodeCount() + " nodes");
/*
PrintWriter pw = new PrintWriter("/x/tmp/out.dot");
Util.toDot(fst, pw, true, true);
pw.close();
*/
// Writer was only temporary, to count up bigrams,
// which we transferred to the FST, so now we
// rollback:
writer.rollback();
success = true;
} finally {
try {
if (success) {
IOUtils.close(reader, dir);
} else {
IOUtils.closeWhileHandlingException(reader, writer, dir);
}
} finally {
IOUtils.rm(tempIndexPath);
}
}
}
Aggregations