use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class TestFSTsMisc method testListOfOutputsEmptyString.
public void testListOfOutputsEmptyString() throws Exception {
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs);
final Builder<Object> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
builder.add(scratch.get(), 0L);
builder.add(scratch.get(), 1L);
builder.add(scratch.get(), 17L);
builder.add(scratch.get(), 1L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 0L);
final FST<Object> fst = builder.finish();
Object output = Util.get(fst, new BytesRef(""));
assertNotNull(output);
List<Long> outputList = outputs.asList(output);
assertEquals(4, outputList.size());
assertEquals(0L, outputList.get(0).longValue());
assertEquals(1L, outputList.get(1).longValue());
assertEquals(17L, outputList.get(2).longValue());
assertEquals(1L, outputList.get(3).longValue());
output = Util.get(fst, new BytesRef("a"));
assertNotNull(output);
outputList = outputs.asList(output);
assertEquals(3, outputList.size());
assertEquals(1L, outputList.get(0).longValue());
assertEquals(3L, outputList.get(1).longValue());
assertEquals(0L, outputList.get(2).longValue());
output = Util.get(fst, new BytesRef("b"));
assertNotNull(output);
outputList = outputs.asList(output);
assertEquals(1, outputList.size());
assertEquals(0L, outputList.get(0).longValue());
}
use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class FacetsConfig method processFacetFields.
private void processFacetFields(TaxonomyWriter taxoWriter, Map<String, List<FacetField>> byField, Document doc) throws IOException {
for (Map.Entry<String, List<FacetField>> ent : byField.entrySet()) {
String indexFieldName = ent.getKey();
//System.out.println(" indexFieldName=" + indexFieldName + " fields=" + ent.getValue());
IntsRefBuilder ordinals = new IntsRefBuilder();
for (FacetField facetField : ent.getValue()) {
FacetsConfig.DimConfig ft = getDimConfig(facetField.dim);
if (facetField.path.length > 1 && ft.hierarchical == false) {
throw new IllegalArgumentException("dimension \"" + facetField.dim + "\" is not hierarchical yet has " + facetField.path.length + " components");
}
FacetLabel cp = new FacetLabel(facetField.dim, facetField.path);
checkTaxoWriter(taxoWriter);
int ordinal = taxoWriter.addCategory(cp);
ordinals.append(ordinal);
if (ft.multiValued && (ft.hierarchical || ft.requireDimCount)) {
//System.out.println(" add parents");
// Add all parents too:
int parent = taxoWriter.getParent(ordinal);
while (parent > 0) {
ordinals.append(parent);
parent = taxoWriter.getParent(parent);
}
if (ft.requireDimCount == false) {
// Remove last (dimension) ord:
ordinals.setLength(ordinals.length() - 1);
}
}
// Drill down:
for (int i = 1; i <= cp.length; i++) {
doc.add(new StringField(indexFieldName, pathToString(cp.components, i), Field.Store.NO));
}
}
// Facet counts:
// DocValues are considered stored fields:
doc.add(new BinaryDocValuesField(indexFieldName, dedupAndEncode(ordinals.get())));
}
}
use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class MemoryDocValuesConsumer method writeFST.
private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException {
meta.writeVInt(field.number);
meta.writeByte(FST);
meta.writeLong(data.getFilePointer());
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = 0;
for (BytesRef v : values) {
builder.add(Util.toIntsRef(v, scratch), ord);
ord++;
}
FST<Long> fst = builder.finish();
if (fst != null) {
fst.save(data);
}
meta.writeVLong(ord);
}
use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class MemoryDocValuesProducer method getSortedNonIterator.
private LegacySortedDocValues getSortedNonIterator(FieldInfo field) throws IOException {
final FSTEntry entry = fsts.get(field.name);
if (entry.numOrds == 0) {
return DocValues.emptyLegacySorted();
}
FST<Long> instance;
synchronized (this) {
instance = fstInstances.get(field.name);
if (instance == null) {
IndexInput data = this.data.clone();
data.seek(entry.offset);
instance = new FST<>(data, PositiveIntOutputs.getSingleton());
if (!merging) {
ramBytesUsed.addAndGet(instance.ramBytesUsed());
fstInstances.put(field.name, instance);
}
}
}
final LegacyNumericDocValues docToOrd = getNumericNonIterator(field);
final FST<Long> fst = instance;
// per-thread resources
final BytesReader in = fst.getBytesReader();
final Arc<Long> firstArc = new Arc<>();
final Arc<Long> scratchArc = new Arc<>();
final IntsRefBuilder scratchInts = new IntsRefBuilder();
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
return new LegacySortedDocValues() {
final BytesRefBuilder term = new BytesRefBuilder();
@Override
public int getOrd(int docID) {
return (int) docToOrd.get(docID);
}
@Override
public BytesRef lookupOrd(int ord) {
try {
in.setPosition(0);
fst.getFirstArc(firstArc);
IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts);
return Util.toBytesRef(output, term);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
@Override
public int lookupTerm(BytesRef key) {
try {
InputOutput<Long> o = fstEnum.seekCeil(key);
if (o == null) {
return -getValueCount() - 1;
} else if (o.input.equals(key)) {
return o.output.intValue();
} else {
return (int) -o.output - 1;
}
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
@Override
public int getValueCount() {
return (int) entry.numOrds;
}
@Override
public TermsEnum termsEnum() {
return new FSTTermsEnum(fst);
}
};
}
use of org.apache.lucene.util.IntsRefBuilder in project lucene-solr by apache.
the class FreeTextSuggester method build.
/** Build the suggest index, using up to the specified
* amount of temporary RAM while building. Note that
* the weights for the suggestions are ignored. */
public void build(InputIterator iterator, double ramBufferSizeMB) throws IOException {
if (iterator.hasPayloads()) {
throw new IllegalArgumentException("this suggester doesn't support payloads");
}
if (iterator.hasContexts()) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
String prefix = getClass().getSimpleName();
Path tempIndexPath = Files.createTempDirectory(prefix + ".index.");
Directory dir = FSDirectory.open(tempIndexPath);
IndexWriterConfig iwc = new IndexWriterConfig(indexAnalyzer);
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
iwc.setRAMBufferSizeMB(ramBufferSizeMB);
IndexWriter writer = new IndexWriter(dir, iwc);
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
// TODO: if only we had IndexOptions.TERMS_ONLY...
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
ft.setOmitNorms(true);
ft.freeze();
Document doc = new Document();
Field field = new Field("body", "", ft);
doc.add(field);
totTokens = 0;
IndexReader reader = null;
boolean success = false;
count = 0;
try {
while (true) {
BytesRef surfaceForm = iterator.next();
if (surfaceForm == null) {
break;
}
field.setStringValue(surfaceForm.utf8ToString());
writer.addDocument(doc);
count++;
}
reader = DirectoryReader.open(writer);
Terms terms = MultiFields.getTerms(reader, "body");
if (terms == null) {
throw new IllegalArgumentException("need at least one suggestion");
}
// Move all ngrams into an FST:
TermsEnum termsEnum = terms.iterator();
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder();
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
int ngramCount = countGrams(term);
if (ngramCount > grams) {
throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams);
}
if (ngramCount == 1) {
totTokens += termsEnum.totalTermFreq();
}
builder.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));
}
fst = builder.finish();
if (fst == null) {
throw new IllegalArgumentException("need at least one suggestion");
}
//System.out.println("FST: " + fst.getNodeCount() + " nodes");
/*
PrintWriter pw = new PrintWriter("/x/tmp/out.dot");
Util.toDot(fst, pw, true, true);
pw.close();
*/
// Writer was only temporary, to count up bigrams,
// which we transferred to the FST, so now we
// rollback:
writer.rollback();
success = true;
} finally {
try {
if (success) {
IOUtils.close(reader, dir);
} else {
IOUtils.closeWhileHandlingException(reader, writer, dir);
}
} finally {
IOUtils.rm(tempIndexPath);
}
}
}
Aggregations