use of org.apache.lucene.index.FieldInfo in project lucene-solr by apache.
the class CompletionFieldsConsumer method close.
@Override
public void close() throws IOException {
if (closed) {
return;
}
closed = true;
String indexFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, INDEX_EXTENSION);
boolean success = false;
try (IndexOutput indexOut = state.directory.createOutput(indexFile, state.context)) {
delegateFieldsConsumer.close();
CodecUtil.writeIndexHeader(indexOut, CODEC_NAME, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
/*
* we write the delegate postings format name so we can load it
* without getting an instance in the ctor
*/
indexOut.writeString(delegatePostingsFormatName);
// write # of seen fields
indexOut.writeVInt(seenFields.size());
// write field numbers and dictOut offsets
for (Map.Entry<String, CompletionMetaData> seenField : seenFields.entrySet()) {
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(seenField.getKey());
indexOut.writeVInt(fieldInfo.number);
CompletionMetaData metaData = seenField.getValue();
indexOut.writeVLong(metaData.filePointer);
indexOut.writeVLong(metaData.minWeight);
indexOut.writeVLong(metaData.maxWeight);
indexOut.writeByte(metaData.type);
}
CodecUtil.writeFooter(indexOut);
CodecUtil.writeFooter(dictOut);
IOUtils.close(dictOut);
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer);
}
}
}
use of org.apache.lucene.index.FieldInfo in project lucene-solr by apache.
the class DefaultSortedSetDocValuesReaderState method getDocValues.
/** Return top-level doc values. */
@Override
public SortedSetDocValues getDocValues() throws IOException {
// TODO: this is dup'd from slow composite reader wrapper ... can we factor it out to share?
OrdinalMap map = null;
// why are we using a map?
synchronized (cachedOrdMaps) {
map = cachedOrdMaps.get(field);
if (map == null) {
// uncached, or not a multi dv
SortedSetDocValues dv = MultiDocValues.getSortedSetValues(reader, field);
if (dv instanceof MultiDocValues.MultiSortedSetDocValues) {
map = ((MultiDocValues.MultiSortedSetDocValues) dv).mapping;
IndexReader.CacheHelper cacheHelper = reader.getReaderCacheHelper();
if (cacheHelper != null && map.owner == cacheHelper.getKey()) {
cachedOrdMaps.put(field, map);
}
}
return dv;
}
}
assert map != null;
int size = reader.leaves().size();
final SortedSetDocValues[] values = new SortedSetDocValues[size];
final int[] starts = new int[size + 1];
long cost = 0;
for (int i = 0; i < size; i++) {
LeafReaderContext context = reader.leaves().get(i);
final LeafReader reader = context.reader();
final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
if (fieldInfo != null && fieldInfo.getDocValuesType() != DocValuesType.SORTED_SET) {
return null;
}
SortedSetDocValues v = reader.getSortedSetDocValues(field);
if (v == null) {
v = DocValues.emptySortedSet();
}
values[i] = v;
starts[i] = context.docBase;
cost += v.cost();
}
starts[size] = reader.maxDoc();
return new MultiSortedSetDocValues(values, starts, map, cost);
}
use of org.apache.lucene.index.FieldInfo in project lucene-solr by apache.
the class PointInSetIncludingScoreQuery method createWeight.
@Override
public final Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
return new Weight(this) {
@Override
public void extractTerms(Set<Term> terms) {
}
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
Scorer scorer = scorer(context);
if (scorer != null) {
int target = scorer.iterator().advance(doc);
if (doc == target) {
return Explanation.match(scorer.score(), "A match");
}
}
return Explanation.noMatch("Not a match");
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
LeafReader reader = context.reader();
FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
if (fieldInfo == null) {
return null;
}
if (fieldInfo.getPointDimensionCount() != 1) {
throw new IllegalArgumentException("field=\"" + field + "\" was indexed with numDims=" + fieldInfo.getPointDimensionCount() + " but this query has numDims=1");
}
if (fieldInfo.getPointNumBytes() != bytesPerDim) {
throw new IllegalArgumentException("field=\"" + field + "\" was indexed with bytesPerDim=" + fieldInfo.getPointNumBytes() + " but this query has bytesPerDim=" + bytesPerDim);
}
PointValues values = reader.getPointValues(field);
if (values == null) {
return null;
}
FixedBitSet result = new FixedBitSet(reader.maxDoc());
float[] scores = new float[reader.maxDoc()];
values.intersect(new MergePointVisitor(sortedPackedPoints, result, scores));
return new Scorer(this) {
DocIdSetIterator disi = new BitSetIterator(result, 10L);
@Override
public float score() throws IOException {
return scores[docID()];
}
@Override
public int freq() throws IOException {
return 1;
}
@Override
public int docID() {
return disi.docID();
}
@Override
public DocIdSetIterator iterator() {
return disi;
}
};
}
};
}
use of org.apache.lucene.index.FieldInfo in project lucene-solr by apache.
the class MockRandomPostingsFormat method fieldsConsumer.
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
int minSkipInterval;
if (state.segmentInfo.maxDoc() > 1000000) {
// Test2BPostings can OOME otherwise:
minSkipInterval = 3;
} else {
minSkipInterval = 2;
}
// we pull this before the seed intentionally: because it's not consumed at runtime
// (the skipInterval is written into postings header).
// NOTE: Currently not passed to postings writer.
// before, it was being passed in wrongly as acceptableOverhead!
int skipInterval = TestUtil.nextInt(seedRandom, minSkipInterval, 10);
if (LuceneTestCase.VERBOSE) {
System.out.println("MockRandomCodec: skipInterval=" + skipInterval);
}
final long seed = seedRandom.nextLong();
if (LuceneTestCase.VERBOSE) {
System.out.println("MockRandomCodec: writing to seg=" + state.segmentInfo.name + " formatID=" + state.segmentSuffix + " seed=" + seed);
}
final String seedFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, SEED_EXT);
try (IndexOutput out = state.directory.createOutput(seedFileName, state.context)) {
CodecUtil.writeIndexHeader(out, "MockRandomSeed", 0, state.segmentInfo.getId(), state.segmentSuffix);
out.writeLong(seed);
CodecUtil.writeFooter(out);
}
final Random random = new Random(seed);
// consume a random for buffersize
random.nextInt();
PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state);
final FieldsConsumer fields;
final int t1 = random.nextInt(5);
if (t1 == 0) {
boolean success = false;
try {
fields = new FSTTermsWriter(state, postingsWriter);
success = true;
} finally {
if (!success) {
postingsWriter.close();
}
}
} else if (t1 == 1) {
boolean success = false;
try {
fields = new FSTOrdTermsWriter(state, postingsWriter);
success = true;
} finally {
if (!success) {
postingsWriter.close();
}
}
} else if (t1 == 2) {
if (LuceneTestCase.VERBOSE) {
System.out.println("MockRandomCodec: writing BlockTree terms dict");
}
// TODO: would be nice to allow 1 but this is very
// slow to write
final int minTermsInBlock = TestUtil.nextInt(random, 2, 100);
final int maxTermsInBlock = Math.max(2, (minTermsInBlock - 1) * 2 + random.nextInt(100));
boolean success = false;
try {
fields = new BlockTreeTermsWriter(state, postingsWriter, minTermsInBlock, maxTermsInBlock);
success = true;
} finally {
if (!success) {
postingsWriter.close();
}
}
} else if (t1 == 3) {
if (LuceneTestCase.VERBOSE) {
System.out.println("MockRandomCodec: writing Block terms dict");
}
boolean success = false;
final TermsIndexWriterBase indexWriter;
try {
if (random.nextBoolean()) {
int termIndexInterval = TestUtil.nextInt(random, 1, 100);
if (LuceneTestCase.VERBOSE) {
System.out.println("MockRandomCodec: fixed-gap terms index (tii=" + termIndexInterval + ")");
}
indexWriter = new FixedGapTermsIndexWriter(state, termIndexInterval);
} else {
final VariableGapTermsIndexWriter.IndexTermSelector selector;
final int n2 = random.nextInt(3);
if (n2 == 0) {
final int tii = TestUtil.nextInt(random, 1, 100);
selector = new VariableGapTermsIndexWriter.EveryNTermSelector(tii);
if (LuceneTestCase.VERBOSE) {
System.out.println("MockRandomCodec: variable-gap terms index (tii=" + tii + ")");
}
} else if (n2 == 1) {
final int docFreqThresh = TestUtil.nextInt(random, 2, 100);
final int tii = TestUtil.nextInt(random, 1, 100);
selector = new VariableGapTermsIndexWriter.EveryNOrDocFreqTermSelector(docFreqThresh, tii);
} else {
final long seed2 = random.nextLong();
final int gap = TestUtil.nextInt(random, 2, 40);
if (LuceneTestCase.VERBOSE) {
System.out.println("MockRandomCodec: random-gap terms index (max gap=" + gap + ")");
}
selector = new VariableGapTermsIndexWriter.IndexTermSelector() {
final Random rand = new Random(seed2);
@Override
public boolean isIndexTerm(BytesRef term, TermStats stats) {
return rand.nextInt(gap) == gap / 2;
}
@Override
public void newField(FieldInfo fieldInfo) {
}
};
}
indexWriter = new VariableGapTermsIndexWriter(state, selector);
}
success = true;
} finally {
if (!success) {
postingsWriter.close();
}
}
success = false;
try {
fields = new BlockTermsWriter(indexWriter, state, postingsWriter);
success = true;
} finally {
if (!success) {
try {
postingsWriter.close();
} finally {
indexWriter.close();
}
}
}
} else if (t1 == 4) {
// Use OrdsBlockTree terms dict
if (LuceneTestCase.VERBOSE) {
System.out.println("MockRandomCodec: writing OrdsBlockTree");
}
// TODO: would be nice to allow 1 but this is very
// slow to write
final int minTermsInBlock = TestUtil.nextInt(random, 2, 100);
final int maxTermsInBlock = Math.max(2, (minTermsInBlock - 1) * 2 + random.nextInt(100));
boolean success = false;
try {
fields = new OrdsBlockTreeTermsWriter(state, postingsWriter, minTermsInBlock, maxTermsInBlock);
success = true;
} finally {
if (!success) {
postingsWriter.close();
}
}
} else {
// BUG!
throw new AssertionError();
}
return fields;
}
use of org.apache.lucene.index.FieldInfo in project lucene-solr by apache.
the class DocTermOrds method uninvert.
/** Call this only once (if you subclass!) */
protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException {
final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) {
throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
}
//System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
final long startTime = System.nanoTime();
prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix);
final int maxDoc = reader.maxDoc();
// immediate term numbers, or the index into the byte[] representing the last number
final int[] index = new int[maxDoc];
// last term we saw for this document
final int[] lastTerm = new int[maxDoc];
// list of term numbers for the doc (delta encoded vInts)
final byte[][] bytes = new byte[maxDoc][];
final Terms terms = reader.terms(field);
if (terms == null) {
// No terms
return;
}
final TermsEnum te = terms.iterator();
final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
//System.out.println("seekStart=" + seekStart.utf8ToString());
if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) {
// No terms match
return;
}
// For our "term index wrapper"
final List<BytesRef> indexedTerms = new ArrayList<>();
final PagedBytes indexedTermsBytes = new PagedBytes(15);
// we need a minimum of 9 bytes, but round up to 12 since the space would
// be wasted with most allocators anyway.
byte[] tempArr = new byte[12];
//
// enumerate all terms, and build an intermediate form of the un-inverted field.
//
// During this intermediate form, every document has a (potential) byte[]
// and the int[maxDoc()] array either contains the termNumber list directly
// or the *end* offset of the termNumber list in its byte array (for faster
// appending and faster creation of the final form).
//
// idea... if things are too large while building, we could do a range of docs
// at a time (but it would be a fair amount slower to build)
// could also do ranges in parallel to take advantage of multiple CPUs
// OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
// values. This requires going over the field first to find the most
// frequent terms ahead of time.
int termNum = 0;
postingsEnum = null;
// seek above):
for (; ; ) {
final BytesRef t = te.term();
if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) {
break;
}
//System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);
visitTerm(te, termNum);
if ((termNum & indexIntervalMask) == 0) {
// Index this term
sizeOfIndexedStrings += t.length;
BytesRef indexedTerm = new BytesRef();
indexedTermsBytes.copy(t, indexedTerm);
// TODO: really should 1) strip off useless suffix,
// and 2) use FST not array/PagedBytes
indexedTerms.add(indexedTerm);
}
final int df = te.docFreq();
if (df <= maxTermDocFreq) {
postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE);
// dF, but takes deletions into account
int actualDF = 0;
for (; ; ) {
int doc = postingsEnum.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
//System.out.println(" chunk=" + chunk + " docs");
actualDF++;
termInstances++;
//System.out.println(" docID=" + doc);
// add TNUM_OFFSET to the term number to make room for special reserved values:
// 0 (end term) and 1 (index into byte array follows)
int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
lastTerm[doc] = termNum;
int val = index[doc];
if ((val & 0xff) == 1) {
// index into byte array (actually the end of
// the doc-specific byte[] when building)
int pos = val >>> 8;
int ilen = vIntSize(delta);
byte[] arr = bytes[doc];
int newend = pos + ilen;
if (newend > arr.length) {
// We avoid a doubling strategy to lower memory usage.
// this faceting method isn't for docs with many terms.
// In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary.
// TODO: figure out what array lengths we can round up to w/o actually using more memory
// (how much space does a byte[] take up? Is data preceded by a 32 bit length only?
// It should be safe to round up to the nearest 32 bits in any case.
// 4 byte alignment
int newLen = (newend + 3) & 0xfffffffc;
byte[] newarr = new byte[newLen];
System.arraycopy(arr, 0, newarr, 0, pos);
arr = newarr;
bytes[doc] = newarr;
}
pos = writeInt(delta, arr, pos);
// update pointer to end index in byte[]
index[doc] = (pos << 8) | 1;
} else {
// OK, this int has data in it... find the end (a zero starting byte - not
// part of another number, hence not following a byte with the high bit set).
int ipos;
if (val == 0) {
ipos = 0;
} else if ((val & 0x0000ff80) == 0) {
ipos = 1;
} else if ((val & 0x00ff8000) == 0) {
ipos = 2;
} else if ((val & 0xff800000) == 0) {
ipos = 3;
} else {
ipos = 4;
}
//System.out.println(" ipos=" + ipos);
int endPos = writeInt(delta, tempArr, ipos);
//System.out.println(" endpos=" + endPos);
if (endPos <= 4) {
// value will fit in the integer... move bytes back
for (int j = ipos; j < endPos; j++) {
val |= (tempArr[j] & 0xff) << (j << 3);
}
index[doc] = val;
} else {
// value won't fit... move integer into byte[]
for (int j = 0; j < ipos; j++) {
tempArr[j] = (byte) val;
val >>>= 8;
}
// point at the end index in the byte[]
index[doc] = (endPos << 8) | 1;
bytes[doc] = tempArr;
tempArr = new byte[12];
}
}
}
setActualDocFreq(termNum, actualDF);
}
termNum++;
if (te.next() == null) {
break;
}
}
numTermsInField = termNum;
long midPoint = System.nanoTime();
if (termInstances == 0) {
// we didn't invert anything
// lower memory consumption.
tnums = null;
} else {
this.index = index;
for (int pass = 0; pass < 256; pass++) {
byte[] target = tnums[pass];
// end in target;
int pos = 0;
if (target != null) {
pos = target.length;
} else {
target = new byte[4096];
}
// each pass shares the same byte[] for termNumber lists.
for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) {
int lim = Math.min(docbase + (1 << 16), maxDoc);
for (int doc = docbase; doc < lim; doc++) {
//System.out.println(" pass=" + pass + " process docID=" + doc);
int val = index[doc];
if ((val & 0xff) == 1) {
int len = val >>> 8;
//System.out.println(" ptr pos=" + pos);
// change index to point to start of array
index[doc] = (pos << 8) | 1;
if ((pos & 0xff000000) != 0) {
// we only have 24 bits for the array index
throw new IllegalStateException("Too many values for UnInvertedField faceting on field " + field);
}
byte[] arr = bytes[doc];
/*
for(byte b : arr) {
//System.out.println(" b=" + Integer.toHexString((int) b));
}
*/
// IMPORTANT: allow GC to avoid OOM
bytes[doc] = null;
if (target.length <= pos + len) {
int newlen = target.length;
// doubling strategy
while (newlen <= pos + len) newlen <<= 1;
byte[] newtarget = new byte[newlen];
System.arraycopy(target, 0, newtarget, 0, pos);
target = newtarget;
}
System.arraycopy(arr, 0, target, pos, len);
// skip single byte at end and leave it 0 for terminator
pos += len + 1;
}
}
}
// shrink array
if (pos < target.length) {
byte[] newtarget = new byte[pos];
System.arraycopy(target, 0, newtarget, 0, pos);
target = newtarget;
}
tnums[pass] = target;
if ((pass << 16) > maxDoc)
break;
}
}
indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);
long endTime = System.nanoTime();
total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS);
phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS);
}
Aggregations