use of org.apache.lucene.index.FieldInfo in project lucene-solr by apache.
the class TermVectorsWriter method addAllDocVectors.
/** Safe (but, slowish) default method to write every
* vector field in the document. */
protected final void addAllDocVectors(Fields vectors, MergeState mergeState) throws IOException {
if (vectors == null) {
startDocument(0);
finishDocument();
return;
}
int numFields = vectors.size();
if (numFields == -1) {
// count manually! TODO: Maybe enforce that Fields.size() returns something valid?
numFields = 0;
for (final Iterator<String> it = vectors.iterator(); it.hasNext(); ) {
it.next();
numFields++;
}
}
startDocument(numFields);
String lastFieldName = null;
TermsEnum termsEnum = null;
PostingsEnum docsAndPositionsEnum = null;
int fieldCount = 0;
for (String fieldName : vectors) {
fieldCount++;
final FieldInfo fieldInfo = mergeState.mergeFieldInfos.fieldInfo(fieldName);
assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0 : "lastFieldName=" + lastFieldName + " fieldName=" + fieldName;
lastFieldName = fieldName;
final Terms terms = vectors.terms(fieldName);
if (terms == null) {
// FieldsEnum shouldn't lie...
continue;
}
final boolean hasPositions = terms.hasPositions();
final boolean hasOffsets = terms.hasOffsets();
final boolean hasPayloads = terms.hasPayloads();
assert !hasPayloads || hasPositions;
int numTerms = (int) terms.size();
if (numTerms == -1) {
// count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function
numTerms = 0;
termsEnum = terms.iterator();
while (termsEnum.next() != null) {
numTerms++;
}
}
startField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads);
termsEnum = terms.iterator();
int termCount = 0;
while (termsEnum.next() != null) {
termCount++;
final int freq = (int) termsEnum.totalTermFreq();
startTerm(termsEnum.term(), freq);
if (hasPositions || hasOffsets) {
docsAndPositionsEnum = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
assert docsAndPositionsEnum != null;
final int docID = docsAndPositionsEnum.nextDoc();
assert docID != DocIdSetIterator.NO_MORE_DOCS;
assert docsAndPositionsEnum.freq() == freq;
for (int posUpto = 0; posUpto < freq; posUpto++) {
final int pos = docsAndPositionsEnum.nextPosition();
final int startOffset = docsAndPositionsEnum.startOffset();
final int endOffset = docsAndPositionsEnum.endOffset();
final BytesRef payload = docsAndPositionsEnum.getPayload();
assert !hasPositions || pos >= 0;
addPosition(pos, startOffset, endOffset, payload);
}
}
finishTerm();
}
assert termCount == numTerms;
finishField();
}
assert fieldCount == numFields;
finishDocument();
}
use of org.apache.lucene.index.FieldInfo in project lucene-solr by apache.
the class SimpleTextFieldsWriter method write.
public void write(FieldInfos fieldInfos, Fields fields) throws IOException {
// for each field
for (String field : fields) {
Terms terms = fields.terms(field);
if (terms == null) {
// Annoyingly, this can happen!
continue;
}
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
boolean wroteField = false;
boolean hasPositions = terms.hasPositions();
boolean hasFreqs = terms.hasFreqs();
boolean hasPayloads = fieldInfo.hasPayloads();
boolean hasOffsets = terms.hasOffsets();
int flags = 0;
if (hasPositions) {
flags = PostingsEnum.POSITIONS;
if (hasPayloads) {
flags = flags | PostingsEnum.PAYLOADS;
}
if (hasOffsets) {
flags = flags | PostingsEnum.OFFSETS;
}
} else {
if (hasFreqs) {
flags = flags | PostingsEnum.FREQS;
}
}
TermsEnum termsEnum = terms.iterator();
PostingsEnum postingsEnum = null;
// for each term in field
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
postingsEnum = termsEnum.postings(postingsEnum, flags);
assert postingsEnum != null : "termsEnum=" + termsEnum + " hasPos=" + hasPositions + " flags=" + flags;
boolean wroteTerm = false;
// for each doc in field+term
while (true) {
int doc = postingsEnum.nextDoc();
if (doc == PostingsEnum.NO_MORE_DOCS) {
break;
}
if (!wroteTerm) {
if (!wroteField) {
// we lazily do this, in case the field had
// no terms
write(FIELD);
write(field);
newline();
wroteField = true;
}
// we lazily do this, in case the term had
// zero docs
write(TERM);
write(term);
newline();
wroteTerm = true;
}
write(DOC);
write(Integer.toString(doc));
newline();
if (hasFreqs) {
int freq = postingsEnum.freq();
write(FREQ);
write(Integer.toString(freq));
newline();
if (hasPositions) {
// for assert:
int lastStartOffset = 0;
// for each pos in field+term+doc
for (int i = 0; i < freq; i++) {
int position = postingsEnum.nextPosition();
write(POS);
write(Integer.toString(position));
newline();
if (hasOffsets) {
int startOffset = postingsEnum.startOffset();
int endOffset = postingsEnum.endOffset();
assert endOffset >= startOffset;
assert startOffset >= lastStartOffset : "startOffset=" + startOffset + " lastStartOffset=" + lastStartOffset;
lastStartOffset = startOffset;
write(START_OFFSET);
write(Integer.toString(startOffset));
newline();
write(END_OFFSET);
write(Integer.toString(endOffset));
newline();
}
BytesRef payload = postingsEnum.getPayload();
if (payload != null && payload.length > 0) {
assert payload.length != 0;
write(PAYLOAD);
write(payload);
newline();
}
}
}
}
}
}
}
}
use of org.apache.lucene.index.FieldInfo in project lucene-solr by apache.
the class SimpleTextStoredFieldsReader method visitDocument.
@Override
public void visitDocument(int n, StoredFieldVisitor visitor) throws IOException {
in.seek(offsets[n]);
while (true) {
readLine();
if (StringHelper.startsWith(scratch.get(), FIELD) == false) {
break;
}
int fieldNumber = parseIntAt(FIELD.length);
FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
readLine();
assert StringHelper.startsWith(scratch.get(), NAME);
readLine();
assert StringHelper.startsWith(scratch.get(), TYPE);
final BytesRef type;
if (equalsAt(TYPE_STRING, scratch.get(), TYPE.length)) {
type = TYPE_STRING;
} else if (equalsAt(TYPE_BINARY, scratch.get(), TYPE.length)) {
type = TYPE_BINARY;
} else if (equalsAt(TYPE_INT, scratch.get(), TYPE.length)) {
type = TYPE_INT;
} else if (equalsAt(TYPE_LONG, scratch.get(), TYPE.length)) {
type = TYPE_LONG;
} else if (equalsAt(TYPE_FLOAT, scratch.get(), TYPE.length)) {
type = TYPE_FLOAT;
} else if (equalsAt(TYPE_DOUBLE, scratch.get(), TYPE.length)) {
type = TYPE_DOUBLE;
} else {
throw new RuntimeException("unknown field type");
}
switch(visitor.needsField(fieldInfo)) {
case YES:
readField(type, fieldInfo, visitor);
break;
case NO:
readLine();
assert StringHelper.startsWith(scratch.get(), VALUE);
break;
case STOP:
return;
}
}
}
use of org.apache.lucene.index.FieldInfo in project lucene-solr by apache.
the class PointsWriter method mergeOneField.
/** Default naive merge implementation for one field: it just re-indexes all the values
* from the incoming segment. The default codec overrides this for 1D fields and uses
* a faster but more complex implementation. */
protected void mergeOneField(MergeState mergeState, FieldInfo fieldInfo) throws IOException {
long maxPointCount = 0;
int docCount = 0;
for (int i = 0; i < mergeState.pointsReaders.length; i++) {
PointsReader pointsReader = mergeState.pointsReaders[i];
if (pointsReader != null) {
FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name);
if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) {
PointValues values = pointsReader.getValues(fieldInfo.name);
if (values != null) {
maxPointCount += values.size();
docCount += values.getDocCount();
}
}
}
}
final long finalMaxPointCount = maxPointCount;
final int finalDocCount = docCount;
writeField(fieldInfo, new PointsReader() {
@Override
public long ramBytesUsed() {
return 0;
}
@Override
public void close() throws IOException {
}
@Override
public PointValues getValues(String fieldName) {
if (fieldName.equals(fieldInfo.name) == false) {
throw new IllegalArgumentException("field name must match the field being merged");
}
return new PointValues() {
@Override
public void intersect(IntersectVisitor mergedVisitor) throws IOException {
for (int i = 0; i < mergeState.pointsReaders.length; i++) {
PointsReader pointsReader = mergeState.pointsReaders[i];
if (pointsReader == null) {
// This segment has no points
continue;
}
FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldName);
if (readerFieldInfo == null) {
// This segment never saw this field
continue;
}
if (readerFieldInfo.getPointDimensionCount() == 0) {
// This segment saw this field, but the field did not index points in it:
continue;
}
PointValues values = pointsReader.getValues(fieldName);
if (values == null) {
continue;
}
MergeState.DocMap docMap = mergeState.docMaps[i];
values.intersect(new IntersectVisitor() {
@Override
public void visit(int docID) {
// Should never be called because our compare method never returns Relation.CELL_INSIDE_QUERY
throw new IllegalStateException();
}
@Override
public void visit(int docID, byte[] packedValue) throws IOException {
int newDocID = docMap.get(docID);
if (newDocID != -1) {
// Not deleted:
mergedVisitor.visit(newDocID, packedValue);
}
}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
// Forces this segment's PointsReader to always visit all docs + values:
return Relation.CELL_CROSSES_QUERY;
}
});
}
}
@Override
public long estimatePointCount(IntersectVisitor visitor) {
throw new UnsupportedOperationException();
}
@Override
public byte[] getMinPackedValue() {
throw new UnsupportedOperationException();
}
@Override
public byte[] getMaxPackedValue() {
throw new UnsupportedOperationException();
}
@Override
public int getNumDimensions() {
throw new UnsupportedOperationException();
}
@Override
public int getBytesPerDimension() {
throw new UnsupportedOperationException();
}
@Override
public long size() {
return finalMaxPointCount;
}
@Override
public int getDocCount() {
return finalDocCount;
}
};
}
@Override
public void checkIntegrity() throws IOException {
throw new UnsupportedOperationException();
}
});
}
use of org.apache.lucene.index.FieldInfo in project lucene-solr by apache.
the class DocValuesConsumer method mergeSortedField.
/**
* Merges the sorted docvalues from <code>toMerge</code>.
* <p>
* The default implementation calls {@link #addSortedField}, passing
* an Iterable that merges ordinals and values and filters deleted documents .
*/
public void mergeSortedField(FieldInfo fieldInfo, final MergeState mergeState) throws IOException {
List<SortedDocValues> toMerge = new ArrayList<>();
for (int i = 0; i < mergeState.docValuesProducers.length; i++) {
SortedDocValues values = null;
DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i];
if (docValuesProducer != null) {
FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name);
if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) {
values = docValuesProducer.getSorted(fieldInfo);
}
}
if (values == null) {
values = DocValues.emptySorted();
}
toMerge.add(values);
}
final int numReaders = toMerge.size();
final SortedDocValues[] dvs = toMerge.toArray(new SortedDocValues[numReaders]);
// step 1: iterate thru each sub and mark terms still in use
TermsEnum[] liveTerms = new TermsEnum[dvs.length];
long[] weights = new long[liveTerms.length];
for (int sub = 0; sub < numReaders; sub++) {
SortedDocValues dv = dvs[sub];
Bits liveDocs = mergeState.liveDocs[sub];
if (liveDocs == null) {
liveTerms[sub] = dv.termsEnum();
weights[sub] = dv.getValueCount();
} else {
LongBitSet bitset = new LongBitSet(dv.getValueCount());
int docID;
while ((docID = dv.nextDoc()) != NO_MORE_DOCS) {
if (liveDocs.get(docID)) {
int ord = dv.ordValue();
if (ord >= 0) {
bitset.set(ord);
}
}
}
liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
weights[sub] = bitset.cardinality();
}
}
// step 2: create ordinal map (this conceptually does the "merging")
final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT);
// step 3: add field
addSortedField(fieldInfo, new EmptyDocValuesProducer() {
@Override
public SortedDocValues getSorted(FieldInfo fieldInfoIn) throws IOException {
if (fieldInfoIn != fieldInfo) {
throw new IllegalArgumentException("wrong FieldInfo");
}
// We must make new iterators + DocIDMerger for each iterator:
List<SortedDocValuesSub> subs = new ArrayList<>();
long cost = 0;
for (int i = 0; i < mergeState.docValuesProducers.length; i++) {
SortedDocValues values = null;
DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i];
if (docValuesProducer != null) {
FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name);
if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) {
values = docValuesProducer.getSorted(readerFieldInfo);
}
}
if (values == null) {
values = DocValues.emptySorted();
}
cost += values.cost();
subs.add(new SortedDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i)));
}
final long finalCost = cost;
final DocIDMerger<SortedDocValuesSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
return new SortedDocValues() {
private int docID = -1;
private int ord;
@Override
public int docID() {
return docID;
}
@Override
public int nextDoc() throws IOException {
SortedDocValuesSub sub = docIDMerger.next();
if (sub == null) {
return docID = NO_MORE_DOCS;
}
int subOrd = sub.values.ordValue();
assert subOrd != -1;
ord = (int) sub.map.get(subOrd);
docID = sub.mappedDocID;
return docID;
}
@Override
public int ordValue() {
return ord;
}
@Override
public int advance(int target) {
throw new UnsupportedOperationException();
}
@Override
public boolean advanceExact(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long cost() {
return finalCost;
}
@Override
public int getValueCount() {
return (int) map.getValueCount();
}
@Override
public BytesRef lookupOrd(int ord) throws IOException {
int segmentNumber = map.getFirstSegmentNumber(ord);
int segmentOrd = (int) map.getFirstSegmentOrd(ord);
return dvs[segmentNumber].lookupOrd(segmentOrd);
}
};
}
});
}
Aggregations