use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.
the class OrdinalsBuilder method buildFromTerms.
/**
* This method iterates all terms in the given {@link TermsEnum} and
* associates each terms ordinal with the terms documents. The caller must
* exhaust the returned {@link BytesRefIterator} which returns all values
* where the first returned value is associated with the ordinal <tt>1</tt>
* etc.
* <p>
* If the {@link TermsEnum} contains prefix coded numerical values the terms
* enum should be wrapped with either {@link #wrapNumeric32Bit(TermsEnum)}
* or {@link #wrapNumeric64Bit(TermsEnum)} depending on its precision. If
* the {@link TermsEnum} is not wrapped the returned
* {@link BytesRefIterator} will contain partial precision terms rather than
* only full-precision terms.
* </p>
*/
public BytesRefIterator buildFromTerms(final TermsEnum termsEnum) throws IOException {
return new BytesRefIterator() {
private PostingsEnum docsEnum = null;
@Override
public BytesRef next() throws IOException {
BytesRef ref;
if ((ref = termsEnum.next()) != null) {
docsEnum = termsEnum.postings(docsEnum, PostingsEnum.NONE);
nextOrdinal();
int docId;
while ((docId = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
addDoc(docId);
}
}
return ref;
}
};
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TermVectorsWriter method addAllDocVectors.
/** Safe (but, slowish) default method to write every
* vector field in the document. */
protected final void addAllDocVectors(Fields vectors, MergeState mergeState) throws IOException {
if (vectors == null) {
startDocument(0);
finishDocument();
return;
}
int numFields = vectors.size();
if (numFields == -1) {
// count manually! TODO: Maybe enforce that Fields.size() returns something valid?
numFields = 0;
for (final Iterator<String> it = vectors.iterator(); it.hasNext(); ) {
it.next();
numFields++;
}
}
startDocument(numFields);
String lastFieldName = null;
TermsEnum termsEnum = null;
PostingsEnum docsAndPositionsEnum = null;
int fieldCount = 0;
for (String fieldName : vectors) {
fieldCount++;
final FieldInfo fieldInfo = mergeState.mergeFieldInfos.fieldInfo(fieldName);
assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0 : "lastFieldName=" + lastFieldName + " fieldName=" + fieldName;
lastFieldName = fieldName;
final Terms terms = vectors.terms(fieldName);
if (terms == null) {
// FieldsEnum shouldn't lie...
continue;
}
final boolean hasPositions = terms.hasPositions();
final boolean hasOffsets = terms.hasOffsets();
final boolean hasPayloads = terms.hasPayloads();
assert !hasPayloads || hasPositions;
int numTerms = (int) terms.size();
if (numTerms == -1) {
// count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function
numTerms = 0;
termsEnum = terms.iterator();
while (termsEnum.next() != null) {
numTerms++;
}
}
startField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads);
termsEnum = terms.iterator();
int termCount = 0;
while (termsEnum.next() != null) {
termCount++;
final int freq = (int) termsEnum.totalTermFreq();
startTerm(termsEnum.term(), freq);
if (hasPositions || hasOffsets) {
docsAndPositionsEnum = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
assert docsAndPositionsEnum != null;
final int docID = docsAndPositionsEnum.nextDoc();
assert docID != DocIdSetIterator.NO_MORE_DOCS;
assert docsAndPositionsEnum.freq() == freq;
for (int posUpto = 0; posUpto < freq; posUpto++) {
final int pos = docsAndPositionsEnum.nextPosition();
final int startOffset = docsAndPositionsEnum.startOffset();
final int endOffset = docsAndPositionsEnum.endOffset();
final BytesRef payload = docsAndPositionsEnum.getPayload();
assert !hasPositions || pos >= 0;
addPosition(pos, startOffset, endOffset, payload);
}
}
finishTerm();
}
assert termCount == numTerms;
finishField();
}
assert fieldCount == numFields;
finishDocument();
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class SimpleTextFieldsWriter method write.
public void write(FieldInfos fieldInfos, Fields fields) throws IOException {
// for each field
for (String field : fields) {
Terms terms = fields.terms(field);
if (terms == null) {
// Annoyingly, this can happen!
continue;
}
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
boolean wroteField = false;
boolean hasPositions = terms.hasPositions();
boolean hasFreqs = terms.hasFreqs();
boolean hasPayloads = fieldInfo.hasPayloads();
boolean hasOffsets = terms.hasOffsets();
int flags = 0;
if (hasPositions) {
flags = PostingsEnum.POSITIONS;
if (hasPayloads) {
flags = flags | PostingsEnum.PAYLOADS;
}
if (hasOffsets) {
flags = flags | PostingsEnum.OFFSETS;
}
} else {
if (hasFreqs) {
flags = flags | PostingsEnum.FREQS;
}
}
TermsEnum termsEnum = terms.iterator();
PostingsEnum postingsEnum = null;
// for each term in field
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
postingsEnum = termsEnum.postings(postingsEnum, flags);
assert postingsEnum != null : "termsEnum=" + termsEnum + " hasPos=" + hasPositions + " flags=" + flags;
boolean wroteTerm = false;
// for each doc in field+term
while (true) {
int doc = postingsEnum.nextDoc();
if (doc == PostingsEnum.NO_MORE_DOCS) {
break;
}
if (!wroteTerm) {
if (!wroteField) {
// we lazily do this, in case the field had
// no terms
write(FIELD);
write(field);
newline();
wroteField = true;
}
// we lazily do this, in case the term had
// zero docs
write(TERM);
write(term);
newline();
wroteTerm = true;
}
write(DOC);
write(Integer.toString(doc));
newline();
if (hasFreqs) {
int freq = postingsEnum.freq();
write(FREQ);
write(Integer.toString(freq));
newline();
if (hasPositions) {
// for assert:
int lastStartOffset = 0;
// for each pos in field+term+doc
for (int i = 0; i < freq; i++) {
int position = postingsEnum.nextPosition();
write(POS);
write(Integer.toString(position));
newline();
if (hasOffsets) {
int startOffset = postingsEnum.startOffset();
int endOffset = postingsEnum.endOffset();
assert endOffset >= startOffset;
assert startOffset >= lastStartOffset : "startOffset=" + startOffset + " lastStartOffset=" + lastStartOffset;
lastStartOffset = startOffset;
write(START_OFFSET);
write(Integer.toString(startOffset));
newline();
write(END_OFFSET);
write(Integer.toString(endOffset));
newline();
}
BytesRef payload = postingsEnum.getPayload();
if (payload != null && payload.length > 0) {
assert payload.length != 0;
write(PAYLOAD);
write(payload);
newline();
}
}
}
}
}
}
}
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class FieldOffsetStrategy method createOffsetsEnumsFromReader.
protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader leafReader, int doc) throws IOException {
final Terms termsIndex = leafReader.terms(field);
if (termsIndex == null) {
return Collections.emptyList();
}
// For strict positions, get a Map of term to Spans:
// note: ScriptPhraseHelper.NONE does the right thing for these method calls
final Map<BytesRef, Spans> strictPhrasesTermToSpans = phraseHelper.getTermToSpans(leafReader, doc);
// Usually simply wraps terms in a List; but if willRewrite() then can be expanded
final List<BytesRef> sourceTerms = phraseHelper.expandTermsIfRewrite(terms, strictPhrasesTermToSpans);
final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + automata.length);
// Handle sourceTerms:
if (!sourceTerms.isEmpty()) {
//does not return null
TermsEnum termsEnum = termsIndex.iterator();
for (BytesRef term : sourceTerms) {
if (termsEnum.seekExact(term)) {
PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
if (postingsEnum == null) {
// no offsets or positions available
throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
}
if (doc == postingsEnum.advance(doc)) {
// now it's positioned, although may be exhausted
postingsEnum = phraseHelper.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term));
if (postingsEnum != null) {
offsetsEnums.add(new OffsetsEnum(term, postingsEnum));
}
}
}
}
}
// Handle automata
if (automata.length > 0) {
offsetsEnums.addAll(createAutomataOffsetsFromTerms(termsIndex, doc));
}
return offsetsEnums;
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TokenStreamOffsetStrategy method getOffsetsEnums.
@Override
public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
TokenStream tokenStream = tokenStream(content);
PostingsEnum mtqPostingsEnum = new TokenStreamPostingsEnum(tokenStream, automata);
mtqPostingsEnum.advance(docId);
return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum));
}
Aggregations