use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TestJoinUtil method createExpectedResult.
private BitSet createExpectedResult(String queryValue, boolean from, IndexReader topLevelReader, IndexIterationContext context) throws IOException {
final Map<String, List<RandomDoc>> randomValueDocs;
final Map<String, List<RandomDoc>> linkValueDocuments;
if (from) {
randomValueDocs = context.randomValueFromDocs;
linkValueDocuments = context.toDocuments;
} else {
randomValueDocs = context.randomValueToDocs;
linkValueDocuments = context.fromDocuments;
}
BitSet expectedResult = new FixedBitSet(topLevelReader.maxDoc());
List<RandomDoc> matchingDocs = randomValueDocs.get(queryValue);
if (matchingDocs == null) {
return new FixedBitSet(topLevelReader.maxDoc());
}
for (RandomDoc matchingDoc : matchingDocs) {
for (String linkValue : matchingDoc.linkValues) {
List<RandomDoc> otherMatchingDocs = linkValueDocuments.get(linkValue);
if (otherMatchingDocs == null) {
continue;
}
for (RandomDoc otherSideDoc : otherMatchingDocs) {
PostingsEnum postingsEnum = MultiFields.getTermDocsEnum(topLevelReader, "id", new BytesRef(otherSideDoc.id), 0);
assert postingsEnum != null;
int doc = postingsEnum.nextDoc();
expectedResult.set(doc);
}
}
}
return expectedResult;
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TestPositionIncrement method testPayloadsPos0.
public void testPayloadsPos0() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockPayloadAnalyzer());
Document doc = new Document();
doc.add(new TextField("content", new StringReader("a a b c d e a f g h i j a b k k")));
writer.addDocument(doc);
final IndexReader readerFromWriter = writer.getReader();
LeafReader r = getOnlyLeafReader(readerFromWriter);
PostingsEnum tp = r.postings(new Term("content", "a"), PostingsEnum.ALL);
int count = 0;
assertTrue(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
// "a" occurs 4 times
assertEquals(4, tp.freq());
assertEquals(0, tp.nextPosition());
assertEquals(1, tp.nextPosition());
assertEquals(3, tp.nextPosition());
assertEquals(6, tp.nextPosition());
// only one doc has "a"
assertEquals(DocIdSetIterator.NO_MORE_DOCS, tp.nextDoc());
IndexSearcher is = newSearcher(getOnlyLeafReader(readerFromWriter));
SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
SpanQuery[] sqs = { stq1, stq2 };
SpanNearQuery snq = new SpanNearQuery(sqs, 30, false);
count = 0;
boolean sawZero = false;
if (VERBOSE) {
System.out.println("\ngetPayloadSpans test");
}
PayloadSpanCollector collector = new PayloadSpanCollector();
Spans pspans = snq.createWeight(is, false, 1f).getSpans(is.getIndexReader().leaves().get(0), SpanWeight.Postings.PAYLOADS);
while (pspans.nextDoc() != Spans.NO_MORE_DOCS) {
while (pspans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
if (VERBOSE) {
System.out.println("doc " + pspans.docID() + ": span " + pspans.startPosition() + " to " + pspans.endPosition());
}
collector.reset();
pspans.collect(collector);
sawZero |= pspans.startPosition() == 0;
for (BytesRef payload : collector.payloads) {
count++;
if (VERBOSE) {
System.out.println(" payload: " + Term.toString(payload));
}
}
}
}
assertTrue(sawZero);
assertEquals(8, count);
// System.out.println("\ngetSpans test");
Spans spans = snq.createWeight(is, false, 1f).getSpans(is.getIndexReader().leaves().get(0), SpanWeight.Postings.POSITIONS);
count = 0;
sawZero = false;
while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
count++;
sawZero |= spans.startPosition() == 0;
// System.out.println(spans.doc() + " - " + spans.start() + " - " +
// spans.end());
}
}
assertEquals(4, count);
assertTrue(sawZero);
writer.close();
is.getIndexReader().close();
dir.close();
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TokenStreamFromTermVector method init.
//We delay initialization because we can see which attributes the consumer wants, particularly payloads
private void init() throws IOException {
assert !initialized;
short dpEnumFlags = PostingsEnum.POSITIONS;
if (vector.hasOffsets()) {
dpEnumFlags |= PostingsEnum.OFFSETS;
offsetAttribute = addAttribute(OffsetAttribute.class);
}
if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
//must ask for offsets too
dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
payloadAttribute = getAttribute(PayloadAttribute.class);
payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
spareBytesRefBuilder = new BytesRefBuilder();
}
// We put term data here
termCharsBuilder = new CharsRefBuilder();
//7 is over-estimate of average term len
termCharsBuilder.grow((int) (vector.size() * 7));
// Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
TokenLL[] positionedTokens = initTokensArray();
int lastPosition = -1;
final TermsEnum termsEnum = vector.iterator();
BytesRef termBytesRef;
PostingsEnum dpEnum = null;
//only for UTF8->UTF16 call
CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();
//int sumFreq = 0;
while ((termBytesRef = termsEnum.next()) != null) {
//Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
// note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
tempCharsRefBuilder.grow(termBytesRef.length);
final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
final int termCharsOff = termCharsBuilder.length();
termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
// presumably checked by TokenSources.hasPositions earlier
assert dpEnum != null;
dpEnum.nextDoc();
final int freq = dpEnum.freq();
//sumFreq += freq;
for (int j = 0; j < freq; j++) {
int pos = dpEnum.nextPosition();
TokenLL token = new TokenLL();
token.termCharsOff = termCharsOff;
token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
if (offsetAttribute != null) {
token.startOffset = dpEnum.startOffset();
if (token.startOffset > maxStartOffset) {
//filter this token out; exceeds threshold
continue;
}
token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
if (pos == -1) {
//divide by 8
pos = token.startOffset >> 3;
}
}
if (payloadAttribute != null) {
final BytesRef payload = dpEnum.getPayload();
token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
}
//Add token to an array indexed by position
if (positionedTokens.length <= pos) {
//grow, but not 2x since we think our original length estimate is close
TokenLL[] newPositionedTokens = new TokenLL[(int) ((pos + 1) * 1.5f)];
System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1);
positionedTokens = newPositionedTokens;
}
positionedTokens[pos] = token.insertIntoSortedLinkedList(positionedTokens[pos]);
lastPosition = Math.max(lastPosition, pos);
}
}
// System.out.println(String.format(
// "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f",
// sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq,
// (originalPositionEstimate/(lastPosition + 1.0f))));
// Step 2: Link all Tokens into a linked-list and set position increments as we go
int prevTokenPos = -1;
TokenLL prevToken = null;
for (int pos = 0; pos <= lastPosition; pos++) {
TokenLL token = positionedTokens[pos];
if (token == null) {
continue;
}
//link
if (prevToken != null) {
assert prevToken.next == null;
//concatenate linked-list
prevToken.next = token;
} else {
assert firstToken == null;
firstToken = token;
}
//set increments
if (vector.hasPositions()) {
token.positionIncrement = pos - prevTokenPos;
while (token.next != null) {
token = token.next;
token.positionIncrement = 0;
}
} else {
token.positionIncrement = 1;
while (token.next != null) {
prevToken = token;
token = token.next;
if (prevToken.startOffset == token.startOffset) {
token.positionIncrement = 0;
} else {
token.positionIncrement = 1;
}
}
}
prevTokenPos = pos;
prevToken = token;
}
initialized = true;
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TestClassicAnalyzer method testWickedLongTerm.
/**
* Make sure we skip wicked long terms.
*/
public void testWickedLongTerm() throws IOException {
RAMDirectory dir = new RAMDirectory();
Analyzer analyzer = new ClassicAnalyzer();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer));
char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
Arrays.fill(chars, 'x');
Document doc = new Document();
final String bigTerm = new String(chars);
// This produces a too-long term:
String contents = "abc xyz x" + bigTerm + " another term";
doc.add(new TextField("content", contents, Field.Store.NO));
writer.addDocument(doc);
// Make sure we can add another normal document
doc = new Document();
doc.add(new TextField("content", "abc bbb ccc", Field.Store.NO));
writer.addDocument(doc);
writer.close();
IndexReader reader = DirectoryReader.open(dir);
// Make sure all terms < max size were indexed
assertEquals(2, reader.docFreq(new Term("content", "abc")));
assertEquals(1, reader.docFreq(new Term("content", "bbb")));
assertEquals(1, reader.docFreq(new Term("content", "term")));
assertEquals(1, reader.docFreq(new Term("content", "another")));
// Make sure position is still incremented when
// massive term is skipped:
PostingsEnum tps = MultiFields.getTermPositionsEnum(reader, "content", new BytesRef("another"));
assertTrue(tps.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, tps.freq());
assertEquals(3, tps.nextPosition());
// Make sure the doc that has the massive term is in
// the index:
assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());
reader.close();
// Make sure we can add a document with exactly the
// maximum length term, and search on that term:
doc = new Document();
doc.add(new TextField("content", bigTerm, Field.Store.NO));
ClassicAnalyzer sa = new ClassicAnalyzer();
sa.setMaxTokenLength(100000);
writer = new IndexWriter(dir, new IndexWriterConfig(sa));
writer.addDocument(doc);
writer.close();
reader = DirectoryReader.open(dir);
assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
reader.close();
dir.close();
analyzer.close();
sa.close();
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class BlendedInfixSuggester method createCoefficient.
/**
* Create the coefficient to transform the weight.
*
* @param doc id of the document
* @param matchedTokens tokens found in the query
* @param prefixToken unfinished token in the query
* @return the coefficient
* @throws IOException If there are problems reading term vectors from the underlying Lucene index.
*/
private double createCoefficient(IndexSearcher searcher, int doc, Set<String> matchedTokens, String prefixToken) throws IOException {
Terms tv = searcher.getIndexReader().getTermVector(doc, TEXT_FIELD_NAME);
TermsEnum it = tv.iterator();
Integer position = Integer.MAX_VALUE;
BytesRef term;
// find the closest token position
while ((term = it.next()) != null) {
String docTerm = term.utf8ToString();
if (matchedTokens.contains(docTerm) || (prefixToken != null && docTerm.startsWith(prefixToken))) {
PostingsEnum docPosEnum = it.postings(null, PostingsEnum.OFFSETS);
docPosEnum.nextDoc();
// use the first occurrence of the term
int p = docPosEnum.nextPosition();
if (p < position) {
position = p;
}
}
}
// create corresponding coefficient based on position
return calculateCoefficient(position);
}
Aggregations