Search in sources :

Example 71 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestJoinUtil method createExpectedResult.

private BitSet createExpectedResult(String queryValue, boolean from, IndexReader topLevelReader, IndexIterationContext context) throws IOException {
    final Map<String, List<RandomDoc>> randomValueDocs;
    final Map<String, List<RandomDoc>> linkValueDocuments;
    if (from) {
        randomValueDocs = context.randomValueFromDocs;
        linkValueDocuments = context.toDocuments;
    } else {
        randomValueDocs = context.randomValueToDocs;
        linkValueDocuments = context.fromDocuments;
    }
    BitSet expectedResult = new FixedBitSet(topLevelReader.maxDoc());
    List<RandomDoc> matchingDocs = randomValueDocs.get(queryValue);
    if (matchingDocs == null) {
        return new FixedBitSet(topLevelReader.maxDoc());
    }
    for (RandomDoc matchingDoc : matchingDocs) {
        for (String linkValue : matchingDoc.linkValues) {
            List<RandomDoc> otherMatchingDocs = linkValueDocuments.get(linkValue);
            if (otherMatchingDocs == null) {
                continue;
            }
            for (RandomDoc otherSideDoc : otherMatchingDocs) {
                PostingsEnum postingsEnum = MultiFields.getTermDocsEnum(topLevelReader, "id", new BytesRef(otherSideDoc.id), 0);
                assert postingsEnum != null;
                int doc = postingsEnum.nextDoc();
                expectedResult.set(doc);
            }
        }
    }
    return expectedResult;
}
Also used : FixedBitSet(org.apache.lucene.util.FixedBitSet) BitSet(org.apache.lucene.util.BitSet) FixedBitSet(org.apache.lucene.util.FixedBitSet) List(java.util.List) ArrayList(java.util.ArrayList) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) DoublePoint(org.apache.lucene.document.DoublePoint) LongPoint(org.apache.lucene.document.LongPoint) IntPoint(org.apache.lucene.document.IntPoint) FloatPoint(org.apache.lucene.document.FloatPoint)

Example 72 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestPositionIncrement method testPayloadsPos0.

public void testPayloadsPos0() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockPayloadAnalyzer());
    Document doc = new Document();
    doc.add(new TextField("content", new StringReader("a a b c d e a f g h i j a b k k")));
    writer.addDocument(doc);
    final IndexReader readerFromWriter = writer.getReader();
    LeafReader r = getOnlyLeafReader(readerFromWriter);
    PostingsEnum tp = r.postings(new Term("content", "a"), PostingsEnum.ALL);
    int count = 0;
    assertTrue(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    // "a" occurs 4 times
    assertEquals(4, tp.freq());
    assertEquals(0, tp.nextPosition());
    assertEquals(1, tp.nextPosition());
    assertEquals(3, tp.nextPosition());
    assertEquals(6, tp.nextPosition());
    // only one doc has "a"
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, tp.nextDoc());
    IndexSearcher is = newSearcher(getOnlyLeafReader(readerFromWriter));
    SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
    SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
    SpanQuery[] sqs = { stq1, stq2 };
    SpanNearQuery snq = new SpanNearQuery(sqs, 30, false);
    count = 0;
    boolean sawZero = false;
    if (VERBOSE) {
        System.out.println("\ngetPayloadSpans test");
    }
    PayloadSpanCollector collector = new PayloadSpanCollector();
    Spans pspans = snq.createWeight(is, false, 1f).getSpans(is.getIndexReader().leaves().get(0), SpanWeight.Postings.PAYLOADS);
    while (pspans.nextDoc() != Spans.NO_MORE_DOCS) {
        while (pspans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
            if (VERBOSE) {
                System.out.println("doc " + pspans.docID() + ": span " + pspans.startPosition() + " to " + pspans.endPosition());
            }
            collector.reset();
            pspans.collect(collector);
            sawZero |= pspans.startPosition() == 0;
            for (BytesRef payload : collector.payloads) {
                count++;
                if (VERBOSE) {
                    System.out.println("  payload: " + Term.toString(payload));
                }
            }
        }
    }
    assertTrue(sawZero);
    assertEquals(8, count);
    // System.out.println("\ngetSpans test");
    Spans spans = snq.createWeight(is, false, 1f).getSpans(is.getIndexReader().leaves().get(0), SpanWeight.Postings.POSITIONS);
    count = 0;
    sawZero = false;
    while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
        while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
            count++;
            sawZero |= spans.startPosition() == 0;
        // System.out.println(spans.doc() + " - " + spans.start() + " - " +
        // spans.end());
        }
    }
    assertEquals(4, count);
    assertTrue(sawZero);
    writer.close();
    is.getIndexReader().close();
    dir.close();
}
Also used : LeafReader(org.apache.lucene.index.LeafReader) Term(org.apache.lucene.index.Term) MockPayloadAnalyzer(org.apache.lucene.analysis.MockPayloadAnalyzer) Document(org.apache.lucene.document.Document) SpanQuery(org.apache.lucene.search.spans.SpanQuery) Spans(org.apache.lucene.search.spans.Spans) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) StringReader(java.io.StringReader) IndexReader(org.apache.lucene.index.IndexReader) TextField(org.apache.lucene.document.TextField) PostingsEnum(org.apache.lucene.index.PostingsEnum) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Example 73 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TokenStreamFromTermVector method init.

//We delay initialization because we can see which attributes the consumer wants, particularly payloads
private void init() throws IOException {
    assert !initialized;
    short dpEnumFlags = PostingsEnum.POSITIONS;
    if (vector.hasOffsets()) {
        dpEnumFlags |= PostingsEnum.OFFSETS;
        offsetAttribute = addAttribute(OffsetAttribute.class);
    }
    if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
        //must ask for offsets too
        dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
        payloadAttribute = getAttribute(PayloadAttribute.class);
        payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
        spareBytesRefBuilder = new BytesRefBuilder();
    }
    // We put term data here
    termCharsBuilder = new CharsRefBuilder();
    //7 is over-estimate of average term len
    termCharsBuilder.grow((int) (vector.size() * 7));
    // Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
    TokenLL[] positionedTokens = initTokensArray();
    int lastPosition = -1;
    final TermsEnum termsEnum = vector.iterator();
    BytesRef termBytesRef;
    PostingsEnum dpEnum = null;
    //only for UTF8->UTF16 call
    CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();
    //int sumFreq = 0;
    while ((termBytesRef = termsEnum.next()) != null) {
        //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
        // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
        tempCharsRefBuilder.grow(termBytesRef.length);
        final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
        final int termCharsOff = termCharsBuilder.length();
        termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
        dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
        // presumably checked by TokenSources.hasPositions earlier
        assert dpEnum != null;
        dpEnum.nextDoc();
        final int freq = dpEnum.freq();
        //sumFreq += freq;
        for (int j = 0; j < freq; j++) {
            int pos = dpEnum.nextPosition();
            TokenLL token = new TokenLL();
            token.termCharsOff = termCharsOff;
            token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
            if (offsetAttribute != null) {
                token.startOffset = dpEnum.startOffset();
                if (token.startOffset > maxStartOffset) {
                    //filter this token out; exceeds threshold
                    continue;
                }
                token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
                if (pos == -1) {
                    //divide by 8
                    pos = token.startOffset >> 3;
                }
            }
            if (payloadAttribute != null) {
                final BytesRef payload = dpEnum.getPayload();
                token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
            }
            //Add token to an array indexed by position
            if (positionedTokens.length <= pos) {
                //grow, but not 2x since we think our original length estimate is close
                TokenLL[] newPositionedTokens = new TokenLL[(int) ((pos + 1) * 1.5f)];
                System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1);
                positionedTokens = newPositionedTokens;
            }
            positionedTokens[pos] = token.insertIntoSortedLinkedList(positionedTokens[pos]);
            lastPosition = Math.max(lastPosition, pos);
        }
    }
    //    System.out.println(String.format(
    //        "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f",
    //        sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq,
    //        (originalPositionEstimate/(lastPosition + 1.0f))));
    // Step 2:  Link all Tokens into a linked-list and set position increments as we go
    int prevTokenPos = -1;
    TokenLL prevToken = null;
    for (int pos = 0; pos <= lastPosition; pos++) {
        TokenLL token = positionedTokens[pos];
        if (token == null) {
            continue;
        }
        //link
        if (prevToken != null) {
            assert prevToken.next == null;
            //concatenate linked-list
            prevToken.next = token;
        } else {
            assert firstToken == null;
            firstToken = token;
        }
        //set increments
        if (vector.hasPositions()) {
            token.positionIncrement = pos - prevTokenPos;
            while (token.next != null) {
                token = token.next;
                token.positionIncrement = 0;
            }
        } else {
            token.positionIncrement = 1;
            while (token.next != null) {
                prevToken = token;
                token = token.next;
                if (prevToken.startOffset == token.startOffset) {
                    token.positionIncrement = 0;
                } else {
                    token.positionIncrement = 1;
                }
            }
        }
        prevTokenPos = pos;
        prevToken = token;
    }
    initialized = true;
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) BytesRefArray(org.apache.lucene.util.BytesRefArray) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 74 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestClassicAnalyzer method testWickedLongTerm.

/**
   * Make sure we skip wicked long terms.
  */
public void testWickedLongTerm() throws IOException {
    RAMDirectory dir = new RAMDirectory();
    Analyzer analyzer = new ClassicAnalyzer();
    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer));
    char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
    Arrays.fill(chars, 'x');
    Document doc = new Document();
    final String bigTerm = new String(chars);
    // This produces a too-long term:
    String contents = "abc xyz x" + bigTerm + " another term";
    doc.add(new TextField("content", contents, Field.Store.NO));
    writer.addDocument(doc);
    // Make sure we can add another normal document
    doc = new Document();
    doc.add(new TextField("content", "abc bbb ccc", Field.Store.NO));
    writer.addDocument(doc);
    writer.close();
    IndexReader reader = DirectoryReader.open(dir);
    // Make sure all terms < max size were indexed
    assertEquals(2, reader.docFreq(new Term("content", "abc")));
    assertEquals(1, reader.docFreq(new Term("content", "bbb")));
    assertEquals(1, reader.docFreq(new Term("content", "term")));
    assertEquals(1, reader.docFreq(new Term("content", "another")));
    // Make sure position is still incremented when
    // massive term is skipped:
    PostingsEnum tps = MultiFields.getTermPositionsEnum(reader, "content", new BytesRef("another"));
    assertTrue(tps.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(1, tps.freq());
    assertEquals(3, tps.nextPosition());
    // Make sure the doc that has the massive term is in
    // the index:
    assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());
    reader.close();
    // Make sure we can add a document with exactly the
    // maximum length term, and search on that term:
    doc = new Document();
    doc.add(new TextField("content", bigTerm, Field.Store.NO));
    ClassicAnalyzer sa = new ClassicAnalyzer();
    sa.setMaxTokenLength(100000);
    writer = new IndexWriter(dir, new IndexWriterConfig(sa));
    writer.addDocument(doc);
    writer.close();
    reader = DirectoryReader.open(dir);
    assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
    reader.close();
    dir.close();
    analyzer.close();
    sa.close();
}
Also used : Term(org.apache.lucene.index.Term) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) RAMDirectory(org.apache.lucene.store.RAMDirectory) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader) TextField(org.apache.lucene.document.TextField) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 75 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class BlendedInfixSuggester method createCoefficient.

/**
   * Create the coefficient to transform the weight.
   *
   * @param doc id of the document
   * @param matchedTokens tokens found in the query
   * @param prefixToken unfinished token in the query
   * @return the coefficient
   * @throws IOException If there are problems reading term vectors from the underlying Lucene index.
   */
private double createCoefficient(IndexSearcher searcher, int doc, Set<String> matchedTokens, String prefixToken) throws IOException {
    Terms tv = searcher.getIndexReader().getTermVector(doc, TEXT_FIELD_NAME);
    TermsEnum it = tv.iterator();
    Integer position = Integer.MAX_VALUE;
    BytesRef term;
    // find the closest token position
    while ((term = it.next()) != null) {
        String docTerm = term.utf8ToString();
        if (matchedTokens.contains(docTerm) || (prefixToken != null && docTerm.startsWith(prefixToken))) {
            PostingsEnum docPosEnum = it.postings(null, PostingsEnum.OFFSETS);
            docPosEnum.nextDoc();
            // use the first occurrence of the term
            int p = docPosEnum.nextPosition();
            if (p < position) {
                position = p;
            }
        }
    }
    // create corresponding coefficient based on position
    return calculateCoefficient(position);
}
Also used : Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

PostingsEnum (org.apache.lucene.index.PostingsEnum)80 BytesRef (org.apache.lucene.util.BytesRef)59 TermsEnum (org.apache.lucene.index.TermsEnum)56 Terms (org.apache.lucene.index.Terms)47 Fields (org.apache.lucene.index.Fields)18 LeafReader (org.apache.lucene.index.LeafReader)17 Term (org.apache.lucene.index.Term)17 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)15 Document (org.apache.lucene.document.Document)13 ArrayList (java.util.ArrayList)12 Bits (org.apache.lucene.util.Bits)11 IndexReader (org.apache.lucene.index.IndexReader)10 TextField (org.apache.lucene.document.TextField)9 Directory (org.apache.lucene.store.Directory)9 IOException (java.io.IOException)8 DirectoryReader (org.apache.lucene.index.DirectoryReader)7 IndexWriter (org.apache.lucene.index.IndexWriter)6 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)6 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)5 XContentBuilder (org.elasticsearch.common.xcontent.XContentBuilder)5