Examples with TermVectorsReader - org.apache.lucene.codecs.TermVectorsReader

Example 1 with TermVectorsReader

use of org.apache.lucene.codecs.TermVectorsReader in project lucene-solr by apache.

the class CheckIndex method testTermVectors.

/**
   * Test term vectors.
   * @lucene.experimental
   */
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream, boolean verbose, boolean crossCheckTermVectors, boolean failFast, Version version) throws IOException {
    long startNS = System.nanoTime();
    final Status.TermVectorStatus status = new Status.TermVectorStatus();
    final FieldInfos fieldInfos = reader.getFieldInfos();
    try {
        if (infoStream != null) {
            infoStream.print("    test: term vectors........");
        }
        PostingsEnum postings = null;
        // Only used if crossCheckTermVectors is true:
        PostingsEnum postingsDocs = null;
        final Bits liveDocs = reader.getLiveDocs();
        final Fields postingsFields;
        // TODO: testTermsIndex
        if (crossCheckTermVectors) {
            postingsFields = reader.getPostingsReader().getMergeInstance();
        } else {
            postingsFields = null;
        }
        TermVectorsReader vectorsReader = reader.getTermVectorsReader();
        if (vectorsReader != null) {
            vectorsReader = vectorsReader.getMergeInstance();
            for (int j = 0; j < reader.maxDoc(); ++j) {
                // Intentionally pull/visit (but don't count in
                // stats) deleted documents to make sure they too
                // are not corrupt:
                Fields tfv = vectorsReader.get(j);
                if (tfv != null) {
                    // First run with no deletions:
                    checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose, version);
                    // Only agg stats if the doc is live:
                    final boolean doStats = liveDocs == null || liveDocs.get(j);
                    if (doStats) {
                        status.docCount++;
                    }
                    for (String field : tfv) {
                        if (doStats) {
                            status.totVectors++;
                        }
                        // Make sure FieldInfo thinks this field is vector'd:
                        final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
                        if (!fieldInfo.hasVectors()) {
                            throw new RuntimeException("docID=" + j + " has term vectors for field=" + field + " but FieldInfo has storeTermVector=false");
                        }
                        if (crossCheckTermVectors) {
                            Terms terms = tfv.terms(field);
                            TermsEnum termsEnum = terms.iterator();
                            final boolean postingsHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
                            final boolean postingsHasPayload = fieldInfo.hasPayloads();
                            final boolean vectorsHasPayload = terms.hasPayloads();
                            Terms postingsTerms = postingsFields.terms(field);
                            if (postingsTerms == null) {
                                throw new RuntimeException("vector field=" + field + " does not exist in postings; doc=" + j);
                            }
                            TermsEnum postingsTermsEnum = postingsTerms.iterator();
                            final boolean hasProx = terms.hasOffsets() || terms.hasPositions();
                            BytesRef term = null;
                            while ((term = termsEnum.next()) != null) {
                                // This is the term vectors:
                                postings = termsEnum.postings(postings, PostingsEnum.ALL);
                                assert postings != null;
                                if (!postingsTermsEnum.seekExact(term)) {
                                    throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j);
                                }
                                // This is the inverted index ("real" postings):
                                postingsDocs = postingsTermsEnum.postings(postingsDocs, PostingsEnum.ALL);
                                assert postingsDocs != null;
                                final int advanceDoc = postingsDocs.advance(j);
                                if (advanceDoc != j) {
                                    throw new RuntimeException("vector term=" + term + " field=" + field + ": doc=" + j + " was not found in postings (got: " + advanceDoc + ")");
                                }
                                final int doc = postings.nextDoc();
                                if (doc != 0) {
                                    throw new RuntimeException("vector for doc " + j + " didn't return docID=0: got docID=" + doc);
                                }
                                if (postingsHasFreq) {
                                    final int tf = postings.freq();
                                    if (postingsHasFreq && postingsDocs.freq() != tf) {
                                        throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": freq=" + tf + " differs from postings freq=" + postingsDocs.freq());
                                    }
                                    // Term vectors has prox?
                                    if (hasProx) {
                                        for (int i = 0; i < tf; i++) {
                                            int pos = postings.nextPosition();
                                            if (postingsTerms.hasPositions()) {
                                                int postingsPos = postingsDocs.nextPosition();
                                                if (terms.hasPositions() && pos != postingsPos) {
                                                    throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": pos=" + pos + " differs from postings pos=" + postingsPos);
                                                }
                                            }
                                            // Call the methods to at least make
                                            // sure they don't throw exc:
                                            final int startOffset = postings.startOffset();
                                            final int endOffset = postings.endOffset();
                                            if (startOffset != -1 && endOffset != -1 && postingsTerms.hasOffsets()) {
                                                int postingsStartOffset = postingsDocs.startOffset();
                                                int postingsEndOffset = postingsDocs.endOffset();
                                                if (startOffset != postingsStartOffset) {
                                                    throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": startOffset=" + startOffset + " differs from postings startOffset=" + postingsStartOffset);
                                                }
                                                if (endOffset != postingsEndOffset) {
                                                    throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": endOffset=" + endOffset + " differs from postings endOffset=" + postingsEndOffset);
                                                }
                                            }
                                            BytesRef payload = postings.getPayload();
                                            if (payload != null) {
                                                assert vectorsHasPayload;
                                            }
                                            if (postingsHasPayload && vectorsHasPayload) {
                                                if (payload == null) {
                                                    // postings has payloads too, it should not have one at this position
                                                    if (postingsDocs.getPayload() != null) {
                                                        throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has no payload but postings does: " + postingsDocs.getPayload());
                                                    }
                                                } else {
                                                    // postings should also have one at this position, with the same bytes.
                                                    if (postingsDocs.getPayload() == null) {
                                                        throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but postings does not.");
                                                    }
                                                    BytesRef postingsPayload = postingsDocs.getPayload();
                                                    if (!payload.equals(postingsPayload)) {
                                                        throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but differs from postings payload=" + postingsPayload);
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
        float vectorAvg = status.docCount == 0 ? 0 : status.totVectors / (float) status.docCount;
        msg(infoStream, String.format(Locale.ROOT, "OK [%d total term vector count; avg %.1f term/freq vector fields per doc] [took %.3f sec]", status.totVectors, vectorAvg, nsToSec(System.nanoTime() - startNS)));
    } catch (Throwable e) {
        if (failFast) {
            throw IOUtils.rethrowAlways(e);
        }
        msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
        status.error = e;
        if (infoStream != null) {
            e.printStackTrace(infoStream);
        }
    }
    return status;
}

Also used : DocValuesStatus(org.apache.lucene.index.CheckIndex.Status.DocValuesStatus) TermVectorsReader(org.apache.lucene.codecs.TermVectorsReader) Bits(org.apache.lucene.util.Bits) BytesRef(org.apache.lucene.util.BytesRef)

Example 2 with TermVectorsReader

use of org.apache.lucene.codecs.TermVectorsReader in project lucene-solr by apache.

the class CompressingTermVectorsWriter method merge.

@Override
public int merge(MergeState mergeState) throws IOException {
    if (mergeState.needsIndexSort) {
        // being copied over...?
        return super.merge(mergeState);
    }
    int docCount = 0;
    int numReaders = mergeState.maxDocs.length;
    MatchingReaders matching = new MatchingReaders(mergeState);
    for (int readerIndex = 0; readerIndex < numReaders; readerIndex++) {
        CompressingTermVectorsReader matchingVectorsReader = null;
        final TermVectorsReader vectorsReader = mergeState.termVectorsReaders[readerIndex];
        if (matching.matchingReaders[readerIndex]) {
            // we can only bulk-copy if the matching reader is also a CompressingTermVectorsReader
            if (vectorsReader != null && vectorsReader instanceof CompressingTermVectorsReader) {
                matchingVectorsReader = (CompressingTermVectorsReader) vectorsReader;
            }
        }
        final int maxDoc = mergeState.maxDocs[readerIndex];
        final Bits liveDocs = mergeState.liveDocs[readerIndex];
        if (matchingVectorsReader != null && matchingVectorsReader.getCompressionMode() == compressionMode && matchingVectorsReader.getChunkSize() == chunkSize && matchingVectorsReader.getVersion() == VERSION_CURRENT && matchingVectorsReader.getPackedIntsVersion() == PackedInts.VERSION_CURRENT && BULK_MERGE_ENABLED && liveDocs == null && !tooDirty(matchingVectorsReader)) {
            // optimized merge, raw byte copy
            // its not worth fine-graining this if there are deletions.
            matchingVectorsReader.checkIntegrity();
            // flush any pending chunks
            if (!pendingDocs.isEmpty()) {
                flush();
                // incomplete: we had to force this flush
                numDirtyChunks++;
            }
            // iterate over each chunk. we use the vectors index to find chunk boundaries,
            // read the docstart + doccount from the chunk header (we write a new header, since doc numbers will change),
            // and just copy the bytes directly.
            IndexInput rawDocs = matchingVectorsReader.getVectorsStream();
            CompressingStoredFieldsIndexReader index = matchingVectorsReader.getIndexReader();
            rawDocs.seek(index.getStartPointer(0));
            int docID = 0;
            while (docID < maxDoc) {
                // read header
                int base = rawDocs.readVInt();
                if (base != docID) {
                    throw new CorruptIndexException("invalid state: base=" + base + ", docID=" + docID, rawDocs);
                }
                int bufferedDocs = rawDocs.readVInt();
                // write a new index entry and new header for this chunk.
                indexWriter.writeIndex(bufferedDocs, vectorsStream.getFilePointer());
                // rebase
                vectorsStream.writeVInt(docCount);
                vectorsStream.writeVInt(bufferedDocs);
                docID += bufferedDocs;
                docCount += bufferedDocs;
                numDocs += bufferedDocs;
                if (docID > maxDoc) {
                    throw new CorruptIndexException("invalid state: base=" + base + ", count=" + bufferedDocs + ", maxDoc=" + maxDoc, rawDocs);
                }
                // copy bytes until the next chunk boundary (or end of chunk data).
                // using the stored fields index for this isn't the most efficient, but fast enough
                // and is a source of redundancy for detecting bad things.
                final long end;
                if (docID == maxDoc) {
                    end = matchingVectorsReader.getMaxPointer();
                } else {
                    end = index.getStartPointer(docID);
                }
                vectorsStream.copyBytes(rawDocs, end - rawDocs.getFilePointer());
            }
            if (rawDocs.getFilePointer() != matchingVectorsReader.getMaxPointer()) {
                throw new CorruptIndexException("invalid state: pos=" + rawDocs.getFilePointer() + ", max=" + matchingVectorsReader.getMaxPointer(), rawDocs);
            }
            // since we bulk merged all chunks, we inherit any dirty ones from this segment.
            numChunks += matchingVectorsReader.getNumChunks();
            numDirtyChunks += matchingVectorsReader.getNumDirtyChunks();
        } else {
            // naive merge...
            if (vectorsReader != null) {
                vectorsReader.checkIntegrity();
            }
            for (int i = 0; i < maxDoc; i++) {
                if (liveDocs != null && liveDocs.get(i) == false) {
                    continue;
                }
                Fields vectors;
                if (vectorsReader == null) {
                    vectors = null;
                } else {
                    vectors = vectorsReader.get(i);
                }
                addAllDocVectors(vectors, mergeState);
                ++docCount;
            }
        }
    }
    finish(mergeState.mergeFieldInfos, docCount);
    return docCount;
}

Also used : Fields(org.apache.lucene.index.Fields) Bits(org.apache.lucene.util.Bits) IndexInput(org.apache.lucene.store.IndexInput) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) TermVectorsReader(org.apache.lucene.codecs.TermVectorsReader)

Example 3 with TermVectorsReader

use of org.apache.lucene.codecs.TermVectorsReader in project lucene-solr by apache.

the class TestTermVectorsReader method testOffsetReader.

public void testOffsetReader() throws IOException {
    TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
    Terms vector = reader.get(0).terms(testFields[0]);
    assertNotNull(vector);
    TermsEnum termsEnum = vector.iterator();
    assertNotNull(termsEnum);
    assertEquals(testTerms.length, vector.size());
    PostingsEnum dpEnum = null;
    for (int i = 0; i < testTerms.length; i++) {
        final BytesRef text = termsEnum.next();
        assertNotNull(text);
        String term = text.utf8ToString();
        assertEquals(testTerms[i], term);
        dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
        assertNotNull(dpEnum);
        assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
        assertEquals(dpEnum.freq(), positions[i].length);
        for (int j = 0; j < positions[i].length; j++) {
            assertEquals(positions[i][j], dpEnum.nextPosition());
        }
        assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
        dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
        assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
        assertNotNull(dpEnum);
        assertEquals(dpEnum.freq(), positions[i].length);
        for (int j = 0; j < positions[i].length; j++) {
            assertEquals(positions[i][j], dpEnum.nextPosition());
            assertEquals(j * 10, dpEnum.startOffset());
            assertEquals(j * 10 + testTerms[i].length(), dpEnum.endOffset());
        }
        assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
    }
    reader.close();
}

Also used : TermVectorsReader(org.apache.lucene.codecs.TermVectorsReader) BytesRef(org.apache.lucene.util.BytesRef)

Example 4 with TermVectorsReader

use of org.apache.lucene.codecs.TermVectorsReader in project lucene-solr by apache.

the class TestTermVectorsReader method testReader.

public void testReader() throws IOException {
    TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
    for (int j = 0; j < 5; j++) {
        Terms vector = reader.get(j).terms(testFields[0]);
        assertNotNull(vector);
        assertEquals(testTerms.length, vector.size());
        TermsEnum termsEnum = vector.iterator();
        for (int i = 0; i < testTerms.length; i++) {
            final BytesRef text = termsEnum.next();
            assertNotNull(text);
            String term = text.utf8ToString();
            //System.out.println("Term: " + term);
            assertEquals(testTerms[i], term);
        }
        assertNull(termsEnum.next());
    }
    reader.close();
}

Also used : TermVectorsReader(org.apache.lucene.codecs.TermVectorsReader) BytesRef(org.apache.lucene.util.BytesRef)

Example 5 with TermVectorsReader

use of org.apache.lucene.codecs.TermVectorsReader in project lucene-solr by apache.

the class TestTermVectorsReader method testDocsEnum.

public void testDocsEnum() throws IOException {
    TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
    for (int j = 0; j < 5; j++) {
        Terms vector = reader.get(j).terms(testFields[0]);
        assertNotNull(vector);
        assertEquals(testTerms.length, vector.size());
        TermsEnum termsEnum = vector.iterator();
        PostingsEnum postingsEnum = null;
        for (int i = 0; i < testTerms.length; i++) {
            final BytesRef text = termsEnum.next();
            assertNotNull(text);
            String term = text.utf8ToString();
            //System.out.println("Term: " + term);
            assertEquals(testTerms[i], term);
            postingsEnum = TestUtil.docs(random(), termsEnum, postingsEnum, PostingsEnum.NONE);
            assertNotNull(postingsEnum);
            int doc = postingsEnum.docID();
            assertEquals(-1, doc);
            assertTrue(postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            assertEquals(DocIdSetIterator.NO_MORE_DOCS, postingsEnum.nextDoc());
        }
        assertNull(termsEnum.next());
    }
    reader.close();
}

Also used : TermVectorsReader(org.apache.lucene.codecs.TermVectorsReader) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

TermVectorsReader (org.apache.lucene.codecs.TermVectorsReader)9 BytesRef (org.apache.lucene.util.BytesRef)6 TermVectorsWriter (org.apache.lucene.codecs.TermVectorsWriter)2 Bits (org.apache.lucene.util.Bits)2 IOException (java.io.IOException)1 Map (java.util.Map)1 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)1 Codec (org.apache.lucene.codecs.Codec)1 DocValuesConsumer (org.apache.lucene.codecs.DocValuesConsumer)1 DocValuesProducer (org.apache.lucene.codecs.DocValuesProducer)1 FieldsConsumer (org.apache.lucene.codecs.FieldsConsumer)1 FieldsProducer (org.apache.lucene.codecs.FieldsProducer)1 NormsConsumer (org.apache.lucene.codecs.NormsConsumer)1 NormsProducer (org.apache.lucene.codecs.NormsProducer)1 StoredFieldsReader (org.apache.lucene.codecs.StoredFieldsReader)1 StoredFieldsWriter (org.apache.lucene.codecs.StoredFieldsWriter)1 Document (org.apache.lucene.document.Document)1 Field (org.apache.lucene.document.Field)1 FieldType (org.apache.lucene.document.FieldType)1 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)1