   * Test term vectors.
   * @lucene.experimental
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream, boolean verbose, boolean crossCheckTermVectors, boolean failFast, Version version) throws IOException {
    long startNS = System.nanoTime();
    final Status.TermVectorStatus status = new Status.TermVectorStatus();
    final FieldInfos fieldInfos = reader.getFieldInfos();
    try {
        if (infoStream != null) {
            infoStream.print("    test: term vectors........");
        PostingsEnum postings = null;
        // Only used if crossCheckTermVectors is true:
        PostingsEnum postingsDocs = null;
        final Bits liveDocs = reader.getLiveDocs();
        final Fields postingsFields;
        // TODO: testTermsIndex
        if (crossCheckTermVectors) {
            postingsFields = reader.getPostingsReader().getMergeInstance();
        } else {
            postingsFields = null;
        TermVectorsReader vectorsReader = reader.getTermVectorsReader();
        if (vectorsReader != null) {
            vectorsReader = vectorsReader.getMergeInstance();
            for (int j = 0; j < reader.maxDoc(); ++j) {
                // Intentionally pull/visit (but don't count in
                // stats) deleted documents to make sure they too
                // are not corrupt:
                Fields tfv = vectorsReader.get(j);
                if (tfv != null) {
                    // First run with no deletions:
                    checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose, version);
                    // Only agg stats if the doc is live:
                    final boolean doStats = liveDocs == null || liveDocs.get(j);
                    if (doStats) {
                    for (String field : tfv) {
                        if (doStats) {
                        // Make sure FieldInfo thinks this field is vector'd:
                        final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
                        if (!fieldInfo.hasVectors()) {
                            throw new RuntimeException("docID=" + j + " has term vectors for field=" + field + " but FieldInfo has storeTermVector=false");
                        if (crossCheckTermVectors) {
                            Terms terms = tfv.terms(field);
                            TermsEnum termsEnum = terms.iterator();
                            final boolean postingsHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
                            final boolean postingsHasPayload = fieldInfo.hasPayloads();
                            final boolean vectorsHasPayload = terms.hasPayloads();
                            Terms postingsTerms = postingsFields.terms(field);
                            if (postingsTerms == null) {
                                throw new RuntimeException("vector field=" + field + " does not exist in postings; doc=" + j);
                            TermsEnum postingsTermsEnum = postingsTerms.iterator();
                            final boolean hasProx = terms.hasOffsets() || terms.hasPositions();
                            BytesRef term = null;
                            while ((term = != null) {
                                // This is the term vectors:
                                postings = termsEnum.postings(postings, PostingsEnum.ALL);
                                assert postings != null;
                                if (!postingsTermsEnum.seekExact(term)) {
                                    throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j);
                                // This is the inverted index ("real" postings):
                                postingsDocs = postingsTermsEnum.postings(postingsDocs, PostingsEnum.ALL);
                                assert postingsDocs != null;
                                final int advanceDoc = postingsDocs.advance(j);
                                if (advanceDoc != j) {
                                    throw new RuntimeException("vector term=" + term + " field=" + field + ": doc=" + j + " was not found in postings (got: " + advanceDoc + ")");
                                final int doc = postings.nextDoc();
                                if (doc != 0) {
                                    throw new RuntimeException("vector for doc " + j + " didn't return docID=0: got docID=" + doc);
                                if (postingsHasFreq) {
                                    final int tf = postings.freq();
                                    if (postingsHasFreq && postingsDocs.freq() != tf) {
                                        throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": freq=" + tf + " differs from postings freq=" + postingsDocs.freq());
                                    // Term vectors has prox?
                                    if (hasProx) {
                                        for (int i = 0; i < tf; i++) {
                                            int pos = postings.nextPosition();
                                            if (postingsTerms.hasPositions()) {
                                                int postingsPos = postingsDocs.nextPosition();
                                                if (terms.hasPositions() && pos != postingsPos) {
                                                    throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": pos=" + pos + " differs from postings pos=" + postingsPos);
                                            // Call the methods to at least make
                                            // sure they don't throw exc:
                                            final int startOffset = postings.startOffset();
                                            final int endOffset = postings.endOffset();
                                            if (startOffset != -1 && endOffset != -1 && postingsTerms.hasOffsets()) {
                                                int postingsStartOffset = postingsDocs.startOffset();
                                                int postingsEndOffset = postingsDocs.endOffset();
                                                if (startOffset != postingsStartOffset) {
                                                    throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": startOffset=" + startOffset + " differs from postings startOffset=" + postingsStartOffset);
                                                if (endOffset != postingsEndOffset) {
                                                    throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": endOffset=" + endOffset + " differs from postings endOffset=" + postingsEndOffset);
                                            BytesRef payload = postings.getPayload();
                                            if (payload != null) {
                                                assert vectorsHasPayload;
                                            if (postingsHasPayload && vectorsHasPayload) {
                                                if (payload == null) {
                                                    // postings has payloads too, it should not have one at this position
                                                    if (postingsDocs.getPayload() != null) {
                                                        throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has no payload but postings does: " + postingsDocs.getPayload());
                                                } else {
                                                    // postings should also have one at this position, with the same bytes.
                                                    if (postingsDocs.getPayload() == null) {
                                                        throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but postings does not.");
                                                    BytesRef postingsPayload = postingsDocs.getPayload();
                                                    if (!payload.equals(postingsPayload)) {
                                                        throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but differs from postings payload=" + postingsPayload);
        float vectorAvg = status.docCount == 0 ? 0 : status.totVectors / (float) status.docCount;
        msg(infoStream, String.format(Locale.ROOT, "OK [%d total term vector count; avg %.1f term/freq vector fields per doc] [took %.3f sec]", status.totVectors, vectorAvg, nsToSec(System.nanoTime() - startNS)));
    } catch (Throwable e) {
        if (failFast) {
            throw IOUtils.rethrowAlways(e);
        msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
        status.error = e;
        if (infoStream != null) {
    return status;
public int merge(MergeState mergeState) throws IOException {
    if (mergeState.needsIndexSort) {
        // being copied over...?
        return super.merge(mergeState);
    int docCount = 0;
    int numReaders = mergeState.maxDocs.length;
    MatchingReaders matching = new MatchingReaders(mergeState);
    for (int readerIndex = 0; readerIndex < numReaders; readerIndex++) {
        CompressingTermVectorsReader matchingVectorsReader = null;
        final TermVectorsReader vectorsReader = mergeState.termVectorsReaders[readerIndex];
        if (matching.matchingReaders[readerIndex]) {
            // we can only bulk-copy if the matching reader is also a CompressingTermVectorsReader
            if (vectorsReader != null && vectorsReader instanceof CompressingTermVectorsReader) {
                matchingVectorsReader = (CompressingTermVectorsReader) vectorsReader;
        final int maxDoc = mergeState.maxDocs[readerIndex];
        final Bits liveDocs = mergeState.liveDocs[readerIndex];
        if (matchingVectorsReader != null && matchingVectorsReader.getCompressionMode() == compressionMode && matchingVectorsReader.getChunkSize() == chunkSize && matchingVectorsReader.getVersion() == VERSION_CURRENT && matchingVectorsReader.getPackedIntsVersion() == PackedInts.VERSION_CURRENT && BULK_MERGE_ENABLED && liveDocs == null && !tooDirty(matchingVectorsReader)) {
            // optimized merge, raw byte copy
            // its not worth fine-graining this if there are deletions.
            // flush any pending chunks
            if (!pendingDocs.isEmpty()) {
                // incomplete: we had to force this flush
            // iterate over each chunk. we use the vectors index to find chunk boundaries,
            // read the docstart + doccount from the chunk header (we write a new header, since doc numbers will change),
            // and just copy the bytes directly.
            IndexInput rawDocs = matchingVectorsReader.getVectorsStream();
            CompressingStoredFieldsIndexReader index = matchingVectorsReader.getIndexReader();
            int docID = 0;
            while (docID < maxDoc) {
                // read header
                int base = rawDocs.readVInt();
                if (base != docID) {
                    throw new CorruptIndexException("invalid state: base=" + base + ", docID=" + docID, rawDocs);
                int bufferedDocs = rawDocs.readVInt();
                // write a new index entry and new header for this chunk.
                indexWriter.writeIndex(bufferedDocs, vectorsStream.getFilePointer());
                // rebase
                docID += bufferedDocs;
                docCount += bufferedDocs;
                numDocs += bufferedDocs;
                if (docID > maxDoc) {
                    throw new CorruptIndexException("invalid state: base=" + base + ", count=" + bufferedDocs + ", maxDoc=" + maxDoc, rawDocs);
                // copy bytes until the next chunk boundary (or end of chunk data).
                // using the stored fields index for this isn't the most efficient, but fast enough
                // and is a source of redundancy for detecting bad things.
                final long end;
                if (docID == maxDoc) {
                    end = matchingVectorsReader.getMaxPointer();
                } else {
                    end = index.getStartPointer(docID);
                vectorsStream.copyBytes(rawDocs, end - rawDocs.getFilePointer());
            if (rawDocs.getFilePointer() != matchingVectorsReader.getMaxPointer()) {
                throw new CorruptIndexException("invalid state: pos=" + rawDocs.getFilePointer() + ", max=" + matchingVectorsReader.getMaxPointer(), rawDocs);
            // since we bulk merged all chunks, we inherit any dirty ones from this segment.
            numChunks += matchingVectorsReader.getNumChunks();
            numDirtyChunks += matchingVectorsReader.getNumDirtyChunks();
        } else {
            // naive merge...
            if (vectorsReader != null) {
            for (int i = 0; i < maxDoc; i++) {
                if (liveDocs != null && liveDocs.get(i) == false) {
                Fields vectors;
                if (vectorsReader == null) {
                    vectors = null;
                } else {
                    vectors = vectorsReader.get(i);
                addAllDocVectors(vectors, mergeState);
    finish(mergeState.mergeFieldInfos, docCount);
    return docCount;
public void testOffsetReader() throws IOException {
    TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir,, fieldInfos, newIOContext(random()));
    Terms vector = reader.get(0).terms(testFields[0]);
    TermsEnum termsEnum = vector.iterator();
    assertEquals(testTerms.length, vector.size());
    PostingsEnum dpEnum = null;
    for (int i = 0; i < testTerms.length; i++) {
        final BytesRef text =;
        String term = text.utf8ToString();
        assertEquals(testTerms[i], term);
        dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
        assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
        assertEquals(dpEnum.freq(), positions[i].length);
        for (int j = 0; j < positions[i].length; j++) {
            assertEquals(positions[i][j], dpEnum.nextPosition());
        assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
        dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
        assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
        assertEquals(dpEnum.freq(), positions[i].length);
        for (int j = 0; j < positions[i].length; j++) {
            assertEquals(positions[i][j], dpEnum.nextPosition());
            assertEquals(j * 10, dpEnum.startOffset());
            assertEquals(j * 10 + testTerms[i].length(), dpEnum.endOffset());
        assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
public void testReader() throws IOException {
    TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir,, fieldInfos, newIOContext(random()));
    for (int j = 0; j < 5; j++) {
        Terms vector = reader.get(j).terms(testFields[0]);
        assertEquals(testTerms.length, vector.size());
        TermsEnum termsEnum = vector.iterator();
        for (int i = 0; i < testTerms.length; i++) {
            final BytesRef text =;
            String term = text.utf8ToString();
            //System.out.println("Term: " + term);
            assertEquals(testTerms[i], term);
public void testDocsEnum() throws IOException {
    TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir,, fieldInfos, newIOContext(random()));
    for (int j = 0; j < 5; j++) {
        Terms vector = reader.get(j).terms(testFields[0]);
        assertEquals(testTerms.length, vector.size());
        TermsEnum termsEnum = vector.iterator();
        PostingsEnum postingsEnum = null;
        for (int i = 0; i < testTerms.length; i++) {
            final BytesRef text =;
            String term = text.utf8ToString();
            //System.out.println("Term: " + term);
            assertEquals(testTerms[i], term);
            postingsEnum =, termsEnum, postingsEnum, PostingsEnum.NONE);
            int doc = postingsEnum.docID();
            assertEquals(-1, doc);
            assertTrue(postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            assertEquals(DocIdSetIterator.NO_MORE_DOCS, postingsEnum.nextDoc());
