use of org.apache.lucene.codecs.TermVectorsReader in project lucene-solr by apache.
the class CheckIndex method testTermVectors.
/**
* Test term vectors.
* @lucene.experimental
*/
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream, boolean verbose, boolean crossCheckTermVectors, boolean failFast, Version version) throws IOException {
long startNS = System.nanoTime();
final Status.TermVectorStatus status = new Status.TermVectorStatus();
final FieldInfos fieldInfos = reader.getFieldInfos();
try {
if (infoStream != null) {
infoStream.print(" test: term vectors........");
}
PostingsEnum postings = null;
// Only used if crossCheckTermVectors is true:
PostingsEnum postingsDocs = null;
final Bits liveDocs = reader.getLiveDocs();
final Fields postingsFields;
// TODO: testTermsIndex
if (crossCheckTermVectors) {
postingsFields = reader.getPostingsReader().getMergeInstance();
} else {
postingsFields = null;
}
TermVectorsReader vectorsReader = reader.getTermVectorsReader();
if (vectorsReader != null) {
vectorsReader = vectorsReader.getMergeInstance();
for (int j = 0; j < reader.maxDoc(); ++j) {
// Intentionally pull/visit (but don't count in
// stats) deleted documents to make sure they too
// are not corrupt:
Fields tfv = vectorsReader.get(j);
if (tfv != null) {
// First run with no deletions:
checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose, version);
// Only agg stats if the doc is live:
final boolean doStats = liveDocs == null || liveDocs.get(j);
if (doStats) {
status.docCount++;
}
for (String field : tfv) {
if (doStats) {
status.totVectors++;
}
// Make sure FieldInfo thinks this field is vector'd:
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
if (!fieldInfo.hasVectors()) {
throw new RuntimeException("docID=" + j + " has term vectors for field=" + field + " but FieldInfo has storeTermVector=false");
}
if (crossCheckTermVectors) {
Terms terms = tfv.terms(field);
TermsEnum termsEnum = terms.iterator();
final boolean postingsHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
final boolean postingsHasPayload = fieldInfo.hasPayloads();
final boolean vectorsHasPayload = terms.hasPayloads();
Terms postingsTerms = postingsFields.terms(field);
if (postingsTerms == null) {
throw new RuntimeException("vector field=" + field + " does not exist in postings; doc=" + j);
}
TermsEnum postingsTermsEnum = postingsTerms.iterator();
final boolean hasProx = terms.hasOffsets() || terms.hasPositions();
BytesRef term = null;
while ((term = termsEnum.next()) != null) {
// This is the term vectors:
postings = termsEnum.postings(postings, PostingsEnum.ALL);
assert postings != null;
if (!postingsTermsEnum.seekExact(term)) {
throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j);
}
// This is the inverted index ("real" postings):
postingsDocs = postingsTermsEnum.postings(postingsDocs, PostingsEnum.ALL);
assert postingsDocs != null;
final int advanceDoc = postingsDocs.advance(j);
if (advanceDoc != j) {
throw new RuntimeException("vector term=" + term + " field=" + field + ": doc=" + j + " was not found in postings (got: " + advanceDoc + ")");
}
final int doc = postings.nextDoc();
if (doc != 0) {
throw new RuntimeException("vector for doc " + j + " didn't return docID=0: got docID=" + doc);
}
if (postingsHasFreq) {
final int tf = postings.freq();
if (postingsHasFreq && postingsDocs.freq() != tf) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": freq=" + tf + " differs from postings freq=" + postingsDocs.freq());
}
// Term vectors has prox?
if (hasProx) {
for (int i = 0; i < tf; i++) {
int pos = postings.nextPosition();
if (postingsTerms.hasPositions()) {
int postingsPos = postingsDocs.nextPosition();
if (terms.hasPositions() && pos != postingsPos) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": pos=" + pos + " differs from postings pos=" + postingsPos);
}
}
// Call the methods to at least make
// sure they don't throw exc:
final int startOffset = postings.startOffset();
final int endOffset = postings.endOffset();
if (startOffset != -1 && endOffset != -1 && postingsTerms.hasOffsets()) {
int postingsStartOffset = postingsDocs.startOffset();
int postingsEndOffset = postingsDocs.endOffset();
if (startOffset != postingsStartOffset) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": startOffset=" + startOffset + " differs from postings startOffset=" + postingsStartOffset);
}
if (endOffset != postingsEndOffset) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": endOffset=" + endOffset + " differs from postings endOffset=" + postingsEndOffset);
}
}
BytesRef payload = postings.getPayload();
if (payload != null) {
assert vectorsHasPayload;
}
if (postingsHasPayload && vectorsHasPayload) {
if (payload == null) {
// postings has payloads too, it should not have one at this position
if (postingsDocs.getPayload() != null) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has no payload but postings does: " + postingsDocs.getPayload());
}
} else {
// postings should also have one at this position, with the same bytes.
if (postingsDocs.getPayload() == null) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but postings does not.");
}
BytesRef postingsPayload = postingsDocs.getPayload();
if (!payload.equals(postingsPayload)) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but differs from postings payload=" + postingsPayload);
}
}
}
}
}
}
}
}
}
}
}
}
float vectorAvg = status.docCount == 0 ? 0 : status.totVectors / (float) status.docCount;
msg(infoStream, String.format(Locale.ROOT, "OK [%d total term vector count; avg %.1f term/freq vector fields per doc] [took %.3f sec]", status.totVectors, vectorAvg, nsToSec(System.nanoTime() - startNS)));
} catch (Throwable e) {
if (failFast) {
throw IOUtils.rethrowAlways(e);
}
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
status.error = e;
if (infoStream != null) {
e.printStackTrace(infoStream);
}
}
return status;
}
use of org.apache.lucene.codecs.TermVectorsReader in project lucene-solr by apache.
the class CompressingTermVectorsWriter method merge.
@Override
public int merge(MergeState mergeState) throws IOException {
if (mergeState.needsIndexSort) {
// being copied over...?
return super.merge(mergeState);
}
int docCount = 0;
int numReaders = mergeState.maxDocs.length;
MatchingReaders matching = new MatchingReaders(mergeState);
for (int readerIndex = 0; readerIndex < numReaders; readerIndex++) {
CompressingTermVectorsReader matchingVectorsReader = null;
final TermVectorsReader vectorsReader = mergeState.termVectorsReaders[readerIndex];
if (matching.matchingReaders[readerIndex]) {
// we can only bulk-copy if the matching reader is also a CompressingTermVectorsReader
if (vectorsReader != null && vectorsReader instanceof CompressingTermVectorsReader) {
matchingVectorsReader = (CompressingTermVectorsReader) vectorsReader;
}
}
final int maxDoc = mergeState.maxDocs[readerIndex];
final Bits liveDocs = mergeState.liveDocs[readerIndex];
if (matchingVectorsReader != null && matchingVectorsReader.getCompressionMode() == compressionMode && matchingVectorsReader.getChunkSize() == chunkSize && matchingVectorsReader.getVersion() == VERSION_CURRENT && matchingVectorsReader.getPackedIntsVersion() == PackedInts.VERSION_CURRENT && BULK_MERGE_ENABLED && liveDocs == null && !tooDirty(matchingVectorsReader)) {
// optimized merge, raw byte copy
// its not worth fine-graining this if there are deletions.
matchingVectorsReader.checkIntegrity();
// flush any pending chunks
if (!pendingDocs.isEmpty()) {
flush();
// incomplete: we had to force this flush
numDirtyChunks++;
}
// iterate over each chunk. we use the vectors index to find chunk boundaries,
// read the docstart + doccount from the chunk header (we write a new header, since doc numbers will change),
// and just copy the bytes directly.
IndexInput rawDocs = matchingVectorsReader.getVectorsStream();
CompressingStoredFieldsIndexReader index = matchingVectorsReader.getIndexReader();
rawDocs.seek(index.getStartPointer(0));
int docID = 0;
while (docID < maxDoc) {
// read header
int base = rawDocs.readVInt();
if (base != docID) {
throw new CorruptIndexException("invalid state: base=" + base + ", docID=" + docID, rawDocs);
}
int bufferedDocs = rawDocs.readVInt();
// write a new index entry and new header for this chunk.
indexWriter.writeIndex(bufferedDocs, vectorsStream.getFilePointer());
// rebase
vectorsStream.writeVInt(docCount);
vectorsStream.writeVInt(bufferedDocs);
docID += bufferedDocs;
docCount += bufferedDocs;
numDocs += bufferedDocs;
if (docID > maxDoc) {
throw new CorruptIndexException("invalid state: base=" + base + ", count=" + bufferedDocs + ", maxDoc=" + maxDoc, rawDocs);
}
// copy bytes until the next chunk boundary (or end of chunk data).
// using the stored fields index for this isn't the most efficient, but fast enough
// and is a source of redundancy for detecting bad things.
final long end;
if (docID == maxDoc) {
end = matchingVectorsReader.getMaxPointer();
} else {
end = index.getStartPointer(docID);
}
vectorsStream.copyBytes(rawDocs, end - rawDocs.getFilePointer());
}
if (rawDocs.getFilePointer() != matchingVectorsReader.getMaxPointer()) {
throw new CorruptIndexException("invalid state: pos=" + rawDocs.getFilePointer() + ", max=" + matchingVectorsReader.getMaxPointer(), rawDocs);
}
// since we bulk merged all chunks, we inherit any dirty ones from this segment.
numChunks += matchingVectorsReader.getNumChunks();
numDirtyChunks += matchingVectorsReader.getNumDirtyChunks();
} else {
// naive merge...
if (vectorsReader != null) {
vectorsReader.checkIntegrity();
}
for (int i = 0; i < maxDoc; i++) {
if (liveDocs != null && liveDocs.get(i) == false) {
continue;
}
Fields vectors;
if (vectorsReader == null) {
vectors = null;
} else {
vectors = vectorsReader.get(i);
}
addAllDocVectors(vectors, mergeState);
++docCount;
}
}
}
finish(mergeState.mergeFieldInfos, docCount);
return docCount;
}
use of org.apache.lucene.codecs.TermVectorsReader in project lucene-solr by apache.
the class TestTermVectorsReader method testOffsetReader.
public void testOffsetReader() throws IOException {
TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
Terms vector = reader.get(0).terms(testFields[0]);
assertNotNull(vector);
TermsEnum termsEnum = vector.iterator();
assertNotNull(termsEnum);
assertEquals(testTerms.length, vector.size());
PostingsEnum dpEnum = null;
for (int i = 0; i < testTerms.length; i++) {
final BytesRef text = termsEnum.next();
assertNotNull(text);
String term = text.utf8ToString();
assertEquals(testTerms[i], term);
dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
assertNotNull(dpEnum);
assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(dpEnum.freq(), positions[i].length);
for (int j = 0; j < positions[i].length; j++) {
assertEquals(positions[i][j], dpEnum.nextPosition());
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertNotNull(dpEnum);
assertEquals(dpEnum.freq(), positions[i].length);
for (int j = 0; j < positions[i].length; j++) {
assertEquals(positions[i][j], dpEnum.nextPosition());
assertEquals(j * 10, dpEnum.startOffset());
assertEquals(j * 10 + testTerms[i].length(), dpEnum.endOffset());
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
}
reader.close();
}
use of org.apache.lucene.codecs.TermVectorsReader in project lucene-solr by apache.
the class TestTermVectorsReader method testReader.
public void testReader() throws IOException {
TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
for (int j = 0; j < 5; j++) {
Terms vector = reader.get(j).terms(testFields[0]);
assertNotNull(vector);
assertEquals(testTerms.length, vector.size());
TermsEnum termsEnum = vector.iterator();
for (int i = 0; i < testTerms.length; i++) {
final BytesRef text = termsEnum.next();
assertNotNull(text);
String term = text.utf8ToString();
//System.out.println("Term: " + term);
assertEquals(testTerms[i], term);
}
assertNull(termsEnum.next());
}
reader.close();
}
use of org.apache.lucene.codecs.TermVectorsReader in project lucene-solr by apache.
the class TestTermVectorsReader method testDocsEnum.
public void testDocsEnum() throws IOException {
TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
for (int j = 0; j < 5; j++) {
Terms vector = reader.get(j).terms(testFields[0]);
assertNotNull(vector);
assertEquals(testTerms.length, vector.size());
TermsEnum termsEnum = vector.iterator();
PostingsEnum postingsEnum = null;
for (int i = 0; i < testTerms.length; i++) {
final BytesRef text = termsEnum.next();
assertNotNull(text);
String term = text.utf8ToString();
//System.out.println("Term: " + term);
assertEquals(testTerms[i], term);
postingsEnum = TestUtil.docs(random(), termsEnum, postingsEnum, PostingsEnum.NONE);
assertNotNull(postingsEnum);
int doc = postingsEnum.docID();
assertEquals(-1, doc);
assertTrue(postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postingsEnum.nextDoc());
}
assertNull(termsEnum.next());
}
reader.close();
}
Aggregations