use of org.apache.lucene.store.IndexInput in project lucene-solr by apache.
the class Node method readLocalFileMetaData.
/** Opens the specified file, reads its identifying information, including file length, full index header (includes the unique segment
* ID) and the full footer (includes checksum), and returns the resulting {@link FileMetaData}.
*
* <p>This returns null, logging a message, if there are any problems (the file does not exist, is corrupt, truncated, etc.).</p> */
public FileMetaData readLocalFileMetaData(String fileName) throws IOException {
Map<String, FileMetaData> cache = lastFileMetaData;
FileMetaData result;
if (cache != null) {
// We may already have this file cached from the last NRT point:
result = cache.get(fileName);
} else {
result = null;
}
if (result == null) {
// Pull from the filesystem
long checksum;
long length;
byte[] header;
byte[] footer;
try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) {
try {
length = in.length();
header = CodecUtil.readIndexHeader(in);
footer = CodecUtil.readFooter(in);
checksum = CodecUtil.retrieveChecksum(in);
} catch (EOFException | CorruptIndexException cie) {
// to delete such unreferenced files, but virus checker can block that, leaving this bad file.
if (VERBOSE_FILES) {
message("file " + fileName + ": will copy [existing file is corrupt]");
}
return null;
}
if (VERBOSE_FILES) {
message("file " + fileName + " has length=" + bytesToString(length));
}
} catch (FileNotFoundException | NoSuchFileException e) {
if (VERBOSE_FILES) {
message("file " + fileName + ": will copy [file does not exist]");
}
return null;
}
// NOTE: checksum is redundant w/ footer, but we break it out separately because when the bits cross the wire we need direct access to
// checksum when copying to catch bit flips:
result = new FileMetaData(header, footer, length, checksum);
}
return result;
}
use of org.apache.lucene.store.IndexInput in project lucene-solr by apache.
the class CompressingTermVectorsWriter method merge.
@Override
public int merge(MergeState mergeState) throws IOException {
if (mergeState.needsIndexSort) {
// being copied over...?
return super.merge(mergeState);
}
int docCount = 0;
int numReaders = mergeState.maxDocs.length;
MatchingReaders matching = new MatchingReaders(mergeState);
for (int readerIndex = 0; readerIndex < numReaders; readerIndex++) {
CompressingTermVectorsReader matchingVectorsReader = null;
final TermVectorsReader vectorsReader = mergeState.termVectorsReaders[readerIndex];
if (matching.matchingReaders[readerIndex]) {
// we can only bulk-copy if the matching reader is also a CompressingTermVectorsReader
if (vectorsReader != null && vectorsReader instanceof CompressingTermVectorsReader) {
matchingVectorsReader = (CompressingTermVectorsReader) vectorsReader;
}
}
final int maxDoc = mergeState.maxDocs[readerIndex];
final Bits liveDocs = mergeState.liveDocs[readerIndex];
if (matchingVectorsReader != null && matchingVectorsReader.getCompressionMode() == compressionMode && matchingVectorsReader.getChunkSize() == chunkSize && matchingVectorsReader.getVersion() == VERSION_CURRENT && matchingVectorsReader.getPackedIntsVersion() == PackedInts.VERSION_CURRENT && BULK_MERGE_ENABLED && liveDocs == null && !tooDirty(matchingVectorsReader)) {
// optimized merge, raw byte copy
// its not worth fine-graining this if there are deletions.
matchingVectorsReader.checkIntegrity();
// flush any pending chunks
if (!pendingDocs.isEmpty()) {
flush();
// incomplete: we had to force this flush
numDirtyChunks++;
}
// iterate over each chunk. we use the vectors index to find chunk boundaries,
// read the docstart + doccount from the chunk header (we write a new header, since doc numbers will change),
// and just copy the bytes directly.
IndexInput rawDocs = matchingVectorsReader.getVectorsStream();
CompressingStoredFieldsIndexReader index = matchingVectorsReader.getIndexReader();
rawDocs.seek(index.getStartPointer(0));
int docID = 0;
while (docID < maxDoc) {
// read header
int base = rawDocs.readVInt();
if (base != docID) {
throw new CorruptIndexException("invalid state: base=" + base + ", docID=" + docID, rawDocs);
}
int bufferedDocs = rawDocs.readVInt();
// write a new index entry and new header for this chunk.
indexWriter.writeIndex(bufferedDocs, vectorsStream.getFilePointer());
// rebase
vectorsStream.writeVInt(docCount);
vectorsStream.writeVInt(bufferedDocs);
docID += bufferedDocs;
docCount += bufferedDocs;
numDocs += bufferedDocs;
if (docID > maxDoc) {
throw new CorruptIndexException("invalid state: base=" + base + ", count=" + bufferedDocs + ", maxDoc=" + maxDoc, rawDocs);
}
// copy bytes until the next chunk boundary (or end of chunk data).
// using the stored fields index for this isn't the most efficient, but fast enough
// and is a source of redundancy for detecting bad things.
final long end;
if (docID == maxDoc) {
end = matchingVectorsReader.getMaxPointer();
} else {
end = index.getStartPointer(docID);
}
vectorsStream.copyBytes(rawDocs, end - rawDocs.getFilePointer());
}
if (rawDocs.getFilePointer() != matchingVectorsReader.getMaxPointer()) {
throw new CorruptIndexException("invalid state: pos=" + rawDocs.getFilePointer() + ", max=" + matchingVectorsReader.getMaxPointer(), rawDocs);
}
// since we bulk merged all chunks, we inherit any dirty ones from this segment.
numChunks += matchingVectorsReader.getNumChunks();
numDirtyChunks += matchingVectorsReader.getNumDirtyChunks();
} else {
// naive merge...
if (vectorsReader != null) {
vectorsReader.checkIntegrity();
}
for (int i = 0; i < maxDoc; i++) {
if (liveDocs != null && liveDocs.get(i) == false) {
continue;
}
Fields vectors;
if (vectorsReader == null) {
vectors = null;
} else {
vectors = vectorsReader.get(i);
}
addAllDocVectors(vectors, mergeState);
++docCount;
}
}
}
finish(mergeState.mergeFieldInfos, docCount);
return docCount;
}
use of org.apache.lucene.store.IndexInput in project lucene-solr by apache.
the class HdfsDirectoryTest method assertInputsEquals.
private void assertInputsEquals(String name, Directory fsDir, HdfsDirectory hdfs) throws IOException {
int reads = random.nextInt(MAX_NUMBER_OF_READS);
IndexInput fsInput = fsDir.openInput(name, new IOContext());
IndexInput hdfsInput = hdfs.openInput(name, new IOContext());
assertEquals(fsInput.length(), hdfsInput.length());
int fileLength = (int) fsInput.length();
for (int i = 0; i < reads; i++) {
int nextInt = Math.min(MAX_BUFFER_SIZE - MIN_BUFFER_SIZE, fileLength);
byte[] fsBuf = new byte[random.nextInt(nextInt > 0 ? nextInt : 1) + MIN_BUFFER_SIZE];
byte[] hdfsBuf = new byte[fsBuf.length];
int offset = random.nextInt(fsBuf.length);
nextInt = fsBuf.length - offset;
int length = random.nextInt(nextInt > 0 ? nextInt : 1);
nextInt = fileLength - length;
int pos = random.nextInt(nextInt > 0 ? nextInt : 1);
fsInput.seek(pos);
fsInput.readBytes(fsBuf, offset, length);
hdfsInput.seek(pos);
hdfsInput.readBytes(hdfsBuf, offset, length);
for (int f = offset; f < length; f++) {
if (fsBuf[f] != hdfsBuf[f]) {
fail();
}
}
}
fsInput.close();
hdfsInput.close();
}
use of org.apache.lucene.store.IndexInput in project lucene-solr by apache.
the class HdfsDirectoryTest method testEof.
private void testEof(String name, Directory directory, long length) throws IOException {
IndexInput input = directory.openInput(name, new IOContext());
input.seek(length);
try {
input.readByte();
fail("should throw eof");
} catch (IOException e) {
}
}
use of org.apache.lucene.store.IndexInput in project lucene-solr by apache.
the class HdfsDirectoryTest method testRename.
public void testRename() throws IOException {
String[] listAll = directory.listAll();
for (String file : listAll) {
directory.deleteFile(file);
}
IndexOutput output = directory.createOutput("testing.test", new IOContext());
output.writeInt(12345);
output.close();
directory.rename("testing.test", "testing.test.renamed");
assertFalse(slowFileExists(directory, "testing.test"));
assertTrue(slowFileExists(directory, "testing.test.renamed"));
IndexInput input = directory.openInput("testing.test.renamed", new IOContext());
assertEquals(12345, input.readInt());
assertEquals(input.getFilePointer(), input.length());
input.close();
directory.deleteFile("testing.test.renamed");
assertFalse(slowFileExists(directory, "testing.test.renamed"));
}
Aggregations