use of org.apache.lucene.store.IndexInput in project lucene-solr by apache.
the class BlockDirectoryTest method assertInputsEquals.
private void assertInputsEquals(String name, Directory fsDir, Directory hdfs) throws IOException {
int reads = random.nextInt(MAX_NUMBER_OF_READS);
IndexInput fsInput = fsDir.openInput(name, new IOContext());
IndexInput hdfsInput = hdfs.openInput(name, new IOContext());
assertEquals(fsInput.length(), hdfsInput.length());
int fileLength = (int) fsInput.length();
for (int i = 0; i < reads; i++) {
int rnd;
if (fileLength == 0) {
rnd = 0;
} else {
rnd = random.nextInt(Math.min(MAX_BUFFER_SIZE - MIN_BUFFER_SIZE, fileLength));
}
byte[] fsBuf = new byte[rnd + MIN_BUFFER_SIZE];
byte[] hdfsBuf = new byte[fsBuf.length];
int offset = random.nextInt(fsBuf.length);
int length = random.nextInt(fsBuf.length - offset);
int pos;
if (fileLength == 0) {
pos = 0;
} else {
pos = random.nextInt(fileLength - length);
}
fsInput.seek(pos);
fsInput.readBytes(fsBuf, offset, length);
hdfsInput.seek(pos);
hdfsInput.readBytes(hdfsBuf, offset, length);
for (int f = offset; f < length; f++) {
if (fsBuf[f] != hdfsBuf[f]) {
fail("read [" + i + "]");
}
}
}
fsInput.close();
hdfsInput.close();
}
use of org.apache.lucene.store.IndexInput in project jackrabbit-oak by apache.
the class LuceneBlobCacheTest method assertWrites.
byte[] assertWrites(Directory dir, int blobSize) throws IOException {
byte[] data = randomBytes(blobSize);
IndexOutput o = dir.createOutput("test", IOContext.DEFAULT);
o.writeBytes(data, data.length);
o.close();
IndexInput i = dir.openInput("test", IOContext.DEFAULT);
assertEquals(blobSize, i.length());
byte[] result = new byte[blobSize];
i.readBytes(result, 0, result.length);
assertTrue(Arrays.equals(data, result));
// Load agagin to see if cached
i = dir.openInput("test", IOContext.DEFAULT);
assertEquals(blobSize, i.length());
result = new byte[blobSize];
i.readBytes(result, 0, result.length);
assertTrue(Arrays.equals(data, result));
assertEquals(1, fileDataStore.count);
return data;
}
use of org.apache.lucene.store.IndexInput in project jackrabbit-oak by apache.
the class OakDirectoryTest method largeFile.
@Test
public void largeFile() throws Exception {
FileStore store = FileStoreBuilder.fileStoreBuilder(tempFolder.getRoot()).withMemoryMapping(false).withBlobStore(new BlackHoleBlobStore()).build();
SegmentNodeStore nodeStore = SegmentNodeStoreBuilders.builder(store).build();
IndexDefinition defn = new IndexDefinition(INITIAL_CONTENT, EmptyNodeState.EMPTY_NODE, "/foo");
Directory directory = new OakDirectory(nodeStore.getRoot().builder(), defn, false);
long expectedSize = ONE_GB * 2 + ONE_MB;
String fileName = "test";
writeFile(directory, fileName, expectedSize);
assertEquals(expectedSize, directory.fileLength(fileName));
IndexInput input = directory.openInput(fileName, IOContext.DEFAULT);
readInputToEnd(expectedSize, input);
store.close();
}
use of org.apache.lucene.store.IndexInput in project lucene-solr by apache.
the class SimpleTextDocValuesReader method getSortedSet.
@Override
public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException {
final OneField field = fields.get(fieldInfo.name);
// valid:
assert field != null;
final IndexInput in = data.clone();
final BytesRefBuilder scratch = new BytesRefBuilder();
final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
return new SortedSetDocValues() {
String[] currentOrds = new String[0];
int currentIndex = 0;
final BytesRefBuilder term = new BytesRefBuilder();
int doc = -1;
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int docID() {
return doc;
}
@Override
public long cost() {
return maxDoc;
}
@Override
public int advance(int target) throws IOException {
for (int i = target; i < maxDoc; ++i) {
in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + i * (1 + field.ordPattern.length()));
SimpleTextUtil.readLine(in, scratch);
String ordList = scratch.get().utf8ToString().trim();
if (ordList.isEmpty() == false) {
currentOrds = ordList.split(",");
currentIndex = 0;
return doc = i;
}
}
return doc = NO_MORE_DOCS;
}
@Override
public boolean advanceExact(int target) throws IOException {
in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + target * (1 + field.ordPattern.length()));
SimpleTextUtil.readLine(in, scratch);
String ordList = scratch.get().utf8ToString().trim();
doc = target;
if (ordList.isEmpty() == false) {
currentOrds = ordList.split(",");
currentIndex = 0;
return true;
}
return false;
}
@Override
public long nextOrd() throws IOException {
if (currentIndex == currentOrds.length) {
return NO_MORE_ORDS;
} else {
return Long.parseLong(currentOrds[currentIndex++]);
}
}
@Override
public BytesRef lookupOrd(long ord) throws IOException {
if (ord < 0 || ord >= field.numValues) {
throw new IndexOutOfBoundsException("ord must be 0 .. " + (field.numValues - 1) + "; got " + ord);
}
in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength));
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), LENGTH) : "got " + scratch.get().utf8ToString() + " in=" + in;
int len;
try {
len = decoder.parse(new String(scratch.bytes(), LENGTH.length, scratch.length() - LENGTH.length, StandardCharsets.UTF_8)).intValue();
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse int length", in, pe);
}
term.grow(len);
term.setLength(len);
in.readBytes(term.bytes(), 0, len);
return term.get();
}
@Override
public long getValueCount() {
return field.numValues;
}
};
}
use of org.apache.lucene.store.IndexInput in project lucene-solr by apache.
the class CompressingStoredFieldsWriter method merge.
@Override
public int merge(MergeState mergeState) throws IOException {
int docCount = 0;
int numReaders = mergeState.maxDocs.length;
MatchingReaders matching = new MatchingReaders(mergeState);
if (mergeState.needsIndexSort) {
/**
* If all readers are compressed and they have the same fieldinfos then we can merge the serialized document
* directly.
*/
List<CompressingStoredFieldsMergeSub> subs = new ArrayList<>();
for (int i = 0; i < mergeState.storedFieldsReaders.length; i++) {
if (matching.matchingReaders[i] && mergeState.storedFieldsReaders[i] instanceof CompressingStoredFieldsReader) {
CompressingStoredFieldsReader storedFieldsReader = (CompressingStoredFieldsReader) mergeState.storedFieldsReaders[i];
storedFieldsReader.checkIntegrity();
subs.add(new CompressingStoredFieldsMergeSub(storedFieldsReader, mergeState.docMaps[i], mergeState.maxDocs[i]));
} else {
return super.merge(mergeState);
}
}
final DocIDMerger<CompressingStoredFieldsMergeSub> docIDMerger = DocIDMerger.of(subs, true);
while (true) {
CompressingStoredFieldsMergeSub sub = docIDMerger.next();
if (sub == null) {
break;
}
assert sub.mappedDocID == docCount;
SerializedDocument doc = sub.reader.document(sub.docID);
startDocument();
bufferedDocs.copyBytes(doc.in, doc.length);
numStoredFieldsInDoc = doc.numStoredFields;
finishDocument();
++docCount;
}
finish(mergeState.mergeFieldInfos, docCount);
return docCount;
}
for (int readerIndex = 0; readerIndex < numReaders; readerIndex++) {
MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
CompressingStoredFieldsReader matchingFieldsReader = null;
if (matching.matchingReaders[readerIndex]) {
final StoredFieldsReader fieldsReader = mergeState.storedFieldsReaders[readerIndex];
// we can only bulk-copy if the matching reader is also a CompressingStoredFieldsReader
if (fieldsReader != null && fieldsReader instanceof CompressingStoredFieldsReader) {
matchingFieldsReader = (CompressingStoredFieldsReader) fieldsReader;
}
}
final int maxDoc = mergeState.maxDocs[readerIndex];
final Bits liveDocs = mergeState.liveDocs[readerIndex];
// if its some other format, or an older version of this format, or safety switch:
if (matchingFieldsReader == null || matchingFieldsReader.getVersion() != VERSION_CURRENT || BULK_MERGE_ENABLED == false) {
// naive merge...
StoredFieldsReader storedFieldsReader = mergeState.storedFieldsReaders[readerIndex];
if (storedFieldsReader != null) {
storedFieldsReader.checkIntegrity();
}
for (int docID = 0; docID < maxDoc; docID++) {
if (liveDocs != null && liveDocs.get(docID) == false) {
continue;
}
startDocument();
storedFieldsReader.visitDocument(docID, visitor);
finishDocument();
++docCount;
}
} else if (matchingFieldsReader.getCompressionMode() == compressionMode && matchingFieldsReader.getChunkSize() == chunkSize && matchingFieldsReader.getPackedIntsVersion() == PackedInts.VERSION_CURRENT && liveDocs == null && !tooDirty(matchingFieldsReader)) {
// if the format is older, its always handled by the naive merge case above
assert matchingFieldsReader.getVersion() == VERSION_CURRENT;
matchingFieldsReader.checkIntegrity();
// flush any pending chunks
if (numBufferedDocs > 0) {
flush();
// incomplete: we had to force this flush
numDirtyChunks++;
}
// iterate over each chunk. we use the stored fields index to find chunk boundaries,
// read the docstart + doccount from the chunk header (we write a new header, since doc numbers will change),
// and just copy the bytes directly.
IndexInput rawDocs = matchingFieldsReader.getFieldsStream();
CompressingStoredFieldsIndexReader index = matchingFieldsReader.getIndexReader();
rawDocs.seek(index.getStartPointer(0));
int docID = 0;
while (docID < maxDoc) {
// read header
int base = rawDocs.readVInt();
if (base != docID) {
throw new CorruptIndexException("invalid state: base=" + base + ", docID=" + docID, rawDocs);
}
int code = rawDocs.readVInt();
// write a new index entry and new header for this chunk.
int bufferedDocs = code >>> 1;
indexWriter.writeIndex(bufferedDocs, fieldsStream.getFilePointer());
// rebase
fieldsStream.writeVInt(docBase);
fieldsStream.writeVInt(code);
docID += bufferedDocs;
docBase += bufferedDocs;
docCount += bufferedDocs;
if (docID > maxDoc) {
throw new CorruptIndexException("invalid state: base=" + base + ", count=" + bufferedDocs + ", maxDoc=" + maxDoc, rawDocs);
}
// copy bytes until the next chunk boundary (or end of chunk data).
// using the stored fields index for this isn't the most efficient, but fast enough
// and is a source of redundancy for detecting bad things.
final long end;
if (docID == maxDoc) {
end = matchingFieldsReader.getMaxPointer();
} else {
end = index.getStartPointer(docID);
}
fieldsStream.copyBytes(rawDocs, end - rawDocs.getFilePointer());
}
if (rawDocs.getFilePointer() != matchingFieldsReader.getMaxPointer()) {
throw new CorruptIndexException("invalid state: pos=" + rawDocs.getFilePointer() + ", max=" + matchingFieldsReader.getMaxPointer(), rawDocs);
}
// since we bulk merged all chunks, we inherit any dirty ones from this segment.
numChunks += matchingFieldsReader.getNumChunks();
numDirtyChunks += matchingFieldsReader.getNumDirtyChunks();
} else {
// if the format is older, its always handled by the naive merge case above
assert matchingFieldsReader.getVersion() == VERSION_CURRENT;
matchingFieldsReader.checkIntegrity();
for (int docID = 0; docID < maxDoc; docID++) {
if (liveDocs != null && liveDocs.get(docID) == false) {
continue;
}
SerializedDocument doc = matchingFieldsReader.document(docID);
startDocument();
bufferedDocs.copyBytes(doc.in, doc.length);
numStoredFieldsInDoc = doc.numStoredFields;
finishDocument();
++docCount;
}
}
}
finish(mergeState.mergeFieldInfos, docCount);
return docCount;
}
Aggregations