use of org.apache.lucene.store.IndexOutput in project elasticsearch by elastic.
the class StoreTests method testCanReadOldCorruptionMarker.
public void testCanReadOldCorruptionMarker() throws IOException {
final ShardId shardId = new ShardId("index", "_na_", 1);
// I use ram dir to prevent that virusscanner being a PITA
final Directory dir = new RAMDirectory();
DirectoryService directoryService = new DirectoryService(shardId, INDEX_SETTINGS) {
@Override
public Directory newDirectory() throws IOException {
return dir;
}
};
Store store = new Store(shardId, INDEX_SETTINGS, directoryService, new DummyShardLock(shardId));
CorruptIndexException exception = new CorruptIndexException("foo", "bar");
String uuid = Store.CORRUPTED + UUIDs.randomBase64UUID();
try (IndexOutput output = dir.createOutput(uuid, IOContext.DEFAULT)) {
CodecUtil.writeHeader(output, Store.CODEC, Store.VERSION_STACK_TRACE);
output.writeString(ExceptionsHelper.detailedMessage(exception));
output.writeString(ExceptionsHelper.stackTrace(exception));
CodecUtil.writeFooter(output);
}
try {
store.failIfCorrupted();
fail("should be corrupted");
} catch (CorruptIndexException e) {
assertTrue(e.getMessage().startsWith("[index][1] Preexisting corrupted index [" + uuid + "] caused by: CorruptIndexException[foo (resource=bar)]"));
assertTrue(e.getMessage().contains(ExceptionsHelper.stackTrace(exception)));
}
store.removeCorruptionMarker();
try (IndexOutput output = dir.createOutput(uuid, IOContext.DEFAULT)) {
CodecUtil.writeHeader(output, Store.CODEC, Store.VERSION_START);
output.writeString(ExceptionsHelper.detailedMessage(exception));
CodecUtil.writeFooter(output);
}
try {
store.failIfCorrupted();
fail("should be corrupted");
} catch (CorruptIndexException e) {
assertTrue(e.getMessage().startsWith("[index][1] Preexisting corrupted index [" + uuid + "] caused by: CorruptIndexException[foo (resource=bar)]"));
assertFalse(e.getMessage().contains(ExceptionsHelper.stackTrace(exception)));
}
store.removeCorruptionMarker();
try (IndexOutput output = dir.createOutput(uuid, IOContext.DEFAULT)) {
// corrupted header
CodecUtil.writeHeader(output, Store.CODEC, Store.VERSION_START - 1);
CodecUtil.writeFooter(output);
}
try {
store.failIfCorrupted();
fail("should be too old");
} catch (IndexFormatTooOldException e) {
}
store.removeCorruptionMarker();
try (IndexOutput output = dir.createOutput(uuid, IOContext.DEFAULT)) {
// corrupted header
CodecUtil.writeHeader(output, Store.CODEC, Store.VERSION + 1);
CodecUtil.writeFooter(output);
}
try {
store.failIfCorrupted();
fail("should be too new");
} catch (IndexFormatTooNewException e) {
}
store.close();
}
use of org.apache.lucene.store.IndexOutput in project elasticsearch by elastic.
the class StoreTests method testChecksumCorrupted.
public void testChecksumCorrupted() throws IOException {
Directory dir = newDirectory();
IndexOutput output = dir.createOutput("foo.bar", IOContext.DEFAULT);
int iters = scaledRandomIntBetween(10, 100);
for (int i = 0; i < iters; i++) {
BytesRef bytesRef = new BytesRef(TestUtil.randomRealisticUnicodeString(random(), 10, 1024));
output.writeBytes(bytesRef.bytes, bytesRef.offset, bytesRef.length);
}
output.writeInt(CodecUtil.FOOTER_MAGIC);
output.writeInt(0);
String checksum = Store.digestToString(output.getChecksum());
// write a wrong checksum to the file
output.writeLong(output.getChecksum() + 1);
output.close();
IndexInput indexInput = dir.openInput("foo.bar", IOContext.DEFAULT);
indexInput.seek(0);
BytesRef ref = new BytesRef(scaledRandomIntBetween(1, 1024));
long length = indexInput.length();
IndexOutput verifyingOutput = new Store.LuceneVerifyingIndexOutput(new StoreFileMetaData("foo1.bar", length, checksum), dir.createOutput("foo1.bar", IOContext.DEFAULT));
// we write the checksum in the try / catch block below
length -= 8;
while (length > 0) {
if (random().nextInt(10) == 0) {
verifyingOutput.writeByte(indexInput.readByte());
length--;
} else {
int min = (int) Math.min(length, ref.bytes.length);
indexInput.readBytes(ref.bytes, ref.offset, min);
verifyingOutput.writeBytes(ref.bytes, ref.offset, min);
length -= min;
}
}
try {
BytesRef checksumBytes = new BytesRef(8);
checksumBytes.length = 8;
indexInput.readBytes(checksumBytes.bytes, checksumBytes.offset, checksumBytes.length);
if (randomBoolean()) {
verifyingOutput.writeBytes(checksumBytes.bytes, checksumBytes.offset, checksumBytes.length);
} else {
for (int i = 0; i < checksumBytes.length; i++) {
verifyingOutput.writeByte(checksumBytes.bytes[i]);
}
}
fail("should be a corrupted index");
} catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
// ok
}
IOUtils.close(indexInput, verifyingOutput, dir);
}
use of org.apache.lucene.store.IndexOutput in project elasticsearch by elastic.
the class StoreTests method testVerifyingIndexOutputWithBogusInput.
public void testVerifyingIndexOutputWithBogusInput() throws IOException {
Directory dir = newDirectory();
int length = scaledRandomIntBetween(10, 1024);
IndexOutput verifyingOutput = new Store.LuceneVerifyingIndexOutput(new StoreFileMetaData("foo1.bar", length, ""), dir.createOutput("foo1.bar", IOContext.DEFAULT));
try {
while (length > 0) {
verifyingOutput.writeByte((byte) random().nextInt());
length--;
}
fail("should be a corrupted index");
} catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
// ok
}
IOUtils.close(verifyingOutput, dir);
}
use of org.apache.lucene.store.IndexOutput in project elasticsearch by elastic.
the class XAnalyzingSuggester method build.
@Override
public void build(InputIterator iterator) throws IOException {
String prefix = getClass().getSimpleName();
Directory tempDir = getTempDir();
OfflineSorter sorter = new OfflineSorter(tempDir, prefix, new AnalyzingComparator(hasPayloads));
IndexOutput tempInput = tempDir.createTempOutput(prefix, "input", IOContext.DEFAULT);
OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
OfflineSorter.ByteSequencesReader reader = null;
hasPayloads = iterator.hasPayloads();
BytesRefBuilder scratch = new BytesRefBuilder();
TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
String tempSortedFileName = null;
count = 0;
byte[] buffer = new byte[8];
try {
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
Util.toBytesRef(string, scratch);
// length of the analyzed text (FST input)
if (scratch.length() > Short.MAX_VALUE - 2) {
throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE - 2) + " in length (got " + scratch.length() + ")");
}
short analyzedLength = (short) scratch.length();
// compute the required length:
// analyzed sequence + weight (4) + surface + analyzedLength (short)
int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
BytesRef payload;
if (hasPayloads) {
if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE - 2) + " in length (got " + surfaceForm.length + ")");
}
payload = iterator.payload();
// payload + surfaceLength (short)
requiredLength += payload.length + 2;
} else {
payload = null;
}
buffer = ArrayUtil.grow(buffer, requiredLength);
output.reset(buffer);
output.writeShort(analyzedLength);
output.writeBytes(scratch.bytes(), 0, scratch.length());
output.writeInt(encodeWeight(iterator.weight()));
if (hasPayloads) {
for (int i = 0; i < surfaceForm.length; i++) {
if (surfaceForm.bytes[i] == payloadSep) {
throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
}
}
output.writeShort((short) surfaceForm.length);
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
output.writeBytes(payload.bytes, payload.offset, payload.length);
} else {
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
}
assert output.getPosition() == requiredLength : output.getPosition() + " vs " + requiredLength;
writer.write(buffer, 0, output.getPosition());
}
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
}
writer.close();
// Sort all input/output pairs (required by FST.Builder):
tempSortedFileName = sorter.sort(tempInput.getName());
// Free disk space:
tempDir.deleteFile(tempInput.getName());
reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), prefix);
PairOutputs<Long, BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
Builder<Pair<Long, BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
// Build FST:
BytesRefBuilder previousAnalyzed = null;
BytesRefBuilder analyzed = new BytesRefBuilder();
BytesRef surface = new BytesRef();
IntsRefBuilder scratchInts = new IntsRefBuilder();
ByteArrayDataInput input = new ByteArrayDataInput();
// Used to remove duplicate surface forms (but we
// still index the hightest-weight one). We clear
// this when we see a new analyzed form, so it cannot
// grow unbounded (at most 256 entries):
Set<BytesRef> seenSurfaceForms = new HashSet<>();
int dedup = 0;
while (true) {
BytesRef bytes = reader.next();
if (bytes == null) {
break;
}
input.reset(bytes.bytes, bytes.offset, bytes.length);
short analyzedLength = input.readShort();
analyzed.grow(analyzedLength + 2);
input.readBytes(analyzed.bytes(), 0, analyzedLength);
analyzed.setLength(analyzedLength);
long cost = input.readInt();
surface.bytes = bytes.bytes;
if (hasPayloads) {
surface.length = input.readShort();
surface.offset = input.getPosition();
} else {
surface.offset = input.getPosition();
surface.length = bytes.length - surface.offset;
}
if (previousAnalyzed == null) {
previousAnalyzed = new BytesRefBuilder();
previousAnalyzed.copyBytes(analyzed);
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else if (analyzed.get().equals(previousAnalyzed.get())) {
dedup++;
if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
// dups: skip the rest:
continue;
}
if (seenSurfaceForms.contains(surface)) {
continue;
}
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else {
dedup = 0;
previousAnalyzed.copyBytes(analyzed);
seenSurfaceForms.clear();
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
}
// TODO: I think we can avoid the extra 2 bytes when
// there is no dup (dedup==0), but we'd have to fix
// the exactFirst logic ... which would be sort of
// hairy because we'd need to special case the two
// (dup/not dup)...
// NOTE: must be byte 0 so we sort before whatever
// is next
analyzed.append((byte) 0);
analyzed.append((byte) dedup);
Util.toIntsRef(analyzed.get(), scratchInts);
//System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
if (!hasPayloads) {
builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
} else {
int payloadOffset = input.getPosition() + surface.length;
int payloadLength = bytes.length - payloadOffset;
BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
br.bytes[surface.length] = (byte) payloadSep;
System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
br.length = br.bytes.length;
builder.add(scratchInts.get(), outputs.newPair(cost, br));
}
}
fst = builder.finish();
//PrintWriter pw = new PrintWriter("/tmp/out.dot");
//Util.toDot(fst, pw, true, true);
//pw.close();
} finally {
IOUtils.closeWhileHandlingException(reader, writer);
IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
}
}
use of org.apache.lucene.store.IndexOutput in project elasticsearch by elastic.
the class InputStreamIndexInputTests method testSingleReadSingleByteLimit.
public void testSingleReadSingleByteLimit() throws IOException {
RAMDirectory dir = new RAMDirectory();
IndexOutput output = dir.createOutput("test", IOContext.DEFAULT);
for (int i = 0; i < 3; i++) {
output.writeByte((byte) 1);
}
for (int i = 0; i < 3; i++) {
output.writeByte((byte) 2);
}
output.close();
IndexInput input = dir.openInput("test", IOContext.DEFAULT);
for (int i = 0; i < 3; i++) {
InputStreamIndexInput is = new InputStreamIndexInput(input, 1);
assertThat(input.getFilePointer(), lessThan(input.length()));
assertThat(is.actualSizeToRead(), equalTo(1L));
assertThat(is.read(), equalTo(1));
assertThat(is.read(), equalTo(-1));
}
for (int i = 0; i < 3; i++) {
InputStreamIndexInput is = new InputStreamIndexInput(input, 1);
assertThat(input.getFilePointer(), lessThan(input.length()));
assertThat(is.actualSizeToRead(), equalTo(1L));
assertThat(is.read(), equalTo(2));
assertThat(is.read(), equalTo(-1));
}
assertThat(input.getFilePointer(), equalTo(input.length()));
InputStreamIndexInput is = new InputStreamIndexInput(input, 1);
assertThat(is.actualSizeToRead(), equalTo(0L));
assertThat(is.read(), equalTo(-1));
}
Aggregations