use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.
the class TestOfflineSorter method testOverNexting.
// OfflineSorter should not call my BytesSequencesReader.next() again after it already returned null:
public void testOverNexting() throws Exception {
Directory dir = newDirectory();
IndexOutput out = dir.createTempOutput("unsorted", "tmp", IOContext.DEFAULT);
try (ByteSequencesWriter w = new OfflineSorter.ByteSequencesWriter(out)) {
byte[] bytes = new byte[Integer.BYTES];
random().nextBytes(bytes);
w.write(bytes);
CodecUtil.writeFooter(out);
}
new OfflineSorter(dir, "foo", OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(4), OfflineSorter.MAX_TEMPFILES, Integer.BYTES, null, 0) {
@Override
protected ByteSequencesReader getReader(ChecksumIndexInput in, String name) throws IOException {
ByteSequencesReader other = super.getReader(in, name);
return new ByteSequencesReader(in, name) {
private boolean alreadyEnded;
@Override
public BytesRef next() throws IOException {
// if we returned null already, OfflineSorter should not call next() again
assertFalse(alreadyEnded);
BytesRef result = other.next();
if (result == null) {
alreadyEnded = true;
}
return result;
}
@Override
public void close() throws IOException {
other.close();
}
};
}
}.sort(out.getName());
dir.close();
}
use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.
the class AnalyzingSuggester method build.
@Override
public void build(InputIterator iterator) throws IOException {
if (iterator.hasContexts()) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
hasPayloads = iterator.hasPayloads();
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new AnalyzingComparator(hasPayloads));
IndexOutput tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
OfflineSorter.ByteSequencesReader reader = null;
BytesRefBuilder scratch = new BytesRefBuilder();
TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
String tempSortedFileName = null;
count = 0;
byte[] buffer = new byte[8];
try {
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
Util.toBytesRef(string, scratch);
// length of the analyzed text (FST input)
if (scratch.length() > Short.MAX_VALUE - 2) {
throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE - 2) + " in length (got " + scratch.length() + ")");
}
short analyzedLength = (short) scratch.length();
// compute the required length:
// analyzed sequence + weight (4) + surface + analyzedLength (short)
int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
BytesRef payload;
if (hasPayloads) {
if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE - 2) + " in length (got " + surfaceForm.length + ")");
}
payload = iterator.payload();
// payload + surfaceLength (short)
requiredLength += payload.length + 2;
} else {
payload = null;
}
buffer = ArrayUtil.grow(buffer, requiredLength);
output.reset(buffer);
output.writeShort(analyzedLength);
output.writeBytes(scratch.bytes(), 0, scratch.length());
output.writeInt(encodeWeight(iterator.weight()));
if (hasPayloads) {
for (int i = 0; i < surfaceForm.length; i++) {
if (surfaceForm.bytes[i] == PAYLOAD_SEP) {
throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
}
}
output.writeShort((short) surfaceForm.length);
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
output.writeBytes(payload.bytes, payload.offset, payload.length);
} else {
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
}
assert output.getPosition() == requiredLength : output.getPosition() + " vs " + requiredLength;
writer.write(buffer, 0, output.getPosition());
}
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
}
CodecUtil.writeFooter(tempInput);
writer.close();
// Sort all input/output pairs (required by FST.Builder):
tempSortedFileName = sorter.sort(tempInput.getName());
// Free disk space:
tempDir.deleteFile(tempInput.getName());
reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
PairOutputs<Long, BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
Builder<Pair<Long, BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
// Build FST:
BytesRefBuilder previousAnalyzed = null;
BytesRefBuilder analyzed = new BytesRefBuilder();
BytesRef surface = new BytesRef();
IntsRefBuilder scratchInts = new IntsRefBuilder();
ByteArrayDataInput input = new ByteArrayDataInput();
// Used to remove duplicate surface forms (but we
// still index the hightest-weight one). We clear
// this when we see a new analyzed form, so it cannot
// grow unbounded (at most 256 entries):
Set<BytesRef> seenSurfaceForms = new HashSet<>();
int dedup = 0;
while (true) {
BytesRef bytes = reader.next();
if (bytes == null) {
break;
}
input.reset(bytes.bytes, bytes.offset, bytes.length);
short analyzedLength = input.readShort();
analyzed.grow(analyzedLength + 2);
input.readBytes(analyzed.bytes(), 0, analyzedLength);
analyzed.setLength(analyzedLength);
long cost = input.readInt();
surface.bytes = bytes.bytes;
if (hasPayloads) {
surface.length = input.readShort();
surface.offset = input.getPosition();
} else {
surface.offset = input.getPosition();
surface.length = bytes.length - surface.offset;
}
if (previousAnalyzed == null) {
previousAnalyzed = new BytesRefBuilder();
previousAnalyzed.copyBytes(analyzed.get());
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else if (analyzed.get().equals(previousAnalyzed.get())) {
dedup++;
if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
// dups: skip the rest:
continue;
}
if (seenSurfaceForms.contains(surface)) {
continue;
}
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else {
dedup = 0;
previousAnalyzed.copyBytes(analyzed);
seenSurfaceForms.clear();
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
}
// TODO: I think we can avoid the extra 2 bytes when
// there is no dup (dedup==0), but we'd have to fix
// the exactFirst logic ... which would be sort of
// hairy because we'd need to special case the two
// (dup/not dup)...
// NOTE: must be byte 0 so we sort before whatever
// is next
analyzed.append((byte) 0);
analyzed.append((byte) dedup);
Util.toIntsRef(analyzed.get(), scratchInts);
//System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
if (!hasPayloads) {
builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
} else {
int payloadOffset = input.getPosition() + surface.length;
int payloadLength = bytes.length - payloadOffset;
BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
br.bytes[surface.length] = PAYLOAD_SEP;
System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
br.length = br.bytes.length;
builder.add(scratchInts.get(), outputs.newPair(cost, br));
}
}
fst = builder.finish();
//Util.dotToFile(fst, "/tmp/suggest.dot");
} finally {
IOUtils.closeWhileHandlingException(reader, writer);
IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
}
}
use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.
the class IndexFetcher method logReplicationTimeAndConfFiles.
/**
* Helper method to record the last replication's details so that we can show them on the statistics page across
* restarts.
* @throws IOException on IO error
*/
@SuppressForbidden(reason = "Need currentTimeMillis for debugging/stats")
private void logReplicationTimeAndConfFiles(Collection<Map<String, Object>> modifiedConfFiles, boolean successfulInstall) throws IOException {
List<String> confFiles = new ArrayList<>();
if (modifiedConfFiles != null && !modifiedConfFiles.isEmpty())
for (Map<String, Object> map1 : modifiedConfFiles) confFiles.add((String) map1.get(NAME));
Properties props = replicationHandler.loadReplicationProperties();
long replicationTime = System.currentTimeMillis();
long replicationTimeTaken = getReplicationTimeElapsed();
Directory dir = null;
try {
dir = solrCore.getDirectoryFactory().get(solrCore.getDataDir(), DirContext.META_DATA, solrCore.getSolrConfig().indexConfig.lockType);
int indexCount = 1, confFilesCount = 1;
if (props.containsKey(TIMES_INDEX_REPLICATED)) {
indexCount = Integer.parseInt(props.getProperty(TIMES_INDEX_REPLICATED)) + 1;
}
StringBuilder sb = readToStringBuilder(replicationTime, props.getProperty(INDEX_REPLICATED_AT_LIST));
props.setProperty(INDEX_REPLICATED_AT_LIST, sb.toString());
props.setProperty(INDEX_REPLICATED_AT, String.valueOf(replicationTime));
props.setProperty(PREVIOUS_CYCLE_TIME_TAKEN, String.valueOf(replicationTimeTaken));
props.setProperty(TIMES_INDEX_REPLICATED, String.valueOf(indexCount));
if (modifiedConfFiles != null && !modifiedConfFiles.isEmpty()) {
props.setProperty(CONF_FILES_REPLICATED, confFiles.toString());
props.setProperty(CONF_FILES_REPLICATED_AT, String.valueOf(replicationTime));
if (props.containsKey(TIMES_CONFIG_REPLICATED)) {
confFilesCount = Integer.parseInt(props.getProperty(TIMES_CONFIG_REPLICATED)) + 1;
}
props.setProperty(TIMES_CONFIG_REPLICATED, String.valueOf(confFilesCount));
}
props.setProperty(LAST_CYCLE_BYTES_DOWNLOADED, String.valueOf(getTotalBytesDownloaded()));
if (!successfulInstall) {
int numFailures = 1;
if (props.containsKey(TIMES_FAILED)) {
numFailures = Integer.parseInt(props.getProperty(TIMES_FAILED)) + 1;
}
props.setProperty(TIMES_FAILED, String.valueOf(numFailures));
props.setProperty(REPLICATION_FAILED_AT, String.valueOf(replicationTime));
sb = readToStringBuilder(replicationTime, props.getProperty(REPLICATION_FAILED_AT_LIST));
props.setProperty(REPLICATION_FAILED_AT_LIST, sb.toString());
}
String tmpFileName = REPLICATION_PROPERTIES + "." + System.nanoTime();
final IndexOutput out = dir.createOutput(tmpFileName, DirectoryFactory.IOCONTEXT_NO_CACHE);
Writer outFile = new OutputStreamWriter(new PropertiesOutputStream(out), StandardCharsets.UTF_8);
try {
props.store(outFile, "Replication details");
dir.sync(Collections.singleton(tmpFileName));
} finally {
IOUtils.closeQuietly(outFile);
}
solrCore.getDirectoryFactory().renameWithOverwrite(dir, tmpFileName, REPLICATION_PROPERTIES);
} catch (Exception e) {
LOG.warn("Exception while updating statistics", e);
} finally {
if (dir != null) {
solrCore.getDirectoryFactory().release(dir);
}
}
}
use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.
the class TestPackedInts method testEndPointer.
public void testEndPointer() throws IOException {
final Directory dir = newDirectory();
final int valueCount = RandomNumbers.randomIntBetween(random(), 1, 1000);
final IndexOutput out = dir.createOutput("tests.bin", newIOContext(random()));
for (int i = 0; i < valueCount; ++i) {
out.writeLong(0);
}
out.close();
final IndexInput in = dir.openInput("tests.bin", newIOContext(random()));
for (int version = PackedInts.VERSION_START; version <= PackedInts.VERSION_CURRENT; ++version) {
for (int bpv = 1; bpv <= 64; ++bpv) {
for (PackedInts.Format format : PackedInts.Format.values()) {
if (!format.isSupported(bpv)) {
continue;
}
final long byteCount = format.byteCount(version, valueCount, bpv);
String msg = "format=" + format + ",version=" + version + ",valueCount=" + valueCount + ",bpv=" + bpv;
// test iterator
in.seek(0L);
final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(in, format, version, valueCount, bpv, RandomNumbers.randomIntBetween(random(), 1, 1 << 16));
for (int i = 0; i < valueCount; ++i) {
it.next();
}
assertEquals(msg, byteCount, in.getFilePointer());
// test direct reader
in.seek(0L);
final PackedInts.Reader directReader = PackedInts.getDirectReaderNoHeader(in, format, version, valueCount, bpv);
directReader.get(valueCount - 1);
assertEquals(msg, byteCount, in.getFilePointer());
// test reader
in.seek(0L);
PackedInts.getReaderNoHeader(in, format, version, valueCount, bpv);
assertEquals(msg, byteCount, in.getFilePointer());
}
}
}
in.close();
dir.close();
}
use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.
the class TestPackedInts method testSave.
public void testSave() throws IOException {
final int valueCount = TestUtil.nextInt(random(), 1, 2048);
for (int bpv = 1; bpv <= 64; ++bpv) {
final int maxValue = (int) Math.min(PackedInts.maxValue(31), PackedInts.maxValue(bpv));
final RAMDirectory directory = new RAMDirectory();
List<PackedInts.Mutable> packedInts = createPackedInts(valueCount, bpv);
for (PackedInts.Mutable mutable : packedInts) {
for (int i = 0; i < mutable.size(); ++i) {
mutable.set(i, random().nextInt(maxValue));
}
IndexOutput out = directory.createOutput("packed-ints.bin", IOContext.DEFAULT);
mutable.save(out);
out.close();
IndexInput in = directory.openInput("packed-ints.bin", IOContext.DEFAULT);
PackedInts.Reader reader = PackedInts.getReader(in);
assertEquals(valueCount, reader.size());
if (mutable instanceof Packed64SingleBlock) {
// make sure that we used the right format so that the reader has
// the same performance characteristics as the mutable that has been
// serialized
assertTrue(reader instanceof Packed64SingleBlock);
} else {
assertFalse(reader instanceof Packed64SingleBlock);
}
for (int i = 0; i < valueCount; ++i) {
assertEquals(mutable.get(i), reader.get(i));
}
in.close();
directory.deleteFile("packed-ints.bin");
}
directory.close();
}
}
Aggregations