use of org.apache.jackrabbit.oak.segment.file.tar.GCGeneration.newGCGeneration in project jackrabbit-oak by apache.
the class TarReader method mark.
/**
* Mark entries that can be reclaimed.
* <p>
* A data segment is reclaimable iff its generation is in the {@code
* reclaimGeneration} predicate. A bulk segment is reclaimable if it is not
* in {@code bulkRefs} or if it is transitively reachable through a non
* reclaimable data segment.
* <p>
* The algorithm implemented by this method uses a couple of supporting data
* structures.
* <p>
* The first of the supporting data structures is the set of bulk segments
* to keep. When this method is invoked, this set initially contains the set
* of bulk segments that are currently in use. The algorithm removes a
* reference from this set if the corresponding bulk segment is not
* referenced (either directly or transitively) from a marked data segment.
* The algorithm adds a reference to this set if a marked data segment is
* references the corresponding bulk segment. When this method returns, the
* references in this set represent bulk segments that are currently in use
* and should not be removed.
* <p>
* The second of the supporting data structures is the set of segments to
* reclaim. This set contains references to bulk and data segments. A
* reference to a bulk segment is added if the bulk segment is not
* referenced (either directly or transitively) by marked data segment. A
* reference to a data segment is added if the user-provided predicate
* returns {@code true} for that segment. When this method returns, this set
* contains segments that are not marked and can be removed.
*
* @param references The set of bulk segments to keep.
* @param reclaimable The set of segments to remove.
* @param context An instance of {@link CleanupContext}.
*/
void mark(Set<UUID> references, Set<UUID> reclaimable, CleanupContext context) throws IOException {
Map<UUID, List<UUID>> graph = getGraph();
SegmentArchiveEntry[] entries = getEntries();
for (int i = entries.length - 1; i >= 0; i--) {
// A bulk segments is *always* written before any data segment referencing it.
// Backward iteration ensures we see all references to bulk segments before
// we see the bulk segment itself. Therefore we can remove a bulk reference
// from the bulkRefs set once we encounter it, which save us some memory and
// CPU on subsequent look-ups.
SegmentArchiveEntry entry = entries[i];
UUID id = new UUID(entry.getMsb(), entry.getLsb());
GCGeneration generation = GCGeneration.newGCGeneration(entry);
if (context.shouldReclaim(id, generation, references.remove(id))) {
reclaimable.add(id);
} else {
for (UUID refId : getReferences(id, graph)) {
if (context.shouldFollow(id, refId)) {
references.add(refId);
}
}
}
}
}
use of org.apache.jackrabbit.oak.segment.file.tar.GCGeneration.newGCGeneration in project jackrabbit-oak by apache.
the class TarReader method sweep.
/**
* Try to remove every segment contained in a user-provided set.
* <p>
* This method might refuse to remove the segments under the following
* circumstances.
* <p>
* First, if this TAR files does not contain any of the segments that are
* supposed to be removed. In this case, the method returns {@code null}.
* <p>
* Second, if this method contains some of the segments that are supposed to
* be removed, but the reclaimable space is be less than 1/4 of the current
* size of the TAR file. In this case, this method returns this {@link
* TarReader}.
* <p>
* Third, if this TAR file is in the highest generation possible ('z') and
* thus a new generation for this TAR file can't be created. In this case,
* the method returns this {@link TarReader}.
* <p>
* Fourth, if a new TAR file has been created but it is unreadable for
* unknown reasons. In this case, this method returns this {@link
* TarReader}.
* <p>
* If none of the above conditions apply, this method returns a new {@link
* TarReader} instance tha points to a TAR file that doesn't contain the
* removed segments. The returned {@link TarReader} will belong to the next
* generation of this {@link TarReader}. In this case, the {@code reclaimed}
* set will be updated to contain the identifiers of the segments that were
* removed from this TAR file.
*
* @param reclaim Set of segment sto reclaim.
* @param reclaimed Set of reclaimed segments. It will be update if this TAR
* file is rewritten.
* @return Either this {@link TarReader}, or a new instance of {@link
* TarReader}, or {@code null}.
*/
TarReader sweep(@Nonnull Set<UUID> reclaim, @Nonnull Set<UUID> reclaimed) throws IOException {
String name = archive.getName();
log.debug("Cleaning up {}", name);
Set<UUID> cleaned = newHashSet();
int afterSize = 0;
int beforeSize = 0;
int afterCount = 0;
SegmentArchiveEntry[] entries = getEntries();
for (int i = 0; i < entries.length; i++) {
SegmentArchiveEntry entry = entries[i];
beforeSize += archive.getEntrySize(entry.getLength());
UUID id = new UUID(entry.getMsb(), entry.getLsb());
if (reclaim.contains(id)) {
cleaned.add(id);
entries[i] = null;
} else {
afterSize += archive.getEntrySize(entry.getLength());
afterCount += 1;
}
}
if (afterCount == 0) {
log.debug("None of the entries of {} are referenceable.", name);
return null;
}
if (afterSize >= beforeSize * 3 / 4 && hasGraph()) {
// the space savings are not worth it at less than 25%,
// unless this tar file lacks a pre-compiled segment graph
// in which case we'll always generate a new tar file with
// the graph to speed up future garbage collection runs.
log.debug("Not enough space savings. ({}/{}). Skipping clean up of {}", archive.length() - afterSize, archive.length(), name);
return this;
}
if (!hasGraph()) {
log.warn("Recovering {}, which is missing its graph.", name);
}
int pos = name.length() - "a.tar".length();
char generation = name.charAt(pos);
if (generation == 'z') {
log.debug("No garbage collection after reaching generation z: {}", name);
return this;
}
String newFile = name.substring(0, pos) + (char) (generation + 1) + ".tar";
log.debug("Writing new generation {}", newFile);
TarWriter writer = new TarWriter(archiveManager, newFile);
for (SegmentArchiveEntry entry : entries) {
if (entry != null) {
long msb = entry.getMsb();
long lsb = entry.getLsb();
int size = entry.getLength();
GCGeneration gen = GCGeneration.newGCGeneration(entry);
byte[] data = new byte[size];
archive.readSegment(msb, lsb).get(data);
writer.writeEntry(msb, lsb, data, 0, size, gen);
}
}
// Reconstruct the graph index for non-cleaned segments.
Map<UUID, List<UUID>> graph = getGraph();
for (Entry<UUID, List<UUID>> e : graph.entrySet()) {
if (cleaned.contains(e.getKey())) {
continue;
}
Set<UUID> vertices = newHashSet();
for (UUID vertex : e.getValue()) {
if (cleaned.contains(vertex)) {
continue;
}
vertices.add(vertex);
}
for (UUID vertex : vertices) {
writer.addGraphEdge(e.getKey(), vertex);
}
}
// Reconstruct the binary reference index for non-cleaned segments.
BinaryReferencesIndex references = getBinaryReferences();
if (references != null) {
references.forEach((gen, full, compacted, id, reference) -> {
if (cleaned.contains(id)) {
return;
}
writer.addBinaryReference(newGCGeneration(gen, full, compacted), id, reference);
});
}
writer.close();
TarReader reader = openFirstFileWithValidIndex(singletonList(newFile), archiveManager);
if (reader != null) {
reclaimed.addAll(cleaned);
return reader;
} else {
log.warn("Failed to open cleaned up tar file {}", getFileName());
return this;
}
}
Aggregations