use of org.apache.accumulo.core.client.summary.Summary in project accumulo by apache.
the class TooManyDeletesIT method tooManyDeletesCompactionStrategyIT.
@Test
public void tooManyDeletesCompactionStrategyIT() throws Exception {
Connector c = getConnector();
String table = getUniqueNames(1)[0];
SummarizerConfiguration sc = SummarizerConfiguration.builder(DeletesSummarizer.class).build();
// TODO open issue about programatic config of compaction strategies
NewTableConfiguration ntc = new NewTableConfiguration().enableSummarization(sc);
HashMap<String, String> props = new HashMap<>();
props.put(Property.TABLE_COMPACTION_STRATEGY.getKey(), TooManyDeletesCompactionStrategy.class.getName());
props.put(Property.TABLE_COMPACTION_STRATEGY_PREFIX.getKey() + TooManyDeletesCompactionStrategy.THRESHOLD_OPT, ".25");
// ensure compaction does not happen because of the number of files
props.put(Property.TABLE_MAJC_RATIO.getKey(), "10");
ntc.setProperties(props);
c.tableOperations().create(table, ntc);
try (BatchWriter bw = c.createBatchWriter(table, new BatchWriterConfig())) {
for (int i = 0; i < 1000; i++) {
Mutation m = new Mutation("row" + i);
m.put("f", "q", "v" + i);
bw.addMutation(m);
}
}
List<Summary> summaries = c.tableOperations().summaries(table).flush(true).withConfiguration(sc).retrieve();
Assert.assertEquals(1, summaries.size());
Summary summary = summaries.get(0);
Assert.assertEquals(1000l, (long) summary.getStatistics().get(DeletesSummarizer.TOTAL_STAT));
Assert.assertEquals(0l, (long) summary.getStatistics().get(DeletesSummarizer.DELETES_STAT));
try (BatchWriter bw = c.createBatchWriter(table, new BatchWriterConfig())) {
for (int i = 0; i < 100; i++) {
Mutation m = new Mutation("row" + i);
m.putDelete("f", "q");
bw.addMutation(m);
}
}
summaries = c.tableOperations().summaries(table).flush(true).withConfiguration(sc).retrieve();
Assert.assertEquals(1, summaries.size());
summary = summaries.get(0);
Assert.assertEquals(1100l, (long) summary.getStatistics().get(DeletesSummarizer.TOTAL_STAT));
Assert.assertEquals(100l, (long) summary.getStatistics().get(DeletesSummarizer.DELETES_STAT));
try (BatchWriter bw = c.createBatchWriter(table, new BatchWriterConfig())) {
for (int i = 100; i < 300; i++) {
Mutation m = new Mutation("row" + i);
m.putDelete("f", "q");
bw.addMutation(m);
}
}
// after a flush occurs Accumulo will check if a major compaction is needed. This check should call the compaction strategy, which should decide to compact
// all files based on the number of deletes.
c.tableOperations().flush(table, null, null, true);
// wait for the compaction to happen
while (true) {
// the flush should cause
summaries = c.tableOperations().summaries(table).flush(false).withConfiguration(sc).retrieve();
Assert.assertEquals(1, summaries.size());
summary = summaries.get(0);
long total = summary.getStatistics().get(DeletesSummarizer.TOTAL_STAT);
long deletes = summary.getStatistics().get(DeletesSummarizer.DELETES_STAT);
if (total == 700 && deletes == 0) {
// a compaction was triggered based on the number of deletes
break;
}
UtilWaitThread.sleep(50);
}
}
use of org.apache.accumulo.core.client.summary.Summary in project accumulo by apache.
the class SummaryCollectionTest method testDeleted.
@Test
public void testDeleted() {
SummarizerConfiguration conf = SummarizerConfiguration.builder(FamilySummarizer.class).build();
HashMap<String, Long> stats = new HashMap<>();
stats.put("c:foo", 9L);
FileSummary fs1 = new FileSummary(conf, stats, false);
SummaryCollection sc1 = new SummaryCollection(Collections.singleton(fs1));
stats = new HashMap<>();
stats.put("c:foo", 5L);
stats.put("c:bar", 3L);
FileSummary fs2 = new FileSummary(conf, stats, true);
SummaryCollection sc2 = new SummaryCollection(Collections.singleton(fs2));
SummaryCollection sc3 = new SummaryCollection(Collections.emptyList());
SummaryCollection sc4 = new SummaryCollection(Collections.emptyList(), true);
SummarizerFactory factory = new SummarizerFactory();
SummaryCollection mergeSc = new SummaryCollection();
for (SummaryCollection sc : Arrays.asList(sc1, sc2, sc3, sc4, sc4)) {
mergeSc.merge(sc, factory);
}
for (SummaryCollection sc : Arrays.asList(mergeSc, new SummaryCollection(mergeSc.toThrift()))) {
List<Summary> summaries = sc.getSummaries();
Assert.assertEquals(1, summaries.size());
Summary summary = summaries.get(0);
FileStatistics filestats = summary.getFileStatistics();
Assert.assertEquals(5, filestats.getTotal());
Assert.assertEquals(1, filestats.getExtra());
Assert.assertEquals(0, filestats.getLarge());
Assert.assertEquals(1, filestats.getMissing());
Assert.assertEquals(2, filestats.getDeleted());
Assert.assertEquals(4, filestats.getInaccurate());
}
}
use of org.apache.accumulo.core.client.summary.Summary in project accumulo by apache.
the class TooManyDeletesCompactionStrategy method gatherInformation.
@Override
public void gatherInformation(MajorCompactionRequest request) throws IOException {
super.gatherInformation(request);
Predicate<SummarizerConfiguration> summarizerPredicate = conf -> conf.getClassName().equals(DeletesSummarizer.class.getName()) && conf.getOptions().isEmpty();
long total = 0;
long deletes = 0;
for (Entry<FileRef, DataFileValue> entry : request.getFiles().entrySet()) {
Collection<Summary> summaries = request.getSummaries(Collections.singleton(entry.getKey()), summarizerPredicate);
if (summaries.size() == 1) {
Summary summary = summaries.iterator().next();
total += summary.getStatistics().get(TOTAL_STAT);
deletes += summary.getStatistics().get(DELETES_STAT);
} else {
long numEntries = entry.getValue().getNumEntries();
if (numEntries == 0 && !proceed_bns) {
shouldCompact = false;
return;
} else {
// no summary data so use Accumulo's estimate of total entries in file
total += entry.getValue().getNumEntries();
}
}
}
long nonDeletes = total - deletes;
if (nonDeletes >= 0) {
// check nonDeletes >= 0 because if this is not true then its clear evidence that the estimates are off
double ratio = deletes / (double) nonDeletes;
shouldCompact = ratio >= threshold;
} else {
shouldCompact = false;
}
}
use of org.apache.accumulo.core.client.summary.Summary in project accumulo by apache.
the class TableOperationsImpl method summaries.
@Override
public SummaryRetriever summaries(String tableName) {
return new SummaryRetriever() {
private Text startRow = null;
private Text endRow = null;
private List<TSummarizerConfiguration> summariesToFetch = Collections.emptyList();
private String summarizerClassRegex;
private boolean flush = false;
@Override
public SummaryRetriever startRow(Text startRow) {
Objects.requireNonNull(startRow);
if (endRow != null) {
Preconditions.checkArgument(startRow.compareTo(endRow) < 0, "Start row must be less than end row : %s >= %s", startRow, endRow);
}
this.startRow = startRow;
return this;
}
@Override
public SummaryRetriever startRow(CharSequence startRow) {
return startRow(new Text(startRow.toString()));
}
@Override
public List<Summary> retrieve() throws AccumuloException, AccumuloSecurityException, TableNotFoundException {
Table.ID tableId = Tables.getTableId(context.getInstance(), tableName);
if (Tables.getTableState(context.getInstance(), tableId) == TableState.OFFLINE)
throw new TableOfflineException(context.getInstance(), tableId.canonicalID());
TRowRange range = new TRowRange(TextUtil.getByteBuffer(startRow), TextUtil.getByteBuffer(endRow));
TSummaryRequest request = new TSummaryRequest(tableId.canonicalID(), range, summariesToFetch, summarizerClassRegex);
if (flush) {
_flush(tableId, startRow, endRow, true);
}
TSummaries ret = ServerClient.execute(context, new TabletClientService.Client.Factory(), client -> {
TSummaries tsr = client.startGetSummaries(Tracer.traceInfo(), context.rpcCreds(), request);
while (!tsr.finished) {
tsr = client.contiuneGetSummaries(Tracer.traceInfo(), tsr.sessionId);
}
return tsr;
});
return new SummaryCollection(ret).getSummaries();
}
@Override
public SummaryRetriever endRow(Text endRow) {
Objects.requireNonNull(endRow);
if (startRow != null) {
Preconditions.checkArgument(startRow.compareTo(endRow) < 0, "Start row must be less than end row : %s >= %s", startRow, endRow);
}
this.endRow = endRow;
return this;
}
@Override
public SummaryRetriever endRow(CharSequence endRow) {
return endRow(new Text(endRow.toString()));
}
@Override
public SummaryRetriever withConfiguration(Collection<SummarizerConfiguration> configs) {
Objects.requireNonNull(configs);
summariesToFetch = configs.stream().map(SummarizerConfigurationUtil::toThrift).collect(Collectors.toList());
return this;
}
@Override
public SummaryRetriever withConfiguration(SummarizerConfiguration... config) {
Objects.requireNonNull(config);
return withConfiguration(Arrays.asList(config));
}
@Override
public SummaryRetriever withMatchingConfiguration(String regex) {
Objects.requireNonNull(regex);
// Do a sanity check here to make sure that regex compiles, instead of having it fail on a tserver.
Pattern.compile(regex);
this.summarizerClassRegex = regex;
return this;
}
@Override
public SummaryRetriever flush(boolean b) {
this.flush = b;
return this;
}
};
}
use of org.apache.accumulo.core.client.summary.Summary in project accumulo by apache.
the class RFileTest method testSummaries.
@Test
public void testSummaries() throws Exception {
SummarizerConfiguration sc1 = SummarizerConfiguration.builder(VisibilitySummarizer.class).build();
SummarizerConfiguration sc2 = SummarizerConfiguration.builder(FamilySummarizer.class).build();
LocalFileSystem localFs = FileSystem.getLocal(new Configuration());
String testFile = createTmpTestFile();
SortedMap<Key, Value> testData1 = createTestData(0, 100, 0, 4, 1, "A&B", "A&B&C");
RFileWriter writer = RFile.newWriter().to(testFile).withFileSystem(localFs).withSummarizers(sc1, sc2).build();
writer.append(testData1.entrySet());
writer.close();
// verify summary data
Collection<Summary> summaries = RFile.summaries().from(testFile).withFileSystem(localFs).read();
Assert.assertEquals(2, summaries.size());
for (Summary summary : summaries) {
Assert.assertEquals(0, summary.getFileStatistics().getInaccurate());
Assert.assertEquals(1, summary.getFileStatistics().getTotal());
String className = summary.getSummarizerConfiguration().getClassName();
CounterSummary counterSummary = new CounterSummary(summary);
if (className.equals(FamilySummarizer.class.getName())) {
Map<String, Long> counters = counterSummary.getCounters();
Map<String, Long> expected = ImmutableMap.of("0000", 200l, "0001", 200l, "0002", 200l, "0003", 200l);
Assert.assertEquals(expected, counters);
} else if (className.equals(VisibilitySummarizer.class.getName())) {
Map<String, Long> counters = counterSummary.getCounters();
Map<String, Long> expected = ImmutableMap.of("A&B", 400l, "A&B&C", 400l);
Assert.assertEquals(expected, counters);
} else {
Assert.fail("Unexpected classname " + className);
}
}
// check if writing summary data impacted normal rfile functionality
Scanner scanner = RFile.newScanner().from(testFile).withFileSystem(localFs).withAuthorizations(new Authorizations("A", "B", "C")).build();
Assert.assertEquals(testData1, toMap(scanner));
scanner.close();
String testFile2 = createTmpTestFile();
SortedMap<Key, Value> testData2 = createTestData(100, 100, 0, 4, 1, "A&B", "A&B&C");
writer = RFile.newWriter().to(testFile2).withFileSystem(localFs).withSummarizers(sc1, sc2).build();
writer.append(testData2.entrySet());
writer.close();
// verify reading summaries from multiple files works
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).read();
Assert.assertEquals(2, summaries.size());
for (Summary summary : summaries) {
Assert.assertEquals(0, summary.getFileStatistics().getInaccurate());
Assert.assertEquals(2, summary.getFileStatistics().getTotal());
String className = summary.getSummarizerConfiguration().getClassName();
CounterSummary counterSummary = new CounterSummary(summary);
if (className.equals(FamilySummarizer.class.getName())) {
Map<String, Long> counters = counterSummary.getCounters();
Map<String, Long> expected = ImmutableMap.of("0000", 400l, "0001", 400l, "0002", 400l, "0003", 400l);
Assert.assertEquals(expected, counters);
} else if (className.equals(VisibilitySummarizer.class.getName())) {
Map<String, Long> counters = counterSummary.getCounters();
Map<String, Long> expected = ImmutableMap.of("A&B", 800l, "A&B&C", 800l);
Assert.assertEquals(expected, counters);
} else {
Assert.fail("Unexpected classname " + className);
}
}
// verify reading a subset of summaries works
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 0);
// the following test check boundry conditions for start row and end row
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(99)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 400l, "A&B&C", 400l), 0);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(98)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 1);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(0)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 1);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow("#").read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 0);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(100)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 400l, "A&B&C", 400l), 1);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).endRow(rowStr(99)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 400l, "A&B&C", 400l), 0);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).endRow(rowStr(100)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 1);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).endRow(rowStr(199)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 0);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(50)).endRow(rowStr(150)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 2);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(120)).endRow(rowStr(150)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 400l, "A&B&C", 400l), 1);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(50)).endRow(rowStr(199)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 1);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow("#").endRow(rowStr(150)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 1);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(199)).read();
checkSummaries(summaries, ImmutableMap.of(), 0);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(200)).read();
checkSummaries(summaries, ImmutableMap.of(), 0);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).endRow("#").read();
checkSummaries(summaries, ImmutableMap.of(), 0);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).endRow(rowStr(0)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 400l, "A&B&C", 400l), 1);
}
Aggregations