use of org.apache.accumulo.core.client.summary.SummarizerConfiguration in project accumulo by apache.
the class TooManyDeletesIT method tooManyDeletesCompactionStrategyIT.
@Test
public void tooManyDeletesCompactionStrategyIT() throws Exception {
Connector c = getConnector();
String table = getUniqueNames(1)[0];
SummarizerConfiguration sc = SummarizerConfiguration.builder(DeletesSummarizer.class).build();
// TODO open issue about programatic config of compaction strategies
NewTableConfiguration ntc = new NewTableConfiguration().enableSummarization(sc);
HashMap<String, String> props = new HashMap<>();
props.put(Property.TABLE_COMPACTION_STRATEGY.getKey(), TooManyDeletesCompactionStrategy.class.getName());
props.put(Property.TABLE_COMPACTION_STRATEGY_PREFIX.getKey() + TooManyDeletesCompactionStrategy.THRESHOLD_OPT, ".25");
// ensure compaction does not happen because of the number of files
props.put(Property.TABLE_MAJC_RATIO.getKey(), "10");
ntc.setProperties(props);
c.tableOperations().create(table, ntc);
try (BatchWriter bw = c.createBatchWriter(table, new BatchWriterConfig())) {
for (int i = 0; i < 1000; i++) {
Mutation m = new Mutation("row" + i);
m.put("f", "q", "v" + i);
bw.addMutation(m);
}
}
List<Summary> summaries = c.tableOperations().summaries(table).flush(true).withConfiguration(sc).retrieve();
Assert.assertEquals(1, summaries.size());
Summary summary = summaries.get(0);
Assert.assertEquals(1000l, (long) summary.getStatistics().get(DeletesSummarizer.TOTAL_STAT));
Assert.assertEquals(0l, (long) summary.getStatistics().get(DeletesSummarizer.DELETES_STAT));
try (BatchWriter bw = c.createBatchWriter(table, new BatchWriterConfig())) {
for (int i = 0; i < 100; i++) {
Mutation m = new Mutation("row" + i);
m.putDelete("f", "q");
bw.addMutation(m);
}
}
summaries = c.tableOperations().summaries(table).flush(true).withConfiguration(sc).retrieve();
Assert.assertEquals(1, summaries.size());
summary = summaries.get(0);
Assert.assertEquals(1100l, (long) summary.getStatistics().get(DeletesSummarizer.TOTAL_STAT));
Assert.assertEquals(100l, (long) summary.getStatistics().get(DeletesSummarizer.DELETES_STAT));
try (BatchWriter bw = c.createBatchWriter(table, new BatchWriterConfig())) {
for (int i = 100; i < 300; i++) {
Mutation m = new Mutation("row" + i);
m.putDelete("f", "q");
bw.addMutation(m);
}
}
// after a flush occurs Accumulo will check if a major compaction is needed. This check should call the compaction strategy, which should decide to compact
// all files based on the number of deletes.
c.tableOperations().flush(table, null, null, true);
// wait for the compaction to happen
while (true) {
// the flush should cause
summaries = c.tableOperations().summaries(table).flush(false).withConfiguration(sc).retrieve();
Assert.assertEquals(1, summaries.size());
summary = summaries.get(0);
long total = summary.getStatistics().get(DeletesSummarizer.TOTAL_STAT);
long deletes = summary.getStatistics().get(DeletesSummarizer.DELETES_STAT);
if (total == 700 && deletes == 0) {
// a compaction was triggered based on the number of deletes
break;
}
UtilWaitThread.sleep(50);
}
}
use of org.apache.accumulo.core.client.summary.SummarizerConfiguration in project accumulo by apache.
the class SummaryCollectionTest method testDeleted.
@Test
public void testDeleted() {
SummarizerConfiguration conf = SummarizerConfiguration.builder(FamilySummarizer.class).build();
HashMap<String, Long> stats = new HashMap<>();
stats.put("c:foo", 9L);
FileSummary fs1 = new FileSummary(conf, stats, false);
SummaryCollection sc1 = new SummaryCollection(Collections.singleton(fs1));
stats = new HashMap<>();
stats.put("c:foo", 5L);
stats.put("c:bar", 3L);
FileSummary fs2 = new FileSummary(conf, stats, true);
SummaryCollection sc2 = new SummaryCollection(Collections.singleton(fs2));
SummaryCollection sc3 = new SummaryCollection(Collections.emptyList());
SummaryCollection sc4 = new SummaryCollection(Collections.emptyList(), true);
SummarizerFactory factory = new SummarizerFactory();
SummaryCollection mergeSc = new SummaryCollection();
for (SummaryCollection sc : Arrays.asList(sc1, sc2, sc3, sc4, sc4)) {
mergeSc.merge(sc, factory);
}
for (SummaryCollection sc : Arrays.asList(mergeSc, new SummaryCollection(mergeSc.toThrift()))) {
List<Summary> summaries = sc.getSummaries();
Assert.assertEquals(1, summaries.size());
Summary summary = summaries.get(0);
FileStatistics filestats = summary.getFileStatistics();
Assert.assertEquals(5, filestats.getTotal());
Assert.assertEquals(1, filestats.getExtra());
Assert.assertEquals(0, filestats.getLarge());
Assert.assertEquals(1, filestats.getMissing());
Assert.assertEquals(2, filestats.getDeleted());
Assert.assertEquals(4, filestats.getInaccurate());
}
}
use of org.apache.accumulo.core.client.summary.SummarizerConfiguration in project accumulo by apache.
the class TooManyDeletesCompactionStrategy method gatherInformation.
@Override
public void gatherInformation(MajorCompactionRequest request) throws IOException {
super.gatherInformation(request);
Predicate<SummarizerConfiguration> summarizerPredicate = conf -> conf.getClassName().equals(DeletesSummarizer.class.getName()) && conf.getOptions().isEmpty();
long total = 0;
long deletes = 0;
for (Entry<FileRef, DataFileValue> entry : request.getFiles().entrySet()) {
Collection<Summary> summaries = request.getSummaries(Collections.singleton(entry.getKey()), summarizerPredicate);
if (summaries.size() == 1) {
Summary summary = summaries.iterator().next();
total += summary.getStatistics().get(TOTAL_STAT);
deletes += summary.getStatistics().get(DELETES_STAT);
} else {
long numEntries = entry.getValue().getNumEntries();
if (numEntries == 0 && !proceed_bns) {
shouldCompact = false;
return;
} else {
// no summary data so use Accumulo's estimate of total entries in file
total += entry.getValue().getNumEntries();
}
}
}
long nonDeletes = total - deletes;
if (nonDeletes >= 0) {
// check nonDeletes >= 0 because if this is not true then its clear evidence that the estimates are off
double ratio = deletes / (double) nonDeletes;
shouldCompact = ratio >= threshold;
} else {
shouldCompact = false;
}
}
use of org.apache.accumulo.core.client.summary.SummarizerConfiguration in project accumulo by apache.
the class AccumuloFileOutputFormatTest method validateConfiguration.
@Test
public void validateConfiguration() throws IOException, InterruptedException {
int a = 7;
long b = 300l;
long c = 50l;
long d = 10l;
String e = "snappy";
SamplerConfiguration samplerConfig = new SamplerConfiguration(RowSampler.class.getName());
samplerConfig.addOption("hasher", "murmur3_32");
samplerConfig.addOption("modulus", "109");
SummarizerConfiguration sc1 = SummarizerConfiguration.builder(VisibilitySummarizer.class).addOption(CountingSummarizer.MAX_COUNTERS_OPT, 2048).build();
SummarizerConfiguration sc2 = SummarizerConfiguration.builder(FamilySummarizer.class).addOption(CountingSummarizer.MAX_COUNTERS_OPT, 256).build();
JobConf job = new JobConf();
AccumuloFileOutputFormat.setReplication(job, a);
AccumuloFileOutputFormat.setFileBlockSize(job, b);
AccumuloFileOutputFormat.setDataBlockSize(job, c);
AccumuloFileOutputFormat.setIndexBlockSize(job, d);
AccumuloFileOutputFormat.setCompressionType(job, e);
AccumuloFileOutputFormat.setSampler(job, samplerConfig);
AccumuloFileOutputFormat.setSummarizers(job, sc1, sc2);
AccumuloConfiguration acuconf = FileOutputConfigurator.getAccumuloConfiguration(AccumuloFileOutputFormat.class, job);
assertEquals(7, acuconf.getCount(Property.TABLE_FILE_REPLICATION));
assertEquals(300l, acuconf.getAsBytes(Property.TABLE_FILE_BLOCK_SIZE));
assertEquals(50l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE));
assertEquals(10l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE_INDEX));
assertEquals("snappy", acuconf.get(Property.TABLE_FILE_COMPRESSION_TYPE));
assertEquals(new SamplerConfigurationImpl(samplerConfig), SamplerConfigurationImpl.newSamplerConfig(acuconf));
Collection<SummarizerConfiguration> summarizerConfigs = SummarizerConfiguration.fromTableProperties(acuconf);
assertEquals(2, summarizerConfigs.size());
assertTrue(summarizerConfigs.contains(sc1));
assertTrue(summarizerConfigs.contains(sc2));
a = 17;
b = 1300l;
c = 150l;
d = 110l;
e = "lzo";
samplerConfig = new SamplerConfiguration(RowSampler.class.getName());
samplerConfig.addOption("hasher", "md5");
samplerConfig.addOption("modulus", "100003");
job = new JobConf();
AccumuloFileOutputFormat.setReplication(job, a);
AccumuloFileOutputFormat.setFileBlockSize(job, b);
AccumuloFileOutputFormat.setDataBlockSize(job, c);
AccumuloFileOutputFormat.setIndexBlockSize(job, d);
AccumuloFileOutputFormat.setCompressionType(job, e);
AccumuloFileOutputFormat.setSampler(job, samplerConfig);
acuconf = FileOutputConfigurator.getAccumuloConfiguration(AccumuloFileOutputFormat.class, job);
assertEquals(17, acuconf.getCount(Property.TABLE_FILE_REPLICATION));
assertEquals(1300l, acuconf.getAsBytes(Property.TABLE_FILE_BLOCK_SIZE));
assertEquals(150l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE));
assertEquals(110l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE_INDEX));
assertEquals("lzo", acuconf.get(Property.TABLE_FILE_COMPRESSION_TYPE));
assertEquals(new SamplerConfigurationImpl(samplerConfig), SamplerConfigurationImpl.newSamplerConfig(acuconf));
summarizerConfigs = SummarizerConfiguration.fromTableProperties(acuconf);
assertEquals(0, summarizerConfigs.size());
}
use of org.apache.accumulo.core.client.summary.SummarizerConfiguration in project accumulo by apache.
the class TableOperationsImpl method summaries.
@Override
public SummaryRetriever summaries(String tableName) {
return new SummaryRetriever() {
private Text startRow = null;
private Text endRow = null;
private List<TSummarizerConfiguration> summariesToFetch = Collections.emptyList();
private String summarizerClassRegex;
private boolean flush = false;
@Override
public SummaryRetriever startRow(Text startRow) {
Objects.requireNonNull(startRow);
if (endRow != null) {
Preconditions.checkArgument(startRow.compareTo(endRow) < 0, "Start row must be less than end row : %s >= %s", startRow, endRow);
}
this.startRow = startRow;
return this;
}
@Override
public SummaryRetriever startRow(CharSequence startRow) {
return startRow(new Text(startRow.toString()));
}
@Override
public List<Summary> retrieve() throws AccumuloException, AccumuloSecurityException, TableNotFoundException {
Table.ID tableId = Tables.getTableId(context.getInstance(), tableName);
if (Tables.getTableState(context.getInstance(), tableId) == TableState.OFFLINE)
throw new TableOfflineException(context.getInstance(), tableId.canonicalID());
TRowRange range = new TRowRange(TextUtil.getByteBuffer(startRow), TextUtil.getByteBuffer(endRow));
TSummaryRequest request = new TSummaryRequest(tableId.canonicalID(), range, summariesToFetch, summarizerClassRegex);
if (flush) {
_flush(tableId, startRow, endRow, true);
}
TSummaries ret = ServerClient.execute(context, new TabletClientService.Client.Factory(), client -> {
TSummaries tsr = client.startGetSummaries(Tracer.traceInfo(), context.rpcCreds(), request);
while (!tsr.finished) {
tsr = client.contiuneGetSummaries(Tracer.traceInfo(), tsr.sessionId);
}
return tsr;
});
return new SummaryCollection(ret).getSummaries();
}
@Override
public SummaryRetriever endRow(Text endRow) {
Objects.requireNonNull(endRow);
if (startRow != null) {
Preconditions.checkArgument(startRow.compareTo(endRow) < 0, "Start row must be less than end row : %s >= %s", startRow, endRow);
}
this.endRow = endRow;
return this;
}
@Override
public SummaryRetriever endRow(CharSequence endRow) {
return endRow(new Text(endRow.toString()));
}
@Override
public SummaryRetriever withConfiguration(Collection<SummarizerConfiguration> configs) {
Objects.requireNonNull(configs);
summariesToFetch = configs.stream().map(SummarizerConfigurationUtil::toThrift).collect(Collectors.toList());
return this;
}
@Override
public SummaryRetriever withConfiguration(SummarizerConfiguration... config) {
Objects.requireNonNull(config);
return withConfiguration(Arrays.asList(config));
}
@Override
public SummaryRetriever withMatchingConfiguration(String regex) {
Objects.requireNonNull(regex);
// Do a sanity check here to make sure that regex compiles, instead of having it fail on a tserver.
Pattern.compile(regex);
this.summarizerClassRegex = regex;
return this;
}
@Override
public SummaryRetriever flush(boolean b) {
this.flush = b;
return this;
}
};
}
Aggregations