use of org.apache.accumulo.core.client.summary.SummarizerConfiguration in project accumulo by apache.
the class MajorCompactionRequest method getSummaries.
/**
* Returns all summaries present in each file.
*
* <p>
* This method can only be called from {@link CompactionStrategy#gatherInformation(MajorCompactionRequest)}. Unfortunately, {@code gatherInformation()} is not
* called before {@link CompactionStrategy#shouldCompact(MajorCompactionRequest)}. Therefore {@code shouldCompact()} should just return true when a
* compactions strategy wants to use summary information.
*
* <p>
* When using summaries to make compaction decisions, its important to ensure that all summary data fits in the tablet server summary cache. The size of this
* cache is configured by code tserver.cache.summary.size}. Also its important to use the summarySelector predicate to only retrieve the needed summary data.
* Otherwise uneeded summary data could be brought into the cache.
*
* <p>
* Some files may contain data outside of a tablets range. When {@link Summarizer}'s generate small amounts of summary data, multiple summaries may be stored
* within a file for different row ranges. This will allow more accurate summaries to be returned for the case where a file has data outside a tablets range.
* However, some summary data outside of the tablets range may still be included. When this happens {@link FileStatistics#getExtra()} will be non zero. Also,
* its good to be aware of the other potential causes of inaccuracies {@link FileStatistics#getInaccurate()}
*
* <p>
* When this method is called with multiple files, it will automatically merge summary data using {@link Combiner#merge(Map, Map)}. If summary information is
* needed for each file, then just call this method for each file.
*
* <p>
* Writing a compaction strategy that uses summary information is a bit tricky. See the source code for {@link TooManyDeletesCompactionStrategy} as an example
* of a compaction strategy.
*
* @see Summarizer
* @see TableOperations#addSummarizers(String, SummarizerConfiguration...)
* @see AccumuloFileOutputFormat#setSummarizers(org.apache.hadoop.mapred.JobConf, SummarizerConfiguration...)
* @see WriterOptions#withSummarizers(SummarizerConfiguration...)
*/
public List<Summary> getSummaries(Collection<FileRef> files, Predicate<SummarizerConfiguration> summarySelector) throws IOException {
Preconditions.checkState(volumeManager != null, "Getting summaries is not supported at this time. Its only supported when CompactionStrategy.gatherInformation() is called.");
SummaryCollection sc = new SummaryCollection();
SummarizerFactory factory = new SummarizerFactory(tableConfig);
for (FileRef file : files) {
FileSystem fs = volumeManager.getVolumeByPath(file.path()).getFileSystem();
Configuration conf = CachedConfiguration.getInstance();
SummaryCollection fsc = SummaryReader.load(fs, conf, tableConfig, factory, file.path(), summarySelector, summaryCache, indexCache).getSummaries(Collections.singletonList(new Gatherer.RowRange(extent)));
sc.merge(fsc, factory);
}
return sc.getSummaries();
}
use of org.apache.accumulo.core.client.summary.SummarizerConfiguration in project accumulo by apache.
the class RFileTest method testSummaries.
@Test
public void testSummaries() throws Exception {
SummarizerConfiguration sc1 = SummarizerConfiguration.builder(VisibilitySummarizer.class).build();
SummarizerConfiguration sc2 = SummarizerConfiguration.builder(FamilySummarizer.class).build();
LocalFileSystem localFs = FileSystem.getLocal(new Configuration());
String testFile = createTmpTestFile();
SortedMap<Key, Value> testData1 = createTestData(0, 100, 0, 4, 1, "A&B", "A&B&C");
RFileWriter writer = RFile.newWriter().to(testFile).withFileSystem(localFs).withSummarizers(sc1, sc2).build();
writer.append(testData1.entrySet());
writer.close();
// verify summary data
Collection<Summary> summaries = RFile.summaries().from(testFile).withFileSystem(localFs).read();
Assert.assertEquals(2, summaries.size());
for (Summary summary : summaries) {
Assert.assertEquals(0, summary.getFileStatistics().getInaccurate());
Assert.assertEquals(1, summary.getFileStatistics().getTotal());
String className = summary.getSummarizerConfiguration().getClassName();
CounterSummary counterSummary = new CounterSummary(summary);
if (className.equals(FamilySummarizer.class.getName())) {
Map<String, Long> counters = counterSummary.getCounters();
Map<String, Long> expected = ImmutableMap.of("0000", 200l, "0001", 200l, "0002", 200l, "0003", 200l);
Assert.assertEquals(expected, counters);
} else if (className.equals(VisibilitySummarizer.class.getName())) {
Map<String, Long> counters = counterSummary.getCounters();
Map<String, Long> expected = ImmutableMap.of("A&B", 400l, "A&B&C", 400l);
Assert.assertEquals(expected, counters);
} else {
Assert.fail("Unexpected classname " + className);
}
}
// check if writing summary data impacted normal rfile functionality
Scanner scanner = RFile.newScanner().from(testFile).withFileSystem(localFs).withAuthorizations(new Authorizations("A", "B", "C")).build();
Assert.assertEquals(testData1, toMap(scanner));
scanner.close();
String testFile2 = createTmpTestFile();
SortedMap<Key, Value> testData2 = createTestData(100, 100, 0, 4, 1, "A&B", "A&B&C");
writer = RFile.newWriter().to(testFile2).withFileSystem(localFs).withSummarizers(sc1, sc2).build();
writer.append(testData2.entrySet());
writer.close();
// verify reading summaries from multiple files works
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).read();
Assert.assertEquals(2, summaries.size());
for (Summary summary : summaries) {
Assert.assertEquals(0, summary.getFileStatistics().getInaccurate());
Assert.assertEquals(2, summary.getFileStatistics().getTotal());
String className = summary.getSummarizerConfiguration().getClassName();
CounterSummary counterSummary = new CounterSummary(summary);
if (className.equals(FamilySummarizer.class.getName())) {
Map<String, Long> counters = counterSummary.getCounters();
Map<String, Long> expected = ImmutableMap.of("0000", 400l, "0001", 400l, "0002", 400l, "0003", 400l);
Assert.assertEquals(expected, counters);
} else if (className.equals(VisibilitySummarizer.class.getName())) {
Map<String, Long> counters = counterSummary.getCounters();
Map<String, Long> expected = ImmutableMap.of("A&B", 800l, "A&B&C", 800l);
Assert.assertEquals(expected, counters);
} else {
Assert.fail("Unexpected classname " + className);
}
}
// verify reading a subset of summaries works
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 0);
// the following test check boundry conditions for start row and end row
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(99)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 400l, "A&B&C", 400l), 0);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(98)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 1);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(0)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 1);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow("#").read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 0);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(100)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 400l, "A&B&C", 400l), 1);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).endRow(rowStr(99)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 400l, "A&B&C", 400l), 0);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).endRow(rowStr(100)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 1);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).endRow(rowStr(199)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 0);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(50)).endRow(rowStr(150)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 2);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(120)).endRow(rowStr(150)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 400l, "A&B&C", 400l), 1);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(50)).endRow(rowStr(199)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 1);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow("#").endRow(rowStr(150)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 800l, "A&B&C", 800l), 1);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(199)).read();
checkSummaries(summaries, ImmutableMap.of(), 0);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).startRow(rowStr(200)).read();
checkSummaries(summaries, ImmutableMap.of(), 0);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).endRow("#").read();
checkSummaries(summaries, ImmutableMap.of(), 0);
summaries = RFile.summaries().from(testFile, testFile2).withFileSystem(localFs).selectSummaries(sc -> sc.equals(sc1)).endRow(rowStr(0)).read();
checkSummaries(summaries, ImmutableMap.of("A&B", 400l, "A&B&C", 400l), 1);
}
use of org.apache.accumulo.core.client.summary.SummarizerConfiguration in project accumulo by apache.
the class AccumuloFileOutputFormatTest method validateConfiguration.
@Test
public void validateConfiguration() throws IOException, InterruptedException {
int a = 7;
long b = 300l;
long c = 50l;
long d = 10l;
String e = "snappy";
SamplerConfiguration samplerConfig = new SamplerConfiguration(RowSampler.class.getName());
samplerConfig.addOption("hasher", "murmur3_32");
samplerConfig.addOption("modulus", "109");
SummarizerConfiguration sc1 = SummarizerConfiguration.builder(VisibilitySummarizer.class).addOption(CountingSummarizer.MAX_COUNTERS_OPT, 2048).build();
SummarizerConfiguration sc2 = SummarizerConfiguration.builder(FamilySummarizer.class).addOption(CountingSummarizer.MAX_COUNTERS_OPT, 256).build();
Job job1 = Job.getInstance();
AccumuloFileOutputFormat.setReplication(job1, a);
AccumuloFileOutputFormat.setFileBlockSize(job1, b);
AccumuloFileOutputFormat.setDataBlockSize(job1, c);
AccumuloFileOutputFormat.setIndexBlockSize(job1, d);
AccumuloFileOutputFormat.setCompressionType(job1, e);
AccumuloFileOutputFormat.setSampler(job1, samplerConfig);
AccumuloFileOutputFormat.setSummarizers(job1, sc1, sc2);
AccumuloConfiguration acuconf = FileOutputConfigurator.getAccumuloConfiguration(AccumuloFileOutputFormat.class, job1.getConfiguration());
assertEquals(7, acuconf.getCount(Property.TABLE_FILE_REPLICATION));
assertEquals(300l, acuconf.getAsBytes(Property.TABLE_FILE_BLOCK_SIZE));
assertEquals(50l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE));
assertEquals(10l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE_INDEX));
assertEquals("snappy", acuconf.get(Property.TABLE_FILE_COMPRESSION_TYPE));
assertEquals(new SamplerConfigurationImpl(samplerConfig), SamplerConfigurationImpl.newSamplerConfig(acuconf));
Collection<SummarizerConfiguration> summarizerConfigs = SummarizerConfiguration.fromTableProperties(acuconf);
assertEquals(2, summarizerConfigs.size());
assertTrue(summarizerConfigs.contains(sc1));
assertTrue(summarizerConfigs.contains(sc2));
a = 17;
b = 1300l;
c = 150l;
d = 110l;
e = "lzo";
samplerConfig = new SamplerConfiguration(RowSampler.class.getName());
samplerConfig.addOption("hasher", "md5");
samplerConfig.addOption("modulus", "100003");
Job job2 = Job.getInstance();
AccumuloFileOutputFormat.setReplication(job2, a);
AccumuloFileOutputFormat.setFileBlockSize(job2, b);
AccumuloFileOutputFormat.setDataBlockSize(job2, c);
AccumuloFileOutputFormat.setIndexBlockSize(job2, d);
AccumuloFileOutputFormat.setCompressionType(job2, e);
AccumuloFileOutputFormat.setSampler(job2, samplerConfig);
acuconf = FileOutputConfigurator.getAccumuloConfiguration(AccumuloFileOutputFormat.class, job2.getConfiguration());
assertEquals(17, acuconf.getCount(Property.TABLE_FILE_REPLICATION));
assertEquals(1300l, acuconf.getAsBytes(Property.TABLE_FILE_BLOCK_SIZE));
assertEquals(150l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE));
assertEquals(110l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE_INDEX));
assertEquals("lzo", acuconf.get(Property.TABLE_FILE_COMPRESSION_TYPE));
assertEquals(new SamplerConfigurationImpl(samplerConfig), SamplerConfigurationImpl.newSamplerConfig(acuconf));
summarizerConfigs = SummarizerConfiguration.fromTableProperties(acuconf);
assertEquals(0, summarizerConfigs.size());
}
use of org.apache.accumulo.core.client.summary.SummarizerConfiguration in project accumulo by apache.
the class CountingSummarizerTest method testMultipleEmit.
@Test
public void testMultipleEmit() {
SummarizerConfiguration sc = SummarizerConfiguration.builder(MultiSummarizer.class).build();
MultiSummarizer countSum = new MultiSummarizer();
Summarizer.Collector collector = countSum.collector(sc);
Value val = new Value("abc");
HashMap<String, Long> expected = new HashMap<>();
for (String row : new String[] { "ask", "asleep", "some", "soul" }) {
for (String fam : new String[] { "hop", "hope", "nope", "noop" }) {
for (String qual : new String[] { "mad", "lad", "lab", "map" }) {
collector.accept(new Key(row, fam, qual), val);
expected.merge("rp:" + row.substring(0, 2), 1l, Long::sum);
expected.merge("fp:" + fam.substring(0, 2), 1l, Long::sum);
expected.merge("qp:" + qual.substring(0, 2), 1l, Long::sum);
}
}
}
HashMap<String, Long> stats = new HashMap<>();
collector.summarize((k, v) -> stats.put(k, v));
CounterSummary csum = new CounterSummary(stats);
Assert.assertEquals(expected, csum.getCounters());
Assert.assertEquals(64, csum.getSeen());
Assert.assertEquals(3 * 64, csum.getEmitted());
Assert.assertEquals(0, csum.getIgnored());
Assert.assertEquals(0, csum.getDeletesIgnored());
}
use of org.apache.accumulo.core.client.summary.SummarizerConfiguration in project accumulo by apache.
the class CountingSummarizerTest method testCountDeletes.
@Test
public void testCountDeletes() {
SummarizerConfiguration sc = SummarizerConfiguration.builder(FamilySummarizer.class).addOptions(INGNORE_DELETES_OPT, "false").build();
FamilySummarizer countSum = new FamilySummarizer();
Key k1 = new Key("r1", "f1");
Key k2 = new Key("r1", "f1");
k2.setDeleted(true);
Key k3 = new Key("r1", "f2");
Collector collector = countSum.collector(sc);
collector.accept(k1, new Value(""));
collector.accept(k2, new Value(""));
collector.accept(k3, new Value(""));
String p = COUNTER_STAT_PREFIX;
HashMap<String, Long> expected = new HashMap<>();
expected.put(p + "f1", 2l);
expected.put(p + "f2", 1l);
expected.put(TOO_LONG_STAT, 0l);
expected.put(TOO_MANY_STAT, 0l);
expected.put(SEEN_STAT, 3l);
expected.put(EMITTED_STAT, 3l);
expected.put(DELETES_IGNORED_STAT, 0l);
HashMap<String, Long> stats = new HashMap<>();
collector.summarize(stats::put);
Assert.assertEquals(expected, stats);
CounterSummary csum = new CounterSummary(stats);
Assert.assertEquals(0, csum.getIgnored());
Assert.assertEquals(0, csum.getTooMany());
Assert.assertEquals(0, csum.getTooLong());
Assert.assertEquals(3, csum.getSeen());
Assert.assertEquals(3, csum.getEmitted());
Assert.assertEquals(0, csum.getDeletesIgnored());
expected.clear();
expected.put("f1", 2l);
expected.put("f2", 1l);
Assert.assertEquals(expected, csum.getCounters());
}
Aggregations