use of org.apache.accumulo.core.client.sample.SamplerConfiguration in project accumulo by apache.
the class RFileTest method testSampling.
@Test
public void testSampling() throws Exception {
SortedMap<Key, Value> testData1 = createTestData(1000, 2, 1);
LocalFileSystem localFs = FileSystem.getLocal(new Configuration());
String testFile = createTmpTestFile();
SamplerConfiguration sc = new SamplerConfiguration(RowSampler.class).setOptions(ImmutableMap.of("hasher", "murmur3_32", "modulus", "19"));
RFileWriter writer = RFile.newWriter().to(testFile).withFileSystem(localFs).withSampler(sc).build();
writer.append(testData1.entrySet());
writer.close();
Scanner scanner = RFile.newScanner().from(testFile).withFileSystem(localFs).build();
scanner.setSamplerConfiguration(sc);
RowSampler rowSampler = new RowSampler();
rowSampler.init(sc);
SortedMap<Key, Value> sampleData = new TreeMap<>();
for (Entry<Key, Value> e : testData1.entrySet()) {
if (rowSampler.accept(e.getKey())) {
sampleData.put(e.getKey(), e.getValue());
}
}
Assert.assertTrue(sampleData.size() < testData1.size());
Assert.assertEquals(sampleData, toMap(scanner));
scanner.clearSamplerConfiguration();
Assert.assertEquals(testData1, toMap(scanner));
}
use of org.apache.accumulo.core.client.sample.SamplerConfiguration in project accumulo by apache.
the class AccumuloFileOutputFormatTest method validateConfiguration.
@Test
public void validateConfiguration() throws IOException, InterruptedException {
int a = 7;
long b = 300l;
long c = 50l;
long d = 10l;
String e = "snappy";
SamplerConfiguration samplerConfig = new SamplerConfiguration(RowSampler.class.getName());
samplerConfig.addOption("hasher", "murmur3_32");
samplerConfig.addOption("modulus", "109");
SummarizerConfiguration sc1 = SummarizerConfiguration.builder(VisibilitySummarizer.class).addOption(CountingSummarizer.MAX_COUNTERS_OPT, 2048).build();
SummarizerConfiguration sc2 = SummarizerConfiguration.builder(FamilySummarizer.class).addOption(CountingSummarizer.MAX_COUNTERS_OPT, 256).build();
JobConf job = new JobConf();
AccumuloFileOutputFormat.setReplication(job, a);
AccumuloFileOutputFormat.setFileBlockSize(job, b);
AccumuloFileOutputFormat.setDataBlockSize(job, c);
AccumuloFileOutputFormat.setIndexBlockSize(job, d);
AccumuloFileOutputFormat.setCompressionType(job, e);
AccumuloFileOutputFormat.setSampler(job, samplerConfig);
AccumuloFileOutputFormat.setSummarizers(job, sc1, sc2);
AccumuloConfiguration acuconf = FileOutputConfigurator.getAccumuloConfiguration(AccumuloFileOutputFormat.class, job);
assertEquals(7, acuconf.getCount(Property.TABLE_FILE_REPLICATION));
assertEquals(300l, acuconf.getAsBytes(Property.TABLE_FILE_BLOCK_SIZE));
assertEquals(50l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE));
assertEquals(10l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE_INDEX));
assertEquals("snappy", acuconf.get(Property.TABLE_FILE_COMPRESSION_TYPE));
assertEquals(new SamplerConfigurationImpl(samplerConfig), SamplerConfigurationImpl.newSamplerConfig(acuconf));
Collection<SummarizerConfiguration> summarizerConfigs = SummarizerConfiguration.fromTableProperties(acuconf);
assertEquals(2, summarizerConfigs.size());
assertTrue(summarizerConfigs.contains(sc1));
assertTrue(summarizerConfigs.contains(sc2));
a = 17;
b = 1300l;
c = 150l;
d = 110l;
e = "lzo";
samplerConfig = new SamplerConfiguration(RowSampler.class.getName());
samplerConfig.addOption("hasher", "md5");
samplerConfig.addOption("modulus", "100003");
job = new JobConf();
AccumuloFileOutputFormat.setReplication(job, a);
AccumuloFileOutputFormat.setFileBlockSize(job, b);
AccumuloFileOutputFormat.setDataBlockSize(job, c);
AccumuloFileOutputFormat.setIndexBlockSize(job, d);
AccumuloFileOutputFormat.setCompressionType(job, e);
AccumuloFileOutputFormat.setSampler(job, samplerConfig);
acuconf = FileOutputConfigurator.getAccumuloConfiguration(AccumuloFileOutputFormat.class, job);
assertEquals(17, acuconf.getCount(Property.TABLE_FILE_REPLICATION));
assertEquals(1300l, acuconf.getAsBytes(Property.TABLE_FILE_BLOCK_SIZE));
assertEquals(150l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE));
assertEquals(110l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE_INDEX));
assertEquals("lzo", acuconf.get(Property.TABLE_FILE_COMPRESSION_TYPE));
assertEquals(new SamplerConfigurationImpl(samplerConfig), SamplerConfigurationImpl.newSamplerConfig(acuconf));
summarizerConfigs = SummarizerConfiguration.fromTableProperties(acuconf);
assertEquals(0, summarizerConfigs.size());
}
use of org.apache.accumulo.core.client.sample.SamplerConfiguration in project accumulo by apache.
the class OfflineIterator method createIterator.
private SortedKeyValueIterator<Key, Value> createIterator(KeyExtent extent, List<String> absFiles) throws TableNotFoundException, AccumuloException, IOException {
// TODO share code w/ tablet - ACCUMULO-1303
// possible race condition here, if table is renamed
String tableName = Tables.getTableName(conn.getInstance(), tableId);
AccumuloConfiguration acuTableConf = new ConfigurationCopy(conn.tableOperations().getProperties(tableName));
Configuration conf = CachedConfiguration.getInstance();
for (SortedKeyValueIterator<Key, Value> reader : readers) {
((FileSKVIterator) reader).close();
}
readers.clear();
SamplerConfiguration scannerSamplerConfig = options.getSamplerConfiguration();
SamplerConfigurationImpl scannerSamplerConfigImpl = scannerSamplerConfig == null ? null : new SamplerConfigurationImpl(scannerSamplerConfig);
SamplerConfigurationImpl samplerConfImpl = SamplerConfigurationImpl.newSamplerConfig(acuTableConf);
if (scannerSamplerConfigImpl != null && ((samplerConfImpl != null && !scannerSamplerConfigImpl.equals(samplerConfImpl)) || samplerConfImpl == null)) {
throw new SampleNotPresentException();
}
// TODO need to close files - ACCUMULO-1303
for (String file : absFiles) {
FileSystem fs = VolumeConfiguration.getVolume(file, conf, config).getFileSystem();
FileSKVIterator reader = FileOperations.getInstance().newReaderBuilder().forFile(file, fs, conf).withTableConfiguration(acuTableConf).build();
if (scannerSamplerConfigImpl != null) {
reader = reader.getSample(scannerSamplerConfigImpl);
if (reader == null)
throw new SampleNotPresentException();
}
readers.add(reader);
}
MultiIterator multiIter = new MultiIterator(readers, extent);
OfflineIteratorEnvironment iterEnv = new OfflineIteratorEnvironment(authorizations, acuTableConf, false, samplerConfImpl == null ? null : samplerConfImpl.toSamplerConfiguration());
byte[] defaultSecurityLabel;
ColumnVisibility cv = new ColumnVisibility(acuTableConf.get(Property.TABLE_DEFAULT_SCANTIME_VISIBILITY));
defaultSecurityLabel = cv.getExpression();
SortedKeyValueIterator<Key, Value> visFilter = IteratorUtil.setupSystemScanIterators(multiIter, new HashSet<>(options.fetchedColumns), authorizations, defaultSecurityLabel);
return iterEnv.getTopLevelIterator(IteratorUtil.loadIterators(IteratorScope.scan, visFilter, extent, acuTableConf, options.serverSideIteratorList, options.serverSideIteratorOptions, iterEnv, false));
}
use of org.apache.accumulo.core.client.sample.SamplerConfiguration in project accumulo by apache.
the class SamplerConfigurationImpl method toSamplerConfiguration.
public SamplerConfiguration toSamplerConfiguration() {
SamplerConfiguration sc = new SamplerConfiguration(className);
sc.setOptions(options);
return sc;
}
use of org.apache.accumulo.core.client.sample.SamplerConfiguration in project accumulo by apache.
the class SampleIT method assertSampleNotPresent.
private void assertSampleNotPresent(SamplerConfiguration sc, ScannerBase... scanners) {
for (ScannerBase scanner : scanners) {
SamplerConfiguration csc = scanner.getSamplerConfiguration();
scanner.setSamplerConfiguration(sc);
try {
for (Entry<Key, Value> entry : scanner) {
entry.getKey();
}
Assert.fail("Expected SampleNotPresentException, but it did not happen : " + scanner.getClass().getSimpleName());
} catch (SampleNotPresentException e) {
}
scanner.clearSamplerConfiguration();
for (Entry<Key, Value> entry : scanner) {
entry.getKey();
}
if (csc == null) {
scanner.clearSamplerConfiguration();
} else {
scanner.setSamplerConfiguration(csc);
}
}
}
Aggregations