use of org.apache.accumulo.core.client.sample.Sampler in project accumulo by apache.
the class RFileOperations method openWriter.
@Override
protected FileSKVWriter openWriter(OpenWriterOperation options) throws IOException {
AccumuloConfiguration acuconf = options.getTableConfiguration();
long blockSize = acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE);
Preconditions.checkArgument((blockSize < Integer.MAX_VALUE && blockSize > 0), "table.file.compress.blocksize must be greater than 0 and less than " + Integer.MAX_VALUE);
long indexBlockSize = acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE_INDEX);
Preconditions.checkArgument((indexBlockSize < Integer.MAX_VALUE && indexBlockSize > 0), "table.file.compress.blocksize.index must be greater than 0 and less than " + Integer.MAX_VALUE);
SamplerConfigurationImpl samplerConfig = SamplerConfigurationImpl.newSamplerConfig(acuconf);
Sampler sampler = null;
if (samplerConfig != null) {
sampler = SamplerFactory.newSampler(samplerConfig, acuconf, options.isAccumuloStartEnabled());
}
String compression = options.getCompression();
compression = compression == null ? options.getTableConfiguration().get(Property.TABLE_FILE_COMPRESSION_TYPE) : compression;
FSDataOutputStream outputStream = options.getOutputStream();
Configuration conf = options.getConfiguration();
if (outputStream == null) {
int hrep = conf.getInt("dfs.replication", -1);
int trep = acuconf.getCount(Property.TABLE_FILE_REPLICATION);
int rep = hrep;
if (trep > 0 && trep != hrep) {
rep = trep;
}
long hblock = conf.getLong("dfs.block.size", 1 << 26);
long tblock = acuconf.getAsBytes(Property.TABLE_FILE_BLOCK_SIZE);
long block = hblock;
if (tblock > 0)
block = tblock;
int bufferSize = conf.getInt("io.file.buffer.size", 4096);
String file = options.getFilename();
FileSystem fs = options.getFileSystem();
outputStream = fs.create(new Path(file), false, bufferSize, (short) rep, block);
}
CachableBlockFile.Writer _cbw = new CachableBlockFile.Writer(new RateLimitedOutputStream(outputStream, options.getRateLimiter()), compression, conf, acuconf);
RFile.Writer writer = new RFile.Writer(_cbw, (int) blockSize, (int) indexBlockSize, samplerConfig, sampler);
return writer;
}
use of org.apache.accumulo.core.client.sample.Sampler in project accumulo by apache.
the class RFileTest method testSample.
@Test
public void testSample() throws IOException {
int num = 10000;
for (int sampleBufferSize : new int[] { 1 << 10, 1 << 20 }) {
// force sample buffer to flush for smaller data
RFile.setSampleBufferSize(sampleBufferSize);
for (int modulus : new int[] { 19, 103, 1019 }) {
Hasher dataHasher = Hashing.md5().newHasher();
List<Entry<Key, Value>> sampleData = new ArrayList<>();
ConfigurationCopy sampleConf = new ConfigurationCopy(conf == null ? DefaultConfiguration.getInstance() : conf);
sampleConf.set(Property.TABLE_SAMPLER, RowSampler.class.getName());
sampleConf.set(Property.TABLE_SAMPLER_OPTS + "hasher", "murmur3_32");
sampleConf.set(Property.TABLE_SAMPLER_OPTS + "modulus", modulus + "");
Sampler sampler = SamplerFactory.newSampler(SamplerConfigurationImpl.newSamplerConfig(sampleConf), sampleConf);
TestRFile trf = new TestRFile(sampleConf);
trf.openWriter();
for (int i = 0; i < num; i++) {
add(trf, newKey(i, 0), newValue(i, 0), dataHasher, sampleData, sampler);
add(trf, newKey(i, 1), newValue(i, 1), dataHasher, sampleData, sampler);
}
HashCode expectedDataHash = dataHasher.hash();
trf.closeWriter();
trf.openReader();
FileSKVIterator sample = trf.reader.getSample(SamplerConfigurationImpl.newSamplerConfig(sampleConf));
checkSample(sample, sampleData);
Assert.assertEquals(expectedDataHash, hash(trf.reader));
SampleIE ie = new SampleIE(SamplerConfigurationImpl.newSamplerConfig(sampleConf).toSamplerConfiguration());
for (int i = 0; i < 3; i++) {
// test opening and closing deep copies a few times.
trf.reader.closeDeepCopies();
sample = trf.reader.getSample(SamplerConfigurationImpl.newSamplerConfig(sampleConf));
SortedKeyValueIterator<Key, Value> sampleDC1 = sample.deepCopy(ie);
SortedKeyValueIterator<Key, Value> sampleDC2 = sample.deepCopy(ie);
SortedKeyValueIterator<Key, Value> sampleDC3 = trf.reader.deepCopy(ie);
SortedKeyValueIterator<Key, Value> allDC1 = sampleDC1.deepCopy(new SampleIE(null));
SortedKeyValueIterator<Key, Value> allDC2 = sample.deepCopy(new SampleIE(null));
Assert.assertEquals(expectedDataHash, hash(allDC1));
Assert.assertEquals(expectedDataHash, hash(allDC2));
checkSample(sample, sampleData);
checkSample(sampleDC1, sampleData);
checkSample(sampleDC2, sampleData);
checkSample(sampleDC3, sampleData);
}
trf.reader.closeDeepCopies();
trf.closeReader();
}
}
}
Aggregations