Search in sources :

Example 6 with Sampler

use of org.apache.accumulo.core.client.sample.Sampler in project accumulo by apache.

the class RFileOperations method openWriter.

@Override
protected FileSKVWriter openWriter(OpenWriterOperation options) throws IOException {
    AccumuloConfiguration acuconf = options.getTableConfiguration();
    long blockSize = acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE);
    Preconditions.checkArgument((blockSize < Integer.MAX_VALUE && blockSize > 0), "table.file.compress.blocksize must be greater than 0 and less than " + Integer.MAX_VALUE);
    long indexBlockSize = acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE_INDEX);
    Preconditions.checkArgument((indexBlockSize < Integer.MAX_VALUE && indexBlockSize > 0), "table.file.compress.blocksize.index must be greater than 0 and less than " + Integer.MAX_VALUE);
    SamplerConfigurationImpl samplerConfig = SamplerConfigurationImpl.newSamplerConfig(acuconf);
    Sampler sampler = null;
    if (samplerConfig != null) {
        sampler = SamplerFactory.newSampler(samplerConfig, acuconf, options.isAccumuloStartEnabled());
    }
    String compression = options.getCompression();
    compression = compression == null ? options.getTableConfiguration().get(Property.TABLE_FILE_COMPRESSION_TYPE) : compression;
    FSDataOutputStream outputStream = options.getOutputStream();
    Configuration conf = options.getConfiguration();
    if (outputStream == null) {
        int hrep = conf.getInt("dfs.replication", -1);
        int trep = acuconf.getCount(Property.TABLE_FILE_REPLICATION);
        int rep = hrep;
        if (trep > 0 && trep != hrep) {
            rep = trep;
        }
        long hblock = conf.getLong("dfs.block.size", 1 << 26);
        long tblock = acuconf.getAsBytes(Property.TABLE_FILE_BLOCK_SIZE);
        long block = hblock;
        if (tblock > 0)
            block = tblock;
        int bufferSize = conf.getInt("io.file.buffer.size", 4096);
        String file = options.getFilename();
        FileSystem fs = options.getFileSystem();
        outputStream = fs.create(new Path(file), false, bufferSize, (short) rep, block);
    }
    CachableBlockFile.Writer _cbw = new CachableBlockFile.Writer(new RateLimitedOutputStream(outputStream, options.getRateLimiter()), compression, conf, acuconf);
    RFile.Writer writer = new RFile.Writer(_cbw, (int) blockSize, (int) indexBlockSize, samplerConfig, sampler);
    return writer;
}
Also used : Path(org.apache.hadoop.fs.Path) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration) Configuration(org.apache.hadoop.conf.Configuration) SamplerConfigurationImpl(org.apache.accumulo.core.sample.impl.SamplerConfigurationImpl) RateLimitedOutputStream(org.apache.accumulo.core.file.streams.RateLimitedOutputStream) Sampler(org.apache.accumulo.core.client.sample.Sampler) FileSystem(org.apache.hadoop.fs.FileSystem) CachableBlockFile(org.apache.accumulo.core.file.blockfile.impl.CachableBlockFile) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) FileSKVWriter(org.apache.accumulo.core.file.FileSKVWriter) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration)

Example 7 with Sampler

use of org.apache.accumulo.core.client.sample.Sampler in project accumulo by apache.

the class RFileTest method testSample.

@Test
public void testSample() throws IOException {
    int num = 10000;
    for (int sampleBufferSize : new int[] { 1 << 10, 1 << 20 }) {
        // force sample buffer to flush for smaller data
        RFile.setSampleBufferSize(sampleBufferSize);
        for (int modulus : new int[] { 19, 103, 1019 }) {
            Hasher dataHasher = Hashing.md5().newHasher();
            List<Entry<Key, Value>> sampleData = new ArrayList<>();
            ConfigurationCopy sampleConf = new ConfigurationCopy(conf == null ? DefaultConfiguration.getInstance() : conf);
            sampleConf.set(Property.TABLE_SAMPLER, RowSampler.class.getName());
            sampleConf.set(Property.TABLE_SAMPLER_OPTS + "hasher", "murmur3_32");
            sampleConf.set(Property.TABLE_SAMPLER_OPTS + "modulus", modulus + "");
            Sampler sampler = SamplerFactory.newSampler(SamplerConfigurationImpl.newSamplerConfig(sampleConf), sampleConf);
            TestRFile trf = new TestRFile(sampleConf);
            trf.openWriter();
            for (int i = 0; i < num; i++) {
                add(trf, newKey(i, 0), newValue(i, 0), dataHasher, sampleData, sampler);
                add(trf, newKey(i, 1), newValue(i, 1), dataHasher, sampleData, sampler);
            }
            HashCode expectedDataHash = dataHasher.hash();
            trf.closeWriter();
            trf.openReader();
            FileSKVIterator sample = trf.reader.getSample(SamplerConfigurationImpl.newSamplerConfig(sampleConf));
            checkSample(sample, sampleData);
            Assert.assertEquals(expectedDataHash, hash(trf.reader));
            SampleIE ie = new SampleIE(SamplerConfigurationImpl.newSamplerConfig(sampleConf).toSamplerConfiguration());
            for (int i = 0; i < 3; i++) {
                // test opening and closing deep copies a few times.
                trf.reader.closeDeepCopies();
                sample = trf.reader.getSample(SamplerConfigurationImpl.newSamplerConfig(sampleConf));
                SortedKeyValueIterator<Key, Value> sampleDC1 = sample.deepCopy(ie);
                SortedKeyValueIterator<Key, Value> sampleDC2 = sample.deepCopy(ie);
                SortedKeyValueIterator<Key, Value> sampleDC3 = trf.reader.deepCopy(ie);
                SortedKeyValueIterator<Key, Value> allDC1 = sampleDC1.deepCopy(new SampleIE(null));
                SortedKeyValueIterator<Key, Value> allDC2 = sample.deepCopy(new SampleIE(null));
                Assert.assertEquals(expectedDataHash, hash(allDC1));
                Assert.assertEquals(expectedDataHash, hash(allDC2));
                checkSample(sample, sampleData);
                checkSample(sampleDC1, sampleData);
                checkSample(sampleDC2, sampleData);
                checkSample(sampleDC3, sampleData);
            }
            trf.reader.closeDeepCopies();
            trf.closeReader();
        }
    }
}
Also used : FileSKVIterator(org.apache.accumulo.core.file.FileSKVIterator) ConfigurationCopy(org.apache.accumulo.core.conf.ConfigurationCopy) ArrayList(java.util.ArrayList) RowSampler(org.apache.accumulo.core.client.sample.RowSampler) Hasher(com.google.common.hash.Hasher) Entry(java.util.Map.Entry) HashCode(com.google.common.hash.HashCode) Sampler(org.apache.accumulo.core.client.sample.Sampler) RowSampler(org.apache.accumulo.core.client.sample.RowSampler) Value(org.apache.accumulo.core.data.Value) Key(org.apache.accumulo.core.data.Key) PartialKey(org.apache.accumulo.core.data.PartialKey) CryptoTest(org.apache.accumulo.core.security.crypto.CryptoTest) Test(org.junit.Test)

Aggregations

Sampler (org.apache.accumulo.core.client.sample.Sampler)7 RowSampler (org.apache.accumulo.core.client.sample.RowSampler)5 ConfigurationCopy (org.apache.accumulo.core.conf.ConfigurationCopy)5 Key (org.apache.accumulo.core.data.Key)5 Value (org.apache.accumulo.core.data.Value)5 SamplerConfigurationImpl (org.apache.accumulo.core.sample.impl.SamplerConfigurationImpl)4 Test (org.junit.Test)4 TreeMap (java.util.TreeMap)3 MemoryIterator (org.apache.accumulo.tserver.InMemoryMap.MemoryIterator)3 ArrayList (java.util.ArrayList)2 Entry (java.util.Map.Entry)2 PartialKey (org.apache.accumulo.core.data.PartialKey)2 FileSKVIterator (org.apache.accumulo.core.file.FileSKVIterator)2 CryptoTest (org.apache.accumulo.core.security.crypto.CryptoTest)2 HashCode (com.google.common.hash.HashCode)1 Hasher (com.google.common.hash.Hasher)1 AbstractMap (java.util.AbstractMap)1 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)1 AccumuloConfiguration (org.apache.accumulo.core.conf.AccumuloConfiguration)1 Range (org.apache.accumulo.core.data.Range)1