Search in sources :

Example 1 with SamplerConfiguration

use of org.apache.accumulo.core.client.sample.SamplerConfiguration in project accumulo by apache.

the class RFileTest method testSampling.

@Test
public void testSampling() throws Exception {
    SortedMap<Key, Value> testData1 = createTestData(1000, 2, 1);
    LocalFileSystem localFs = FileSystem.getLocal(new Configuration());
    String testFile = createTmpTestFile();
    SamplerConfiguration sc = new SamplerConfiguration(RowSampler.class).setOptions(ImmutableMap.of("hasher", "murmur3_32", "modulus", "19"));
    RFileWriter writer = RFile.newWriter().to(testFile).withFileSystem(localFs).withSampler(sc).build();
    writer.append(testData1.entrySet());
    writer.close();
    Scanner scanner = RFile.newScanner().from(testFile).withFileSystem(localFs).build();
    scanner.setSamplerConfiguration(sc);
    RowSampler rowSampler = new RowSampler();
    rowSampler.init(sc);
    SortedMap<Key, Value> sampleData = new TreeMap<>();
    for (Entry<Key, Value> e : testData1.entrySet()) {
        if (rowSampler.accept(e.getKey())) {
            sampleData.put(e.getKey(), e.getValue());
        }
    }
    Assert.assertTrue(sampleData.size() < testData1.size());
    Assert.assertEquals(sampleData, toMap(scanner));
    scanner.clearSamplerConfiguration();
    Assert.assertEquals(testData1, toMap(scanner));
}
Also used : RowSampler(org.apache.accumulo.core.client.sample.RowSampler) Scanner(org.apache.accumulo.core.client.Scanner) SummarizerConfiguration(org.apache.accumulo.core.client.summary.SummarizerConfiguration) NewTableConfiguration(org.apache.accumulo.core.client.admin.NewTableConfiguration) Configuration(org.apache.hadoop.conf.Configuration) SamplerConfiguration(org.apache.accumulo.core.client.sample.SamplerConfiguration) DefaultConfiguration(org.apache.accumulo.core.conf.DefaultConfiguration) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) Value(org.apache.accumulo.core.data.Value) SamplerConfiguration(org.apache.accumulo.core.client.sample.SamplerConfiguration) TreeMap(java.util.TreeMap) Key(org.apache.accumulo.core.data.Key) Test(org.junit.Test)

Example 2 with SamplerConfiguration

use of org.apache.accumulo.core.client.sample.SamplerConfiguration in project accumulo by apache.

the class AccumuloFileOutputFormatTest method validateConfiguration.

@Test
public void validateConfiguration() throws IOException, InterruptedException {
    int a = 7;
    long b = 300l;
    long c = 50l;
    long d = 10l;
    String e = "snappy";
    SamplerConfiguration samplerConfig = new SamplerConfiguration(RowSampler.class.getName());
    samplerConfig.addOption("hasher", "murmur3_32");
    samplerConfig.addOption("modulus", "109");
    SummarizerConfiguration sc1 = SummarizerConfiguration.builder(VisibilitySummarizer.class).addOption(CountingSummarizer.MAX_COUNTERS_OPT, 2048).build();
    SummarizerConfiguration sc2 = SummarizerConfiguration.builder(FamilySummarizer.class).addOption(CountingSummarizer.MAX_COUNTERS_OPT, 256).build();
    JobConf job = new JobConf();
    AccumuloFileOutputFormat.setReplication(job, a);
    AccumuloFileOutputFormat.setFileBlockSize(job, b);
    AccumuloFileOutputFormat.setDataBlockSize(job, c);
    AccumuloFileOutputFormat.setIndexBlockSize(job, d);
    AccumuloFileOutputFormat.setCompressionType(job, e);
    AccumuloFileOutputFormat.setSampler(job, samplerConfig);
    AccumuloFileOutputFormat.setSummarizers(job, sc1, sc2);
    AccumuloConfiguration acuconf = FileOutputConfigurator.getAccumuloConfiguration(AccumuloFileOutputFormat.class, job);
    assertEquals(7, acuconf.getCount(Property.TABLE_FILE_REPLICATION));
    assertEquals(300l, acuconf.getAsBytes(Property.TABLE_FILE_BLOCK_SIZE));
    assertEquals(50l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE));
    assertEquals(10l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE_INDEX));
    assertEquals("snappy", acuconf.get(Property.TABLE_FILE_COMPRESSION_TYPE));
    assertEquals(new SamplerConfigurationImpl(samplerConfig), SamplerConfigurationImpl.newSamplerConfig(acuconf));
    Collection<SummarizerConfiguration> summarizerConfigs = SummarizerConfiguration.fromTableProperties(acuconf);
    assertEquals(2, summarizerConfigs.size());
    assertTrue(summarizerConfigs.contains(sc1));
    assertTrue(summarizerConfigs.contains(sc2));
    a = 17;
    b = 1300l;
    c = 150l;
    d = 110l;
    e = "lzo";
    samplerConfig = new SamplerConfiguration(RowSampler.class.getName());
    samplerConfig.addOption("hasher", "md5");
    samplerConfig.addOption("modulus", "100003");
    job = new JobConf();
    AccumuloFileOutputFormat.setReplication(job, a);
    AccumuloFileOutputFormat.setFileBlockSize(job, b);
    AccumuloFileOutputFormat.setDataBlockSize(job, c);
    AccumuloFileOutputFormat.setIndexBlockSize(job, d);
    AccumuloFileOutputFormat.setCompressionType(job, e);
    AccumuloFileOutputFormat.setSampler(job, samplerConfig);
    acuconf = FileOutputConfigurator.getAccumuloConfiguration(AccumuloFileOutputFormat.class, job);
    assertEquals(17, acuconf.getCount(Property.TABLE_FILE_REPLICATION));
    assertEquals(1300l, acuconf.getAsBytes(Property.TABLE_FILE_BLOCK_SIZE));
    assertEquals(150l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE));
    assertEquals(110l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE_INDEX));
    assertEquals("lzo", acuconf.get(Property.TABLE_FILE_COMPRESSION_TYPE));
    assertEquals(new SamplerConfigurationImpl(samplerConfig), SamplerConfigurationImpl.newSamplerConfig(acuconf));
    summarizerConfigs = SummarizerConfiguration.fromTableProperties(acuconf);
    assertEquals(0, summarizerConfigs.size());
}
Also used : RowSampler(org.apache.accumulo.core.client.sample.RowSampler) SamplerConfigurationImpl(org.apache.accumulo.core.sample.impl.SamplerConfigurationImpl) SamplerConfiguration(org.apache.accumulo.core.client.sample.SamplerConfiguration) SummarizerConfiguration(org.apache.accumulo.core.client.summary.SummarizerConfiguration) JobConf(org.apache.hadoop.mapred.JobConf) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration) Test(org.junit.Test)

Example 3 with SamplerConfiguration

use of org.apache.accumulo.core.client.sample.SamplerConfiguration in project accumulo by apache.

the class OfflineIterator method createIterator.

private SortedKeyValueIterator<Key, Value> createIterator(KeyExtent extent, List<String> absFiles) throws TableNotFoundException, AccumuloException, IOException {
    // TODO share code w/ tablet - ACCUMULO-1303
    // possible race condition here, if table is renamed
    String tableName = Tables.getTableName(conn.getInstance(), tableId);
    AccumuloConfiguration acuTableConf = new ConfigurationCopy(conn.tableOperations().getProperties(tableName));
    Configuration conf = CachedConfiguration.getInstance();
    for (SortedKeyValueIterator<Key, Value> reader : readers) {
        ((FileSKVIterator) reader).close();
    }
    readers.clear();
    SamplerConfiguration scannerSamplerConfig = options.getSamplerConfiguration();
    SamplerConfigurationImpl scannerSamplerConfigImpl = scannerSamplerConfig == null ? null : new SamplerConfigurationImpl(scannerSamplerConfig);
    SamplerConfigurationImpl samplerConfImpl = SamplerConfigurationImpl.newSamplerConfig(acuTableConf);
    if (scannerSamplerConfigImpl != null && ((samplerConfImpl != null && !scannerSamplerConfigImpl.equals(samplerConfImpl)) || samplerConfImpl == null)) {
        throw new SampleNotPresentException();
    }
    // TODO need to close files - ACCUMULO-1303
    for (String file : absFiles) {
        FileSystem fs = VolumeConfiguration.getVolume(file, conf, config).getFileSystem();
        FileSKVIterator reader = FileOperations.getInstance().newReaderBuilder().forFile(file, fs, conf).withTableConfiguration(acuTableConf).build();
        if (scannerSamplerConfigImpl != null) {
            reader = reader.getSample(scannerSamplerConfigImpl);
            if (reader == null)
                throw new SampleNotPresentException();
        }
        readers.add(reader);
    }
    MultiIterator multiIter = new MultiIterator(readers, extent);
    OfflineIteratorEnvironment iterEnv = new OfflineIteratorEnvironment(authorizations, acuTableConf, false, samplerConfImpl == null ? null : samplerConfImpl.toSamplerConfiguration());
    byte[] defaultSecurityLabel;
    ColumnVisibility cv = new ColumnVisibility(acuTableConf.get(Property.TABLE_DEFAULT_SCANTIME_VISIBILITY));
    defaultSecurityLabel = cv.getExpression();
    SortedKeyValueIterator<Key, Value> visFilter = IteratorUtil.setupSystemScanIterators(multiIter, new HashSet<>(options.fetchedColumns), authorizations, defaultSecurityLabel);
    return iterEnv.getTopLevelIterator(IteratorUtil.loadIterators(IteratorScope.scan, visFilter, extent, acuTableConf, options.serverSideIteratorList, options.serverSideIteratorOptions, iterEnv, false));
}
Also used : FileSKVIterator(org.apache.accumulo.core.file.FileSKVIterator) ConfigurationCopy(org.apache.accumulo.core.conf.ConfigurationCopy) MultiIterator(org.apache.accumulo.core.iterators.system.MultiIterator) Configuration(org.apache.hadoop.conf.Configuration) SamplerConfiguration(org.apache.accumulo.core.client.sample.SamplerConfiguration) VolumeConfiguration(org.apache.accumulo.core.volume.VolumeConfiguration) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration) CachedConfiguration(org.apache.accumulo.core.util.CachedConfiguration) SamplerConfigurationImpl(org.apache.accumulo.core.sample.impl.SamplerConfigurationImpl) SamplerConfiguration(org.apache.accumulo.core.client.sample.SamplerConfiguration) SampleNotPresentException(org.apache.accumulo.core.client.SampleNotPresentException) FileSystem(org.apache.hadoop.fs.FileSystem) KeyValue(org.apache.accumulo.core.data.KeyValue) Value(org.apache.accumulo.core.data.Value) ColumnVisibility(org.apache.accumulo.core.security.ColumnVisibility) Key(org.apache.accumulo.core.data.Key) PartialKey(org.apache.accumulo.core.data.PartialKey) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration)

Example 4 with SamplerConfiguration

use of org.apache.accumulo.core.client.sample.SamplerConfiguration in project accumulo by apache.

the class SamplerConfigurationImpl method toSamplerConfiguration.

public SamplerConfiguration toSamplerConfiguration() {
    SamplerConfiguration sc = new SamplerConfiguration(className);
    sc.setOptions(options);
    return sc;
}
Also used : TSamplerConfiguration(org.apache.accumulo.core.tabletserver.thrift.TSamplerConfiguration) SamplerConfiguration(org.apache.accumulo.core.client.sample.SamplerConfiguration)

Example 5 with SamplerConfiguration

use of org.apache.accumulo.core.client.sample.SamplerConfiguration in project accumulo by apache.

the class SampleIT method assertSampleNotPresent.

private void assertSampleNotPresent(SamplerConfiguration sc, ScannerBase... scanners) {
    for (ScannerBase scanner : scanners) {
        SamplerConfiguration csc = scanner.getSamplerConfiguration();
        scanner.setSamplerConfiguration(sc);
        try {
            for (Entry<Key, Value> entry : scanner) {
                entry.getKey();
            }
            Assert.fail("Expected SampleNotPresentException, but it did not happen : " + scanner.getClass().getSimpleName());
        } catch (SampleNotPresentException e) {
        }
        scanner.clearSamplerConfiguration();
        for (Entry<Key, Value> entry : scanner) {
            entry.getKey();
        }
        if (csc == null) {
            scanner.clearSamplerConfiguration();
        } else {
            scanner.setSamplerConfiguration(csc);
        }
    }
}
Also used : SampleNotPresentException(org.apache.accumulo.core.client.SampleNotPresentException) ScannerBase(org.apache.accumulo.core.client.ScannerBase) SamplerConfiguration(org.apache.accumulo.core.client.sample.SamplerConfiguration) Value(org.apache.accumulo.core.data.Value) Key(org.apache.accumulo.core.data.Key)

Aggregations

SamplerConfiguration (org.apache.accumulo.core.client.sample.SamplerConfiguration)12 SampleNotPresentException (org.apache.accumulo.core.client.SampleNotPresentException)4 RowSampler (org.apache.accumulo.core.client.sample.RowSampler)4 Test (org.junit.Test)4 SummarizerConfiguration (org.apache.accumulo.core.client.summary.SummarizerConfiguration)3 AccumuloConfiguration (org.apache.accumulo.core.conf.AccumuloConfiguration)3 Key (org.apache.accumulo.core.data.Key)3 Value (org.apache.accumulo.core.data.Value)3 SamplerConfigurationImpl (org.apache.accumulo.core.sample.impl.SamplerConfigurationImpl)3 Connector (org.apache.accumulo.core.client.Connector)2 Scanner (org.apache.accumulo.core.client.Scanner)2 Configuration (org.apache.hadoop.conf.Configuration)2 IOException (java.io.IOException)1 TreeMap (java.util.TreeMap)1 BatchScanner (org.apache.accumulo.core.client.BatchScanner)1 BatchWriter (org.apache.accumulo.core.client.BatchWriter)1 IteratorSetting (org.apache.accumulo.core.client.IteratorSetting)1 ScannerBase (org.apache.accumulo.core.client.ScannerBase)1 CompactionConfig (org.apache.accumulo.core.client.admin.CompactionConfig)1 NewTableConfiguration (org.apache.accumulo.core.client.admin.NewTableConfiguration)1