Search in sources :

Example 11 with SummarizerConfiguration

use of org.apache.accumulo.core.client.summary.SummarizerConfiguration in project accumulo by apache.

the class TooManyDeletesIT method tooManyDeletesCompactionStrategyIT.

@Test
public void tooManyDeletesCompactionStrategyIT() throws Exception {
    Connector c = getConnector();
    String table = getUniqueNames(1)[0];
    SummarizerConfiguration sc = SummarizerConfiguration.builder(DeletesSummarizer.class).build();
    // TODO open issue about programatic config of compaction strategies
    NewTableConfiguration ntc = new NewTableConfiguration().enableSummarization(sc);
    HashMap<String, String> props = new HashMap<>();
    props.put(Property.TABLE_COMPACTION_STRATEGY.getKey(), TooManyDeletesCompactionStrategy.class.getName());
    props.put(Property.TABLE_COMPACTION_STRATEGY_PREFIX.getKey() + TooManyDeletesCompactionStrategy.THRESHOLD_OPT, ".25");
    // ensure compaction does not happen because of the number of files
    props.put(Property.TABLE_MAJC_RATIO.getKey(), "10");
    ntc.setProperties(props);
    c.tableOperations().create(table, ntc);
    try (BatchWriter bw = c.createBatchWriter(table, new BatchWriterConfig())) {
        for (int i = 0; i < 1000; i++) {
            Mutation m = new Mutation("row" + i);
            m.put("f", "q", "v" + i);
            bw.addMutation(m);
        }
    }
    List<Summary> summaries = c.tableOperations().summaries(table).flush(true).withConfiguration(sc).retrieve();
    Assert.assertEquals(1, summaries.size());
    Summary summary = summaries.get(0);
    Assert.assertEquals(1000l, (long) summary.getStatistics().get(DeletesSummarizer.TOTAL_STAT));
    Assert.assertEquals(0l, (long) summary.getStatistics().get(DeletesSummarizer.DELETES_STAT));
    try (BatchWriter bw = c.createBatchWriter(table, new BatchWriterConfig())) {
        for (int i = 0; i < 100; i++) {
            Mutation m = new Mutation("row" + i);
            m.putDelete("f", "q");
            bw.addMutation(m);
        }
    }
    summaries = c.tableOperations().summaries(table).flush(true).withConfiguration(sc).retrieve();
    Assert.assertEquals(1, summaries.size());
    summary = summaries.get(0);
    Assert.assertEquals(1100l, (long) summary.getStatistics().get(DeletesSummarizer.TOTAL_STAT));
    Assert.assertEquals(100l, (long) summary.getStatistics().get(DeletesSummarizer.DELETES_STAT));
    try (BatchWriter bw = c.createBatchWriter(table, new BatchWriterConfig())) {
        for (int i = 100; i < 300; i++) {
            Mutation m = new Mutation("row" + i);
            m.putDelete("f", "q");
            bw.addMutation(m);
        }
    }
    // after a flush occurs Accumulo will check if a major compaction is needed. This check should call the compaction strategy, which should decide to compact
    // all files based on the number of deletes.
    c.tableOperations().flush(table, null, null, true);
    // wait for the compaction to happen
    while (true) {
        // the flush should cause
        summaries = c.tableOperations().summaries(table).flush(false).withConfiguration(sc).retrieve();
        Assert.assertEquals(1, summaries.size());
        summary = summaries.get(0);
        long total = summary.getStatistics().get(DeletesSummarizer.TOTAL_STAT);
        long deletes = summary.getStatistics().get(DeletesSummarizer.DELETES_STAT);
        if (total == 700 && deletes == 0) {
            // a compaction was triggered based on the number of deletes
            break;
        }
        UtilWaitThread.sleep(50);
    }
}
Also used : Connector(org.apache.accumulo.core.client.Connector) HashMap(java.util.HashMap) DeletesSummarizer(org.apache.accumulo.core.client.summary.summarizers.DeletesSummarizer) TooManyDeletesCompactionStrategy(org.apache.accumulo.tserver.compaction.strategies.TooManyDeletesCompactionStrategy) NewTableConfiguration(org.apache.accumulo.core.client.admin.NewTableConfiguration) BatchWriterConfig(org.apache.accumulo.core.client.BatchWriterConfig) Summary(org.apache.accumulo.core.client.summary.Summary) BatchWriter(org.apache.accumulo.core.client.BatchWriter) Mutation(org.apache.accumulo.core.data.Mutation) SummarizerConfiguration(org.apache.accumulo.core.client.summary.SummarizerConfiguration) Test(org.junit.Test)

Example 12 with SummarizerConfiguration

use of org.apache.accumulo.core.client.summary.SummarizerConfiguration in project accumulo by apache.

the class SummaryCollectionTest method testDeleted.

@Test
public void testDeleted() {
    SummarizerConfiguration conf = SummarizerConfiguration.builder(FamilySummarizer.class).build();
    HashMap<String, Long> stats = new HashMap<>();
    stats.put("c:foo", 9L);
    FileSummary fs1 = new FileSummary(conf, stats, false);
    SummaryCollection sc1 = new SummaryCollection(Collections.singleton(fs1));
    stats = new HashMap<>();
    stats.put("c:foo", 5L);
    stats.put("c:bar", 3L);
    FileSummary fs2 = new FileSummary(conf, stats, true);
    SummaryCollection sc2 = new SummaryCollection(Collections.singleton(fs2));
    SummaryCollection sc3 = new SummaryCollection(Collections.emptyList());
    SummaryCollection sc4 = new SummaryCollection(Collections.emptyList(), true);
    SummarizerFactory factory = new SummarizerFactory();
    SummaryCollection mergeSc = new SummaryCollection();
    for (SummaryCollection sc : Arrays.asList(sc1, sc2, sc3, sc4, sc4)) {
        mergeSc.merge(sc, factory);
    }
    for (SummaryCollection sc : Arrays.asList(mergeSc, new SummaryCollection(mergeSc.toThrift()))) {
        List<Summary> summaries = sc.getSummaries();
        Assert.assertEquals(1, summaries.size());
        Summary summary = summaries.get(0);
        FileStatistics filestats = summary.getFileStatistics();
        Assert.assertEquals(5, filestats.getTotal());
        Assert.assertEquals(1, filestats.getExtra());
        Assert.assertEquals(0, filestats.getLarge());
        Assert.assertEquals(1, filestats.getMissing());
        Assert.assertEquals(2, filestats.getDeleted());
        Assert.assertEquals(4, filestats.getInaccurate());
    }
}
Also used : FileStatistics(org.apache.accumulo.core.client.summary.Summary.FileStatistics) HashMap(java.util.HashMap) FamilySummarizer(org.apache.accumulo.core.client.summary.summarizers.FamilySummarizer) FileSummary(org.apache.accumulo.core.summary.SummaryCollection.FileSummary) FileSummary(org.apache.accumulo.core.summary.SummaryCollection.FileSummary) Summary(org.apache.accumulo.core.client.summary.Summary) SummarizerConfiguration(org.apache.accumulo.core.client.summary.SummarizerConfiguration) Test(org.junit.Test)

Example 13 with SummarizerConfiguration

use of org.apache.accumulo.core.client.summary.SummarizerConfiguration in project accumulo by apache.

the class TooManyDeletesCompactionStrategy method gatherInformation.

@Override
public void gatherInformation(MajorCompactionRequest request) throws IOException {
    super.gatherInformation(request);
    Predicate<SummarizerConfiguration> summarizerPredicate = conf -> conf.getClassName().equals(DeletesSummarizer.class.getName()) && conf.getOptions().isEmpty();
    long total = 0;
    long deletes = 0;
    for (Entry<FileRef, DataFileValue> entry : request.getFiles().entrySet()) {
        Collection<Summary> summaries = request.getSummaries(Collections.singleton(entry.getKey()), summarizerPredicate);
        if (summaries.size() == 1) {
            Summary summary = summaries.iterator().next();
            total += summary.getStatistics().get(TOTAL_STAT);
            deletes += summary.getStatistics().get(DELETES_STAT);
        } else {
            long numEntries = entry.getValue().getNumEntries();
            if (numEntries == 0 && !proceed_bns) {
                shouldCompact = false;
                return;
            } else {
                // no summary data so use Accumulo's estimate of total entries in file
                total += entry.getValue().getNumEntries();
            }
        }
    }
    long nonDeletes = total - deletes;
    if (nonDeletes >= 0) {
        // check nonDeletes >= 0 because if this is not true then its clear evidence that the estimates are off
        double ratio = deletes / (double) nonDeletes;
        shouldCompact = ratio >= threshold;
    } else {
        shouldCompact = false;
    }
}
Also used : TOTAL_STAT(org.apache.accumulo.core.client.summary.summarizers.DeletesSummarizer.TOTAL_STAT) Summary(org.apache.accumulo.core.client.summary.Summary) CompactionPlan(org.apache.accumulo.tserver.compaction.CompactionPlan) DataFileValue(org.apache.accumulo.core.metadata.schema.DataFileValue) Logger(org.slf4j.Logger) Predicate(java.util.function.Predicate) SummarizerConfiguration(org.apache.accumulo.core.client.summary.SummarizerConfiguration) Collection(java.util.Collection) MajorCompactionRequest(org.apache.accumulo.tserver.compaction.MajorCompactionRequest) LoggerFactory(org.slf4j.LoggerFactory) IOException(java.io.IOException) DeletesSummarizer(org.apache.accumulo.core.client.summary.summarizers.DeletesSummarizer) DefaultCompactionStrategy(org.apache.accumulo.tserver.compaction.DefaultCompactionStrategy) WriterOptions(org.apache.accumulo.core.client.rfile.RFile.WriterOptions) DELETES_STAT(org.apache.accumulo.core.client.summary.summarizers.DeletesSummarizer.DELETES_STAT) Map(java.util.Map) Entry(java.util.Map.Entry) AccumuloFileOutputFormat(org.apache.accumulo.core.client.mapred.AccumuloFileOutputFormat) FileRef(org.apache.accumulo.server.fs.FileRef) Collections(java.util.Collections) DataFileValue(org.apache.accumulo.core.metadata.schema.DataFileValue) FileRef(org.apache.accumulo.server.fs.FileRef) Summary(org.apache.accumulo.core.client.summary.Summary) SummarizerConfiguration(org.apache.accumulo.core.client.summary.SummarizerConfiguration)

Example 14 with SummarizerConfiguration

use of org.apache.accumulo.core.client.summary.SummarizerConfiguration in project accumulo by apache.

the class AccumuloFileOutputFormatTest method validateConfiguration.

@Test
public void validateConfiguration() throws IOException, InterruptedException {
    int a = 7;
    long b = 300l;
    long c = 50l;
    long d = 10l;
    String e = "snappy";
    SamplerConfiguration samplerConfig = new SamplerConfiguration(RowSampler.class.getName());
    samplerConfig.addOption("hasher", "murmur3_32");
    samplerConfig.addOption("modulus", "109");
    SummarizerConfiguration sc1 = SummarizerConfiguration.builder(VisibilitySummarizer.class).addOption(CountingSummarizer.MAX_COUNTERS_OPT, 2048).build();
    SummarizerConfiguration sc2 = SummarizerConfiguration.builder(FamilySummarizer.class).addOption(CountingSummarizer.MAX_COUNTERS_OPT, 256).build();
    JobConf job = new JobConf();
    AccumuloFileOutputFormat.setReplication(job, a);
    AccumuloFileOutputFormat.setFileBlockSize(job, b);
    AccumuloFileOutputFormat.setDataBlockSize(job, c);
    AccumuloFileOutputFormat.setIndexBlockSize(job, d);
    AccumuloFileOutputFormat.setCompressionType(job, e);
    AccumuloFileOutputFormat.setSampler(job, samplerConfig);
    AccumuloFileOutputFormat.setSummarizers(job, sc1, sc2);
    AccumuloConfiguration acuconf = FileOutputConfigurator.getAccumuloConfiguration(AccumuloFileOutputFormat.class, job);
    assertEquals(7, acuconf.getCount(Property.TABLE_FILE_REPLICATION));
    assertEquals(300l, acuconf.getAsBytes(Property.TABLE_FILE_BLOCK_SIZE));
    assertEquals(50l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE));
    assertEquals(10l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE_INDEX));
    assertEquals("snappy", acuconf.get(Property.TABLE_FILE_COMPRESSION_TYPE));
    assertEquals(new SamplerConfigurationImpl(samplerConfig), SamplerConfigurationImpl.newSamplerConfig(acuconf));
    Collection<SummarizerConfiguration> summarizerConfigs = SummarizerConfiguration.fromTableProperties(acuconf);
    assertEquals(2, summarizerConfigs.size());
    assertTrue(summarizerConfigs.contains(sc1));
    assertTrue(summarizerConfigs.contains(sc2));
    a = 17;
    b = 1300l;
    c = 150l;
    d = 110l;
    e = "lzo";
    samplerConfig = new SamplerConfiguration(RowSampler.class.getName());
    samplerConfig.addOption("hasher", "md5");
    samplerConfig.addOption("modulus", "100003");
    job = new JobConf();
    AccumuloFileOutputFormat.setReplication(job, a);
    AccumuloFileOutputFormat.setFileBlockSize(job, b);
    AccumuloFileOutputFormat.setDataBlockSize(job, c);
    AccumuloFileOutputFormat.setIndexBlockSize(job, d);
    AccumuloFileOutputFormat.setCompressionType(job, e);
    AccumuloFileOutputFormat.setSampler(job, samplerConfig);
    acuconf = FileOutputConfigurator.getAccumuloConfiguration(AccumuloFileOutputFormat.class, job);
    assertEquals(17, acuconf.getCount(Property.TABLE_FILE_REPLICATION));
    assertEquals(1300l, acuconf.getAsBytes(Property.TABLE_FILE_BLOCK_SIZE));
    assertEquals(150l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE));
    assertEquals(110l, acuconf.getAsBytes(Property.TABLE_FILE_COMPRESSED_BLOCK_SIZE_INDEX));
    assertEquals("lzo", acuconf.get(Property.TABLE_FILE_COMPRESSION_TYPE));
    assertEquals(new SamplerConfigurationImpl(samplerConfig), SamplerConfigurationImpl.newSamplerConfig(acuconf));
    summarizerConfigs = SummarizerConfiguration.fromTableProperties(acuconf);
    assertEquals(0, summarizerConfigs.size());
}
Also used : RowSampler(org.apache.accumulo.core.client.sample.RowSampler) SamplerConfigurationImpl(org.apache.accumulo.core.sample.impl.SamplerConfigurationImpl) SamplerConfiguration(org.apache.accumulo.core.client.sample.SamplerConfiguration) SummarizerConfiguration(org.apache.accumulo.core.client.summary.SummarizerConfiguration) JobConf(org.apache.hadoop.mapred.JobConf) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration) Test(org.junit.Test)

Example 15 with SummarizerConfiguration

use of org.apache.accumulo.core.client.summary.SummarizerConfiguration in project accumulo by apache.

the class TableOperationsImpl method summaries.

@Override
public SummaryRetriever summaries(String tableName) {
    return new SummaryRetriever() {

        private Text startRow = null;

        private Text endRow = null;

        private List<TSummarizerConfiguration> summariesToFetch = Collections.emptyList();

        private String summarizerClassRegex;

        private boolean flush = false;

        @Override
        public SummaryRetriever startRow(Text startRow) {
            Objects.requireNonNull(startRow);
            if (endRow != null) {
                Preconditions.checkArgument(startRow.compareTo(endRow) < 0, "Start row must be less than end row : %s >= %s", startRow, endRow);
            }
            this.startRow = startRow;
            return this;
        }

        @Override
        public SummaryRetriever startRow(CharSequence startRow) {
            return startRow(new Text(startRow.toString()));
        }

        @Override
        public List<Summary> retrieve() throws AccumuloException, AccumuloSecurityException, TableNotFoundException {
            Table.ID tableId = Tables.getTableId(context.getInstance(), tableName);
            if (Tables.getTableState(context.getInstance(), tableId) == TableState.OFFLINE)
                throw new TableOfflineException(context.getInstance(), tableId.canonicalID());
            TRowRange range = new TRowRange(TextUtil.getByteBuffer(startRow), TextUtil.getByteBuffer(endRow));
            TSummaryRequest request = new TSummaryRequest(tableId.canonicalID(), range, summariesToFetch, summarizerClassRegex);
            if (flush) {
                _flush(tableId, startRow, endRow, true);
            }
            TSummaries ret = ServerClient.execute(context, new TabletClientService.Client.Factory(), client -> {
                TSummaries tsr = client.startGetSummaries(Tracer.traceInfo(), context.rpcCreds(), request);
                while (!tsr.finished) {
                    tsr = client.contiuneGetSummaries(Tracer.traceInfo(), tsr.sessionId);
                }
                return tsr;
            });
            return new SummaryCollection(ret).getSummaries();
        }

        @Override
        public SummaryRetriever endRow(Text endRow) {
            Objects.requireNonNull(endRow);
            if (startRow != null) {
                Preconditions.checkArgument(startRow.compareTo(endRow) < 0, "Start row must be less than end row : %s >= %s", startRow, endRow);
            }
            this.endRow = endRow;
            return this;
        }

        @Override
        public SummaryRetriever endRow(CharSequence endRow) {
            return endRow(new Text(endRow.toString()));
        }

        @Override
        public SummaryRetriever withConfiguration(Collection<SummarizerConfiguration> configs) {
            Objects.requireNonNull(configs);
            summariesToFetch = configs.stream().map(SummarizerConfigurationUtil::toThrift).collect(Collectors.toList());
            return this;
        }

        @Override
        public SummaryRetriever withConfiguration(SummarizerConfiguration... config) {
            Objects.requireNonNull(config);
            return withConfiguration(Arrays.asList(config));
        }

        @Override
        public SummaryRetriever withMatchingConfiguration(String regex) {
            Objects.requireNonNull(regex);
            // Do a sanity check here to make sure that regex compiles, instead of having it fail on a tserver.
            Pattern.compile(regex);
            this.summarizerClassRegex = regex;
            return this;
        }

        @Override
        public SummaryRetriever flush(boolean b) {
            this.flush = b;
            return this;
        }
    };
}
Also used : RootTable(org.apache.accumulo.core.metadata.RootTable) MetadataTable(org.apache.accumulo.core.metadata.MetadataTable) TableOfflineException(org.apache.accumulo.core.client.TableOfflineException) Text(org.apache.hadoop.io.Text) SummarizerConfigurationUtil(org.apache.accumulo.core.summary.SummarizerConfigurationUtil) SummaryRetriever(org.apache.accumulo.core.client.admin.SummaryRetriever) TSummaryRequest(org.apache.accumulo.core.data.thrift.TSummaryRequest) TSummaries(org.apache.accumulo.core.data.thrift.TSummaries) Summary(org.apache.accumulo.core.client.summary.Summary) SummaryCollection(org.apache.accumulo.core.summary.SummaryCollection) Collection(java.util.Collection) ArrayList(java.util.ArrayList) List(java.util.List) LinkedList(java.util.LinkedList) Client(org.apache.accumulo.core.client.impl.thrift.ClientService.Client) TRowRange(org.apache.accumulo.core.data.thrift.TRowRange) SummaryCollection(org.apache.accumulo.core.summary.SummaryCollection) SummarizerConfiguration(org.apache.accumulo.core.client.summary.SummarizerConfiguration) TSummarizerConfiguration(org.apache.accumulo.core.data.thrift.TSummarizerConfiguration)

Aggregations

SummarizerConfiguration (org.apache.accumulo.core.client.summary.SummarizerConfiguration)41 Test (org.junit.Test)33 HashMap (java.util.HashMap)28 Key (org.apache.accumulo.core.data.Key)22 Value (org.apache.accumulo.core.data.Value)22 Collector (org.apache.accumulo.core.client.summary.Summarizer.Collector)19 EntryLengthSummarizer (org.apache.accumulo.core.client.summary.summarizers.EntryLengthSummarizer)16 Summary (org.apache.accumulo.core.client.summary.Summary)13 NewTableConfiguration (org.apache.accumulo.core.client.admin.NewTableConfiguration)10 CounterSummary (org.apache.accumulo.core.client.summary.CounterSummary)10 Connector (org.apache.accumulo.core.client.Connector)9 BatchWriter (org.apache.accumulo.core.client.BatchWriter)8 Text (org.apache.hadoop.io.Text)8 FamilySummarizer (org.apache.accumulo.core.client.summary.summarizers.FamilySummarizer)7 ArrayList (java.util.ArrayList)6 BatchWriterConfig (org.apache.accumulo.core.client.BatchWriterConfig)6 IOException (java.io.IOException)5 Collection (java.util.Collection)5 Map (java.util.Map)5 Entry (java.util.Map.Entry)5