Search in sources :

Example 1 with SummaryCollection

use of org.apache.accumulo.core.summary.SummaryCollection in project accumulo by apache.

the class TableOperationsImpl method summaries.

@Override
public SummaryRetriever summaries(String tableName) {
    return new SummaryRetriever() {

        private Text startRow = null;

        private Text endRow = null;

        private List<TSummarizerConfiguration> summariesToFetch = Collections.emptyList();

        private String summarizerClassRegex;

        private boolean flush = false;

        @Override
        public SummaryRetriever startRow(Text startRow) {
            Objects.requireNonNull(startRow);
            if (endRow != null) {
                Preconditions.checkArgument(startRow.compareTo(endRow) < 0, "Start row must be less than end row : %s >= %s", startRow, endRow);
            }
            this.startRow = startRow;
            return this;
        }

        @Override
        public SummaryRetriever startRow(CharSequence startRow) {
            return startRow(new Text(startRow.toString()));
        }

        @Override
        public List<Summary> retrieve() throws AccumuloException, AccumuloSecurityException, TableNotFoundException {
            Table.ID tableId = Tables.getTableId(context.getInstance(), tableName);
            if (Tables.getTableState(context.getInstance(), tableId) == TableState.OFFLINE)
                throw new TableOfflineException(context.getInstance(), tableId.canonicalID());
            TRowRange range = new TRowRange(TextUtil.getByteBuffer(startRow), TextUtil.getByteBuffer(endRow));
            TSummaryRequest request = new TSummaryRequest(tableId.canonicalID(), range, summariesToFetch, summarizerClassRegex);
            if (flush) {
                _flush(tableId, startRow, endRow, true);
            }
            TSummaries ret = ServerClient.execute(context, new TabletClientService.Client.Factory(), client -> {
                TSummaries tsr = client.startGetSummaries(Tracer.traceInfo(), context.rpcCreds(), request);
                while (!tsr.finished) {
                    tsr = client.contiuneGetSummaries(Tracer.traceInfo(), tsr.sessionId);
                }
                return tsr;
            });
            return new SummaryCollection(ret).getSummaries();
        }

        @Override
        public SummaryRetriever endRow(Text endRow) {
            Objects.requireNonNull(endRow);
            if (startRow != null) {
                Preconditions.checkArgument(startRow.compareTo(endRow) < 0, "Start row must be less than end row : %s >= %s", startRow, endRow);
            }
            this.endRow = endRow;
            return this;
        }

        @Override
        public SummaryRetriever endRow(CharSequence endRow) {
            return endRow(new Text(endRow.toString()));
        }

        @Override
        public SummaryRetriever withConfiguration(Collection<SummarizerConfiguration> configs) {
            Objects.requireNonNull(configs);
            summariesToFetch = configs.stream().map(SummarizerConfigurationUtil::toThrift).collect(Collectors.toList());
            return this;
        }

        @Override
        public SummaryRetriever withConfiguration(SummarizerConfiguration... config) {
            Objects.requireNonNull(config);
            return withConfiguration(Arrays.asList(config));
        }

        @Override
        public SummaryRetriever withMatchingConfiguration(String regex) {
            Objects.requireNonNull(regex);
            // Do a sanity check here to make sure that regex compiles, instead of having it fail on a tserver.
            Pattern.compile(regex);
            this.summarizerClassRegex = regex;
            return this;
        }

        @Override
        public SummaryRetriever flush(boolean b) {
            this.flush = b;
            return this;
        }
    };
}
Also used : RootTable(org.apache.accumulo.core.metadata.RootTable) MetadataTable(org.apache.accumulo.core.metadata.MetadataTable) TableOfflineException(org.apache.accumulo.core.client.TableOfflineException) Text(org.apache.hadoop.io.Text) SummarizerConfigurationUtil(org.apache.accumulo.core.summary.SummarizerConfigurationUtil) SummaryRetriever(org.apache.accumulo.core.client.admin.SummaryRetriever) TSummaryRequest(org.apache.accumulo.core.data.thrift.TSummaryRequest) TSummaries(org.apache.accumulo.core.data.thrift.TSummaries) Summary(org.apache.accumulo.core.client.summary.Summary) SummaryCollection(org.apache.accumulo.core.summary.SummaryCollection) Collection(java.util.Collection) ArrayList(java.util.ArrayList) List(java.util.List) LinkedList(java.util.LinkedList) Client(org.apache.accumulo.core.client.impl.thrift.ClientService.Client) TRowRange(org.apache.accumulo.core.data.thrift.TRowRange) SummaryCollection(org.apache.accumulo.core.summary.SummaryCollection) SummarizerConfiguration(org.apache.accumulo.core.client.summary.SummarizerConfiguration) TSummarizerConfiguration(org.apache.accumulo.core.data.thrift.TSummarizerConfiguration)

Example 2 with SummaryCollection

use of org.apache.accumulo.core.summary.SummaryCollection in project accumulo by apache.

the class MajorCompactionRequest method getSummaries.

/**
 * Returns all summaries present in each file.
 *
 * <p>
 * This method can only be called from {@link CompactionStrategy#gatherInformation(MajorCompactionRequest)}. Unfortunately, {@code gatherInformation()} is not
 * called before {@link CompactionStrategy#shouldCompact(MajorCompactionRequest)}. Therefore {@code shouldCompact()} should just return true when a
 * compactions strategy wants to use summary information.
 *
 * <p>
 * When using summaries to make compaction decisions, its important to ensure that all summary data fits in the tablet server summary cache. The size of this
 * cache is configured by code tserver.cache.summary.size}. Also its important to use the summarySelector predicate to only retrieve the needed summary data.
 * Otherwise uneeded summary data could be brought into the cache.
 *
 * <p>
 * Some files may contain data outside of a tablets range. When {@link Summarizer}'s generate small amounts of summary data, multiple summaries may be stored
 * within a file for different row ranges. This will allow more accurate summaries to be returned for the case where a file has data outside a tablets range.
 * However, some summary data outside of the tablets range may still be included. When this happens {@link FileStatistics#getExtra()} will be non zero. Also,
 * its good to be aware of the other potential causes of inaccuracies {@link FileStatistics#getInaccurate()}
 *
 * <p>
 * When this method is called with multiple files, it will automatically merge summary data using {@link Combiner#merge(Map, Map)}. If summary information is
 * needed for each file, then just call this method for each file.
 *
 * <p>
 * Writing a compaction strategy that uses summary information is a bit tricky. See the source code for {@link TooManyDeletesCompactionStrategy} as an example
 * of a compaction strategy.
 *
 * @see Summarizer
 * @see TableOperations#addSummarizers(String, SummarizerConfiguration...)
 * @see AccumuloFileOutputFormat#setSummarizers(org.apache.hadoop.mapred.JobConf, SummarizerConfiguration...)
 * @see WriterOptions#withSummarizers(SummarizerConfiguration...)
 */
public List<Summary> getSummaries(Collection<FileRef> files, Predicate<SummarizerConfiguration> summarySelector) throws IOException {
    Preconditions.checkState(volumeManager != null, "Getting summaries is not supported at this time.  Its only supported when CompactionStrategy.gatherInformation() is called.");
    SummaryCollection sc = new SummaryCollection();
    SummarizerFactory factory = new SummarizerFactory(tableConfig);
    for (FileRef file : files) {
        FileSystem fs = volumeManager.getVolumeByPath(file.path()).getFileSystem();
        Configuration conf = CachedConfiguration.getInstance();
        SummaryCollection fsc = SummaryReader.load(fs, conf, tableConfig, factory, file.path(), summarySelector, summaryCache, indexCache).getSummaries(Collections.singletonList(new Gatherer.RowRange(extent)));
        sc.merge(fsc, factory);
    }
    return sc.getSummaries();
}
Also used : SummarizerConfiguration(org.apache.accumulo.core.client.summary.SummarizerConfiguration) Configuration(org.apache.hadoop.conf.Configuration) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration) CachedConfiguration(org.apache.accumulo.core.util.CachedConfiguration) FileRef(org.apache.accumulo.server.fs.FileRef) FileSystem(org.apache.hadoop.fs.FileSystem) SummarizerFactory(org.apache.accumulo.core.summary.SummarizerFactory) SummaryCollection(org.apache.accumulo.core.summary.SummaryCollection)

Example 3 with SummaryCollection

use of org.apache.accumulo.core.summary.SummaryCollection in project accumulo by apache.

the class RFileSummariesRetriever method read.

@Override
public Collection<Summary> read() throws IOException {
    SummarizerFactory factory = new SummarizerFactory();
    AccumuloConfiguration acuconf = DefaultConfiguration.getInstance();
    Configuration conf = in.getFileSystem().getConf();
    RFileSource[] sources = in.getSources();
    try {
        SummaryCollection all = new SummaryCollection();
        for (RFileSource source : sources) {
            SummaryReader fileSummary = SummaryReader.load(conf, acuconf, source.getInputStream(), source.getLength(), summarySelector, factory);
            SummaryCollection sc = fileSummary.getSummaries(Collections.singletonList(new Gatherer.RowRange(startRow, endRow)));
            all.merge(sc, factory);
        }
        return all.getSummaries();
    } finally {
        for (RFileSource source : sources) {
            source.getInputStream().close();
        }
    }
}
Also used : SummarizerConfiguration(org.apache.accumulo.core.client.summary.SummarizerConfiguration) DefaultConfiguration(org.apache.accumulo.core.conf.DefaultConfiguration) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration) Configuration(org.apache.hadoop.conf.Configuration) SummaryReader(org.apache.accumulo.core.summary.SummaryReader) SummarizerFactory(org.apache.accumulo.core.summary.SummarizerFactory) SummaryCollection(org.apache.accumulo.core.summary.SummaryCollection) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration)

Aggregations

SummarizerConfiguration (org.apache.accumulo.core.client.summary.SummarizerConfiguration)3 SummaryCollection (org.apache.accumulo.core.summary.SummaryCollection)3 AccumuloConfiguration (org.apache.accumulo.core.conf.AccumuloConfiguration)2 SummarizerFactory (org.apache.accumulo.core.summary.SummarizerFactory)2 Configuration (org.apache.hadoop.conf.Configuration)2 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 TableOfflineException (org.apache.accumulo.core.client.TableOfflineException)1 SummaryRetriever (org.apache.accumulo.core.client.admin.SummaryRetriever)1 Client (org.apache.accumulo.core.client.impl.thrift.ClientService.Client)1 Summary (org.apache.accumulo.core.client.summary.Summary)1 DefaultConfiguration (org.apache.accumulo.core.conf.DefaultConfiguration)1 TRowRange (org.apache.accumulo.core.data.thrift.TRowRange)1 TSummaries (org.apache.accumulo.core.data.thrift.TSummaries)1 TSummarizerConfiguration (org.apache.accumulo.core.data.thrift.TSummarizerConfiguration)1 TSummaryRequest (org.apache.accumulo.core.data.thrift.TSummaryRequest)1 MetadataTable (org.apache.accumulo.core.metadata.MetadataTable)1 RootTable (org.apache.accumulo.core.metadata.RootTable)1