use of org.apache.accumulo.core.summary.SummaryCollection in project accumulo by apache.
the class TableOperationsImpl method summaries.
@Override
public SummaryRetriever summaries(String tableName) {
return new SummaryRetriever() {
private Text startRow = null;
private Text endRow = null;
private List<TSummarizerConfiguration> summariesToFetch = Collections.emptyList();
private String summarizerClassRegex;
private boolean flush = false;
@Override
public SummaryRetriever startRow(Text startRow) {
Objects.requireNonNull(startRow);
if (endRow != null) {
Preconditions.checkArgument(startRow.compareTo(endRow) < 0, "Start row must be less than end row : %s >= %s", startRow, endRow);
}
this.startRow = startRow;
return this;
}
@Override
public SummaryRetriever startRow(CharSequence startRow) {
return startRow(new Text(startRow.toString()));
}
@Override
public List<Summary> retrieve() throws AccumuloException, AccumuloSecurityException, TableNotFoundException {
Table.ID tableId = Tables.getTableId(context.getInstance(), tableName);
if (Tables.getTableState(context.getInstance(), tableId) == TableState.OFFLINE)
throw new TableOfflineException(context.getInstance(), tableId.canonicalID());
TRowRange range = new TRowRange(TextUtil.getByteBuffer(startRow), TextUtil.getByteBuffer(endRow));
TSummaryRequest request = new TSummaryRequest(tableId.canonicalID(), range, summariesToFetch, summarizerClassRegex);
if (flush) {
_flush(tableId, startRow, endRow, true);
}
TSummaries ret = ServerClient.execute(context, new TabletClientService.Client.Factory(), client -> {
TSummaries tsr = client.startGetSummaries(Tracer.traceInfo(), context.rpcCreds(), request);
while (!tsr.finished) {
tsr = client.contiuneGetSummaries(Tracer.traceInfo(), tsr.sessionId);
}
return tsr;
});
return new SummaryCollection(ret).getSummaries();
}
@Override
public SummaryRetriever endRow(Text endRow) {
Objects.requireNonNull(endRow);
if (startRow != null) {
Preconditions.checkArgument(startRow.compareTo(endRow) < 0, "Start row must be less than end row : %s >= %s", startRow, endRow);
}
this.endRow = endRow;
return this;
}
@Override
public SummaryRetriever endRow(CharSequence endRow) {
return endRow(new Text(endRow.toString()));
}
@Override
public SummaryRetriever withConfiguration(Collection<SummarizerConfiguration> configs) {
Objects.requireNonNull(configs);
summariesToFetch = configs.stream().map(SummarizerConfigurationUtil::toThrift).collect(Collectors.toList());
return this;
}
@Override
public SummaryRetriever withConfiguration(SummarizerConfiguration... config) {
Objects.requireNonNull(config);
return withConfiguration(Arrays.asList(config));
}
@Override
public SummaryRetriever withMatchingConfiguration(String regex) {
Objects.requireNonNull(regex);
// Do a sanity check here to make sure that regex compiles, instead of having it fail on a tserver.
Pattern.compile(regex);
this.summarizerClassRegex = regex;
return this;
}
@Override
public SummaryRetriever flush(boolean b) {
this.flush = b;
return this;
}
};
}
use of org.apache.accumulo.core.summary.SummaryCollection in project accumulo by apache.
the class MajorCompactionRequest method getSummaries.
/**
* Returns all summaries present in each file.
*
* <p>
* This method can only be called from {@link CompactionStrategy#gatherInformation(MajorCompactionRequest)}. Unfortunately, {@code gatherInformation()} is not
* called before {@link CompactionStrategy#shouldCompact(MajorCompactionRequest)}. Therefore {@code shouldCompact()} should just return true when a
* compactions strategy wants to use summary information.
*
* <p>
* When using summaries to make compaction decisions, its important to ensure that all summary data fits in the tablet server summary cache. The size of this
* cache is configured by code tserver.cache.summary.size}. Also its important to use the summarySelector predicate to only retrieve the needed summary data.
* Otherwise uneeded summary data could be brought into the cache.
*
* <p>
* Some files may contain data outside of a tablets range. When {@link Summarizer}'s generate small amounts of summary data, multiple summaries may be stored
* within a file for different row ranges. This will allow more accurate summaries to be returned for the case where a file has data outside a tablets range.
* However, some summary data outside of the tablets range may still be included. When this happens {@link FileStatistics#getExtra()} will be non zero. Also,
* its good to be aware of the other potential causes of inaccuracies {@link FileStatistics#getInaccurate()}
*
* <p>
* When this method is called with multiple files, it will automatically merge summary data using {@link Combiner#merge(Map, Map)}. If summary information is
* needed for each file, then just call this method for each file.
*
* <p>
* Writing a compaction strategy that uses summary information is a bit tricky. See the source code for {@link TooManyDeletesCompactionStrategy} as an example
* of a compaction strategy.
*
* @see Summarizer
* @see TableOperations#addSummarizers(String, SummarizerConfiguration...)
* @see AccumuloFileOutputFormat#setSummarizers(org.apache.hadoop.mapred.JobConf, SummarizerConfiguration...)
* @see WriterOptions#withSummarizers(SummarizerConfiguration...)
*/
public List<Summary> getSummaries(Collection<FileRef> files, Predicate<SummarizerConfiguration> summarySelector) throws IOException {
Preconditions.checkState(volumeManager != null, "Getting summaries is not supported at this time. Its only supported when CompactionStrategy.gatherInformation() is called.");
SummaryCollection sc = new SummaryCollection();
SummarizerFactory factory = new SummarizerFactory(tableConfig);
for (FileRef file : files) {
FileSystem fs = volumeManager.getVolumeByPath(file.path()).getFileSystem();
Configuration conf = CachedConfiguration.getInstance();
SummaryCollection fsc = SummaryReader.load(fs, conf, tableConfig, factory, file.path(), summarySelector, summaryCache, indexCache).getSummaries(Collections.singletonList(new Gatherer.RowRange(extent)));
sc.merge(fsc, factory);
}
return sc.getSummaries();
}
use of org.apache.accumulo.core.summary.SummaryCollection in project accumulo by apache.
the class RFileSummariesRetriever method read.
@Override
public Collection<Summary> read() throws IOException {
SummarizerFactory factory = new SummarizerFactory();
AccumuloConfiguration acuconf = DefaultConfiguration.getInstance();
Configuration conf = in.getFileSystem().getConf();
RFileSource[] sources = in.getSources();
try {
SummaryCollection all = new SummaryCollection();
for (RFileSource source : sources) {
SummaryReader fileSummary = SummaryReader.load(conf, acuconf, source.getInputStream(), source.getLength(), summarySelector, factory);
SummaryCollection sc = fileSummary.getSummaries(Collections.singletonList(new Gatherer.RowRange(startRow, endRow)));
all.merge(sc, factory);
}
return all.getSummaries();
} finally {
for (RFileSource source : sources) {
source.getInputStream().close();
}
}
}
Aggregations