use of co.cask.cdap.api.dataset.table.Scanner in project cdap by caskdata.
the class WorkflowDataset method getNeighbors.
/**
* Returns a map of WorkflowRunId to WorkflowRunRecord that are close to the WorkflowRunId provided by the user.
*
* @param id The workflow
* @param runId The runid of the workflow
* @param limit The limit on each side of the run that we want to see into
* @param timeInterval The time interval that we want the results to be spaced apart
* @return A Map of WorkflowRunId to the corresponding Workflow Run Record. A map is used so that duplicates of
* the WorkflowRunRecord are not obtained
*/
private Map<String, WorkflowRunRecord> getNeighbors(WorkflowId id, RunId runId, int limit, long timeInterval) {
long startTime = RunIds.getTime(runId, TimeUnit.SECONDS);
Map<String, WorkflowRunRecord> workflowRunRecords = new HashMap<>();
int i = -limit;
long prevStartTime = startTime - (limit * timeInterval);
// the last record was found if the (interval * the count of the loop) is less than the time.
while (prevStartTime <= startTime + (limit * timeInterval)) {
MDSKey mdsKey = getRowKeyBuilder(id, prevStartTime).build();
byte[] startRowKey = mdsKey.getKey();
Scan scan = new Scan(startRowKey, null);
Scanner scanner = table.scan(scan);
Row indexRow = scanner.next();
if (indexRow == null) {
return workflowRunRecords;
}
byte[] rowKey = indexRow.getRow();
long time = ByteBuffer.wrap(rowKey, rowKey.length - Bytes.SIZEOF_LONG, Bytes.SIZEOF_LONG).getLong();
if (!((time >= (startTime - (limit * timeInterval))) && time <= (startTime + (limit * timeInterval)))) {
break;
}
Map<byte[], byte[]> columns = indexRow.getColumns();
String workflowRunId = Bytes.toString(columns.get(RUNID));
long timeTaken = Bytes.toLong(columns.get(TIME_TAKEN));
List<ProgramRun> programRunList = GSON.fromJson(Bytes.toString(columns.get(NODES)), PROGRAM_RUNS_TYPE);
workflowRunRecords.put(workflowRunId, new WorkflowRunRecord(workflowRunId, timeTaken, programRunList));
prevStartTime = startTime + (i * timeInterval) < time ? time + 1 : startTime + (i * timeInterval);
i++;
}
return workflowRunRecords;
}
use of co.cask.cdap.api.dataset.table.Scanner in project cdap by caskdata.
the class DefaultPreviewStore method get.
@Override
public Map<String, List<JsonElement>> get(ApplicationId applicationId, String tracerName) {
// PreviewStore is a singleton and we have to create gson for each operation since gson is not thread safe.
Gson gson = new GsonBuilder().registerTypeAdapter(Schema.class, new SchemaTypeAdapter()).create();
byte[] startRowKey = new MDSKey.Builder().add(applicationId.getNamespace()).add(applicationId.getApplication()).add(tracerName).build().getKey();
byte[] stopRowKey = new MDSKey(Bytes.stopKeyForPrefix(startRowKey)).getKey();
Map<String, List<JsonElement>> result = new HashMap<>();
try (Scanner scanner = table.scan(startRowKey, stopRowKey, null, null, null)) {
Row indexRow;
while ((indexRow = scanner.next()) != null) {
Map<byte[], byte[]> columns = indexRow.getColumns();
String propertyName = Bytes.toString(columns.get(PROPERTY));
JsonElement value = gson.fromJson(Bytes.toString(columns.get(VALUE)), JsonElement.class);
List<JsonElement> values = result.get(propertyName);
if (values == null) {
values = new ArrayList<>();
result.put(propertyName, values);
}
values.add(value);
}
} catch (IOException e) {
String message = String.format("Error while reading preview data for application '%s' and tracer '%s'.", applicationId, tracerName);
throw new RuntimeException(message, e);
}
return result;
}
use of co.cask.cdap.api.dataset.table.Scanner in project cdap by caskdata.
the class LineageDataset method getAccessTimesForRun.
/**
* @return a set of access times (for program and data it accesses) associated with a program run.
*/
@VisibleForTesting
public List<Long> getAccessTimesForRun(ProgramRunId run) {
ImmutableList.Builder<Long> recordBuilder = ImmutableList.builder();
byte[] startKey = getRunScanStartKey(run);
try (Scanner scanner = accessRegistryTable.scan(startKey, Bytes.stopKeyForPrefix(startKey))) {
Row row;
while ((row = scanner.next()) != null) {
if (LOG.isTraceEnabled()) {
LOG.trace("Got row key = {}", Bytes.toString(row.getRow()));
}
RowKey rowKey = parseRow(row);
if (run.getEntityName().equals(rowKey.getRunId().getId())) {
recordBuilder.add(Bytes.toLong(row.get(ACCESS_TIME_COLS_BYTE)));
}
}
}
return recordBuilder.build();
}
use of co.cask.cdap.api.dataset.table.Scanner in project cdap by caskdata.
the class FactTable method findMeasureNames.
/**
* Finds all measure names of the facts that match given {@link DimensionValue}s and time range.
* @param allDimensionNames list of all dimension names to be present in the fact record
* @param dimensionSlice dimension values to filter by, {@code null} means any non-null value.
* @param startTs start timestamp, in sec
* @param endTs end timestamp, in sec
* @return {@link Set} of measure names
*/
// todo: pass a limit on number of measures returned
public Set<String> findMeasureNames(List<String> allDimensionNames, Map<String, String> dimensionSlice, long startTs, long endTs) {
List<DimensionValue> allDimensions = Lists.newArrayList();
for (String dimensionName : allDimensionNames) {
allDimensions.add(new DimensionValue(dimensionName, dimensionSlice.get(dimensionName)));
}
byte[] startRow = codec.createStartRowKey(allDimensions, null, startTs, false);
byte[] endRow = codec.createEndRowKey(allDimensions, null, endTs, false);
endRow = Bytes.stopKeyForPrefix(endRow);
FuzzyRowFilter fuzzyRowFilter = createFuzzyRowFilter(new FactScan(startTs, endTs, ImmutableList.<String>of(), allDimensions), startRow);
Set<String> measureNames = Sets.newHashSet();
int scannedRecords = 0;
try (Scanner scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter)) {
Row rowResult;
while ((rowResult = scanner.next()) != null) {
scannedRecords++;
if (scannedRecords > MAX_RECORDS_TO_SCAN_DURING_SEARCH) {
break;
}
byte[] rowKey = rowResult.getRow();
// filter out columns by time range (scan configuration only filters whole rows)
if (codec.getTimestamp(rowKey, codec.createColumn(startTs)) < startTs) {
continue;
}
if (codec.getTimestamp(rowKey, codec.createColumn(endTs)) > endTs) {
// we're done with scanner
break;
}
measureNames.add(codec.getMeasureName(rowResult.getRow()));
}
}
LOG.trace("search for measures completed, scanned records: {}", scannedRecords);
return measureNames;
}
use of co.cask.cdap.api.dataset.table.Scanner in project cdap by caskdata.
the class FactTable method findSingleDimensionValue.
/**
* Searches for first non-null valued dimensions in records that contain given list of dimensions and match given
* dimension values in given time range. Returned dimension values are those that are not defined in given
* dimension values.
* @param allDimensionNames list of all dimension names to be present in the record
* @param dimensionSlice dimension values to filter by, {@code null} means any non-null value.
* @param startTs start of the time range, in seconds
* @param endTs end of the time range, in seconds
* @return {@link Set} of {@link DimensionValue}s
*/
// todo: pass a limit on number of dimensionValues returned
// todo: kinda not cool API when we expect null values in a map...
public Set<DimensionValue> findSingleDimensionValue(List<String> allDimensionNames, Map<String, String> dimensionSlice, long startTs, long endTs) {
// Algorithm, briefly:
// We scan in the records which have given allDimensionNames. We use dimensionSlice as a criteria for scan.
// If record from the scan has non-null values in the dimensions which are not specified in dimensionSlice,
// we use first of such dimension as a value to return.
// When we find value to return, since we only fill a single dimension, we are not interested in drilling down
// further and instead attempt to fast-forward (jump) to a record that has different value in that dimension.
// Thus we find all results.
List<DimensionValue> allDimensions = Lists.newArrayList();
List<DimensionValue> filledDimension = Lists.newArrayList();
List<Integer> dimToFillIndexes = Lists.newArrayList();
for (int i = 0; i < allDimensionNames.size(); i++) {
String dimensionName = allDimensionNames.get(i);
if (!dimensionSlice.containsKey(dimensionName)) {
dimToFillIndexes.add(i);
allDimensions.add(new DimensionValue(dimensionName, null));
} else {
DimensionValue dimensionValue = new DimensionValue(dimensionName, dimensionSlice.get(dimensionName));
filledDimension.add(dimensionValue);
allDimensions.add(dimensionValue);
}
}
// If provided dimensions contain all values filled in, there's nothing to look for
if (dimToFillIndexes.isEmpty()) {
return Collections.emptySet();
}
Set<DimensionValue> result = Sets.newHashSet();
int scans = 0;
int scannedRecords = 0;
// build a scan
byte[] startRow = codec.createStartRowKey(allDimensions, null, startTs, false);
byte[] endRow = codec.createEndRowKey(allDimensions, null, endTs, false);
endRow = Bytes.stopKeyForPrefix(endRow);
FuzzyRowFilter fuzzyRowFilter = createFuzzyRowFilter(new FactScan(startTs, endTs, ImmutableList.<String>of(), allDimensions), startRow);
Scanner scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
scans++;
try {
Row rowResult;
while ((rowResult = scanner.next()) != null) {
scannedRecords++;
// todo: make configurable
if (scannedRecords > MAX_RECORDS_TO_SCAN_DURING_SEARCH) {
break;
}
byte[] rowKey = rowResult.getRow();
// filter out columns by time range (scan configuration only filters whole rows)
if (codec.getTimestamp(rowKey, codec.createColumn(startTs)) < startTs) {
continue;
}
if (codec.getTimestamp(rowKey, codec.createColumn(endTs)) > endTs) {
// we're done with scanner
break;
}
List<DimensionValue> dimensionValues = codec.getDimensionValues(rowResult.getRow());
// At this point, we know that the record is in right time range and its dimensions matches given.
// We try find first non-null valued dimension in the record that was not in given dimensions: we use it to form
// next drill down suggestion
int filledIndex = -1;
for (int index : dimToFillIndexes) {
// todo: it may be not efficient, if dimensionValues is not array-backed list: i.e. if access by index is
// not fast
DimensionValue dimensionValue = dimensionValues.get(index);
if (dimensionValue.getValue() != null) {
result.add(dimensionValue);
filledIndex = index;
break;
}
}
// todo: fast-forwarding (jumping) should be done on server-side (CDAP-1421)
if (filledIndex >= 0) {
scanner.close();
scanner = null;
scans++;
if (scans > MAX_SCANS_DURING_SEARCH) {
break;
}
startRow = codec.getNextRowKey(rowResult.getRow(), filledIndex);
scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
}
}
} finally {
if (scanner != null) {
scanner.close();
}
}
LOG.trace("search for dimensions completed, scans performed: {}, scanned records: {}", scans, scannedRecords);
return result;
}
Aggregations