Search in sources :

Example 16 with Scanner

use of co.cask.cdap.api.dataset.table.Scanner in project cdap by caskdata.

the class WorkflowDataset method getNeighbors.

/**
   * Returns a map of WorkflowRunId to WorkflowRunRecord that are close to the WorkflowRunId provided by the user.
   *
   * @param id The workflow
   * @param runId The runid of the workflow
   * @param limit The limit on each side of the run that we want to see into
   * @param timeInterval The time interval that we want the results to be spaced apart
   * @return A Map of WorkflowRunId to the corresponding Workflow Run Record. A map is used so that duplicates of
   * the WorkflowRunRecord are not obtained
   */
private Map<String, WorkflowRunRecord> getNeighbors(WorkflowId id, RunId runId, int limit, long timeInterval) {
    long startTime = RunIds.getTime(runId, TimeUnit.SECONDS);
    Map<String, WorkflowRunRecord> workflowRunRecords = new HashMap<>();
    int i = -limit;
    long prevStartTime = startTime - (limit * timeInterval);
    // the last record was found if the (interval * the count of the loop) is less than the time.
    while (prevStartTime <= startTime + (limit * timeInterval)) {
        MDSKey mdsKey = getRowKeyBuilder(id, prevStartTime).build();
        byte[] startRowKey = mdsKey.getKey();
        Scan scan = new Scan(startRowKey, null);
        Scanner scanner = table.scan(scan);
        Row indexRow = scanner.next();
        if (indexRow == null) {
            return workflowRunRecords;
        }
        byte[] rowKey = indexRow.getRow();
        long time = ByteBuffer.wrap(rowKey, rowKey.length - Bytes.SIZEOF_LONG, Bytes.SIZEOF_LONG).getLong();
        if (!((time >= (startTime - (limit * timeInterval))) && time <= (startTime + (limit * timeInterval)))) {
            break;
        }
        Map<byte[], byte[]> columns = indexRow.getColumns();
        String workflowRunId = Bytes.toString(columns.get(RUNID));
        long timeTaken = Bytes.toLong(columns.get(TIME_TAKEN));
        List<ProgramRun> programRunList = GSON.fromJson(Bytes.toString(columns.get(NODES)), PROGRAM_RUNS_TYPE);
        workflowRunRecords.put(workflowRunId, new WorkflowRunRecord(workflowRunId, timeTaken, programRunList));
        prevStartTime = startTime + (i * timeInterval) < time ? time + 1 : startTime + (i * timeInterval);
        i++;
    }
    return workflowRunRecords;
}
Also used : Scanner(co.cask.cdap.api.dataset.table.Scanner) HashMap(java.util.HashMap) MDSKey(co.cask.cdap.data2.dataset2.lib.table.MDSKey) Scan(co.cask.cdap.api.dataset.table.Scan) Row(co.cask.cdap.api.dataset.table.Row)

Example 17 with Scanner

use of co.cask.cdap.api.dataset.table.Scanner in project cdap by caskdata.

the class DefaultPreviewStore method get.

@Override
public Map<String, List<JsonElement>> get(ApplicationId applicationId, String tracerName) {
    // PreviewStore is a singleton and we have to create gson for each operation since gson is not thread safe.
    Gson gson = new GsonBuilder().registerTypeAdapter(Schema.class, new SchemaTypeAdapter()).create();
    byte[] startRowKey = new MDSKey.Builder().add(applicationId.getNamespace()).add(applicationId.getApplication()).add(tracerName).build().getKey();
    byte[] stopRowKey = new MDSKey(Bytes.stopKeyForPrefix(startRowKey)).getKey();
    Map<String, List<JsonElement>> result = new HashMap<>();
    try (Scanner scanner = table.scan(startRowKey, stopRowKey, null, null, null)) {
        Row indexRow;
        while ((indexRow = scanner.next()) != null) {
            Map<byte[], byte[]> columns = indexRow.getColumns();
            String propertyName = Bytes.toString(columns.get(PROPERTY));
            JsonElement value = gson.fromJson(Bytes.toString(columns.get(VALUE)), JsonElement.class);
            List<JsonElement> values = result.get(propertyName);
            if (values == null) {
                values = new ArrayList<>();
                result.put(propertyName, values);
            }
            values.add(value);
        }
    } catch (IOException e) {
        String message = String.format("Error while reading preview data for application '%s' and tracer '%s'.", applicationId, tracerName);
        throw new RuntimeException(message, e);
    }
    return result;
}
Also used : Scanner(co.cask.cdap.api.dataset.table.Scanner) GsonBuilder(com.google.gson.GsonBuilder) HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) Gson(com.google.gson.Gson) MDSKey(co.cask.cdap.data2.dataset2.lib.table.MDSKey) IOException(java.io.IOException) SchemaTypeAdapter(co.cask.cdap.internal.io.SchemaTypeAdapter) JsonElement(com.google.gson.JsonElement) ArrayList(java.util.ArrayList) List(java.util.List) Row(co.cask.cdap.api.dataset.table.Row)

Example 18 with Scanner

use of co.cask.cdap.api.dataset.table.Scanner in project cdap by caskdata.

the class LineageDataset method getAccessTimesForRun.

/**
   * @return a set of access times (for program and data it accesses) associated with a program run.
   */
@VisibleForTesting
public List<Long> getAccessTimesForRun(ProgramRunId run) {
    ImmutableList.Builder<Long> recordBuilder = ImmutableList.builder();
    byte[] startKey = getRunScanStartKey(run);
    try (Scanner scanner = accessRegistryTable.scan(startKey, Bytes.stopKeyForPrefix(startKey))) {
        Row row;
        while ((row = scanner.next()) != null) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Got row key = {}", Bytes.toString(row.getRow()));
            }
            RowKey rowKey = parseRow(row);
            if (run.getEntityName().equals(rowKey.getRunId().getId())) {
                recordBuilder.add(Bytes.toLong(row.get(ACCESS_TIME_COLS_BYTE)));
            }
        }
    }
    return recordBuilder.build();
}
Also used : Scanner(co.cask.cdap.api.dataset.table.Scanner) ImmutableList(com.google.common.collect.ImmutableList) Row(co.cask.cdap.api.dataset.table.Row) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 19 with Scanner

use of co.cask.cdap.api.dataset.table.Scanner in project cdap by caskdata.

the class FactTable method findMeasureNames.

/**
   * Finds all measure names of the facts that match given {@link DimensionValue}s and time range.
   * @param allDimensionNames list of all dimension names to be present in the fact record
   * @param dimensionSlice dimension values to filter by, {@code null} means any non-null value.
   * @param startTs start timestamp, in sec
   * @param endTs end timestamp, in sec
   * @return {@link Set} of measure names
   */
// todo: pass a limit on number of measures returned
public Set<String> findMeasureNames(List<String> allDimensionNames, Map<String, String> dimensionSlice, long startTs, long endTs) {
    List<DimensionValue> allDimensions = Lists.newArrayList();
    for (String dimensionName : allDimensionNames) {
        allDimensions.add(new DimensionValue(dimensionName, dimensionSlice.get(dimensionName)));
    }
    byte[] startRow = codec.createStartRowKey(allDimensions, null, startTs, false);
    byte[] endRow = codec.createEndRowKey(allDimensions, null, endTs, false);
    endRow = Bytes.stopKeyForPrefix(endRow);
    FuzzyRowFilter fuzzyRowFilter = createFuzzyRowFilter(new FactScan(startTs, endTs, ImmutableList.<String>of(), allDimensions), startRow);
    Set<String> measureNames = Sets.newHashSet();
    int scannedRecords = 0;
    try (Scanner scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter)) {
        Row rowResult;
        while ((rowResult = scanner.next()) != null) {
            scannedRecords++;
            if (scannedRecords > MAX_RECORDS_TO_SCAN_DURING_SEARCH) {
                break;
            }
            byte[] rowKey = rowResult.getRow();
            // filter out columns by time range (scan configuration only filters whole rows)
            if (codec.getTimestamp(rowKey, codec.createColumn(startTs)) < startTs) {
                continue;
            }
            if (codec.getTimestamp(rowKey, codec.createColumn(endTs)) > endTs) {
                // we're done with scanner
                break;
            }
            measureNames.add(codec.getMeasureName(rowResult.getRow()));
        }
    }
    LOG.trace("search for measures completed, scanned records: {}", scannedRecords);
    return measureNames;
}
Also used : Scanner(co.cask.cdap.api.dataset.table.Scanner) DimensionValue(co.cask.cdap.api.dataset.lib.cube.DimensionValue) Row(co.cask.cdap.api.dataset.table.Row) FuzzyRowFilter(co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter)

Example 20 with Scanner

use of co.cask.cdap.api.dataset.table.Scanner in project cdap by caskdata.

the class FactTable method findSingleDimensionValue.

/**
   * Searches for first non-null valued dimensions in records that contain given list of dimensions and match given
   * dimension values in given time range. Returned dimension values are those that are not defined in given
   * dimension values.
   * @param allDimensionNames list of all dimension names to be present in the record
   * @param dimensionSlice dimension values to filter by, {@code null} means any non-null value.
   * @param startTs start of the time range, in seconds
   * @param endTs end of the time range, in seconds
   * @return {@link Set} of {@link DimensionValue}s
   */
// todo: pass a limit on number of dimensionValues returned
// todo: kinda not cool API when we expect null values in a map...
public Set<DimensionValue> findSingleDimensionValue(List<String> allDimensionNames, Map<String, String> dimensionSlice, long startTs, long endTs) {
    // Algorithm, briefly:
    // We scan in the records which have given allDimensionNames. We use dimensionSlice as a criteria for scan.
    // If record from the scan has non-null values in the dimensions which are not specified in dimensionSlice,
    // we use first of such dimension as a value to return.
    // When we find value to return, since we only fill a single dimension, we are not interested in drilling down
    // further and instead attempt to fast-forward (jump) to a record that has different value in that dimension.
    // Thus we find all results.
    List<DimensionValue> allDimensions = Lists.newArrayList();
    List<DimensionValue> filledDimension = Lists.newArrayList();
    List<Integer> dimToFillIndexes = Lists.newArrayList();
    for (int i = 0; i < allDimensionNames.size(); i++) {
        String dimensionName = allDimensionNames.get(i);
        if (!dimensionSlice.containsKey(dimensionName)) {
            dimToFillIndexes.add(i);
            allDimensions.add(new DimensionValue(dimensionName, null));
        } else {
            DimensionValue dimensionValue = new DimensionValue(dimensionName, dimensionSlice.get(dimensionName));
            filledDimension.add(dimensionValue);
            allDimensions.add(dimensionValue);
        }
    }
    // If provided dimensions contain all values filled in, there's nothing to look for
    if (dimToFillIndexes.isEmpty()) {
        return Collections.emptySet();
    }
    Set<DimensionValue> result = Sets.newHashSet();
    int scans = 0;
    int scannedRecords = 0;
    // build a scan
    byte[] startRow = codec.createStartRowKey(allDimensions, null, startTs, false);
    byte[] endRow = codec.createEndRowKey(allDimensions, null, endTs, false);
    endRow = Bytes.stopKeyForPrefix(endRow);
    FuzzyRowFilter fuzzyRowFilter = createFuzzyRowFilter(new FactScan(startTs, endTs, ImmutableList.<String>of(), allDimensions), startRow);
    Scanner scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
    scans++;
    try {
        Row rowResult;
        while ((rowResult = scanner.next()) != null) {
            scannedRecords++;
            // todo: make configurable
            if (scannedRecords > MAX_RECORDS_TO_SCAN_DURING_SEARCH) {
                break;
            }
            byte[] rowKey = rowResult.getRow();
            // filter out columns by time range (scan configuration only filters whole rows)
            if (codec.getTimestamp(rowKey, codec.createColumn(startTs)) < startTs) {
                continue;
            }
            if (codec.getTimestamp(rowKey, codec.createColumn(endTs)) > endTs) {
                // we're done with scanner
                break;
            }
            List<DimensionValue> dimensionValues = codec.getDimensionValues(rowResult.getRow());
            // At this point, we know that the record is in right time range and its dimensions matches given.
            // We try find first non-null valued dimension in the record that was not in given dimensions: we use it to form
            // next drill down suggestion
            int filledIndex = -1;
            for (int index : dimToFillIndexes) {
                // todo: it may be not efficient, if dimensionValues is not array-backed list: i.e. if access by index is
                //       not fast
                DimensionValue dimensionValue = dimensionValues.get(index);
                if (dimensionValue.getValue() != null) {
                    result.add(dimensionValue);
                    filledIndex = index;
                    break;
                }
            }
            // todo: fast-forwarding (jumping) should be done on server-side (CDAP-1421)
            if (filledIndex >= 0) {
                scanner.close();
                scanner = null;
                scans++;
                if (scans > MAX_SCANS_DURING_SEARCH) {
                    break;
                }
                startRow = codec.getNextRowKey(rowResult.getRow(), filledIndex);
                scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
            }
        }
    } finally {
        if (scanner != null) {
            scanner.close();
        }
    }
    LOG.trace("search for dimensions completed, scans performed: {}, scanned records: {}", scans, scannedRecords);
    return result;
}
Also used : Scanner(co.cask.cdap.api.dataset.table.Scanner) DimensionValue(co.cask.cdap.api.dataset.lib.cube.DimensionValue) Row(co.cask.cdap.api.dataset.table.Row) FuzzyRowFilter(co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter)

Aggregations

Scanner (co.cask.cdap.api.dataset.table.Scanner)68 Row (co.cask.cdap.api.dataset.table.Row)60 ArrayList (java.util.ArrayList)11 Scan (co.cask.cdap.api.dataset.table.Scan)10 Table (co.cask.cdap.api.dataset.table.Table)10 Test (org.junit.Test)10 DatasetId (co.cask.cdap.proto.id.DatasetId)8 TransactionExecutor (org.apache.tephra.TransactionExecutor)8 MDSKey (co.cask.cdap.data2.dataset2.lib.table.MDSKey)6 QueueEntryRow (co.cask.cdap.data2.transaction.queue.QueueEntryRow)6 IOException (java.io.IOException)6 HashMap (java.util.HashMap)6 Map (java.util.Map)6 Put (co.cask.cdap.api.dataset.table.Put)5 DatasetManagementException (co.cask.cdap.api.dataset.DatasetManagementException)4 Delete (co.cask.cdap.api.dataset.table.Delete)4 FuzzyRowFilter (co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter)4 ScheduleId (co.cask.cdap.proto.id.ScheduleId)4 ReadOnly (co.cask.cdap.api.annotation.ReadOnly)3 RecordScanner (co.cask.cdap.api.data.batch.RecordScanner)3