Search in sources :

Example 1 with DimensionValue

use of co.cask.cdap.api.dataset.lib.cube.DimensionValue in project cdap by caskdata.

the class FactTable method findMeasureNames.

/**
 * Finds all measure names of the facts that match given {@link DimensionValue}s and time range.
 * @param allDimensionNames list of all dimension names to be present in the fact record
 * @param dimensionSlice dimension values to filter by, {@code null} means any non-null value.
 * @param startTs start timestamp, in sec
 * @param endTs end timestamp, in sec
 * @return {@link Set} of measure names
 */
// todo: pass a limit on number of measures returned
public Set<String> findMeasureNames(List<String> allDimensionNames, Map<String, String> dimensionSlice, long startTs, long endTs) {
    List<DimensionValue> allDimensions = Lists.newArrayList();
    for (String dimensionName : allDimensionNames) {
        allDimensions.add(new DimensionValue(dimensionName, dimensionSlice.get(dimensionName)));
    }
    byte[] startRow = codec.createStartRowKey(allDimensions, null, startTs, false);
    byte[] endRow = codec.createEndRowKey(allDimensions, null, endTs, false);
    endRow = Bytes.stopKeyForPrefix(endRow);
    FuzzyRowFilter fuzzyRowFilter = createFuzzyRowFilter(new FactScan(startTs, endTs, ImmutableList.<String>of(), allDimensions), startRow);
    Set<String> measureNames = Sets.newHashSet();
    int scannedRecords = 0;
    try (Scanner scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter)) {
        Row rowResult;
        while ((rowResult = scanner.next()) != null) {
            scannedRecords++;
            if (scannedRecords > MAX_RECORDS_TO_SCAN_DURING_SEARCH) {
                break;
            }
            byte[] rowKey = rowResult.getRow();
            // filter out columns by time range (scan configuration only filters whole rows)
            if (codec.getTimestamp(rowKey, codec.createColumn(startTs)) < startTs) {
                continue;
            }
            if (codec.getTimestamp(rowKey, codec.createColumn(endTs)) > endTs) {
                // we're done with scanner
                break;
            }
            measureNames.add(codec.getMeasureName(rowResult.getRow()));
        }
    }
    LOG.trace("search for measures completed, scanned records: {}", scannedRecords);
    return measureNames;
}
Also used : Scanner(co.cask.cdap.api.dataset.table.Scanner) DimensionValue(co.cask.cdap.api.dataset.lib.cube.DimensionValue) Row(co.cask.cdap.api.dataset.table.Row) FuzzyRowFilter(co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter)

Example 2 with DimensionValue

use of co.cask.cdap.api.dataset.lib.cube.DimensionValue in project cdap by caskdata.

the class FactTable method findSingleDimensionValue.

/**
 * Searches for first non-null valued dimensions in records that contain given list of dimensions and match given
 * dimension values in given time range. Returned dimension values are those that are not defined in given
 * dimension values.
 * @param allDimensionNames list of all dimension names to be present in the record
 * @param dimensionSlice dimension values to filter by, {@code null} means any non-null value.
 * @param startTs start of the time range, in seconds
 * @param endTs end of the time range, in seconds
 * @return {@link Set} of {@link DimensionValue}s
 */
// todo: pass a limit on number of dimensionValues returned
// todo: kinda not cool API when we expect null values in a map...
public Set<DimensionValue> findSingleDimensionValue(List<String> allDimensionNames, Map<String, String> dimensionSlice, long startTs, long endTs) {
    // Algorithm, briefly:
    // We scan in the records which have given allDimensionNames. We use dimensionSlice as a criteria for scan.
    // If record from the scan has non-null values in the dimensions which are not specified in dimensionSlice,
    // we use first of such dimension as a value to return.
    // When we find value to return, since we only fill a single dimension, we are not interested in drilling down
    // further and instead attempt to fast-forward (jump) to a record that has different value in that dimension.
    // Thus we find all results.
    List<DimensionValue> allDimensions = Lists.newArrayList();
    List<DimensionValue> filledDimension = Lists.newArrayList();
    List<Integer> dimToFillIndexes = Lists.newArrayList();
    for (int i = 0; i < allDimensionNames.size(); i++) {
        String dimensionName = allDimensionNames.get(i);
        if (!dimensionSlice.containsKey(dimensionName)) {
            dimToFillIndexes.add(i);
            allDimensions.add(new DimensionValue(dimensionName, null));
        } else {
            DimensionValue dimensionValue = new DimensionValue(dimensionName, dimensionSlice.get(dimensionName));
            filledDimension.add(dimensionValue);
            allDimensions.add(dimensionValue);
        }
    }
    // If provided dimensions contain all values filled in, there's nothing to look for
    if (dimToFillIndexes.isEmpty()) {
        return Collections.emptySet();
    }
    Set<DimensionValue> result = Sets.newHashSet();
    int scans = 0;
    int scannedRecords = 0;
    // build a scan
    byte[] startRow = codec.createStartRowKey(allDimensions, null, startTs, false);
    byte[] endRow = codec.createEndRowKey(allDimensions, null, endTs, false);
    endRow = Bytes.stopKeyForPrefix(endRow);
    FuzzyRowFilter fuzzyRowFilter = createFuzzyRowFilter(new FactScan(startTs, endTs, ImmutableList.<String>of(), allDimensions), startRow);
    Scanner scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
    scans++;
    try {
        Row rowResult;
        while ((rowResult = scanner.next()) != null) {
            scannedRecords++;
            // todo: make configurable
            if (scannedRecords > MAX_RECORDS_TO_SCAN_DURING_SEARCH) {
                break;
            }
            byte[] rowKey = rowResult.getRow();
            // filter out columns by time range (scan configuration only filters whole rows)
            if (codec.getTimestamp(rowKey, codec.createColumn(startTs)) < startTs) {
                continue;
            }
            if (codec.getTimestamp(rowKey, codec.createColumn(endTs)) > endTs) {
                // we're done with scanner
                break;
            }
            List<DimensionValue> dimensionValues = codec.getDimensionValues(rowResult.getRow());
            // At this point, we know that the record is in right time range and its dimensions matches given.
            // We try find first non-null valued dimension in the record that was not in given dimensions: we use it to form
            // next drill down suggestion
            int filledIndex = -1;
            for (int index : dimToFillIndexes) {
                // todo: it may be not efficient, if dimensionValues is not array-backed list: i.e. if access by index is
                // not fast
                DimensionValue dimensionValue = dimensionValues.get(index);
                if (dimensionValue.getValue() != null) {
                    result.add(dimensionValue);
                    filledIndex = index;
                    break;
                }
            }
            // todo: fast-forwarding (jumping) should be done on server-side (CDAP-1421)
            if (filledIndex >= 0) {
                scanner.close();
                scanner = null;
                scans++;
                if (scans > MAX_SCANS_DURING_SEARCH) {
                    break;
                }
                startRow = codec.getNextRowKey(rowResult.getRow(), filledIndex);
                scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
            }
        }
    } finally {
        if (scanner != null) {
            scanner.close();
        }
    }
    LOG.trace("search for dimensions completed, scans performed: {}, scanned records: {}", scans, scannedRecords);
    return result;
}
Also used : Scanner(co.cask.cdap.api.dataset.table.Scanner) DimensionValue(co.cask.cdap.api.dataset.lib.cube.DimensionValue) Row(co.cask.cdap.api.dataset.table.Row) FuzzyRowFilter(co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter)

Example 3 with DimensionValue

use of co.cask.cdap.api.dataset.lib.cube.DimensionValue in project cdap by caskdata.

the class FactTableTest method testPreSplits.

@Test
public void testPreSplits() throws Exception {
    InMemoryTableService.create("presplitEntityTable");
    InMemoryTableService.create("presplitDataTable");
    int resolution = 10;
    int rollTimebaseInterval = 2;
    InMemoryMetricsTable metricsTable = new InMemoryMetricsTable("presplitDataTable");
    FactTable table = new FactTable(metricsTable, new EntityTable(new InMemoryMetricsTable("presplitEntityTable")), resolution, rollTimebaseInterval);
    byte[][] splits = FactTable.getSplits(3);
    long ts = System.currentTimeMillis() / 1000;
    DimensionValue dimVal1 = new DimensionValue("dim1", "value1");
    DimensionValue dimVal2 = new DimensionValue("dim2", "value2");
    DimensionValue dimVal3 = new DimensionValue("dim3", "value3");
    // first agg view: dim1
    table.add(ImmutableList.of(new Fact(ts, ImmutableList.of(dimVal1), new Measurement("metric1", MeasureType.COUNTER, 1))));
    // second agg view: dim1 & dim2
    table.add(ImmutableList.of(new Fact(ts, ImmutableList.of(dimVal1, dimVal2), new Measurement("metric1", MeasureType.COUNTER, 1))));
    // third agg view: dim3
    table.add(ImmutableList.of(new Fact(ts, ImmutableList.of(dimVal3), new Measurement("metric1", MeasureType.COUNTER, 1))));
    // Verify all written records are spread across splits
    Scanner scanner = metricsTable.scan(null, null, null);
    Row row;
    Set<Integer> splitsWithRows = Sets.newHashSet();
    while ((row = scanner.next()) != null) {
        boolean added = false;
        for (int i = 0; i < splits.length; i++) {
            if (Bytes.compareTo(row.getRow(), splits[i]) < 0) {
                splitsWithRows.add(i);
                added = true;
                break;
            }
        }
        if (!added) {
            // falls into last split
            splitsWithRows.add(splits.length);
        }
    }
    Assert.assertEquals(3, splitsWithRows.size());
}
Also used : Measurement(co.cask.cdap.api.dataset.lib.cube.Measurement) Scanner(co.cask.cdap.api.dataset.table.Scanner) DimensionValue(co.cask.cdap.api.dataset.lib.cube.DimensionValue) InMemoryMetricsTable(co.cask.cdap.data2.dataset2.lib.table.inmemory.InMemoryMetricsTable) Row(co.cask.cdap.api.dataset.table.Row) Test(org.junit.Test)

Example 4 with DimensionValue

use of co.cask.cdap.api.dataset.lib.cube.DimensionValue in project cdap by caskdata.

the class FactTableTest method testBasics.

@Test
public void testBasics() throws Exception {
    InMemoryTableService.create("EntityTable");
    InMemoryTableService.create("DataTable");
    int resolution = 10;
    int rollTimebaseInterval = 2;
    FactTable table = new FactTable(new InMemoryMetricsTable("DataTable"), new EntityTable(new InMemoryMetricsTable("EntityTable")), resolution, rollTimebaseInterval);
    // aligned to start of resolution bucket
    // "/1000" because time is expected to be in seconds
    long ts = ((System.currentTimeMillis() / 1000) / resolution) * resolution;
    // testing encoding with multiple dims
    List<DimensionValue> dimensionValues = ImmutableList.of(new DimensionValue("dim1", "value1"), new DimensionValue("dim2", "value2"), new DimensionValue("dim3", "value3"));
    // trying adding one by one, in same (first) time resolution bucket
    for (int i = 0; i < 5; i++) {
        for (int k = 1; k < 4; k++) {
            // note: "+i" here and below doesn't affect results, just to confirm
            // that data points are rounded to the resolution
            table.add(ImmutableList.of(new Fact(ts + i, dimensionValues, new Measurement("metric" + k, MeasureType.COUNTER, k))));
        }
    }
    // trying adding one by one, in different time resolution buckets
    for (int i = 0; i < 3; i++) {
        for (int k = 1; k < 4; k++) {
            table.add(ImmutableList.of(new Fact(ts + resolution * i + i, dimensionValues, new Measurement("metric" + k, MeasureType.COUNTER, 2 * k))));
        }
    }
    // trying adding as list
    // first incs in same (second) time resolution bucket
    List<Fact> aggs = Lists.newArrayList();
    for (int i = 0; i < 7; i++) {
        for (int k = 1; k < 4; k++) {
            aggs.add(new Fact(ts + resolution, dimensionValues, new Measurement("metric" + k, MeasureType.COUNTER, 3 * k)));
        }
    }
    // then incs in different time resolution buckets
    for (int i = 0; i < 3; i++) {
        for (int k = 1; k < 4; k++) {
            aggs.add(new Fact(ts + resolution * i, dimensionValues, new Measurement("metric" + k, MeasureType.COUNTER, 4 * k)));
        }
    }
    table.add(aggs);
    // verify each metric
    for (int k = 1; k < 4; k++) {
        FactScan scan = new FactScan(ts - 2 * resolution, ts + 3 * resolution, "metric" + k, dimensionValues);
        Table<String, List<DimensionValue>, List<TimeValue>> expected = HashBasedTable.create();
        expected.put("metric" + k, dimensionValues, ImmutableList.of(new TimeValue(ts, 11 * k), new TimeValue(ts + resolution, 27 * k), new TimeValue(ts + 2 * resolution, 6 * k)));
        assertScan(table, expected, scan);
    }
    // verify each metric within a single timeBase
    for (int k = 1; k < 4; k++) {
        FactScan scan = new FactScan(ts, ts + resolution - 1, "metric" + k, dimensionValues);
        Table<String, List<DimensionValue>, List<TimeValue>> expected = HashBasedTable.create();
        expected.put("metric" + k, dimensionValues, ImmutableList.of(new TimeValue(ts, 11 * k)));
        assertScan(table, expected, scan);
    }
    // verify all metrics with fuzzy metric in scan
    Table<String, List<DimensionValue>, List<TimeValue>> expected = HashBasedTable.create();
    for (int k = 1; k < 4; k++) {
        expected.put("metric" + k, dimensionValues, ImmutableList.of(new TimeValue(ts, 11 * k), new TimeValue(ts + resolution, 27 * k), new TimeValue(ts + 2 * resolution, 6 * k)));
    }
    // metric = null means "all"
    FactScan scan = new FactScan(ts - 2 * resolution, ts + 3 * resolution, dimensionValues);
    assertScan(table, expected, scan);
    // delete metric test
    expected.clear();
    // delete the metrics data at (timestamp + 20) resolution
    scan = new FactScan(ts + resolution * 2, ts + resolution * 3, dimensionValues);
    table.delete(scan);
    for (int k = 1; k < 4; k++) {
        expected.put("metric" + k, dimensionValues, ImmutableList.of(new TimeValue(ts, 11 * k), new TimeValue(ts + resolution, 27 * k)));
    }
    // verify deletion
    scan = new FactScan(ts - 2 * resolution, ts + 3 * resolution, dimensionValues);
    assertScan(table, expected, scan);
    // delete metrics for "metric1" at ts0 and verify deletion
    scan = new FactScan(ts, ts + 1, "metric1", dimensionValues);
    table.delete(scan);
    expected.clear();
    expected.put("metric1", dimensionValues, ImmutableList.of(new TimeValue(ts + resolution, 27)));
    scan = new FactScan(ts - 2 * resolution, ts + 3 * resolution, "metric1", dimensionValues);
    assertScan(table, expected, scan);
    // verify the next dims search
    Collection<DimensionValue> nextTags = table.findSingleDimensionValue(ImmutableList.of("dim1", "dim2", "dim3"), ImmutableMap.of("dim1", "value1"), ts, ts + 1);
    Assert.assertEquals(ImmutableSet.of(new DimensionValue("dim2", "value2")), nextTags);
    Map<String, String> slice = Maps.newHashMap();
    slice.put("dim1", null);
    nextTags = table.findSingleDimensionValue(ImmutableList.of("dim1", "dim2", "dim3"), slice, ts, ts + 1);
    Assert.assertEquals(ImmutableSet.of(new DimensionValue("dim2", "value2")), nextTags);
    nextTags = table.findSingleDimensionValue(ImmutableList.of("dim1", "dim2", "dim3"), ImmutableMap.of("dim1", "value1", "dim2", "value2"), ts, ts + 3);
    Assert.assertEquals(ImmutableSet.of(new DimensionValue("dim3", "value3")), nextTags);
    // add new dim values
    dimensionValues = ImmutableList.of(new DimensionValue("dim1", "value1"), new DimensionValue("dim2", "value5"), new DimensionValue("dim3", null));
    table.add(ImmutableList.of(new Fact(ts, dimensionValues, new Measurement("metric", MeasureType.COUNTER, 10))));
    dimensionValues = ImmutableList.of(new DimensionValue("dim1", "value1"), new DimensionValue("dim2", null), new DimensionValue("dim3", "value3"));
    table.add(ImmutableList.of(new Fact(ts, dimensionValues, new Measurement("metric", MeasureType.COUNTER, 10))));
    nextTags = table.findSingleDimensionValue(ImmutableList.of("dim1", "dim2", "dim3"), ImmutableMap.of("dim1", "value1"), ts, ts + 1);
    Assert.assertEquals(ImmutableSet.of(new DimensionValue("dim2", "value2"), new DimensionValue("dim2", "value5"), new DimensionValue("dim3", "value3")), nextTags);
    // search for metric names given dims list and verify
    Collection<String> metricNames = table.findMeasureNames(ImmutableList.of("dim1", "dim2", "dim3"), ImmutableMap.of("dim1", "value1", "dim2", "value2", "dim3", "value3"), ts, ts + 1);
    Assert.assertEquals(ImmutableSet.of("metric2", "metric3"), metricNames);
    metricNames = table.findMeasureNames(ImmutableList.of("dim1", "dim2", "dim3"), ImmutableMap.of("dim1", "value1"), ts, ts + 1);
    Assert.assertEquals(ImmutableSet.of("metric", "metric2", "metric3"), metricNames);
    metricNames = table.findMeasureNames(ImmutableList.of("dim1", "dim2", "dim3"), ImmutableMap.of("dim2", "value2"), ts, ts + 1);
    Assert.assertEquals(ImmutableSet.of("metric2", "metric3"), metricNames);
    metricNames = table.findMeasureNames(ImmutableList.of("dim1", "dim2", "dim3"), slice, ts, ts + 1);
    Assert.assertEquals(ImmutableSet.of("metric", "metric2", "metric3"), metricNames);
}
Also used : Measurement(co.cask.cdap.api.dataset.lib.cube.Measurement) DimensionValue(co.cask.cdap.api.dataset.lib.cube.DimensionValue) InMemoryMetricsTable(co.cask.cdap.data2.dataset2.lib.table.inmemory.InMemoryMetricsTable) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) TimeValue(co.cask.cdap.api.dataset.lib.cube.TimeValue) Test(org.junit.Test)

Example 5 with DimensionValue

use of co.cask.cdap.api.dataset.lib.cube.DimensionValue in project cdap by caskdata.

the class DefaultCube method query.

@Override
public Collection<TimeSeries> query(CubeQuery query) {
    /*
      CubeQuery example: "dataset read ops for app per dataset". Or:

      SELECT count('read.ops')                                           << measure name and type
      FROM aggregation1.1min_resolution                                  << aggregation and resolution
      GROUP BY dataset,                                                  << groupByDimensions
      WHERE namespace='ns1' AND app='myApp' AND program='myFlow' AND     << dimensionValues
            ts>=1423370200 AND ts{@literal<}1423398198                   << startTs and endTs
      LIMIT 100                                                          << limit

      Execution:

      1) (optional, if aggregation to query in is not provided) find aggregation to supply results

      Here, we need aggregation that has following dimensions: 'namespace', 'app', 'program', 'dataset'.

      Ideally (to reduce the scan range), 'dataset' should be in the end, other dimensions as close to the beginning
      as possible, and minimal number of other "unspecified" dimensions.

      Let's say we found aggregation: 'namespace', 'app', 'program', 'instance', 'dataset'

      2) build a scan in the aggregation

      For scan we set "any" into the dimension values that aggregation has but query doesn't define value for:

      'namespace'='ns1', 'app'='myApp', 'program'='myFlow', 'instance'=*, 'dataset'=*

      Plus specified measure & aggregation?:

      'measureName'='read.ops'
      'measureType'='COUNTER'

      3) While scanning build a table: dimension values -> time -> value. Use measureType as values aggregate
         function if needed.
    */
    incrementMetric("cube.query.request.count", 1);
    if (!resolutionToFactTable.containsKey(query.getResolution())) {
        incrementMetric("cube.query.request.failure.count", 1);
        throw new IllegalArgumentException("There's no data aggregated for specified resolution to satisfy the query: " + query.toString());
    }
    // 1) find aggregation to query
    Aggregation agg;
    String aggName;
    if (query.getAggregation() != null) {
        aggName = query.getAggregation();
        agg = aggregations.get(query.getAggregation());
        if (agg == null) {
            incrementMetric("cube.query.request.failure.count", 1);
            throw new IllegalArgumentException(String.format("Specified aggregation %s is not found in cube aggregations: %s", query.getAggregation(), aggregations.keySet().toString()));
        }
    } else {
        ImmutablePair<String, Aggregation> aggregation = findAggregation(query);
        if (aggregation == null) {
            incrementMetric("cube.query.request.failure.count", 1);
            throw new IllegalArgumentException("There's no data aggregated for specified dimensions " + "to satisfy the query: " + query.toString());
        }
        agg = aggregation.getSecond();
        aggName = aggregation.getFirst();
    }
    // tell how many queries end up querying specific pre-aggregated views and resolutions
    incrementMetric("cube.query.agg." + aggName + ".count", 1);
    incrementMetric("cube.query.res." + query.getResolution() + ".count", 1);
    // 2) build a scan for a query
    List<DimensionValue> dimensionValues = Lists.newArrayList();
    for (String dimensionName : agg.getDimensionNames()) {
        // if not defined in query, will be set as null, which means "any"
        dimensionValues.add(new DimensionValue(dimensionName, query.getDimensionValues().get(dimensionName)));
    }
    FactScan scan = new FactScan(query.getStartTs(), query.getEndTs(), query.getMeasurements().keySet(), dimensionValues);
    // 3) execute scan query
    FactTable table = resolutionToFactTable.get(query.getResolution());
    FactScanner scanner = table.scan(scan);
    Table<Map<String, String>, String, Map<Long, Long>> resultMap = getTimeSeries(query, scanner);
    incrementMetric("cube.query.request.success.count", 1);
    incrementMetric("cube.query.result.size", resultMap.size());
    Collection<TimeSeries> timeSeries = convertToQueryResult(query, resultMap);
    incrementMetric("cube.query.result.timeseries.count", timeSeries.size());
    return timeSeries;
}
Also used : FactScan(co.cask.cdap.data2.dataset2.lib.timeseries.FactScan) TimeSeries(co.cask.cdap.api.dataset.lib.cube.TimeSeries) FactScanner(co.cask.cdap.data2.dataset2.lib.timeseries.FactScanner) FactTable(co.cask.cdap.data2.dataset2.lib.timeseries.FactTable) DimensionValue(co.cask.cdap.api.dataset.lib.cube.DimensionValue) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Aggregations

DimensionValue (co.cask.cdap.api.dataset.lib.cube.DimensionValue)19 FactTable (co.cask.cdap.data2.dataset2.lib.timeseries.FactTable)5 Test (org.junit.Test)5 TimeValue (co.cask.cdap.api.dataset.lib.cube.TimeValue)4 Row (co.cask.cdap.api.dataset.table.Row)4 InMemoryMetricsTable (co.cask.cdap.data2.dataset2.lib.table.inmemory.InMemoryMetricsTable)4 Map (java.util.Map)4 Scanner (co.cask.cdap.api.dataset.table.Scanner)3 FuzzyRowFilter (co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter)3 LinkedHashMap (java.util.LinkedHashMap)3 CubeFact (co.cask.cdap.api.dataset.lib.cube.CubeFact)2 Measurement (co.cask.cdap.api.dataset.lib.cube.Measurement)2 TimeSeries (co.cask.cdap.api.dataset.lib.cube.TimeSeries)2 FactScan (co.cask.cdap.data2.dataset2.lib.timeseries.FactScan)2 URL (java.net.URL)2 ArrayList (java.util.ArrayList)2 AggregationFunction (co.cask.cdap.api.dataset.lib.cube.AggregationFunction)1 CubeExploreQuery (co.cask.cdap.api.dataset.lib.cube.CubeExploreQuery)1 CubeQuery (co.cask.cdap.api.dataset.lib.cube.CubeQuery)1 TagValue (co.cask.cdap.api.metrics.TagValue)1