Search in sources :

Example 11 with DimensionValue

use of co.cask.cdap.api.dataset.lib.cube.DimensionValue in project cdap by caskdata.

the class FactCodec method getDimensionValues.

public List<DimensionValue> getDimensionValues(byte[] rowKey) {
    // todo: in some cases, the client knows the agg group - so to optimize we can accept is as a parameter
    // first encoded is aggregation group
    long encodedAggGroup = readEncoded(rowKey, VERSION.length);
    String aggGroup = entityTable.getName(encodedAggGroup, TYPE_DIMENSIONS_GROUP);
    if (aggGroup == null) {
        // will never happen, unless data in entity table was corrupted or deleted
        LOG.warn("Could not decode agg group: " + encodedAggGroup);
        return Collections.emptyList();
    }
    if (aggGroup.isEmpty()) {
        return Collections.emptyList();
    }
    // aggregation group is defined by list of dimension names concatenated with "." (see writeEncodedAggGroup
    // for details)
    String[] dimensionNames = aggGroup.split("\\.");
    // todo: assert count of dimension values is same as dimension names?
    List<DimensionValue> dimensions = Lists.newArrayListWithCapacity(dimensionNames.length);
    for (int i = 0; i < dimensionNames.length; i++) {
        // dimension values go right after encoded agg group and timebase (encoded as int)
        long encodedDimensionValue = readEncoded(rowKey, VERSION.length + entityTable.getIdSize() * (i + 1) + Bytes.SIZEOF_INT);
        String dimensionValue = entityTable.getName(encodedDimensionValue, dimensionNames[i]);
        dimensions.add(new DimensionValue(dimensionNames[i], dimensionValue));
    }
    return dimensions;
}
Also used : DimensionValue(co.cask.cdap.api.dataset.lib.cube.DimensionValue)

Example 12 with DimensionValue

use of co.cask.cdap.api.dataset.lib.cube.DimensionValue in project cdap by caskdata.

the class FactCodec method createFuzzyRowMask.

/**
 * create fuzzy row mask based on dimension values and measure name.
 * if dimension value/measure name is null it matches any dimension values / measures.
 * @param dimensionValues
 * @param measureName
 * @return fuzzy mask byte array
 */
public byte[] createFuzzyRowMask(List<DimensionValue> dimensionValues, @Nullable String measureName) {
    // See createRowKey for row format info
    byte[] mask = new byte[VERSION.length + (dimensionValues.size() + 2) * entityTable.getIdSize() + Bytes.SIZEOF_INT];
    int offset = writeVersion(mask);
    // agg group encoded is always provided for fuzzy row filter
    offset = writeEncodedFixedMask(mask, offset);
    // time is defined by start/stop keys when scanning - we never include it in fuzzy filter
    offset = writeFuzzyMask(mask, offset, Bytes.SIZEOF_INT);
    for (DimensionValue dimensionValue : dimensionValues) {
        if (dimensionValue.getValue() != null) {
            offset = writeEncodedFixedMask(mask, offset);
        } else {
            offset = writeEncodedFuzzyMask(mask, offset);
        }
    }
    if (measureName != null) {
        writeEncodedFixedMask(mask, offset);
    } else {
        writeEncodedFuzzyMask(mask, offset);
    }
    return mask;
}
Also used : DimensionValue(co.cask.cdap.api.dataset.lib.cube.DimensionValue)

Example 13 with DimensionValue

use of co.cask.cdap.api.dataset.lib.cube.DimensionValue in project cdap by caskdata.

the class FactCodec method createRowKey.

private byte[] createRowKey(List<DimensionValue> dimensionValues, String measureName, long ts, boolean stopKey, boolean anyAggGroup) {
    // Row key format:
    // <version><encoded agg group><time base><encoded dimension1 value>...
    // <encoded dimensionN value><encoded measure name>.
    // "+2" is for <encoded agg group> and <encoded measure name>
    byte[] rowKey = new byte[VERSION.length + (dimensionValues.size() + 2) * entityTable.getIdSize() + Bytes.SIZEOF_INT];
    int offset = writeVersion(rowKey);
    if (anyAggGroup) {
        offset = writeAnyEncoded(rowKey, offset, stopKey);
    } else {
        offset = writeEncodedAggGroup(dimensionValues, rowKey, offset);
    }
    long timestamp = roundToResolution(ts);
    int timeBase = getTimeBase(timestamp);
    offset = Bytes.putInt(rowKey, offset, timeBase);
    for (DimensionValue dimensionValue : dimensionValues) {
        if (dimensionValue.getValue() != null) {
            // encoded value is unique within values of the dimension name
            offset = writeEncoded(dimensionValue.getName(), dimensionValue.getValue(), rowKey, offset);
        } else {
            // todo: this is only applicable for constructing scan, throw smth if constructing key for writing data
            // writing "ANY" as a value
            offset = writeAnyEncoded(rowKey, offset, stopKey);
        }
    }
    if (measureName != null) {
        writeEncoded(TYPE_MEASURE_NAME, measureName, rowKey, offset);
    } else {
        // todo: this is only applicable for constructing scan, throw smth if constructing key for writing data
        // writing "ANY" value
        writeAnyEncoded(rowKey, offset, stopKey);
    }
    return rowKey;
}
Also used : DimensionValue(co.cask.cdap.api.dataset.lib.cube.DimensionValue)

Example 14 with DimensionValue

use of co.cask.cdap.api.dataset.lib.cube.DimensionValue in project cdap by caskdata.

the class FactScanner method createIterator.

private Iterator<FactScanResult> createIterator() {
    return new AbstractIterator<FactScanResult>() {

        @Override
        protected FactScanResult computeNext() {
            Row rowResult;
            while ((rowResult = scanner.next()) != null) {
                rowScanned++;
                byte[] rowKey = rowResult.getRow();
                // Decode context and metric from key
                String measureName = codec.getMeasureName(rowKey);
                // if measureNames is empty we include all metrics
                if (!measureNames.isEmpty() && !measureNames.contains(measureName)) {
                    continue;
                }
                // todo: codec.getDimensionValues(rowKey) needs to un-encode dimension names which may result in read in
                // entity table (depending on the cache and its state). To avoid that, we can pass to scanner the
                // list of dimension names as we *always* know it (it is given) at the time of scanning
                List<DimensionValue> dimensionValues = codec.getDimensionValues(rowKey);
                boolean exhausted = false;
                List<TimeValue> timeValues = Lists.newLinkedList();
                // todo: entry set is ordered by ts?
                for (Map.Entry<byte[], byte[]> columnValue : rowResult.getColumns().entrySet()) {
                    long ts = codec.getTimestamp(rowKey, columnValue.getKey());
                    if (ts < startTs) {
                        continue;
                    }
                    if (ts > endTs) {
                        exhausted = true;
                        break;
                    }
                    // todo: move Bytes.toLong into codec?
                    TimeValue timeValue = new TimeValue(ts, Bytes.toLong(columnValue.getValue()));
                    timeValues.add(timeValue);
                }
                if (timeValues.isEmpty() && exhausted) {
                    break;
                }
                // todo: can return empty list, if all data is < startTs or > endTs
                return new FactScanResult(measureName, dimensionValues, timeValues);
            }
            scanner.close();
            return endOfData();
        }
    };
}
Also used : DimensionValue(co.cask.cdap.api.dataset.lib.cube.DimensionValue) AbstractIterator(com.google.common.collect.AbstractIterator) Row(co.cask.cdap.api.dataset.table.Row) Map(java.util.Map) TimeValue(co.cask.cdap.api.dataset.lib.cube.TimeValue)

Example 15 with DimensionValue

use of co.cask.cdap.api.dataset.lib.cube.DimensionValue in project cdap by caskdata.

the class DefaultCube method getTimeSeries.

private Table<Map<String, String>, String, Map<Long, Long>> getTimeSeries(CubeQuery query, FactScanner scanner) {
    // {dimension values, measure} -> {time -> value}s
    Table<Map<String, String>, String, Map<Long, Long>> result = HashBasedTable.create();
    int count = 0;
    while (scanner.hasNext()) {
        FactScanResult next = scanner.next();
        incrementMetric("cube.query.scan.records.count", 1);
        boolean skip = false;
        // using tree map, as we are using it as a key for a map
        Map<String, String> seriesDimensions = Maps.newTreeMap();
        for (String dimensionName : query.getGroupByDimensions()) {
            // todo: use Map<String, String> instead of List<DimensionValue> into a String, String, everywhere
            for (DimensionValue dimensionValue : next.getDimensionValues()) {
                if (dimensionName.equals(dimensionValue.getName())) {
                    if (dimensionValue.getValue() == null) {
                        // Currently, we do NOT return null as grouped by value.
                        // Depending on whether dimension is required or not the records with null value in it may or may not be
                        // in aggregation. At this moment, the choosing of the aggregation for query doesn't look at this, so
                        // potentially null may or may not be included in results, depending on the aggregation selected
                        // querying. We don't want to produce inconsistent results varying due to different aggregations selected,
                        // so don't return nulls in any of those cases.
                        skip = true;
                        continue;
                    }
                    seriesDimensions.put(dimensionName, dimensionValue.getValue());
                    break;
                }
            }
        }
        if (skip) {
            incrementMetric("cube.query.scan.skipped.count", 1);
            continue;
        }
        for (TimeValue timeValue : next) {
            Map<Long, Long> timeValues = result.get(seriesDimensions, next.getMeasureName());
            if (timeValues == null) {
                result.put(seriesDimensions, next.getMeasureName(), Maps.<Long, Long>newHashMap());
            }
            AggregationFunction function = query.getMeasurements().get(next.getMeasureName());
            if (AggregationFunction.SUM == function) {
                Long value = result.get(seriesDimensions, next.getMeasureName()).get(timeValue.getTimestamp());
                value = value == null ? 0 : value;
                value += timeValue.getValue();
                result.get(seriesDimensions, next.getMeasureName()).put(timeValue.getTimestamp(), value);
            } else if (AggregationFunction.MAX == function) {
                Long value = result.get(seriesDimensions, next.getMeasureName()).get(timeValue.getTimestamp());
                value = value != null && value > timeValue.getValue() ? value : timeValue.getValue();
                result.get(seriesDimensions, next.getMeasureName()).put(timeValue.getTimestamp(), value);
            } else if (AggregationFunction.MIN == function) {
                Long value = result.get(seriesDimensions, next.getMeasureName()).get(timeValue.getTimestamp());
                value = value != null && value < timeValue.getValue() ? value : timeValue.getValue();
                result.get(seriesDimensions, next.getMeasureName()).put(timeValue.getTimestamp(), value);
            } else if (AggregationFunction.LATEST == function) {
                result.get(seriesDimensions, next.getMeasureName()).put(timeValue.getTimestamp(), timeValue.getValue());
            } else {
                // should never happen: developer error
                throw new RuntimeException("Unknown MeasureType: " + function);
            }
        }
        if (++count >= MAX_RECORDS_TO_SCAN) {
            break;
        }
    }
    return result;
}
Also used : AggregationFunction(co.cask.cdap.api.dataset.lib.cube.AggregationFunction) FactScanResult(co.cask.cdap.data2.dataset2.lib.timeseries.FactScanResult) DimensionValue(co.cask.cdap.api.dataset.lib.cube.DimensionValue) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) TimeValue(co.cask.cdap.api.dataset.lib.cube.TimeValue)

Aggregations

DimensionValue (co.cask.cdap.api.dataset.lib.cube.DimensionValue)19 FactTable (co.cask.cdap.data2.dataset2.lib.timeseries.FactTable)5 Test (org.junit.Test)5 TimeValue (co.cask.cdap.api.dataset.lib.cube.TimeValue)4 Row (co.cask.cdap.api.dataset.table.Row)4 InMemoryMetricsTable (co.cask.cdap.data2.dataset2.lib.table.inmemory.InMemoryMetricsTable)4 Map (java.util.Map)4 Scanner (co.cask.cdap.api.dataset.table.Scanner)3 FuzzyRowFilter (co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter)3 LinkedHashMap (java.util.LinkedHashMap)3 CubeFact (co.cask.cdap.api.dataset.lib.cube.CubeFact)2 Measurement (co.cask.cdap.api.dataset.lib.cube.Measurement)2 TimeSeries (co.cask.cdap.api.dataset.lib.cube.TimeSeries)2 FactScan (co.cask.cdap.data2.dataset2.lib.timeseries.FactScan)2 URL (java.net.URL)2 ArrayList (java.util.ArrayList)2 AggregationFunction (co.cask.cdap.api.dataset.lib.cube.AggregationFunction)1 CubeExploreQuery (co.cask.cdap.api.dataset.lib.cube.CubeExploreQuery)1 CubeQuery (co.cask.cdap.api.dataset.lib.cube.CubeQuery)1 TagValue (co.cask.cdap.api.metrics.TagValue)1