use of io.cdap.cdap.data2.dataset2.lib.timeseries.FactScanner in project cdap by caskdata.
the class DefaultCube method query.
@Override
public Collection<TimeSeries> query(CubeQuery query) {
/*
CubeQuery example: "dataset read ops for app per dataset". Or:
SELECT count('read.ops') << measure name and type
FROM aggregation1.1min_resolution << aggregation and resolution
GROUP BY dataset, << groupByDimensions
WHERE namespace='ns1' AND app='myApp' AND program='myFlow' AND << dimensionValues
ts>=1423370200 AND ts{@literal<}1423398198 << startTs and endTs
LIMIT 100 << limit
Execution:
1) (optional, if aggregation to query in is not provided) find aggregation to supply results
Here, we need aggregation that has following dimensions: 'namespace', 'app', 'program', 'dataset'.
Ideally (to reduce the scan range), 'dataset' should be in the end, other dimensions as close to the beginning
as possible, and minimal number of other "unspecified" dimensions.
Let's say we found aggregation: 'namespace', 'app', 'program', 'instance', 'dataset'
2) build a scan in the aggregation
For scan we set "any" into the dimension values that aggregation has but query doesn't define value for:
'namespace'='ns1', 'app'='myApp', 'program'='myFlow', 'instance'=*, 'dataset'=*
Plus specified measure & aggregation?:
'measureName'='read.ops'
'measureType'='COUNTER'
3) While scanning build a table: dimension values -> time -> value. Use measureType as values aggregate
function if needed.
*/
incrementMetric("cube.query.request.count", 1);
if (!resolutionToFactTable.containsKey(query.getResolution())) {
incrementMetric("cube.query.request.failure.count", 1);
throw new IllegalArgumentException("There's no data aggregated for specified resolution to satisfy the query: " + query.toString());
}
// 1) find aggregation to query
Aggregation agg;
String aggName;
if (query.getAggregation() != null) {
aggName = query.getAggregation();
agg = aggregations.get(query.getAggregation());
if (agg == null) {
incrementMetric("cube.query.request.failure.count", 1);
throw new IllegalArgumentException(String.format("Specified aggregation %s is not found in cube aggregations: %s", query.getAggregation(), aggregations.keySet().toString()));
}
} else {
ImmutablePair<String, Aggregation> aggregation = findAggregation(query);
if (aggregation == null) {
incrementMetric("cube.query.request.failure.count", 1);
throw new IllegalArgumentException("There's no data aggregated for specified dimensions " + "to satisfy the query: " + query.toString());
}
agg = aggregation.getSecond();
aggName = aggregation.getFirst();
}
// tell how many queries end up querying specific pre-aggregated views and resolutions
incrementMetric("cube.query.agg." + aggName + ".count", 1);
incrementMetric("cube.query.res." + query.getResolution() + ".count", 1);
// 2) build a scan for a query
List<DimensionValue> dimensionValues = Lists.newArrayList();
for (String dimensionName : agg.getDimensionNames()) {
// if not defined in query, will be set as null, which means "any"
dimensionValues.add(new DimensionValue(dimensionName, query.getDimensionValues().get(dimensionName)));
}
FactScan scan = new FactScan(query.getStartTs(), query.getEndTs(), query.getMeasurements().keySet(), dimensionValues);
// 3) execute scan query
FactTable table = resolutionToFactTable.get(query.getResolution());
FactScanner scanner = table.scan(scan);
Table<Map<String, String>, String, Map<Long, Long>> resultMap = getTimeSeries(query, scanner);
incrementMetric("cube.query.request.success.count", 1);
incrementMetric("cube.query.result.size", resultMap.size());
Collection<TimeSeries> timeSeries = convertToQueryResult(query, resultMap);
incrementMetric("cube.query.result.timeseries.count", timeSeries.size());
return timeSeries;
}
use of io.cdap.cdap.data2.dataset2.lib.timeseries.FactScanner in project cdap by caskdata.
the class DefaultCube method query.
@Override
public Collection<TimeSeries> query(CubeQuery query) {
/*
CubeQuery example: "dataset read ops for app per dataset". Or:
SELECT count('read.ops') << measure name and type
FROM aggregation1.1min_resolution << aggregation and resolution
GROUP BY dataset, << groupByDimensions
WHERE namespace='ns1' AND app='myApp' AND program='myFlow' AND << dimensionValues
ts>=1423370200 AND ts{@literal<}1423398198 << startTs and endTs
LIMIT 100 << limit
Execution:
1) (optional, if aggregation to query in is not provided) find aggregation to supply results
Here, we need aggregation that has following dimensions: 'namespace', 'app', 'program', 'dataset'.
Ideally (to reduce the scan range), 'dataset' should be in the end, other dimensions as close to the beginning
as possible, and minimal number of other "unspecified" dimensions.
Let's say we found aggregation: 'namespace', 'app', 'program', 'instance', 'dataset'
2) build a scan in the aggregation
For scan we set "any" into the dimension values that aggregation has but query doesn't define value for:
'namespace'='ns1', 'app'='myApp', 'program'='myFlow', 'instance'=*, 'dataset'=*
Plus specified measure & aggregation?:
'measureName'='read.ops'
'measureType'='COUNTER'
3) While scanning build a table: dimension values -> time -> value. Use measureType as values aggregate
function if needed.
*/
incrementMetric("cube.query.request.count", 1);
if (!resolutionToFactTable.containsKey(query.getResolution())) {
incrementMetric("cube.query.request.failure.count", 1);
throw new IllegalArgumentException("There's no data aggregated for specified resolution to satisfy the query: " + query.toString());
}
// 1) find aggregation to query
Aggregation agg;
String aggName;
if (query.getAggregation() != null) {
aggName = query.getAggregation();
agg = aggregations.get(query.getAggregation());
if (agg == null) {
incrementMetric("cube.query.request.failure.count", 1);
throw new IllegalArgumentException(String.format("Specified aggregation %s is not found in cube aggregations: %s", query.getAggregation(), aggregations.keySet().toString()));
}
} else {
ImmutablePair<String, Aggregation> aggregation = findAggregation(query);
if (aggregation == null) {
incrementMetric("cube.query.request.failure.count", 1);
throw new IllegalArgumentException("There's no data aggregated for specified dimensions " + "to satisfy the query: " + query.toString());
}
agg = aggregation.getSecond();
aggName = aggregation.getFirst();
}
// tell how many queries end up querying specific pre-aggregated views and resolutions
incrementMetric("cube.query.agg." + aggName + ".count", 1);
incrementMetric("cube.query.res." + query.getResolution() + ".count", 1);
// 2) build a scan for a query
List<DimensionValue> dimensionValues = Lists.newArrayList();
for (String dimensionName : agg.getDimensionNames()) {
// if not defined in query, will be set as null, which means "any"
dimensionValues.add(new DimensionValue(dimensionName, query.getDimensionValues().get(dimensionName)));
}
FactScan scan = new FactScan(query.getStartTs(), query.getEndTs(), query.getMeasurements().keySet(), dimensionValues);
// 3) execute scan query
FactTable table = resolutionToFactTable.get(query.getResolution());
FactScanner scanner = table.scan(scan);
Table<Map<String, String>, String, Map<Long, Long>> resultMap = getTimeSeries(query, scanner);
incrementMetric("cube.query.request.success.count", 1);
incrementMetric("cube.query.result.size", resultMap.size());
Collection<TimeSeries> timeSeries = convertToQueryResult(query, resultMap);
incrementMetric("cube.query.result.timeseries.count", timeSeries.size());
return timeSeries;
}
use of io.cdap.cdap.data2.dataset2.lib.timeseries.FactScanner in project cdap by caskdata.
the class DefaultCube method getTimeSeries.
private Table<Map<String, String>, String, Map<Long, Long>> getTimeSeries(CubeQuery query, FactScanner scanner) {
// {dimension values, measure} -> {time -> value}s
Table<Map<String, String>, String, Map<Long, Long>> result = HashBasedTable.create();
int count = 0;
while (scanner.hasNext()) {
FactScanResult next = scanner.next();
incrementMetric("cube.query.scan.records.count", 1);
boolean skip = false;
// using tree map, as we are using it as a key for a map
Map<String, String> seriesDimensions = Maps.newTreeMap();
for (String dimensionName : query.getGroupByDimensions()) {
// todo: use Map<String, String> instead of List<DimensionValue> into a String, String, everywhere
for (DimensionValue dimensionValue : next.getDimensionValues()) {
if (dimensionName.equals(dimensionValue.getName())) {
if (dimensionValue.getValue() == null) {
// Currently, we do NOT return null as grouped by value.
// Depending on whether dimension is required or not the records with null value in it may or may not be
// in aggregation. At this moment, the choosing of the aggregation for query doesn't look at this, so
// potentially null may or may not be included in results, depending on the aggregation selected
// querying. We don't want to produce inconsistent results varying due to different aggregations selected,
// so don't return nulls in any of those cases.
skip = true;
continue;
}
seriesDimensions.put(dimensionName, dimensionValue.getValue());
break;
}
}
}
if (skip) {
incrementMetric("cube.query.scan.skipped.count", 1);
continue;
}
for (TimeValue timeValue : next) {
Map<Long, Long> timeValues = result.get(seriesDimensions, next.getMeasureName());
if (timeValues == null) {
result.put(seriesDimensions, next.getMeasureName(), Maps.<Long, Long>newHashMap());
}
AggregationFunction function = query.getMeasurements().get(next.getMeasureName());
if (AggregationFunction.SUM == function) {
Long value = result.get(seriesDimensions, next.getMeasureName()).get(timeValue.getTimestamp());
value = value == null ? 0 : value;
value += timeValue.getValue();
result.get(seriesDimensions, next.getMeasureName()).put(timeValue.getTimestamp(), value);
} else if (AggregationFunction.MAX == function) {
Long value = result.get(seriesDimensions, next.getMeasureName()).get(timeValue.getTimestamp());
value = value != null && value > timeValue.getValue() ? value : timeValue.getValue();
result.get(seriesDimensions, next.getMeasureName()).put(timeValue.getTimestamp(), value);
} else if (AggregationFunction.MIN == function) {
Long value = result.get(seriesDimensions, next.getMeasureName()).get(timeValue.getTimestamp());
value = value != null && value < timeValue.getValue() ? value : timeValue.getValue();
result.get(seriesDimensions, next.getMeasureName()).put(timeValue.getTimestamp(), value);
} else if (AggregationFunction.LATEST == function) {
result.get(seriesDimensions, next.getMeasureName()).put(timeValue.getTimestamp(), timeValue.getValue());
} else {
// should never happen: developer error
throw new RuntimeException("Unknown MeasureType: " + function);
}
}
if (++count >= MAX_RECORDS_TO_SCAN) {
break;
}
}
return result;
}
use of io.cdap.cdap.data2.dataset2.lib.timeseries.FactScanner in project cdap by caskdata.
the class DefaultCube method getTimeSeries.
private Table<Map<String, String>, String, Map<Long, Long>> getTimeSeries(CubeQuery query, FactScanner scanner) {
// {dimension values, measure} -> {time -> value}s
Table<Map<String, String>, String, Map<Long, Long>> result = HashBasedTable.create();
int count = 0;
while (scanner.hasNext()) {
FactScanResult next = scanner.next();
incrementMetric("cube.query.scan.records.count", 1);
boolean skip = false;
// using tree map, as we are using it as a key for a map
Map<String, String> seriesDimensions = Maps.newTreeMap();
for (String dimensionName : query.getGroupByDimensions()) {
// todo: use Map<String, String> instead of List<DimensionValue> into a String, String, everywhere
for (DimensionValue dimensionValue : next.getDimensionValues()) {
if (dimensionName.equals(dimensionValue.getName())) {
if (dimensionValue.getValue() == null) {
// Currently, we do NOT return null as grouped by value.
// Depending on whether dimension is required or not the records with null value in it may or may not be
// in aggregation. At this moment, the choosing of the aggregation for query doesn't look at this, so
// potentially null may or may not be included in results, depending on the aggregation selected
// querying. We don't want to produce inconsistent results varying due to different aggregations selected,
// so don't return nulls in any of those cases.
skip = true;
continue;
}
seriesDimensions.put(dimensionName, dimensionValue.getValue());
break;
}
}
}
if (skip) {
incrementMetric("cube.query.scan.skipped.count", 1);
continue;
}
for (TimeValue timeValue : next) {
Map<Long, Long> timeValues = result.get(seriesDimensions, next.getMeasureName());
if (timeValues == null) {
result.put(seriesDimensions, next.getMeasureName(), Maps.<Long, Long>newHashMap());
}
AggregationFunction function = query.getMeasurements().get(next.getMeasureName());
if (AggregationFunction.SUM == function) {
Long value = result.get(seriesDimensions, next.getMeasureName()).get(timeValue.getTimestamp());
value = value == null ? 0 : value;
value += timeValue.getValue();
result.get(seriesDimensions, next.getMeasureName()).put(timeValue.getTimestamp(), value);
} else if (AggregationFunction.MAX == function) {
Long value = result.get(seriesDimensions, next.getMeasureName()).get(timeValue.getTimestamp());
value = value != null && value > timeValue.getValue() ? value : timeValue.getValue();
result.get(seriesDimensions, next.getMeasureName()).put(timeValue.getTimestamp(), value);
} else if (AggregationFunction.MIN == function) {
Long value = result.get(seriesDimensions, next.getMeasureName()).get(timeValue.getTimestamp());
value = value != null && value < timeValue.getValue() ? value : timeValue.getValue();
result.get(seriesDimensions, next.getMeasureName()).put(timeValue.getTimestamp(), value);
} else if (AggregationFunction.LATEST == function) {
result.get(seriesDimensions, next.getMeasureName()).put(timeValue.getTimestamp(), timeValue.getValue());
} else {
// should never happen: developer error
throw new RuntimeException("Unknown MeasureType: " + function);
}
}
if (++count >= MAX_RECORDS_TO_SCAN) {
break;
}
}
return result;
}
Aggregations