use of co.cask.cdap.api.dataset.lib.cube.DimensionValue in project cdap by caskdata.
the class FactCodec method getDimensionValues.
public List<DimensionValue> getDimensionValues(byte[] rowKey) {
// todo: in some cases, the client knows the agg group - so to optimize we can accept is as a parameter
// first encoded is aggregation group
long encodedAggGroup = readEncoded(rowKey, VERSION.length);
String aggGroup = entityTable.getName(encodedAggGroup, TYPE_DIMENSIONS_GROUP);
if (aggGroup == null) {
// will never happen, unless data in entity table was corrupted or deleted
LOG.warn("Could not decode agg group: " + encodedAggGroup);
return Collections.emptyList();
}
if (aggGroup.isEmpty()) {
return Collections.emptyList();
}
// aggregation group is defined by list of dimension names concatenated with "." (see writeEncodedAggGroup
// for details)
String[] dimensionNames = aggGroup.split("\\.");
// todo: assert count of dimension values is same as dimension names?
List<DimensionValue> dimensions = Lists.newArrayListWithCapacity(dimensionNames.length);
for (int i = 0; i < dimensionNames.length; i++) {
// dimension values go right after encoded agg group and timebase (encoded as int)
long encodedDimensionValue = readEncoded(rowKey, VERSION.length + entityTable.getIdSize() * (i + 1) + Bytes.SIZEOF_INT);
String dimensionValue = entityTable.getName(encodedDimensionValue, dimensionNames[i]);
dimensions.add(new DimensionValue(dimensionNames[i], dimensionValue));
}
return dimensions;
}
use of co.cask.cdap.api.dataset.lib.cube.DimensionValue in project cdap by caskdata.
the class FactCodec method createFuzzyRowMask.
/**
* create fuzzy row mask based on dimension values and measure name.
* if dimension value/measure name is null it matches any dimension values / measures.
* @param dimensionValues
* @param measureName
* @return fuzzy mask byte array
*/
public byte[] createFuzzyRowMask(List<DimensionValue> dimensionValues, @Nullable String measureName) {
// See createRowKey for row format info
byte[] mask = new byte[VERSION.length + (dimensionValues.size() + 2) * entityTable.getIdSize() + Bytes.SIZEOF_INT];
int offset = writeVersion(mask);
// agg group encoded is always provided for fuzzy row filter
offset = writeEncodedFixedMask(mask, offset);
// time is defined by start/stop keys when scanning - we never include it in fuzzy filter
offset = writeFuzzyMask(mask, offset, Bytes.SIZEOF_INT);
for (DimensionValue dimensionValue : dimensionValues) {
if (dimensionValue.getValue() != null) {
offset = writeEncodedFixedMask(mask, offset);
} else {
offset = writeEncodedFuzzyMask(mask, offset);
}
}
if (measureName != null) {
writeEncodedFixedMask(mask, offset);
} else {
writeEncodedFuzzyMask(mask, offset);
}
return mask;
}
use of co.cask.cdap.api.dataset.lib.cube.DimensionValue in project cdap by caskdata.
the class FactCodec method createRowKey.
private byte[] createRowKey(List<DimensionValue> dimensionValues, String measureName, long ts, boolean stopKey, boolean anyAggGroup) {
// Row key format:
// <version><encoded agg group><time base><encoded dimension1 value>...
// <encoded dimensionN value><encoded measure name>.
// "+2" is for <encoded agg group> and <encoded measure name>
byte[] rowKey = new byte[VERSION.length + (dimensionValues.size() + 2) * entityTable.getIdSize() + Bytes.SIZEOF_INT];
int offset = writeVersion(rowKey);
if (anyAggGroup) {
offset = writeAnyEncoded(rowKey, offset, stopKey);
} else {
offset = writeEncodedAggGroup(dimensionValues, rowKey, offset);
}
long timestamp = roundToResolution(ts);
int timeBase = getTimeBase(timestamp);
offset = Bytes.putInt(rowKey, offset, timeBase);
for (DimensionValue dimensionValue : dimensionValues) {
if (dimensionValue.getValue() != null) {
// encoded value is unique within values of the dimension name
offset = writeEncoded(dimensionValue.getName(), dimensionValue.getValue(), rowKey, offset);
} else {
// todo: this is only applicable for constructing scan, throw smth if constructing key for writing data
// writing "ANY" as a value
offset = writeAnyEncoded(rowKey, offset, stopKey);
}
}
if (measureName != null) {
writeEncoded(TYPE_MEASURE_NAME, measureName, rowKey, offset);
} else {
// todo: this is only applicable for constructing scan, throw smth if constructing key for writing data
// writing "ANY" value
writeAnyEncoded(rowKey, offset, stopKey);
}
return rowKey;
}
use of co.cask.cdap.api.dataset.lib.cube.DimensionValue in project cdap by caskdata.
the class FactScanner method createIterator.
private Iterator<FactScanResult> createIterator() {
return new AbstractIterator<FactScanResult>() {
@Override
protected FactScanResult computeNext() {
Row rowResult;
while ((rowResult = scanner.next()) != null) {
rowScanned++;
byte[] rowKey = rowResult.getRow();
// Decode context and metric from key
String measureName = codec.getMeasureName(rowKey);
// if measureNames is empty we include all metrics
if (!measureNames.isEmpty() && !measureNames.contains(measureName)) {
continue;
}
// todo: codec.getDimensionValues(rowKey) needs to un-encode dimension names which may result in read in
// entity table (depending on the cache and its state). To avoid that, we can pass to scanner the
// list of dimension names as we *always* know it (it is given) at the time of scanning
List<DimensionValue> dimensionValues = codec.getDimensionValues(rowKey);
boolean exhausted = false;
List<TimeValue> timeValues = Lists.newLinkedList();
// todo: entry set is ordered by ts?
for (Map.Entry<byte[], byte[]> columnValue : rowResult.getColumns().entrySet()) {
long ts = codec.getTimestamp(rowKey, columnValue.getKey());
if (ts < startTs) {
continue;
}
if (ts > endTs) {
exhausted = true;
break;
}
// todo: move Bytes.toLong into codec?
TimeValue timeValue = new TimeValue(ts, Bytes.toLong(columnValue.getValue()));
timeValues.add(timeValue);
}
if (timeValues.isEmpty() && exhausted) {
break;
}
// todo: can return empty list, if all data is < startTs or > endTs
return new FactScanResult(measureName, dimensionValues, timeValues);
}
scanner.close();
return endOfData();
}
};
}
use of co.cask.cdap.api.dataset.lib.cube.DimensionValue in project cdap by caskdata.
the class DefaultCube method getTimeSeries.
private Table<Map<String, String>, String, Map<Long, Long>> getTimeSeries(CubeQuery query, FactScanner scanner) {
// {dimension values, measure} -> {time -> value}s
Table<Map<String, String>, String, Map<Long, Long>> result = HashBasedTable.create();
int count = 0;
while (scanner.hasNext()) {
FactScanResult next = scanner.next();
incrementMetric("cube.query.scan.records.count", 1);
boolean skip = false;
// using tree map, as we are using it as a key for a map
Map<String, String> seriesDimensions = Maps.newTreeMap();
for (String dimensionName : query.getGroupByDimensions()) {
// todo: use Map<String, String> instead of List<DimensionValue> into a String, String, everywhere
for (DimensionValue dimensionValue : next.getDimensionValues()) {
if (dimensionName.equals(dimensionValue.getName())) {
if (dimensionValue.getValue() == null) {
// Currently, we do NOT return null as grouped by value.
// Depending on whether dimension is required or not the records with null value in it may or may not be
// in aggregation. At this moment, the choosing of the aggregation for query doesn't look at this, so
// potentially null may or may not be included in results, depending on the aggregation selected
// querying. We don't want to produce inconsistent results varying due to different aggregations selected,
// so don't return nulls in any of those cases.
skip = true;
continue;
}
seriesDimensions.put(dimensionName, dimensionValue.getValue());
break;
}
}
}
if (skip) {
incrementMetric("cube.query.scan.skipped.count", 1);
continue;
}
for (TimeValue timeValue : next) {
Map<Long, Long> timeValues = result.get(seriesDimensions, next.getMeasureName());
if (timeValues == null) {
result.put(seriesDimensions, next.getMeasureName(), Maps.<Long, Long>newHashMap());
}
AggregationFunction function = query.getMeasurements().get(next.getMeasureName());
if (AggregationFunction.SUM == function) {
Long value = result.get(seriesDimensions, next.getMeasureName()).get(timeValue.getTimestamp());
value = value == null ? 0 : value;
value += timeValue.getValue();
result.get(seriesDimensions, next.getMeasureName()).put(timeValue.getTimestamp(), value);
} else if (AggregationFunction.MAX == function) {
Long value = result.get(seriesDimensions, next.getMeasureName()).get(timeValue.getTimestamp());
value = value != null && value > timeValue.getValue() ? value : timeValue.getValue();
result.get(seriesDimensions, next.getMeasureName()).put(timeValue.getTimestamp(), value);
} else if (AggregationFunction.MIN == function) {
Long value = result.get(seriesDimensions, next.getMeasureName()).get(timeValue.getTimestamp());
value = value != null && value < timeValue.getValue() ? value : timeValue.getValue();
result.get(seriesDimensions, next.getMeasureName()).put(timeValue.getTimestamp(), value);
} else if (AggregationFunction.LATEST == function) {
result.get(seriesDimensions, next.getMeasureName()).put(timeValue.getTimestamp(), timeValue.getValue());
} else {
// should never happen: developer error
throw new RuntimeException("Unknown MeasureType: " + function);
}
}
if (++count >= MAX_RECORDS_TO_SCAN) {
break;
}
}
return result;
}
Aggregations