use of co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter in project cdap by caskdata.
the class FactTable method getScanner.
private Scanner getScanner(FactScan scan) {
// sort the measures based on their entity ids and based on that get the start and end row key metric names
List<String> measureNames = getSortedMeasures(scan.getMeasureNames());
byte[] startRow = codec.createStartRowKey(scan.getDimensionValues(), measureNames.isEmpty() ? null : measureNames.get(0), scan.getStartTs(), false);
byte[] endRow = codec.createEndRowKey(scan.getDimensionValues(), measureNames.isEmpty() ? null : measureNames.get(measureNames.size() - 1), scan.getEndTs(), false);
byte[][] columns;
if (Arrays.equals(startRow, endRow)) {
// If on the same timebase, we only need subset of columns
long timeBase = scan.getStartTs() / rollTime * rollTime;
int startCol = (int) (scan.getStartTs() - timeBase) / resolution;
int endCol = (int) (scan.getEndTs() - timeBase) / resolution;
columns = new byte[endCol - startCol + 1][];
for (int i = 0; i < columns.length; i++) {
columns[i] = Bytes.toBytes((short) (startCol + i));
}
}
endRow = Bytes.stopKeyForPrefix(endRow);
FuzzyRowFilter fuzzyRowFilter = measureNames.isEmpty() ? createFuzzyRowFilter(scan, startRow) : createFuzzyRowFilter(scan, measureNames);
if (LOG.isTraceEnabled()) {
LOG.trace("Scanning fact table {} with scan: {}; constructed startRow: {}, endRow: {}, fuzzyRowFilter: {}", timeSeriesTable, scan, toPrettyLog(startRow), toPrettyLog(endRow), fuzzyRowFilter);
}
return timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
}
use of co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter in project cdap by caskdata.
the class FactTable method findMeasureNames.
/**
* Finds all measure names of the facts that match given {@link DimensionValue}s and time range.
* @param allDimensionNames list of all dimension names to be present in the fact record
* @param dimensionSlice dimension values to filter by, {@code null} means any non-null value.
* @param startTs start timestamp, in sec
* @param endTs end timestamp, in sec
* @return {@link Set} of measure names
*/
// todo: pass a limit on number of measures returned
public Set<String> findMeasureNames(List<String> allDimensionNames, Map<String, String> dimensionSlice, long startTs, long endTs) {
List<DimensionValue> allDimensions = Lists.newArrayList();
for (String dimensionName : allDimensionNames) {
allDimensions.add(new DimensionValue(dimensionName, dimensionSlice.get(dimensionName)));
}
byte[] startRow = codec.createStartRowKey(allDimensions, null, startTs, false);
byte[] endRow = codec.createEndRowKey(allDimensions, null, endTs, false);
endRow = Bytes.stopKeyForPrefix(endRow);
FuzzyRowFilter fuzzyRowFilter = createFuzzyRowFilter(new FactScan(startTs, endTs, ImmutableList.<String>of(), allDimensions), startRow);
Set<String> measureNames = Sets.newHashSet();
int scannedRecords = 0;
try (Scanner scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter)) {
Row rowResult;
while ((rowResult = scanner.next()) != null) {
scannedRecords++;
if (scannedRecords > MAX_RECORDS_TO_SCAN_DURING_SEARCH) {
break;
}
byte[] rowKey = rowResult.getRow();
// filter out columns by time range (scan configuration only filters whole rows)
if (codec.getTimestamp(rowKey, codec.createColumn(startTs)) < startTs) {
continue;
}
if (codec.getTimestamp(rowKey, codec.createColumn(endTs)) > endTs) {
// we're done with scanner
break;
}
measureNames.add(codec.getMeasureName(rowResult.getRow()));
}
}
LOG.trace("search for measures completed, scanned records: {}", scannedRecords);
return measureNames;
}
use of co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter in project cdap by caskdata.
the class FactTable method findSingleDimensionValue.
/**
* Searches for first non-null valued dimensions in records that contain given list of dimensions and match given
* dimension values in given time range. Returned dimension values are those that are not defined in given
* dimension values.
* @param allDimensionNames list of all dimension names to be present in the record
* @param dimensionSlice dimension values to filter by, {@code null} means any non-null value.
* @param startTs start of the time range, in seconds
* @param endTs end of the time range, in seconds
* @return {@link Set} of {@link DimensionValue}s
*/
// todo: pass a limit on number of dimensionValues returned
// todo: kinda not cool API when we expect null values in a map...
public Set<DimensionValue> findSingleDimensionValue(List<String> allDimensionNames, Map<String, String> dimensionSlice, long startTs, long endTs) {
// Algorithm, briefly:
// We scan in the records which have given allDimensionNames. We use dimensionSlice as a criteria for scan.
// If record from the scan has non-null values in the dimensions which are not specified in dimensionSlice,
// we use first of such dimension as a value to return.
// When we find value to return, since we only fill a single dimension, we are not interested in drilling down
// further and instead attempt to fast-forward (jump) to a record that has different value in that dimension.
// Thus we find all results.
List<DimensionValue> allDimensions = Lists.newArrayList();
List<DimensionValue> filledDimension = Lists.newArrayList();
List<Integer> dimToFillIndexes = Lists.newArrayList();
for (int i = 0; i < allDimensionNames.size(); i++) {
String dimensionName = allDimensionNames.get(i);
if (!dimensionSlice.containsKey(dimensionName)) {
dimToFillIndexes.add(i);
allDimensions.add(new DimensionValue(dimensionName, null));
} else {
DimensionValue dimensionValue = new DimensionValue(dimensionName, dimensionSlice.get(dimensionName));
filledDimension.add(dimensionValue);
allDimensions.add(dimensionValue);
}
}
// If provided dimensions contain all values filled in, there's nothing to look for
if (dimToFillIndexes.isEmpty()) {
return Collections.emptySet();
}
Set<DimensionValue> result = Sets.newHashSet();
int scans = 0;
int scannedRecords = 0;
// build a scan
byte[] startRow = codec.createStartRowKey(allDimensions, null, startTs, false);
byte[] endRow = codec.createEndRowKey(allDimensions, null, endTs, false);
endRow = Bytes.stopKeyForPrefix(endRow);
FuzzyRowFilter fuzzyRowFilter = createFuzzyRowFilter(new FactScan(startTs, endTs, ImmutableList.<String>of(), allDimensions), startRow);
Scanner scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
scans++;
try {
Row rowResult;
while ((rowResult = scanner.next()) != null) {
scannedRecords++;
// todo: make configurable
if (scannedRecords > MAX_RECORDS_TO_SCAN_DURING_SEARCH) {
break;
}
byte[] rowKey = rowResult.getRow();
// filter out columns by time range (scan configuration only filters whole rows)
if (codec.getTimestamp(rowKey, codec.createColumn(startTs)) < startTs) {
continue;
}
if (codec.getTimestamp(rowKey, codec.createColumn(endTs)) > endTs) {
// we're done with scanner
break;
}
List<DimensionValue> dimensionValues = codec.getDimensionValues(rowResult.getRow());
// At this point, we know that the record is in right time range and its dimensions matches given.
// We try find first non-null valued dimension in the record that was not in given dimensions: we use it to form
// next drill down suggestion
int filledIndex = -1;
for (int index : dimToFillIndexes) {
// todo: it may be not efficient, if dimensionValues is not array-backed list: i.e. if access by index is
// not fast
DimensionValue dimensionValue = dimensionValues.get(index);
if (dimensionValue.getValue() != null) {
result.add(dimensionValue);
filledIndex = index;
break;
}
}
// todo: fast-forwarding (jumping) should be done on server-side (CDAP-1421)
if (filledIndex >= 0) {
scanner.close();
scanner = null;
scans++;
if (scans > MAX_SCANS_DURING_SEARCH) {
break;
}
startRow = codec.getNextRowKey(rowResult.getRow(), filledIndex);
scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
}
}
} finally {
if (scanner != null) {
scanner.close();
}
}
LOG.trace("search for dimensions completed, scans performed: {}, scanned records: {}", scans, scannedRecords);
return result;
}
use of co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter in project cdap by caskdata.
the class HBaseTable method setFilterIfNeeded.
private void setFilterIfNeeded(ScanBuilder scan, @Nullable Filter filter) {
if (filter == null) {
return;
}
if (filter instanceof FuzzyRowFilter) {
FuzzyRowFilter fuzzyRowFilter = (FuzzyRowFilter) filter;
List<Pair<byte[], byte[]>> fuzzyPairs = Lists.newArrayListWithExpectedSize(fuzzyRowFilter.getFuzzyKeysData().size());
for (ImmutablePair<byte[], byte[]> pair : fuzzyRowFilter.getFuzzyKeysData()) {
fuzzyPairs.add(Pair.newPair(pair.getFirst(), pair.getSecond()));
}
scan.setFilter(new org.apache.hadoop.hbase.filter.FuzzyRowFilter(fuzzyPairs));
} else {
throw new IllegalArgumentException("Unsupported filter: " + filter);
}
}
use of co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter in project cdap by caskdata.
the class FactCodecTest method test.
@Test
public void test() {
InMemoryTableService.create("FactCodecTest");
MetricsTable table = new InMemoryMetricsTable("FactCodecTest");
int resolution = 10;
int rollTimebaseInterval = 2;
FactCodec codec = new FactCodec(new EntityTable(table), resolution, rollTimebaseInterval);
// testing encoding with multiple dimensions
List<DimensionValue> dimensionValues = ImmutableList.of(new DimensionValue("dimension1", "value1"), new DimensionValue("dimension2", "value2"), new DimensionValue("dimension3", "value3"));
// note: we use seconds everywhere and rely on this
long ts = 1422312915;
byte[] rowKey = codec.createRowKey(dimensionValues, "myMetric", ts);
byte[] column = codec.createColumn(ts);
Assert.assertEquals((ts / resolution) * resolution, codec.getTimestamp(rowKey, column));
Assert.assertEquals(dimensionValues, codec.getDimensionValues(rowKey));
Assert.assertEquals("myMetric", codec.getMeasureName(rowKey));
// testing encoding without one dimension
dimensionValues = ImmutableList.of(new DimensionValue("myTag", "myValue"));
rowKey = codec.createRowKey(dimensionValues, "mySingleTagMetric", ts);
Assert.assertEquals((ts / resolution) * resolution, codec.getTimestamp(rowKey, column));
Assert.assertEquals(dimensionValues, codec.getDimensionValues(rowKey));
Assert.assertEquals("mySingleTagMetric", codec.getMeasureName(rowKey));
// testing encoding without empty dimensions
rowKey = codec.createRowKey(new ArrayList<DimensionValue>(), "myNoTagsMetric", ts);
Assert.assertEquals((ts / resolution) * resolution, codec.getTimestamp(rowKey, column));
Assert.assertEquals(new ArrayList<DimensionValue>(), codec.getDimensionValues(rowKey));
Assert.assertEquals("myNoTagsMetric", codec.getMeasureName(rowKey));
// testing null metric
dimensionValues = ImmutableList.of(new DimensionValue("myTag", "myValue"));
rowKey = codec.createRowKey(dimensionValues, "mySingleTagMetric", ts);
Assert.assertEquals((ts / resolution) * resolution, codec.getTimestamp(rowKey, column));
Assert.assertEquals(dimensionValues, codec.getDimensionValues(rowKey));
Assert.assertEquals("mySingleTagMetric", codec.getMeasureName(rowKey));
// testing null value
dimensionValues = ImmutableList.of(new DimensionValue("dimension1", "value1"), new DimensionValue("dimension2", null), new DimensionValue("dimension3", "value3"));
rowKey = codec.createRowKey(dimensionValues, "myNullTagMetric", ts);
Assert.assertEquals((ts / resolution) * resolution, codec.getTimestamp(rowKey, column));
Assert.assertEquals(dimensionValues, codec.getDimensionValues(rowKey));
Assert.assertEquals("myNullTagMetric", codec.getMeasureName(rowKey));
// testing fuzzy mask for fuzzy stuff in row key
dimensionValues = ImmutableList.of(new DimensionValue("dimension1", "value1"), // any value is accepted
new DimensionValue("dimension2", null), new DimensionValue("dimension3", "value3"));
byte[] mask = codec.createFuzzyRowMask(dimensionValues, "myMetric");
rowKey = codec.createRowKey(dimensionValues, "myMetric", ts);
FuzzyRowFilter filter = new FuzzyRowFilter(ImmutableList.of(new ImmutablePair<>(rowKey, mask)));
dimensionValues = ImmutableList.of(new DimensionValue("dimension1", "value1"), new DimensionValue("dimension2", "annnnnnnnnny"), new DimensionValue("dimension3", "value3"));
rowKey = codec.createRowKey(dimensionValues, "myMetric", ts);
Assert.assertEquals(FuzzyRowFilter.ReturnCode.INCLUDE, filter.filterRow(rowKey));
dimensionValues = ImmutableList.of(new DimensionValue("dimension1", "value12"), new DimensionValue("dimension2", "value2"), new DimensionValue("dimension3", "value3"));
rowKey = codec.createRowKey(dimensionValues, "myMetric", ts);
Assert.assertTrue(FuzzyRowFilter.ReturnCode.INCLUDE != filter.filterRow(rowKey));
dimensionValues = ImmutableList.of(new DimensionValue("dimension1", "value1"), new DimensionValue("dimension2", "value2"), new DimensionValue("dimension3", "value13"));
rowKey = codec.createRowKey(dimensionValues, "myMetric", ts);
Assert.assertTrue(FuzzyRowFilter.ReturnCode.INCLUDE != filter.filterRow(rowKey));
dimensionValues = ImmutableList.of(new DimensionValue("dimension1", "value1"), new DimensionValue("dimension3", "value3"));
rowKey = codec.createRowKey(dimensionValues, "myMetric", ts);
Assert.assertTrue(FuzzyRowFilter.ReturnCode.INCLUDE != filter.filterRow(rowKey));
// fuzzy in value should match the "null" value
dimensionValues = ImmutableList.of(new DimensionValue("dimension1", "value1"), new DimensionValue("dimension2", null), new DimensionValue("dimension3", "value3"));
rowKey = codec.createRowKey(dimensionValues, "myMetric", ts);
Assert.assertEquals(FuzzyRowFilter.ReturnCode.INCLUDE, filter.filterRow(rowKey));
dimensionValues = ImmutableList.of(new DimensionValue("dimension1", "value1"), new DimensionValue("dimension2", "value2"), new DimensionValue("dimension3", "value3"));
rowKey = codec.createRowKey(dimensionValues, "myMetric2", ts);
Assert.assertTrue(FuzzyRowFilter.ReturnCode.INCLUDE != filter.filterRow(rowKey));
rowKey = codec.createRowKey(dimensionValues, null, ts);
Assert.assertTrue(FuzzyRowFilter.ReturnCode.INCLUDE != filter.filterRow(rowKey));
rowKey = codec.createRowKey(new ArrayList<DimensionValue>(), "myMetric", ts);
Assert.assertTrue(FuzzyRowFilter.ReturnCode.INCLUDE != filter.filterRow(rowKey));
// testing fuzzy mask for fuzzy metric
dimensionValues = ImmutableList.of(new DimensionValue("myTag", "myValue"));
rowKey = codec.createRowKey(dimensionValues, null, ts);
mask = codec.createFuzzyRowMask(dimensionValues, null);
filter = new FuzzyRowFilter(ImmutableList.of(new ImmutablePair<>(rowKey, mask)));
rowKey = codec.createRowKey(dimensionValues, "annyyy", ts);
Assert.assertEquals(FuzzyRowFilter.ReturnCode.INCLUDE, filter.filterRow(rowKey));
rowKey = codec.createRowKey(dimensionValues, "zzzzzzzzzzzz", ts);
Assert.assertEquals(FuzzyRowFilter.ReturnCode.INCLUDE, filter.filterRow(rowKey));
dimensionValues = ImmutableList.of(new DimensionValue("myTag", "myValue2"));
rowKey = codec.createRowKey(dimensionValues, "metric", ts);
Assert.assertTrue(FuzzyRowFilter.ReturnCode.INCLUDE != filter.filterRow(rowKey));
// todo: test prefix of multi dimension valued row key is not same one dimension valued row key
// todo: test that rollTimebaseInterval applies well
}
Aggregations