use of io.cdap.cdap.data2.dataset2.lib.table.FuzzyRowFilter in project cdap by caskdata.
the class MetadataDataset method getMetadata.
/**
* Returns metadata for a given set of entities
*
* @param targetIds entities for which metadata is required
* @return map of entitiyId to set of metadata for that entity
*/
public Set<Metadata> getMetadata(Set<? extends NamespacedEntityId> targetIds) {
if (targetIds.isEmpty()) {
return Collections.emptySet();
}
List<ImmutablePair<byte[], byte[]>> fuzzyKeys = new ArrayList<>(targetIds.size());
for (NamespacedEntityId targetId : targetIds) {
fuzzyKeys.add(getFuzzyKeyFor(targetId));
}
// Sort fuzzy keys
Collections.sort(fuzzyKeys, FUZZY_KEY_COMPARATOR);
// Scan using fuzzy filter. Scan returns one row per property.
// Group the rows on namespacedId
Multimap<NamespacedEntityId, MetadataEntry> metadataMap = HashMultimap.create();
byte[] start = fuzzyKeys.get(0).getFirst();
byte[] end = Bytes.stopKeyForPrefix(fuzzyKeys.get(fuzzyKeys.size() - 1).getFirst());
try (Scanner scan = indexedTable.scan(new Scan(start, end, new FuzzyRowFilter(fuzzyKeys)))) {
Row next;
while ((next = scan.next()) != null) {
MetadataEntry metadataEntry = convertRow(next);
if (metadataEntry != null) {
metadataMap.put(metadataEntry.getTargetId(), metadataEntry);
}
}
}
// Create metadata objects for each entity from grouped rows
Set<Metadata> metadataSet = new HashSet<>();
for (Map.Entry<NamespacedEntityId, Collection<MetadataEntry>> entry : metadataMap.asMap().entrySet()) {
Map<String, String> properties = new HashMap<>();
Set<String> tags = Collections.emptySet();
for (MetadataEntry metadataEntry : entry.getValue()) {
if (TAGS_KEY.equals(metadataEntry.getKey())) {
tags = splitTags(metadataEntry.getValue());
} else {
properties.put(metadataEntry.getKey(), metadataEntry.getValue());
}
}
metadataSet.add(new Metadata(entry.getKey(), properties, tags));
}
return metadataSet;
}
use of io.cdap.cdap.data2.dataset2.lib.table.FuzzyRowFilter in project cdap by caskdata.
the class LevelDBTable method scanPersisted.
@ReadOnly
@Override
protected Scanner scanPersisted(Scan scan) throws Exception {
FuzzyRowFilter filter = null;
if (scan.getFilter() != null) {
// todo: currently we support only FuzzyRowFilter as an experimental feature
if (scan.getFilter() instanceof FuzzyRowFilter) {
filter = (FuzzyRowFilter) scan.getFilter();
} else {
throw new DataSetException("Unknown filter type: " + scan.getFilter());
}
}
final Scanner scanner = core.scan(scan.getStartRow(), scan.getStopRow(), filter, null, tx);
return new Scanner() {
@Nullable
@Override
public Row next() {
return LevelDBTable.this.next(scanner);
}
@Override
public void close() {
scanner.close();
}
};
}
use of io.cdap.cdap.data2.dataset2.lib.table.FuzzyRowFilter in project cdap by caskdata.
the class MetadataDataset method getMetadata.
/**
* Returns metadata for a given set of entities
*
* @param metadataEntitys entities for which metadata is required
* @return map of entitiyId to set of metadata for that entity
*/
public Set<Record> getMetadata(Set<? extends MetadataEntity> metadataEntitys) {
if (metadataEntitys.isEmpty()) {
return Collections.emptySet();
}
List<ImmutablePair<byte[], byte[]>> fuzzyKeys = new ArrayList<>(metadataEntitys.size());
for (MetadataEntity metadataEntity : metadataEntitys) {
fuzzyKeys.add(getFuzzyKeyFor(metadataEntity));
}
// Sort fuzzy keys
fuzzyKeys.sort(FUZZY_KEY_COMPARATOR);
// Scan using fuzzy filter. Scan returns one row per property.
// Group the rows on namespacedId
Multimap<MetadataEntity, MetadataEntry> metadataMap = HashMultimap.create();
byte[] start = fuzzyKeys.get(0).getFirst();
byte[] end = Bytes.stopKeyForPrefix(fuzzyKeys.get(fuzzyKeys.size() - 1).getFirst());
try (Scanner scan = indexedTable.scan(new Scan(start, end, new FuzzyRowFilter(fuzzyKeys)))) {
Row next;
while ((next = scan.next()) != null) {
MetadataEntry metadataEntry = convertRow(next);
if (metadataEntry != null) {
metadataMap.put(metadataEntry.getMetadataEntity(), metadataEntry);
}
}
}
// Create metadata objects for each entity from grouped rows
Set<Record> metadataSet = new HashSet<>();
for (Map.Entry<MetadataEntity, Collection<MetadataEntry>> entry : metadataMap.asMap().entrySet()) {
Map<String, String> properties = new HashMap<>();
Set<String> tags = Collections.emptySet();
for (MetadataEntry metadataEntry : entry.getValue()) {
if (MetadataConstants.TAGS_KEY.equals(metadataEntry.getKey())) {
tags = splitTags(metadataEntry.getValue());
} else {
properties.put(metadataEntry.getKey(), metadataEntry.getValue());
}
}
metadataSet.add(new Record(entry.getKey(), properties, tags));
}
return metadataSet;
}
use of io.cdap.cdap.data2.dataset2.lib.table.FuzzyRowFilter in project cdap by caskdata.
the class FactTable method findSingleDimensionValue.
/**
* Searches for first non-null valued dimensions in records that contain given list of dimensions and match given
* dimension values in given time range. Returned dimension values are those that are not defined in given
* dimension values.
* @param allDimensionNames list of all dimension names to be present in the record
* @param dimensionSlice dimension values to filter by, {@code null} means any non-null value.
* @param startTs start of the time range, in seconds
* @param endTs end of the time range, in seconds
* @return {@link Set} of {@link DimensionValue}s
*/
// todo: pass a limit on number of dimensionValues returned
// todo: kinda not cool API when we expect null values in a map...
public Set<DimensionValue> findSingleDimensionValue(List<String> allDimensionNames, Map<String, String> dimensionSlice, long startTs, long endTs) {
// Algorithm, briefly:
// We scan in the records which have given allDimensionNames. We use dimensionSlice as a criteria for scan.
// If record from the scan has non-null values in the dimensions which are not specified in dimensionSlice,
// we use first of such dimension as a value to return.
// When we find value to return, since we only fill a single dimension, we are not interested in drilling down
// further and instead attempt to fast-forward (jump) to a record that has different value in that dimension.
// Thus we find all results.
List<DimensionValue> allDimensions = Lists.newArrayList();
List<Integer> dimToFillIndexes = Lists.newArrayList();
for (int i = 0; i < allDimensionNames.size(); i++) {
String dimensionName = allDimensionNames.get(i);
if (!dimensionSlice.containsKey(dimensionName)) {
dimToFillIndexes.add(i);
allDimensions.add(new DimensionValue(dimensionName, null));
} else {
DimensionValue dimensionValue = new DimensionValue(dimensionName, dimensionSlice.get(dimensionName));
allDimensions.add(dimensionValue);
}
}
// If provided dimensions contain all values filled in, there's nothing to look for
if (dimToFillIndexes.isEmpty()) {
return Collections.emptySet();
}
Set<DimensionValue> result = Sets.newHashSet();
int scans = 0;
int scannedRecords = 0;
// build a scan
byte[] startRow = codec.createStartRowKey(allDimensions, null, startTs, false);
byte[] endRow = codec.createEndRowKey(allDimensions, null, endTs, false);
endRow = Bytes.stopKeyForPrefix(endRow);
FuzzyRowFilter fuzzyRowFilter = createFuzzyRowFilter(new FactScan(startTs, endTs, Collections.emptyList(), allDimensions), startRow);
Scanner scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
scans++;
try {
Row rowResult;
while ((rowResult = scanner.next()) != null) {
scannedRecords++;
// todo: make configurable
if (scannedRecords > MAX_RECORDS_TO_SCAN_DURING_SEARCH) {
break;
}
byte[] rowKey = rowResult.getRow();
// filter out columns by time range (scan configuration only filters whole rows)
if (codec.getTimestamp(rowKey, codec.createColumn(startTs)) < startTs) {
continue;
}
if (codec.getTimestamp(rowKey, codec.createColumn(endTs)) > endTs) {
// we're done with scanner
break;
}
List<DimensionValue> dimensionValues = codec.getDimensionValues(rowResult.getRow());
// At this point, we know that the record is in right time range and its dimensions matches given.
// We try find first non-null valued dimension in the record that was not in given dimensions: we use it to form
// next drill down suggestion
int filledIndex = -1;
for (int index : dimToFillIndexes) {
// todo: it may be not efficient, if dimensionValues is not array-backed list: i.e. if access by index is
// not fast
DimensionValue dimensionValue = dimensionValues.get(index);
if (dimensionValue.getValue() != null) {
result.add(dimensionValue);
filledIndex = index;
break;
}
}
// todo: fast-forwarding (jumping) should be done on server-side (CDAP-1421)
if (filledIndex >= 0) {
scanner.close();
scanner = null;
scans++;
if (scans > MAX_SCANS_DURING_SEARCH) {
break;
}
startRow = codec.getNextRowKey(rowResult.getRow(), filledIndex);
scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
}
}
} finally {
if (scanner != null) {
scanner.close();
}
}
LOG.trace("search for dimensions completed, scans performed: {}, scanned records: {}", scans, scannedRecords);
return result;
}
Aggregations