Examples with FuzzyRowFilter - io.cdap.cdap.data2.dataset2.lib.table.FuzzyRowFilter

Example 11 with FuzzyRowFilter

use of io.cdap.cdap.data2.dataset2.lib.table.FuzzyRowFilter in project cdap by caskdata.

the class MetadataDataset method getMetadata.

/**
 * Returns metadata for a given set of entities
 *
 * @param targetIds entities for which metadata is required
 * @return map of entitiyId to set of metadata for that entity
 */
public Set<Metadata> getMetadata(Set<? extends NamespacedEntityId> targetIds) {
    if (targetIds.isEmpty()) {
        return Collections.emptySet();
    }
    List<ImmutablePair<byte[], byte[]>> fuzzyKeys = new ArrayList<>(targetIds.size());
    for (NamespacedEntityId targetId : targetIds) {
        fuzzyKeys.add(getFuzzyKeyFor(targetId));
    }
    // Sort fuzzy keys
    Collections.sort(fuzzyKeys, FUZZY_KEY_COMPARATOR);
    // Scan using fuzzy filter. Scan returns one row per property.
    // Group the rows on namespacedId
    Multimap<NamespacedEntityId, MetadataEntry> metadataMap = HashMultimap.create();
    byte[] start = fuzzyKeys.get(0).getFirst();
    byte[] end = Bytes.stopKeyForPrefix(fuzzyKeys.get(fuzzyKeys.size() - 1).getFirst());
    try (Scanner scan = indexedTable.scan(new Scan(start, end, new FuzzyRowFilter(fuzzyKeys)))) {
        Row next;
        while ((next = scan.next()) != null) {
            MetadataEntry metadataEntry = convertRow(next);
            if (metadataEntry != null) {
                metadataMap.put(metadataEntry.getTargetId(), metadataEntry);
            }
        }
    }
    // Create metadata objects for each entity from grouped rows
    Set<Metadata> metadataSet = new HashSet<>();
    for (Map.Entry<NamespacedEntityId, Collection<MetadataEntry>> entry : metadataMap.asMap().entrySet()) {
        Map<String, String> properties = new HashMap<>();
        Set<String> tags = Collections.emptySet();
        for (MetadataEntry metadataEntry : entry.getValue()) {
            if (TAGS_KEY.equals(metadataEntry.getKey())) {
                tags = splitTags(metadataEntry.getValue());
            } else {
                properties.put(metadataEntry.getKey(), metadataEntry.getValue());
            }
        }
        metadataSet.add(new Metadata(entry.getKey(), properties, tags));
    }
    return metadataSet;
}

Also used : Scanner(co.cask.cdap.api.dataset.table.Scanner) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) FuzzyRowFilter(co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter) NamespacedEntityId(co.cask.cdap.proto.id.NamespacedEntityId) ImmutablePair(co.cask.cdap.common.utils.ImmutablePair) Collection(java.util.Collection) Scan(co.cask.cdap.api.dataset.table.Scan) Row(co.cask.cdap.api.dataset.table.Row) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) HashSet(java.util.HashSet)

Example 12 with FuzzyRowFilter

use of io.cdap.cdap.data2.dataset2.lib.table.FuzzyRowFilter in project cdap by caskdata.

the class LevelDBTable method scanPersisted.

@ReadOnly
@Override
protected Scanner scanPersisted(Scan scan) throws Exception {
    FuzzyRowFilter filter = null;
    if (scan.getFilter() != null) {
        // todo: currently we support only FuzzyRowFilter as an experimental feature
        if (scan.getFilter() instanceof FuzzyRowFilter) {
            filter = (FuzzyRowFilter) scan.getFilter();
        } else {
            throw new DataSetException("Unknown filter type: " + scan.getFilter());
        }
    }
    final Scanner scanner = core.scan(scan.getStartRow(), scan.getStopRow(), filter, null, tx);
    return new Scanner() {

        @Nullable
        @Override
        public Row next() {
            return LevelDBTable.this.next(scanner);
        }

        @Override
        public void close() {
            scanner.close();
        }
    };
}

Also used : Scanner(co.cask.cdap.api.dataset.table.Scanner) DataSetException(co.cask.cdap.api.dataset.DataSetException) FuzzyRowFilter(co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter) ReadOnly(co.cask.cdap.api.annotation.ReadOnly)

Example 13 with FuzzyRowFilter

use of io.cdap.cdap.data2.dataset2.lib.table.FuzzyRowFilter in project cdap by caskdata.

the class MetadataDataset method getMetadata.

/**
 * Returns metadata for a given set of entities
 *
 * @param metadataEntitys entities for which metadata is required
 * @return map of entitiyId to set of metadata for that entity
 */
public Set<Record> getMetadata(Set<? extends MetadataEntity> metadataEntitys) {
    if (metadataEntitys.isEmpty()) {
        return Collections.emptySet();
    }
    List<ImmutablePair<byte[], byte[]>> fuzzyKeys = new ArrayList<>(metadataEntitys.size());
    for (MetadataEntity metadataEntity : metadataEntitys) {
        fuzzyKeys.add(getFuzzyKeyFor(metadataEntity));
    }
    // Sort fuzzy keys
    fuzzyKeys.sort(FUZZY_KEY_COMPARATOR);
    // Scan using fuzzy filter. Scan returns one row per property.
    // Group the rows on namespacedId
    Multimap<MetadataEntity, MetadataEntry> metadataMap = HashMultimap.create();
    byte[] start = fuzzyKeys.get(0).getFirst();
    byte[] end = Bytes.stopKeyForPrefix(fuzzyKeys.get(fuzzyKeys.size() - 1).getFirst());
    try (Scanner scan = indexedTable.scan(new Scan(start, end, new FuzzyRowFilter(fuzzyKeys)))) {
        Row next;
        while ((next = scan.next()) != null) {
            MetadataEntry metadataEntry = convertRow(next);
            if (metadataEntry != null) {
                metadataMap.put(metadataEntry.getMetadataEntity(), metadataEntry);
            }
        }
    }
    // Create metadata objects for each entity from grouped rows
    Set<Record> metadataSet = new HashSet<>();
    for (Map.Entry<MetadataEntity, Collection<MetadataEntry>> entry : metadataMap.asMap().entrySet()) {
        Map<String, String> properties = new HashMap<>();
        Set<String> tags = Collections.emptySet();
        for (MetadataEntry metadataEntry : entry.getValue()) {
            if (MetadataConstants.TAGS_KEY.equals(metadataEntry.getKey())) {
                tags = splitTags(metadataEntry.getValue());
            } else {
                properties.put(metadataEntry.getKey(), metadataEntry.getValue());
            }
        }
        metadataSet.add(new Record(entry.getKey(), properties, tags));
    }
    return metadataSet;
}

Also used : MetadataEntity(io.cdap.cdap.api.metadata.MetadataEntity) Scanner(io.cdap.cdap.api.dataset.table.Scanner) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) FuzzyRowFilter(io.cdap.cdap.data2.dataset2.lib.table.FuzzyRowFilter) ImmutablePair(io.cdap.cdap.common.utils.ImmutablePair) Collection(java.util.Collection) Scan(io.cdap.cdap.api.dataset.table.Scan) Row(io.cdap.cdap.api.dataset.table.Row) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) HashSet(java.util.HashSet)

Example 14 with FuzzyRowFilter

use of io.cdap.cdap.data2.dataset2.lib.table.FuzzyRowFilter in project cdap by caskdata.

the class FactTable method findSingleDimensionValue.

/**
 * Searches for first non-null valued dimensions in records that contain given list of dimensions and match given
 * dimension values in given time range. Returned dimension values are those that are not defined in given
 * dimension values.
 * @param allDimensionNames list of all dimension names to be present in the record
 * @param dimensionSlice dimension values to filter by, {@code null} means any non-null value.
 * @param startTs start of the time range, in seconds
 * @param endTs end of the time range, in seconds
 * @return {@link Set} of {@link DimensionValue}s
 */
// todo: pass a limit on number of dimensionValues returned
// todo: kinda not cool API when we expect null values in a map...
public Set<DimensionValue> findSingleDimensionValue(List<String> allDimensionNames, Map<String, String> dimensionSlice, long startTs, long endTs) {
    // Algorithm, briefly:
    // We scan in the records which have given allDimensionNames. We use dimensionSlice as a criteria for scan.
    // If record from the scan has non-null values in the dimensions which are not specified in dimensionSlice,
    // we use first of such dimension as a value to return.
    // When we find value to return, since we only fill a single dimension, we are not interested in drilling down
    // further and instead attempt to fast-forward (jump) to a record that has different value in that dimension.
    // Thus we find all results.
    List<DimensionValue> allDimensions = Lists.newArrayList();
    List<Integer> dimToFillIndexes = Lists.newArrayList();
    for (int i = 0; i < allDimensionNames.size(); i++) {
        String dimensionName = allDimensionNames.get(i);
        if (!dimensionSlice.containsKey(dimensionName)) {
            dimToFillIndexes.add(i);
            allDimensions.add(new DimensionValue(dimensionName, null));
        } else {
            DimensionValue dimensionValue = new DimensionValue(dimensionName, dimensionSlice.get(dimensionName));
            allDimensions.add(dimensionValue);
        }
    }
    // If provided dimensions contain all values filled in, there's nothing to look for
    if (dimToFillIndexes.isEmpty()) {
        return Collections.emptySet();
    }
    Set<DimensionValue> result = Sets.newHashSet();
    int scans = 0;
    int scannedRecords = 0;
    // build a scan
    byte[] startRow = codec.createStartRowKey(allDimensions, null, startTs, false);
    byte[] endRow = codec.createEndRowKey(allDimensions, null, endTs, false);
    endRow = Bytes.stopKeyForPrefix(endRow);
    FuzzyRowFilter fuzzyRowFilter = createFuzzyRowFilter(new FactScan(startTs, endTs, Collections.emptyList(), allDimensions), startRow);
    Scanner scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
    scans++;
    try {
        Row rowResult;
        while ((rowResult = scanner.next()) != null) {
            scannedRecords++;
            // todo: make configurable
            if (scannedRecords > MAX_RECORDS_TO_SCAN_DURING_SEARCH) {
                break;
            }
            byte[] rowKey = rowResult.getRow();
            // filter out columns by time range (scan configuration only filters whole rows)
            if (codec.getTimestamp(rowKey, codec.createColumn(startTs)) < startTs) {
                continue;
            }
            if (codec.getTimestamp(rowKey, codec.createColumn(endTs)) > endTs) {
                // we're done with scanner
                break;
            }
            List<DimensionValue> dimensionValues = codec.getDimensionValues(rowResult.getRow());
            // At this point, we know that the record is in right time range and its dimensions matches given.
            // We try find first non-null valued dimension in the record that was not in given dimensions: we use it to form
            // next drill down suggestion
            int filledIndex = -1;
            for (int index : dimToFillIndexes) {
                // todo: it may be not efficient, if dimensionValues is not array-backed list: i.e. if access by index is
                // not fast
                DimensionValue dimensionValue = dimensionValues.get(index);
                if (dimensionValue.getValue() != null) {
                    result.add(dimensionValue);
                    filledIndex = index;
                    break;
                }
            }
            // todo: fast-forwarding (jumping) should be done on server-side (CDAP-1421)
            if (filledIndex >= 0) {
                scanner.close();
                scanner = null;
                scans++;
                if (scans > MAX_SCANS_DURING_SEARCH) {
                    break;
                }
                startRow = codec.getNextRowKey(rowResult.getRow(), filledIndex);
                scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
            }
        }
    } finally {
        if (scanner != null) {
            scanner.close();
        }
    }
    LOG.trace("search for dimensions completed, scans performed: {}, scanned records: {}", scans, scannedRecords);
    return result;
}

Also used : Scanner(io.cdap.cdap.api.dataset.table.Scanner) DimensionValue(io.cdap.cdap.api.dataset.lib.cube.DimensionValue) Row(io.cdap.cdap.api.dataset.table.Row) FuzzyRowFilter(io.cdap.cdap.data2.dataset2.lib.table.FuzzyRowFilter)

Aggregations

FuzzyRowFilter (co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter)7 FuzzyRowFilter (io.cdap.cdap.data2.dataset2.lib.table.FuzzyRowFilter)7 Scanner (co.cask.cdap.api.dataset.table.Scanner)4 Scanner (io.cdap.cdap.api.dataset.table.Scanner)4 ArrayList (java.util.ArrayList)4 DimensionValue (co.cask.cdap.api.dataset.lib.cube.DimensionValue)3 Row (co.cask.cdap.api.dataset.table.Row)3 ImmutablePair (co.cask.cdap.common.utils.ImmutablePair)3 DimensionValue (io.cdap.cdap.api.dataset.lib.cube.DimensionValue)3 Row (io.cdap.cdap.api.dataset.table.Row)3 ImmutablePair (io.cdap.cdap.common.utils.ImmutablePair)3 ImmutableMap (com.google.common.collect.ImmutableMap)2 Collection (java.util.Collection)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 Map (java.util.Map)2 Pair (org.apache.hadoop.hbase.util.Pair)2 Test (org.junit.Test)2 ReadOnly (co.cask.cdap.api.annotation.ReadOnly)1 DataSetException (co.cask.cdap.api.dataset.DataSetException)1