Search in sources :

Example 21 with Row

use of co.cask.cdap.api.dataset.table.Row in project cdap by caskdata.

the class FactTable method findMeasureNames.

/**
   * Finds all measure names of the facts that match given {@link DimensionValue}s and time range.
   * @param allDimensionNames list of all dimension names to be present in the fact record
   * @param dimensionSlice dimension values to filter by, {@code null} means any non-null value.
   * @param startTs start timestamp, in sec
   * @param endTs end timestamp, in sec
   * @return {@link Set} of measure names
   */
// todo: pass a limit on number of measures returned
public Set<String> findMeasureNames(List<String> allDimensionNames, Map<String, String> dimensionSlice, long startTs, long endTs) {
    List<DimensionValue> allDimensions = Lists.newArrayList();
    for (String dimensionName : allDimensionNames) {
        allDimensions.add(new DimensionValue(dimensionName, dimensionSlice.get(dimensionName)));
    }
    byte[] startRow = codec.createStartRowKey(allDimensions, null, startTs, false);
    byte[] endRow = codec.createEndRowKey(allDimensions, null, endTs, false);
    endRow = Bytes.stopKeyForPrefix(endRow);
    FuzzyRowFilter fuzzyRowFilter = createFuzzyRowFilter(new FactScan(startTs, endTs, ImmutableList.<String>of(), allDimensions), startRow);
    Set<String> measureNames = Sets.newHashSet();
    int scannedRecords = 0;
    try (Scanner scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter)) {
        Row rowResult;
        while ((rowResult = scanner.next()) != null) {
            scannedRecords++;
            if (scannedRecords > MAX_RECORDS_TO_SCAN_DURING_SEARCH) {
                break;
            }
            byte[] rowKey = rowResult.getRow();
            // filter out columns by time range (scan configuration only filters whole rows)
            if (codec.getTimestamp(rowKey, codec.createColumn(startTs)) < startTs) {
                continue;
            }
            if (codec.getTimestamp(rowKey, codec.createColumn(endTs)) > endTs) {
                // we're done with scanner
                break;
            }
            measureNames.add(codec.getMeasureName(rowResult.getRow()));
        }
    }
    LOG.trace("search for measures completed, scanned records: {}", scannedRecords);
    return measureNames;
}
Also used : Scanner(co.cask.cdap.api.dataset.table.Scanner) DimensionValue(co.cask.cdap.api.dataset.lib.cube.DimensionValue) Row(co.cask.cdap.api.dataset.table.Row) FuzzyRowFilter(co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter)

Example 22 with Row

use of co.cask.cdap.api.dataset.table.Row in project cdap by caskdata.

the class FactTable method findSingleDimensionValue.

/**
   * Searches for first non-null valued dimensions in records that contain given list of dimensions and match given
   * dimension values in given time range. Returned dimension values are those that are not defined in given
   * dimension values.
   * @param allDimensionNames list of all dimension names to be present in the record
   * @param dimensionSlice dimension values to filter by, {@code null} means any non-null value.
   * @param startTs start of the time range, in seconds
   * @param endTs end of the time range, in seconds
   * @return {@link Set} of {@link DimensionValue}s
   */
// todo: pass a limit on number of dimensionValues returned
// todo: kinda not cool API when we expect null values in a map...
public Set<DimensionValue> findSingleDimensionValue(List<String> allDimensionNames, Map<String, String> dimensionSlice, long startTs, long endTs) {
    // Algorithm, briefly:
    // We scan in the records which have given allDimensionNames. We use dimensionSlice as a criteria for scan.
    // If record from the scan has non-null values in the dimensions which are not specified in dimensionSlice,
    // we use first of such dimension as a value to return.
    // When we find value to return, since we only fill a single dimension, we are not interested in drilling down
    // further and instead attempt to fast-forward (jump) to a record that has different value in that dimension.
    // Thus we find all results.
    List<DimensionValue> allDimensions = Lists.newArrayList();
    List<DimensionValue> filledDimension = Lists.newArrayList();
    List<Integer> dimToFillIndexes = Lists.newArrayList();
    for (int i = 0; i < allDimensionNames.size(); i++) {
        String dimensionName = allDimensionNames.get(i);
        if (!dimensionSlice.containsKey(dimensionName)) {
            dimToFillIndexes.add(i);
            allDimensions.add(new DimensionValue(dimensionName, null));
        } else {
            DimensionValue dimensionValue = new DimensionValue(dimensionName, dimensionSlice.get(dimensionName));
            filledDimension.add(dimensionValue);
            allDimensions.add(dimensionValue);
        }
    }
    // If provided dimensions contain all values filled in, there's nothing to look for
    if (dimToFillIndexes.isEmpty()) {
        return Collections.emptySet();
    }
    Set<DimensionValue> result = Sets.newHashSet();
    int scans = 0;
    int scannedRecords = 0;
    // build a scan
    byte[] startRow = codec.createStartRowKey(allDimensions, null, startTs, false);
    byte[] endRow = codec.createEndRowKey(allDimensions, null, endTs, false);
    endRow = Bytes.stopKeyForPrefix(endRow);
    FuzzyRowFilter fuzzyRowFilter = createFuzzyRowFilter(new FactScan(startTs, endTs, ImmutableList.<String>of(), allDimensions), startRow);
    Scanner scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
    scans++;
    try {
        Row rowResult;
        while ((rowResult = scanner.next()) != null) {
            scannedRecords++;
            // todo: make configurable
            if (scannedRecords > MAX_RECORDS_TO_SCAN_DURING_SEARCH) {
                break;
            }
            byte[] rowKey = rowResult.getRow();
            // filter out columns by time range (scan configuration only filters whole rows)
            if (codec.getTimestamp(rowKey, codec.createColumn(startTs)) < startTs) {
                continue;
            }
            if (codec.getTimestamp(rowKey, codec.createColumn(endTs)) > endTs) {
                // we're done with scanner
                break;
            }
            List<DimensionValue> dimensionValues = codec.getDimensionValues(rowResult.getRow());
            // At this point, we know that the record is in right time range and its dimensions matches given.
            // We try find first non-null valued dimension in the record that was not in given dimensions: we use it to form
            // next drill down suggestion
            int filledIndex = -1;
            for (int index : dimToFillIndexes) {
                // todo: it may be not efficient, if dimensionValues is not array-backed list: i.e. if access by index is
                //       not fast
                DimensionValue dimensionValue = dimensionValues.get(index);
                if (dimensionValue.getValue() != null) {
                    result.add(dimensionValue);
                    filledIndex = index;
                    break;
                }
            }
            // todo: fast-forwarding (jumping) should be done on server-side (CDAP-1421)
            if (filledIndex >= 0) {
                scanner.close();
                scanner = null;
                scans++;
                if (scans > MAX_SCANS_DURING_SEARCH) {
                    break;
                }
                startRow = codec.getNextRowKey(rowResult.getRow(), filledIndex);
                scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
            }
        }
    } finally {
        if (scanner != null) {
            scanner.close();
        }
    }
    LOG.trace("search for dimensions completed, scans performed: {}, scanned records: {}", scans, scannedRecords);
    return result;
}
Also used : Scanner(co.cask.cdap.api.dataset.table.Scanner) DimensionValue(co.cask.cdap.api.dataset.lib.cube.DimensionValue) Row(co.cask.cdap.api.dataset.table.Row) FuzzyRowFilter(co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter)

Example 23 with Row

use of co.cask.cdap.api.dataset.table.Row in project cdap by caskdata.

the class FactTable method delete.

/**
   * Delete entries in fact table.
   * @param scan specifies deletion criteria
   */
public void delete(FactScan scan) {
    try (Scanner scanner = getScanner(scan)) {
        Row row;
        while ((row = scanner.next()) != null) {
            List<byte[]> columns = Lists.newArrayList();
            boolean exhausted = false;
            for (byte[] column : row.getColumns().keySet()) {
                long ts = codec.getTimestamp(row.getRow(), column);
                if (ts < scan.getStartTs()) {
                    continue;
                }
                if (ts > scan.getEndTs()) {
                    exhausted = true;
                    break;
                }
                columns.add(column);
            }
            // todo: do deletes efficiently, in batches, not one-by-one
            timeSeriesTable.delete(row.getRow(), columns.toArray(new byte[columns.size()][]));
            if (exhausted) {
                break;
            }
        }
    }
}
Also used : Scanner(co.cask.cdap.api.dataset.table.Scanner) Row(co.cask.cdap.api.dataset.table.Row)

Example 24 with Row

use of co.cask.cdap.api.dataset.table.Row in project cdap by caskdata.

the class MetadataDataset method deleteIndexes.

/**
   * Deletes all indexes associated with a metadata key
   *
   * @param targetId the {@link NamespacedEntityId} for which keys are to be removed
   * @param metadataKey the key to remove from the metadata of the specified {@link NamespacedEntityId}
   */
private void deleteIndexes(NamespacedEntityId targetId, String metadataKey) {
    MDSKey mdsKey = MdsKey.getMDSIndexKey(targetId, metadataKey, null);
    byte[] startKey = mdsKey.getKey();
    byte[] stopKey = Bytes.stopKeyForPrefix(startKey);
    try (Scanner scan = indexedTable.scan(startKey, stopKey)) {
        Row next;
        while ((next = scan.next()) != null) {
            deleteIndexRow(next);
        }
    }
}
Also used : Scanner(co.cask.cdap.api.dataset.table.Scanner) MDSKey(co.cask.cdap.data2.dataset2.lib.table.MDSKey) Row(co.cask.cdap.api.dataset.table.Row)

Example 25 with Row

use of co.cask.cdap.api.dataset.table.Row in project cdap by caskdata.

the class ReflectionTableTest method testStructuredRecordProjection.

@Test
public void testStructuredRecordProjection() throws Exception {
    dsFrameworkUtil.createInstance("table", users, DatasetProperties.builder().build());
    try {
        final Table usersTable = dsFrameworkUtil.getInstance(users);
        final byte[] rowKey = Bytes.toBytes(123);
        final User2 projected = new User2("Samuel L.", 123L, ((Float) 50000000.02f).doubleValue(), Double.MAX_VALUE, ByteBuffer.wrap(new byte[] { 0, 1, 2 }));
        final Schema fullSchema = new ReflectionSchemaGenerator().generate(User.class);
        final Schema projSchema = new ReflectionSchemaGenerator().generate(User2.class);
        // TableDataset is not accessible here, but we know that's the underlying implementation...
        TransactionExecutor tx = dsFrameworkUtil.newTransactionExecutor((TransactionAware) usersTable);
        tx.execute(new TransactionExecutor.Subroutine() {

            @Override
            public void apply() throws Exception {
                Put put = new Put(rowKey);
                ReflectionPutWriter<User> putWriter = new ReflectionPutWriter<>(fullSchema);
                putWriter.write(SAMUEL, put);
                usersTable.put(put);
                Row row = usersTable.get(rowKey);
                ReflectionRowRecordReader rowReader = new ReflectionRowRecordReader(projSchema, null);
                StructuredRecord actual = rowReader.read(row, fullSchema);
                assertRecordEqualsUser(projected, actual);
            }
        });
    } finally {
        dsFrameworkUtil.deleteInstance(users);
    }
}
Also used : Table(co.cask.cdap.api.dataset.table.Table) Schema(co.cask.cdap.api.data.schema.Schema) TransactionExecutor(org.apache.tephra.TransactionExecutor) ReflectionSchemaGenerator(co.cask.cdap.internal.io.ReflectionSchemaGenerator) Put(co.cask.cdap.api.dataset.table.Put) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ReflectionPutWriter(co.cask.cdap.internal.io.ReflectionPutWriter) Row(co.cask.cdap.api.dataset.table.Row) ReflectionRowRecordReader(co.cask.cdap.internal.io.ReflectionRowRecordReader) Test(org.junit.Test)

Aggregations

Row (co.cask.cdap.api.dataset.table.Row)111 Scanner (co.cask.cdap.api.dataset.table.Scanner)60 Test (org.junit.Test)23 Table (co.cask.cdap.api.dataset.table.Table)20 Get (co.cask.cdap.api.dataset.table.Get)16 ArrayList (java.util.ArrayList)16 TransactionExecutor (org.apache.tephra.TransactionExecutor)16 Map (java.util.Map)15 Put (co.cask.cdap.api.dataset.table.Put)14 HashMap (java.util.HashMap)10 Scan (co.cask.cdap.api.dataset.table.Scan)9 TransactionAware (org.apache.tephra.TransactionAware)9 MDSKey (co.cask.cdap.data2.dataset2.lib.table.MDSKey)8 QueueEntryRow (co.cask.cdap.data2.transaction.queue.QueueEntryRow)8 DatasetId (co.cask.cdap.proto.id.DatasetId)8 IOException (java.io.IOException)8 ImmutableMap (com.google.common.collect.ImmutableMap)7 Transaction (org.apache.tephra.Transaction)7 WriteOnly (co.cask.cdap.api.annotation.WriteOnly)6 Schema (co.cask.cdap.api.data.schema.Schema)6