Search in sources :

Example 76 with Row

use of io.cdap.cdap.api.dataset.table.Row in project cdap by cdapio.

the class FactScanner method createIterator.

private Iterator<FactScanResult> createIterator() {
    return new AbstractIterator<FactScanResult>() {

        @Override
        protected FactScanResult computeNext() {
            Row rowResult;
            while ((rowResult = scanner.next()) != null) {
                rowScanned++;
                byte[] rowKey = rowResult.getRow();
                // Decode context and metric from key
                String measureName = codec.getMeasureName(rowKey);
                // if measureNames is empty we include all metrics
                if (!measureNames.isEmpty() && !measureNames.contains(measureName)) {
                    continue;
                }
                // todo: codec.getDimensionValues(rowKey) needs to un-encode dimension names which may result in read in
                // entity table (depending on the cache and its state). To avoid that, we can pass to scanner the
                // list of dimension names as we *always* know it (it is given) at the time of scanning
                List<DimensionValue> dimensionValues = codec.getDimensionValues(rowKey);
                boolean exhausted = false;
                List<TimeValue> timeValues = Lists.newLinkedList();
                // todo: entry set is ordered by ts?
                for (Map.Entry<byte[], byte[]> columnValue : rowResult.getColumns().entrySet()) {
                    long ts = codec.getTimestamp(rowKey, columnValue.getKey());
                    if (ts < startTs) {
                        continue;
                    }
                    if (ts > endTs) {
                        exhausted = true;
                        break;
                    }
                    // todo: move Bytes.toLong into codec?
                    TimeValue timeValue = new TimeValue(ts, Bytes.toLong(columnValue.getValue()));
                    timeValues.add(timeValue);
                }
                if (timeValues.isEmpty() && exhausted) {
                    break;
                }
                // todo: can return empty list, if all data is < startTs or > endTs
                return new FactScanResult(measureName, dimensionValues, timeValues);
            }
            scanner.close();
            return endOfData();
        }
    };
}
Also used : DimensionValue(io.cdap.cdap.api.dataset.lib.cube.DimensionValue) AbstractIterator(com.google.common.collect.AbstractIterator) Row(io.cdap.cdap.api.dataset.table.Row) Map(java.util.Map) TimeValue(io.cdap.cdap.api.dataset.lib.cube.TimeValue)

Example 77 with Row

use of io.cdap.cdap.api.dataset.table.Row in project cdap by cdapio.

the class FactTable method findSingleDimensionValue.

/**
 * Searches for first non-null valued dimensions in records that contain given list of dimensions and match given
 * dimension values in given time range. Returned dimension values are those that are not defined in given
 * dimension values.
 * @param allDimensionNames list of all dimension names to be present in the record
 * @param dimensionSlice dimension values to filter by, {@code null} means any non-null value.
 * @param startTs start of the time range, in seconds
 * @param endTs end of the time range, in seconds
 * @return {@link Set} of {@link DimensionValue}s
 */
// todo: pass a limit on number of dimensionValues returned
// todo: kinda not cool API when we expect null values in a map...
public Set<DimensionValue> findSingleDimensionValue(List<String> allDimensionNames, Map<String, String> dimensionSlice, long startTs, long endTs) {
    // Algorithm, briefly:
    // We scan in the records which have given allDimensionNames. We use dimensionSlice as a criteria for scan.
    // If record from the scan has non-null values in the dimensions which are not specified in dimensionSlice,
    // we use first of such dimension as a value to return.
    // When we find value to return, since we only fill a single dimension, we are not interested in drilling down
    // further and instead attempt to fast-forward (jump) to a record that has different value in that dimension.
    // Thus we find all results.
    List<DimensionValue> allDimensions = Lists.newArrayList();
    List<Integer> dimToFillIndexes = Lists.newArrayList();
    for (int i = 0; i < allDimensionNames.size(); i++) {
        String dimensionName = allDimensionNames.get(i);
        if (!dimensionSlice.containsKey(dimensionName)) {
            dimToFillIndexes.add(i);
            allDimensions.add(new DimensionValue(dimensionName, null));
        } else {
            DimensionValue dimensionValue = new DimensionValue(dimensionName, dimensionSlice.get(dimensionName));
            allDimensions.add(dimensionValue);
        }
    }
    // If provided dimensions contain all values filled in, there's nothing to look for
    if (dimToFillIndexes.isEmpty()) {
        return Collections.emptySet();
    }
    Set<DimensionValue> result = Sets.newHashSet();
    int scans = 0;
    int scannedRecords = 0;
    // build a scan
    byte[] startRow = codec.createStartRowKey(allDimensions, null, startTs, false);
    byte[] endRow = codec.createEndRowKey(allDimensions, null, endTs, false);
    endRow = Bytes.stopKeyForPrefix(endRow);
    FuzzyRowFilter fuzzyRowFilter = createFuzzyRowFilter(new FactScan(startTs, endTs, Collections.emptyList(), allDimensions), startRow);
    Scanner scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
    scans++;
    try {
        Row rowResult;
        while ((rowResult = scanner.next()) != null) {
            scannedRecords++;
            // todo: make configurable
            if (scannedRecords > MAX_RECORDS_TO_SCAN_DURING_SEARCH) {
                break;
            }
            byte[] rowKey = rowResult.getRow();
            // filter out columns by time range (scan configuration only filters whole rows)
            if (codec.getTimestamp(rowKey, codec.createColumn(startTs)) < startTs) {
                continue;
            }
            if (codec.getTimestamp(rowKey, codec.createColumn(endTs)) > endTs) {
                // we're done with scanner
                break;
            }
            List<DimensionValue> dimensionValues = codec.getDimensionValues(rowResult.getRow());
            // At this point, we know that the record is in right time range and its dimensions matches given.
            // We try find first non-null valued dimension in the record that was not in given dimensions: we use it to form
            // next drill down suggestion
            int filledIndex = -1;
            for (int index : dimToFillIndexes) {
                // todo: it may be not efficient, if dimensionValues is not array-backed list: i.e. if access by index is
                // not fast
                DimensionValue dimensionValue = dimensionValues.get(index);
                if (dimensionValue.getValue() != null) {
                    result.add(dimensionValue);
                    filledIndex = index;
                    break;
                }
            }
            // todo: fast-forwarding (jumping) should be done on server-side (CDAP-1421)
            if (filledIndex >= 0) {
                scanner.close();
                scanner = null;
                scans++;
                if (scans > MAX_SCANS_DURING_SEARCH) {
                    break;
                }
                startRow = codec.getNextRowKey(rowResult.getRow(), filledIndex);
                scanner = timeSeriesTable.scan(startRow, endRow, fuzzyRowFilter);
            }
        }
    } finally {
        if (scanner != null) {
            scanner.close();
        }
    }
    LOG.trace("search for dimensions completed, scans performed: {}, scanned records: {}", scans, scannedRecords);
    return result;
}
Also used : Scanner(io.cdap.cdap.api.dataset.table.Scanner) DimensionValue(io.cdap.cdap.api.dataset.lib.cube.DimensionValue) Row(io.cdap.cdap.api.dataset.table.Row) FuzzyRowFilter(io.cdap.cdap.data2.dataset2.lib.table.FuzzyRowFilter)

Example 78 with Row

use of io.cdap.cdap.api.dataset.table.Row in project cdap by cdapio.

the class TableAssert method assertScan.

public static void assertScan(byte[][] expectedRows, byte[][][] expectedRowMaps, Scanner scanner) {
    for (int i = 0; i < expectedRows.length; i++) {
        Row next = scanner.next();
        Assert.assertNotNull(next);
        Assert.assertArrayEquals(expectedRows[i], next.getRow());
        assertRow(expectedRowMaps[i], next.getColumns());
    }
    // nothing is left in scan
    Assert.assertNull(scanner.next());
}
Also used : Row(io.cdap.cdap.api.dataset.table.Row)

Example 79 with Row

use of io.cdap.cdap.api.dataset.table.Row in project hydrator-plugins by cdapio.

the class GroupByTestRun method testGroupBy.

private void testGroupBy(Engine engine) throws Exception {
    /*
                                  |--> group by user, totalPurchases:count(*), totalSpent:sum(price) --> user table
        <ts, user, item, price> --|
                                  |--> group by item, totalPurchases:count(user), latestPurchase:max(ts) --> item table
                                  |
                                  |  test same name can be used in the output schema
                                  |--> group by user, price:max(price) --> max table
     */
    String purchasesDatasetName = "purchases-groupbytest-" + engine;
    String usersDatasetName = "users-groupbytest-" + engine;
    String itemsDatasetName = "items-groupbytest-" + engine;
    String maxDatasetName = "max-groupbytest-" + engine;
    Schema purchaseSchema = Schema.recordOf("purchase", Schema.Field.of("ts", Schema.of(Schema.Type.LONG)), Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.STRING)), Schema.Field.of("price", Schema.of(Schema.Type.DOUBLE)));
    ETLStage purchaseStage = new ETLStage("purchases", new ETLPlugin("Table", BatchSource.PLUGIN_TYPE, ImmutableMap.of(Properties.BatchReadableWritable.NAME, purchasesDatasetName, Properties.Table.PROPERTY_SCHEMA, purchaseSchema.toString()), null));
    Schema userSchema = Schema.recordOf("user", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("totalPurchases", Schema.of(Schema.Type.LONG)), Schema.Field.of("totalSpent", Schema.of(Schema.Type.DOUBLE)));
    ETLStage userSinkStage = new ETLStage("users", new ETLPlugin("Table", BatchSink.PLUGIN_TYPE, ImmutableMap.of(Properties.BatchReadableWritable.NAME, usersDatasetName, Properties.Table.PROPERTY_SCHEMA_ROW_FIELD, "user", Properties.Table.PROPERTY_SCHEMA, userSchema.toString()), null));
    Schema itemSchema = Schema.recordOf("item", Schema.Field.of("item", Schema.of(Schema.Type.STRING)), Schema.Field.of("totalPurchases", Schema.of(Schema.Type.LONG)), Schema.Field.of("latestPurchase", Schema.of(Schema.Type.LONG)));
    ETLStage itemSinkStage = new ETLStage("items", new ETLPlugin("Table", BatchSink.PLUGIN_TYPE, ImmutableMap.of(Properties.BatchReadableWritable.NAME, itemsDatasetName, Properties.Table.PROPERTY_SCHEMA_ROW_FIELD, "item", Properties.Table.PROPERTY_SCHEMA, itemSchema.toString()), null));
    ETLStage userGroupStage = new ETLStage("userGroup", new ETLPlugin("GroupByAggregate", BatchAggregator.PLUGIN_TYPE, ImmutableMap.of("groupByFields", "user", "aggregates", "totalPurchases:count(*), totalSpent:sum(price)"), null));
    ETLStage itemGroupStage = new ETLStage("itemGroup", new ETLPlugin("GroupByAggregate", BatchAggregator.PLUGIN_TYPE, ImmutableMap.of("groupByFields", "item", "aggregates", "totalPurchases:count(user), latestPurchase:max(ts)"), null));
    Schema maxSchema = Schema.recordOf("max", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("price", Schema.of(Schema.Type.DOUBLE)));
    ETLStage maxSinkStage = new ETLStage("max", new ETLPlugin("Table", BatchSink.PLUGIN_TYPE, ImmutableMap.of(Properties.BatchReadableWritable.NAME, maxDatasetName, Properties.Table.PROPERTY_SCHEMA_ROW_FIELD, "user", Properties.Table.PROPERTY_SCHEMA, maxSchema.toString()), null));
    ETLStage maxGroupStage = new ETLStage("maxGroup", new ETLPlugin("GroupByAggregate", BatchAggregator.PLUGIN_TYPE, ImmutableMap.of("groupByFields", "user", "aggregates", "price:max(price)"), null));
    ETLBatchConfig config = ETLBatchConfig.builder("* * * * *").addStage(purchaseStage).addStage(userSinkStage).addStage(itemSinkStage).addStage(maxSinkStage).addStage(userGroupStage).addStage(itemGroupStage).addStage(maxGroupStage).addConnection(purchaseStage.getName(), userGroupStage.getName()).addConnection(purchaseStage.getName(), itemGroupStage.getName()).addConnection(purchaseStage.getName(), maxGroupStage.getName()).addConnection(userGroupStage.getName(), userSinkStage.getName()).addConnection(itemGroupStage.getName(), itemSinkStage.getName()).addConnection(maxGroupStage.getName(), maxSinkStage.getName()).setEngine(engine).build();
    ApplicationManager appManager = deployETL(config, "groupby-test-" + engine);
    // write input data
    // 1: 1234567890000, samuel, island, 1000000
    // 2: 1234567890001, samuel, shirt, 15.34
    // 3. 1234567890001, samuel, pie, 3.14
    // 4. 1234567890002, john, pie, 3.14
    // 5. 1234567890003, john, shirt, 20.53
    DataSetManager<Table> purchaseManager = getDataset(purchasesDatasetName);
    Table purchaseTable = purchaseManager.get();
    // 1: 1234567890000, samuel, island, 1000000
    Put put = new Put(Bytes.toBytes(1));
    put.add("ts", 1234567890000L);
    put.add("user", "samuel");
    put.add("item", "island");
    put.add("price", 1000000d);
    purchaseTable.put(put);
    put = new Put(Bytes.toBytes(2));
    put.add("ts", 1234567890001L);
    put.add("user", "samuel");
    put.add("item", "shirt");
    put.add("price", 15.34d);
    purchaseTable.put(put);
    put = new Put(Bytes.toBytes(3));
    put.add("ts", 1234567890001L);
    put.add("user", "samuel");
    put.add("item", "pie");
    put.add("price", 3.14d);
    purchaseTable.put(put);
    put = new Put(Bytes.toBytes(4));
    put.add("ts", 1234567890002L);
    put.add("user", "john");
    put.add("item", "pie");
    put.add("price", 3.14d);
    purchaseTable.put(put);
    put = new Put(Bytes.toBytes(5));
    put.add("ts", 1234567890003L);
    put.add("user", "john");
    put.add("item", "shirt");
    put.add("price", 20.53d);
    purchaseTable.put(put);
    purchaseManager.flush();
    // run the pipeline
    runETLOnce(appManager);
    DataSetManager<Table> usersManager = getDataset(usersDatasetName);
    Table usersTable = usersManager.get();
    // users table should have:
    // samuel: 3, 1000000 + 15.34 + 3.14
    Row row = usersTable.get(Bytes.toBytes("samuel"));
    Assert.assertEquals(Objects.requireNonNull(row.getLong("totalPurchases")).longValue(), 3L);
    Assert.assertTrue(Math.abs(Objects.requireNonNull(row.getDouble("totalSpent")) - 1000000d - 15.34d - 3.14d) < 0.0000001);
    // john: 2, 3.14 + 20.53
    row = usersTable.get(Bytes.toBytes("john"));
    Assert.assertEquals(Objects.requireNonNull(row.getLong("totalPurchases")).longValue(), 2L);
    Assert.assertTrue(Math.abs(Objects.requireNonNull(row.getDouble("totalSpent")) - 3.14d - 20.53d) < 0.0000001);
    DataSetManager<Table> itemsManager = getDataset(itemsDatasetName);
    Table itemsTable = itemsManager.get();
    // items table should have:
    // island: 1, 1234567890000
    row = itemsTable.get(Bytes.toBytes("island"));
    Assert.assertEquals(Objects.requireNonNull(row.getLong("totalPurchases")).longValue(), 1L);
    Assert.assertEquals(Objects.requireNonNull(row.getLong("latestPurchase")).longValue(), 1234567890000L);
    // pie: 2, 1234567890002
    row = itemsTable.get(Bytes.toBytes("pie"));
    Assert.assertEquals(Objects.requireNonNull(row.getLong("totalPurchases")).longValue(), 2L);
    Assert.assertEquals(Objects.requireNonNull(row.getLong("latestPurchase")).longValue(), 1234567890002L);
    // shirt: 2, 1234567890003
    row = itemsTable.get(Bytes.toBytes("shirt"));
    Assert.assertEquals(Objects.requireNonNull(row.getLong("totalPurchases")).longValue(), 2L);
    Assert.assertEquals(Objects.requireNonNull(row.getLong("latestPurchase")).longValue(), 1234567890003L);
    DataSetManager<Table> maxManager = getDataset(maxDatasetName);
    Table maxTable = maxManager.get();
    // max table should have:
    // Samuel, 100000
    row = maxTable.get(Bytes.toBytes("samuel"));
    Assert.assertTrue(Math.abs(row.getDouble("price") - 1000000d) < 0.0000001);
    // pie: 2, 1234567890002
    row = maxTable.get(Bytes.toBytes("john"));
    Assert.assertTrue(Math.abs(row.getDouble("price") - 20.53d) < 0.0000001);
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) Schema(io.cdap.cdap.api.data.schema.Schema) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) Row(io.cdap.cdap.api.dataset.table.Row) Put(io.cdap.cdap.api.dataset.table.Put)

Example 80 with Row

use of io.cdap.cdap.api.dataset.table.Row in project hydrator-plugins by cdapio.

the class ETLMapReduceTestRun method testTableToTableWithValidations.

@SuppressWarnings("ConstantConditions")
@Test
public void testTableToTableWithValidations() throws Exception {
    Schema schema = Schema.recordOf("purchase", Schema.Field.of("rowkey", Schema.of(Schema.Type.STRING)), Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("count", Schema.of(Schema.Type.INT)), Schema.Field.of("price", Schema.of(Schema.Type.DOUBLE)), Schema.Field.of("item", Schema.of(Schema.Type.STRING)));
    ETLStage source = new ETLStage("source", new ETLPlugin("Table", BatchSource.PLUGIN_TYPE, ImmutableMap.of(Properties.BatchReadableWritable.NAME, "inputTable", Properties.Table.PROPERTY_SCHEMA_ROW_FIELD, "rowkey", Properties.Table.PROPERTY_SCHEMA, schema.toString()), null));
    ETLStage sink1 = new ETLStage("sink1", new ETLPlugin("Table", BatchSink.PLUGIN_TYPE, ImmutableMap.of(Properties.BatchReadableWritable.NAME, "outputTable", Properties.Table.PROPERTY_SCHEMA_ROW_FIELD, "rowkey", Properties.Table.PROPERTY_SCHEMA, schema.toString())));
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink1).addConnection(source.getName(), sink1.getName()).build();
    ApplicationManager appManager = deployETL(etlConfig, "TableToTable");
    // add some data to the input table
    DataSetManager<Table> inputManager = getDataset("inputTable");
    Table inputTable = inputManager.get();
    // valid record, user name "samuel" is 6 chars long
    Put put = new Put(Bytes.toBytes("row1"));
    put.add("user", "samuel");
    put.add("count", 5);
    put.add("price", 123.45);
    put.add("item", "scotch");
    inputTable.put(put);
    inputManager.flush();
    // valid record, user name "jackson" is > 6 characters
    put = new Put(Bytes.toBytes("row2"));
    put.add("user", "jackson");
    put.add("count", 10);
    put.add("price", 123456789d);
    put.add("item", "island");
    inputTable.put(put);
    inputManager.flush();
    runETLOnce(appManager);
    DataSetManager<Table> outputManager = getDataset("outputTable");
    Table outputTable = outputManager.get();
    Row row = outputTable.get(Bytes.toBytes("row1"));
    Assert.assertEquals("samuel", row.getString("user"));
    Assert.assertEquals(5, (int) row.getInt("count"));
    Assert.assertTrue(Math.abs(123.45 - row.getDouble("price")) < 0.000001);
    Assert.assertEquals("scotch", row.getString("item"));
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ApplicationManager(io.cdap.cdap.test.ApplicationManager) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Table(io.cdap.cdap.api.dataset.table.Table) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) Schema(io.cdap.cdap.api.data.schema.Schema) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) Row(io.cdap.cdap.api.dataset.table.Row) Put(io.cdap.cdap.api.dataset.table.Put) Test(org.junit.Test)

Aggregations

Row (io.cdap.cdap.api.dataset.table.Row)166 Scanner (io.cdap.cdap.api.dataset.table.Scanner)81 Test (org.junit.Test)50 Table (io.cdap.cdap.api.dataset.table.Table)34 Put (io.cdap.cdap.api.dataset.table.Put)29 ArrayList (java.util.ArrayList)26 TransactionExecutor (org.apache.tephra.TransactionExecutor)26 Get (io.cdap.cdap.api.dataset.table.Get)24 Schema (io.cdap.cdap.api.data.schema.Schema)21 HashMap (java.util.HashMap)19 MDSKey (io.cdap.cdap.data2.dataset2.lib.table.MDSKey)16 Transaction (org.apache.tephra.Transaction)16 TransactionAware (org.apache.tephra.TransactionAware)16 IOException (java.io.IOException)14 Map (java.util.Map)14 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)13 DatasetAdmin (io.cdap.cdap.api.dataset.DatasetAdmin)12 WriteOnly (io.cdap.cdap.api.annotation.WriteOnly)10 DimensionValue (io.cdap.cdap.api.dataset.lib.cube.DimensionValue)10 HBaseTable (io.cdap.cdap.data2.dataset2.lib.table.hbase.HBaseTable)10