Search in sources :

Example 81 with Row

use of io.cdap.cdap.api.dataset.table.Row in project hydrator-plugins by cdapio.

the class SparkPluginTest method testFileSource.

@Test
public void testFileSource() throws Exception {
    Schema schema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("first", Schema.of(Schema.Type.STRING)), Schema.Field.of("last", Schema.of(Schema.Type.STRING)));
    File folder = tmpFolder.newFolder("fileSourceTest");
    File input1 = new File(folder, "input1.txt");
    File input2 = new File(folder, "input2.csv");
    File ignore1 = new File(folder, "input1.txt.done");
    File ignore2 = new File(folder, "input1");
    CharStreams.write("1,samuel,jackson\n2,dwayne,johnson", Files.newWriterSupplier(input1, Charsets.UTF_8));
    CharStreams.write("3,christopher,walken", Files.newWriterSupplier(input2, Charsets.UTF_8));
    CharStreams.write("0,nicolas,cage", Files.newWriterSupplier(ignore1, Charsets.UTF_8));
    CharStreams.write("0,orlando,bloom", Files.newWriterSupplier(ignore2, Charsets.UTF_8));
    Map<String, String> properties = ImmutableMap.<String, String>builder().put("path", folder.getAbsolutePath()).put("format", "csv").put("schema", schema.toString()).put("referenceName", "fileSourceTestInput").put("ignoreThreshold", "300").put("extensions", "txt,csv").build();
    DataStreamsConfig pipelineCfg = DataStreamsConfig.builder().addStage(new ETLStage("source", new ETLPlugin("File", StreamingSource.PLUGIN_TYPE, properties, null))).addStage(new ETLStage("sink", MockSink.getPlugin("fileOutput"))).addConnection("source", "sink").setBatchInterval("1s").build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(DATASTREAMS_ARTIFACT, pipelineCfg);
    ApplicationId appId = NamespaceId.DEFAULT.app("FileSourceApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForRun(ProgramRunStatus.RUNNING, 1, TimeUnit.MINUTES);
    Map<Long, String> expected = ImmutableMap.of(1L, "samuel jackson", 2L, "dwayne johnson", 3L, "christopher walken");
    final DataSetManager<Table> outputManager = getDataset("fileOutput");
    Tasks.waitFor(true, () -> {
        outputManager.flush();
        Map<Long, String> actual = new HashMap<>();
        for (StructuredRecord outputRecord : MockSink.readOutput(outputManager)) {
            actual.put(outputRecord.get("id"), outputRecord.get("first") + " " + outputRecord.get("last"));
        }
        return expected.equals(actual);
    }, 4, TimeUnit.MINUTES);
    // now write a new file to make sure new files are picked up.
    File input3 = new File(folder, "input3.txt");
    CharStreams.write("4,terry,crews\n5,rocky,balboa", Files.newWriterSupplier(input3, Charsets.UTF_8));
    Map<Long, String> expected2 = ImmutableMap.of(4L, "terry crews", 5L, "rocky balboa");
    Table outputTable = outputManager.get();
    Scanner scanner = outputTable.scan(null, null);
    Row row;
    while ((row = scanner.next()) != null) {
        outputTable.delete(row.getRow());
    }
    outputManager.flush();
    Tasks.waitFor(true, () -> {
        outputManager.flush();
        Map<Long, String> actual = new HashMap<>();
        for (StructuredRecord outputRecord : MockSink.readOutput(outputManager)) {
            actual.put(outputRecord.get("id"), outputRecord.get("first") + " " + outputRecord.get("last"));
        }
        return expected2.equals(actual);
    }, 4, TimeUnit.MINUTES);
    sparkManager.stop();
}
Also used : Scanner(io.cdap.cdap.api.dataset.table.Scanner) ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) Table(io.cdap.cdap.api.dataset.table.Table) HashMap(java.util.HashMap) Schema(io.cdap.cdap.api.data.schema.Schema) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) Row(io.cdap.cdap.api.dataset.table.Row) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) File(java.io.File) Test(org.junit.Test)

Example 82 with Row

use of io.cdap.cdap.api.dataset.table.Row in project cdap by caskdata.

the class MapReduceWithPartitionedTest method testTimePartitionedWithMR.

@Test
public void testTimePartitionedWithMR() throws Exception {
    final ApplicationWithPrograms app = deployApp(AppWithTimePartitionedFileSet.class);
    // write a value to the input table
    final Table table = datasetCache.getDataset(AppWithTimePartitionedFileSet.INPUT);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            table.put(Bytes.toBytes("x"), AppWithTimePartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("1"));
        }
    });
    final long time = DATE_FORMAT.parse("1/15/15 11:15 am").getTime();
    final long time5 = time + TimeUnit.MINUTES.toMillis(5);
    // run the partition writer m/r with this output partition time
    Map<String, String> runtimeArguments = Maps.newHashMap();
    Map<String, String> outputArgs = Maps.newHashMap();
    TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, time);
    final ImmutableMap<String, String> assignedMetadata = ImmutableMap.of("region", "13", "data.source.name", "input", "data.source.type", "table");
    TimePartitionedFileSetArguments.setOutputPartitionMetadata(outputArgs, assignedMetadata);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, outputArgs));
    Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
    // this should have created a partition in the tpfs
    final TimePartitionedFileSet tpfs = datasetCache.getDataset(TIME_PARTITIONED);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) tpfs).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            TimePartitionDetail partition = tpfs.getPartitionByTime(time);
            Assert.assertNotNull(partition);
            String path = partition.getRelativePath();
            Assert.assertNotNull(path);
            Assert.assertTrue(path.contains("2015-01-15/11-15"));
            Assert.assertEquals(assignedMetadata, partition.getMetadata().asMap());
        }
    });
    // delete the data in the input table and write a new row
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            table.delete(Bytes.toBytes("x"));
            table.put(Bytes.toBytes("y"), AppWithTimePartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("2"));
        }
    });
    // now run the m/r again with a new partition time, say 5 minutes later
    TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, time5);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, outputArgs));
    // make the mapreduce add the partition in destroy, to validate that this does not fail the job
    runtimeArguments.put(AppWithTimePartitionedFileSet.COMPAT_ADD_PARTITION, "true");
    Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
    // this should have created a partition in the tpfs
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) tpfs).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Partition partition = tpfs.getPartitionByTime(time5);
            Assert.assertNotNull(partition);
            String path = partition.getRelativePath();
            Assert.assertNotNull(path);
            Assert.assertTrue(path.contains("2015-01-15/11-20"));
        }
    });
    // now run a map/reduce that reads all the partitions
    runtimeArguments = Maps.newHashMap();
    Map<String, String> inputArgs = Maps.newHashMap();
    TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(5));
    TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time5 + TimeUnit.MINUTES.toMillis(5));
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "a");
    Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read both partitions - and written both x and y to row a
    final Table output = datasetCache.getDataset(AppWithTimePartitionedFileSet.OUTPUT);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("a"));
            Assert.assertEquals("1", row.getString("x"));
            Assert.assertEquals("2", row.getString("y"));
        }
    });
    // now run a map/reduce that reads a range of the partitions, namely the first one
    TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(5));
    TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time + TimeUnit.MINUTES.toMillis(2));
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "b");
    Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read the first partition only - and written only x to row b
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("b"));
            Assert.assertEquals("1", row.getString("x"));
            Assert.assertNull(row.get("y"));
        }
    });
    // now run a map/reduce that reads no partitions (because the range matches nothing)
    TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(10));
    TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time - TimeUnit.MINUTES.toMillis(9));
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "n");
    Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read no partitions - and written nothing to row n
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("n"));
            Assert.assertTrue(row.isEmpty());
        }
    });
}
Also used : Partition(io.cdap.cdap.api.dataset.lib.Partition) Table(io.cdap.cdap.api.dataset.table.Table) TransactionExecutor(org.apache.tephra.TransactionExecutor) ApplicationWithPrograms(io.cdap.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) TransactionAware(org.apache.tephra.TransactionAware) BasicArguments(io.cdap.cdap.internal.app.runtime.BasicArguments) TimePartitionDetail(io.cdap.cdap.api.dataset.lib.TimePartitionDetail) Row(io.cdap.cdap.api.dataset.table.Row) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) Test(org.junit.Test)

Example 83 with Row

use of io.cdap.cdap.api.dataset.table.Row in project cdap by caskdata.

the class FactTableTest method testPreSplits.

@Test
public void testPreSplits() throws Exception {
    InMemoryTableService.create("presplitEntityTable");
    InMemoryTableService.create("presplitDataTable");
    int resolution = 10;
    int rollTimebaseInterval = 2;
    InMemoryMetricsTable metricsTable = new InMemoryMetricsTable("presplitDataTable");
    FactTable table = new FactTable(metricsTable, new EntityTable(new InMemoryMetricsTable("presplitEntityTable")), resolution, rollTimebaseInterval);
    byte[][] splits = FactTable.getSplits(3);
    long ts = System.currentTimeMillis() / 1000;
    DimensionValue dimVal1 = new DimensionValue("dim1", "value1");
    DimensionValue dimVal2 = new DimensionValue("dim2", "value2");
    DimensionValue dimVal3 = new DimensionValue("dim3", "value3");
    // first agg view: dim1
    table.add(ImmutableList.of(new Fact(ts, ImmutableList.of(dimVal1), new Measurement("metric1", MeasureType.COUNTER, 1))));
    // second agg view: dim1 & dim2
    table.add(ImmutableList.of(new Fact(ts, ImmutableList.of(dimVal1, dimVal2), new Measurement("metric1", MeasureType.COUNTER, 1))));
    // third agg view: dim3
    table.add(ImmutableList.of(new Fact(ts, ImmutableList.of(dimVal3), new Measurement("metric1", MeasureType.COUNTER, 1))));
    // Verify all written records are spread across splits
    Scanner scanner = metricsTable.scan(null, null, null);
    Row row;
    Set<Integer> splitsWithRows = Sets.newHashSet();
    while ((row = scanner.next()) != null) {
        boolean added = false;
        for (int i = 0; i < splits.length; i++) {
            if (Bytes.compareTo(row.getRow(), splits[i]) < 0) {
                splitsWithRows.add(i);
                added = true;
                break;
            }
        }
        if (!added) {
            // falls into last split
            splitsWithRows.add(splits.length);
        }
    }
    Assert.assertEquals(3, splitsWithRows.size());
}
Also used : Measurement(io.cdap.cdap.api.dataset.lib.cube.Measurement) Scanner(io.cdap.cdap.api.dataset.table.Scanner) DimensionValue(io.cdap.cdap.api.dataset.lib.cube.DimensionValue) InMemoryMetricsTable(io.cdap.cdap.data2.dataset2.lib.table.inmemory.InMemoryMetricsTable) Row(io.cdap.cdap.api.dataset.table.Row) Test(org.junit.Test)

Example 84 with Row

use of io.cdap.cdap.api.dataset.table.Row in project cdap by caskdata.

the class LevelDBTableCoreTest method testScan.

@Test
public void testScan() throws Exception {
    String tableName = "testScanTable";
    TableId tableId = TableId.from("default", tableName);
    {
        Assert.assertNull(service.getTableStats().get(tableId));
        service.ensureTableExists(tableName);
        LevelDBTableCore table = new LevelDBTableCore(tableName, service);
        int numRows = 8;
        int numVersion = 8;
        // Write data to multiple rows and multiple versions per row and col.
        writeData(table, rowNamePrefix, numRows, colName, 1024, numVersion);
        // Scan only the first row and make sure no data from other rows are returned.
        try (Scanner scanner = table.scan(getRowName(rowNamePrefix, 0).getBytes(StandardCharsets.UTF_8), getRowName(rowNamePrefix, 1).getBytes(StandardCharsets.UTF_8), null, null, null)) {
            Row row;
            while ((row = scanner.next()) != null) {
                String rowName = new String(row.getRow(), StandardCharsets.UTF_8);
                Assert.assertTrue(rowName.equals(getRowName(rowNamePrefix, 0)));
            }
        }
        // Test a corner case by writing to row i + 1 at default version (i.e. max version) and scan row i,
        // because scan uses the max version at row i + 1 as scan end key (excluded) and we want to make sure
        // nothing from row i + 1 gets returned in such case.
        writeRowColDefaultVersion(table, getRowName(rowNamePrefix, 1), colName, "dummy-value");
        try (Scanner scanner = table.scan(getRowName(rowNamePrefix, 0).getBytes(StandardCharsets.UTF_8), getRowName(rowNamePrefix, 1).getBytes(StandardCharsets.UTF_8), null, null, null)) {
            Row row;
            while ((row = scanner.next()) != null) {
                String rowName = new String(row.getRow(), StandardCharsets.UTF_8);
                Assert.assertTrue(rowName.equals(getRowName(rowNamePrefix, 0)));
            }
        }
        service.dropTable(tableName);
    }
}
Also used : TableId(io.cdap.cdap.data2.util.TableId) Scanner(io.cdap.cdap.api.dataset.table.Scanner) Row(io.cdap.cdap.api.dataset.table.Row) Test(org.junit.Test)

Example 85 with Row

use of io.cdap.cdap.api.dataset.table.Row in project cdap by caskdata.

the class TableTest method verifyScanWithFuzzyRowFilter.

private static void verifyScanWithFuzzyRowFilter(Table table) {
    FuzzyRowFilter filter = new FuzzyRowFilter(ImmutableList.of(ImmutablePair.of(new byte[] { '*', 'b', '*', 'b' }, new byte[] { 0x01, 0x00, 0x01, 0x00 })));
    Scanner scanner = table.scan(new Scan(null, null, filter));
    int count = 0;
    while (true) {
        Row entry = scanner.next();
        if (entry == null) {
            break;
        }
        Assert.assertTrue(entry.getRow()[1] == 'b' && entry.getRow()[3] == 'b');
        Assert.assertEquals(1, entry.getColumns().size());
        Assert.assertTrue(entry.getColumns().containsKey(C1));
        Assert.assertArrayEquals(V1, entry.get(C1));
        count++;
    }
    Assert.assertEquals(9, count);
}
Also used : Scanner(io.cdap.cdap.api.dataset.table.Scanner) Scan(io.cdap.cdap.api.dataset.table.Scan) Row(io.cdap.cdap.api.dataset.table.Row)

Aggregations

Row (io.cdap.cdap.api.dataset.table.Row)166 Scanner (io.cdap.cdap.api.dataset.table.Scanner)81 Test (org.junit.Test)50 Table (io.cdap.cdap.api.dataset.table.Table)34 Put (io.cdap.cdap.api.dataset.table.Put)29 ArrayList (java.util.ArrayList)26 TransactionExecutor (org.apache.tephra.TransactionExecutor)26 Get (io.cdap.cdap.api.dataset.table.Get)24 Schema (io.cdap.cdap.api.data.schema.Schema)21 HashMap (java.util.HashMap)19 MDSKey (io.cdap.cdap.data2.dataset2.lib.table.MDSKey)16 Transaction (org.apache.tephra.Transaction)16 TransactionAware (org.apache.tephra.TransactionAware)16 IOException (java.io.IOException)14 Map (java.util.Map)14 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)13 DatasetAdmin (io.cdap.cdap.api.dataset.DatasetAdmin)12 WriteOnly (io.cdap.cdap.api.annotation.WriteOnly)10 DimensionValue (io.cdap.cdap.api.dataset.lib.cube.DimensionValue)10 HBaseTable (io.cdap.cdap.data2.dataset2.lib.table.hbase.HBaseTable)10