use of io.cdap.cdap.api.dataset.table.Row in project hydrator-plugins by cdapio.
the class SparkPluginTest method testFileSource.
@Test
public void testFileSource() throws Exception {
Schema schema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("first", Schema.of(Schema.Type.STRING)), Schema.Field.of("last", Schema.of(Schema.Type.STRING)));
File folder = tmpFolder.newFolder("fileSourceTest");
File input1 = new File(folder, "input1.txt");
File input2 = new File(folder, "input2.csv");
File ignore1 = new File(folder, "input1.txt.done");
File ignore2 = new File(folder, "input1");
CharStreams.write("1,samuel,jackson\n2,dwayne,johnson", Files.newWriterSupplier(input1, Charsets.UTF_8));
CharStreams.write("3,christopher,walken", Files.newWriterSupplier(input2, Charsets.UTF_8));
CharStreams.write("0,nicolas,cage", Files.newWriterSupplier(ignore1, Charsets.UTF_8));
CharStreams.write("0,orlando,bloom", Files.newWriterSupplier(ignore2, Charsets.UTF_8));
Map<String, String> properties = ImmutableMap.<String, String>builder().put("path", folder.getAbsolutePath()).put("format", "csv").put("schema", schema.toString()).put("referenceName", "fileSourceTestInput").put("ignoreThreshold", "300").put("extensions", "txt,csv").build();
DataStreamsConfig pipelineCfg = DataStreamsConfig.builder().addStage(new ETLStage("source", new ETLPlugin("File", StreamingSource.PLUGIN_TYPE, properties, null))).addStage(new ETLStage("sink", MockSink.getPlugin("fileOutput"))).addConnection("source", "sink").setBatchInterval("1s").build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(DATASTREAMS_ARTIFACT, pipelineCfg);
ApplicationId appId = NamespaceId.DEFAULT.app("FileSourceApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 1, TimeUnit.MINUTES);
Map<Long, String> expected = ImmutableMap.of(1L, "samuel jackson", 2L, "dwayne johnson", 3L, "christopher walken");
final DataSetManager<Table> outputManager = getDataset("fileOutput");
Tasks.waitFor(true, () -> {
outputManager.flush();
Map<Long, String> actual = new HashMap<>();
for (StructuredRecord outputRecord : MockSink.readOutput(outputManager)) {
actual.put(outputRecord.get("id"), outputRecord.get("first") + " " + outputRecord.get("last"));
}
return expected.equals(actual);
}, 4, TimeUnit.MINUTES);
// now write a new file to make sure new files are picked up.
File input3 = new File(folder, "input3.txt");
CharStreams.write("4,terry,crews\n5,rocky,balboa", Files.newWriterSupplier(input3, Charsets.UTF_8));
Map<Long, String> expected2 = ImmutableMap.of(4L, "terry crews", 5L, "rocky balboa");
Table outputTable = outputManager.get();
Scanner scanner = outputTable.scan(null, null);
Row row;
while ((row = scanner.next()) != null) {
outputTable.delete(row.getRow());
}
outputManager.flush();
Tasks.waitFor(true, () -> {
outputManager.flush();
Map<Long, String> actual = new HashMap<>();
for (StructuredRecord outputRecord : MockSink.readOutput(outputManager)) {
actual.put(outputRecord.get("id"), outputRecord.get("first") + " " + outputRecord.get("last"));
}
return expected2.equals(actual);
}, 4, TimeUnit.MINUTES);
sparkManager.stop();
}
use of io.cdap.cdap.api.dataset.table.Row in project cdap by caskdata.
the class MapReduceWithPartitionedTest method testTimePartitionedWithMR.
@Test
public void testTimePartitionedWithMR() throws Exception {
final ApplicationWithPrograms app = deployApp(AppWithTimePartitionedFileSet.class);
// write a value to the input table
final Table table = datasetCache.getDataset(AppWithTimePartitionedFileSet.INPUT);
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
table.put(Bytes.toBytes("x"), AppWithTimePartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("1"));
}
});
final long time = DATE_FORMAT.parse("1/15/15 11:15 am").getTime();
final long time5 = time + TimeUnit.MINUTES.toMillis(5);
// run the partition writer m/r with this output partition time
Map<String, String> runtimeArguments = Maps.newHashMap();
Map<String, String> outputArgs = Maps.newHashMap();
TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, time);
final ImmutableMap<String, String> assignedMetadata = ImmutableMap.of("region", "13", "data.source.name", "input", "data.source.type", "table");
TimePartitionedFileSetArguments.setOutputPartitionMetadata(outputArgs, assignedMetadata);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, outputArgs));
Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
// this should have created a partition in the tpfs
final TimePartitionedFileSet tpfs = datasetCache.getDataset(TIME_PARTITIONED);
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) tpfs).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
TimePartitionDetail partition = tpfs.getPartitionByTime(time);
Assert.assertNotNull(partition);
String path = partition.getRelativePath();
Assert.assertNotNull(path);
Assert.assertTrue(path.contains("2015-01-15/11-15"));
Assert.assertEquals(assignedMetadata, partition.getMetadata().asMap());
}
});
// delete the data in the input table and write a new row
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
table.delete(Bytes.toBytes("x"));
table.put(Bytes.toBytes("y"), AppWithTimePartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("2"));
}
});
// now run the m/r again with a new partition time, say 5 minutes later
TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, time5);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, outputArgs));
// make the mapreduce add the partition in destroy, to validate that this does not fail the job
runtimeArguments.put(AppWithTimePartitionedFileSet.COMPAT_ADD_PARTITION, "true");
Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
// this should have created a partition in the tpfs
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) tpfs).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Partition partition = tpfs.getPartitionByTime(time5);
Assert.assertNotNull(partition);
String path = partition.getRelativePath();
Assert.assertNotNull(path);
Assert.assertTrue(path.contains("2015-01-15/11-20"));
}
});
// now run a map/reduce that reads all the partitions
runtimeArguments = Maps.newHashMap();
Map<String, String> inputArgs = Maps.newHashMap();
TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(5));
TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time5 + TimeUnit.MINUTES.toMillis(5));
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs));
runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "a");
Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
// this should have read both partitions - and written both x and y to row a
final Table output = datasetCache.getDataset(AppWithTimePartitionedFileSet.OUTPUT);
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Row row = output.get(Bytes.toBytes("a"));
Assert.assertEquals("1", row.getString("x"));
Assert.assertEquals("2", row.getString("y"));
}
});
// now run a map/reduce that reads a range of the partitions, namely the first one
TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(5));
TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time + TimeUnit.MINUTES.toMillis(2));
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs));
runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "b");
Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
// this should have read the first partition only - and written only x to row b
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Row row = output.get(Bytes.toBytes("b"));
Assert.assertEquals("1", row.getString("x"));
Assert.assertNull(row.get("y"));
}
});
// now run a map/reduce that reads no partitions (because the range matches nothing)
TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(10));
TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time - TimeUnit.MINUTES.toMillis(9));
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs));
runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "n");
Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
// this should have read no partitions - and written nothing to row n
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Row row = output.get(Bytes.toBytes("n"));
Assert.assertTrue(row.isEmpty());
}
});
}
use of io.cdap.cdap.api.dataset.table.Row in project cdap by caskdata.
the class FactTableTest method testPreSplits.
@Test
public void testPreSplits() throws Exception {
InMemoryTableService.create("presplitEntityTable");
InMemoryTableService.create("presplitDataTable");
int resolution = 10;
int rollTimebaseInterval = 2;
InMemoryMetricsTable metricsTable = new InMemoryMetricsTable("presplitDataTable");
FactTable table = new FactTable(metricsTable, new EntityTable(new InMemoryMetricsTable("presplitEntityTable")), resolution, rollTimebaseInterval);
byte[][] splits = FactTable.getSplits(3);
long ts = System.currentTimeMillis() / 1000;
DimensionValue dimVal1 = new DimensionValue("dim1", "value1");
DimensionValue dimVal2 = new DimensionValue("dim2", "value2");
DimensionValue dimVal3 = new DimensionValue("dim3", "value3");
// first agg view: dim1
table.add(ImmutableList.of(new Fact(ts, ImmutableList.of(dimVal1), new Measurement("metric1", MeasureType.COUNTER, 1))));
// second agg view: dim1 & dim2
table.add(ImmutableList.of(new Fact(ts, ImmutableList.of(dimVal1, dimVal2), new Measurement("metric1", MeasureType.COUNTER, 1))));
// third agg view: dim3
table.add(ImmutableList.of(new Fact(ts, ImmutableList.of(dimVal3), new Measurement("metric1", MeasureType.COUNTER, 1))));
// Verify all written records are spread across splits
Scanner scanner = metricsTable.scan(null, null, null);
Row row;
Set<Integer> splitsWithRows = Sets.newHashSet();
while ((row = scanner.next()) != null) {
boolean added = false;
for (int i = 0; i < splits.length; i++) {
if (Bytes.compareTo(row.getRow(), splits[i]) < 0) {
splitsWithRows.add(i);
added = true;
break;
}
}
if (!added) {
// falls into last split
splitsWithRows.add(splits.length);
}
}
Assert.assertEquals(3, splitsWithRows.size());
}
use of io.cdap.cdap.api.dataset.table.Row in project cdap by caskdata.
the class LevelDBTableCoreTest method testScan.
@Test
public void testScan() throws Exception {
String tableName = "testScanTable";
TableId tableId = TableId.from("default", tableName);
{
Assert.assertNull(service.getTableStats().get(tableId));
service.ensureTableExists(tableName);
LevelDBTableCore table = new LevelDBTableCore(tableName, service);
int numRows = 8;
int numVersion = 8;
// Write data to multiple rows and multiple versions per row and col.
writeData(table, rowNamePrefix, numRows, colName, 1024, numVersion);
// Scan only the first row and make sure no data from other rows are returned.
try (Scanner scanner = table.scan(getRowName(rowNamePrefix, 0).getBytes(StandardCharsets.UTF_8), getRowName(rowNamePrefix, 1).getBytes(StandardCharsets.UTF_8), null, null, null)) {
Row row;
while ((row = scanner.next()) != null) {
String rowName = new String(row.getRow(), StandardCharsets.UTF_8);
Assert.assertTrue(rowName.equals(getRowName(rowNamePrefix, 0)));
}
}
// Test a corner case by writing to row i + 1 at default version (i.e. max version) and scan row i,
// because scan uses the max version at row i + 1 as scan end key (excluded) and we want to make sure
// nothing from row i + 1 gets returned in such case.
writeRowColDefaultVersion(table, getRowName(rowNamePrefix, 1), colName, "dummy-value");
try (Scanner scanner = table.scan(getRowName(rowNamePrefix, 0).getBytes(StandardCharsets.UTF_8), getRowName(rowNamePrefix, 1).getBytes(StandardCharsets.UTF_8), null, null, null)) {
Row row;
while ((row = scanner.next()) != null) {
String rowName = new String(row.getRow(), StandardCharsets.UTF_8);
Assert.assertTrue(rowName.equals(getRowName(rowNamePrefix, 0)));
}
}
service.dropTable(tableName);
}
}
use of io.cdap.cdap.api.dataset.table.Row in project cdap by caskdata.
the class TableTest method verifyScanWithFuzzyRowFilter.
private static void verifyScanWithFuzzyRowFilter(Table table) {
FuzzyRowFilter filter = new FuzzyRowFilter(ImmutableList.of(ImmutablePair.of(new byte[] { '*', 'b', '*', 'b' }, new byte[] { 0x01, 0x00, 0x01, 0x00 })));
Scanner scanner = table.scan(new Scan(null, null, filter));
int count = 0;
while (true) {
Row entry = scanner.next();
if (entry == null) {
break;
}
Assert.assertTrue(entry.getRow()[1] == 'b' && entry.getRow()[3] == 'b');
Assert.assertEquals(1, entry.getColumns().size());
Assert.assertTrue(entry.getColumns().containsKey(C1));
Assert.assertArrayEquals(V1, entry.get(C1));
count++;
}
Assert.assertEquals(9, count);
}
Aggregations