use of org.apache.samza.operators.KV in project samza by apache.
the class JoinTranslator method getTable.
private Table getTable(JoinInputNode tableNode, TranslatorContext context) {
SqlIOConfig sourceTableConfig = resolveSQlIOForTable(tableNode.getRelNode(), context.getExecutionContext().getSamzaSqlApplicationConfig().getInputSystemStreamConfigBySource());
if (sourceTableConfig == null || !sourceTableConfig.getTableDescriptor().isPresent()) {
String errMsg = "Failed to resolve table source in join operation: node=" + tableNode.getRelNode();
log.error(errMsg);
throw new SamzaException(errMsg);
}
Table<KV<SamzaSqlRelRecord, SamzaSqlRelMessage>> table = context.getStreamAppDescriptor().getTable(sourceTableConfig.getTableDescriptor().get());
if (tableNode.isRemoteTable()) {
return table;
}
// If local table, load the table.
// Load the local table with the fields in the join condition as composite key and relational message as the value.
// Send the messages from the input stream denoted as 'table' to the created table store.
MessageStream<SamzaSqlRelMessage> relOutputStream = context.getMessageStream(tableNode.getRelNode().getId());
SamzaSqlRelRecordSerdeFactory.SamzaSqlRelRecordSerde keySerde = (SamzaSqlRelRecordSerdeFactory.SamzaSqlRelRecordSerde) new SamzaSqlRelRecordSerdeFactory().getSerde(null, null);
SamzaSqlRelMessageSerdeFactory.SamzaSqlRelMessageSerde valueSerde = (SamzaSqlRelMessageSerdeFactory.SamzaSqlRelMessageSerde) new SamzaSqlRelMessageSerdeFactory().getSerde(null, null);
List<Integer> tableKeyIds = tableNode.getKeyIds();
// Let's always repartition by the join fields as key before sending the key and value to the table.
// We need to repartition the stream denoted as table to ensure that both the stream and table that are joined
// have the same partitioning scheme with the same partition key and number. Please note that bootstrap semantic is
// not propagated to the intermediate streams. Please refer SAMZA-1613 for more details on this. Subsequently, the
// results are consistent only after the local table is caught up.
relOutputStream.partitionBy(m -> createSamzaSqlCompositeKey(m, tableKeyIds), m -> m, KVSerde.of(keySerde, valueSerde), intermediateStreamPrefix + "table_" + logicalOpId).sendTo(table);
return table;
}
use of org.apache.samza.operators.KV in project samza by apache.
the class TestAvroRelConversion method validateAvroSerializedData.
private void validateAvroSerializedData(byte[] serializedData, Object unionValue) throws IOException {
GenericRecord complexRecordValue = genericRecordFromBytes(serializedData, ComplexRecord.SCHEMA$);
SamzaSqlRelMessage message = complexRecordAvroRelConverter.convertToRelMessage(new KV<>("key", complexRecordValue));
Assert.assertEquals(message.getSamzaSqlRelRecord().getFieldNames().size(), ComplexRecord.SCHEMA$.getFields().size() + 1);
Assert.assertEquals(message.getSamzaSqlRelRecord().getField("id").get(), id);
Assert.assertEquals(message.getSamzaSqlRelRecord().getField("bool_value").get(), boolValue);
Assert.assertEquals(message.getSamzaSqlRelRecord().getField("double_value").get(), doubleValue);
Assert.assertEquals(message.getSamzaSqlRelRecord().getField("string_value").get(), new Utf8(testStrValue));
Assert.assertEquals(message.getSamzaSqlRelRecord().getField("float_value0").get(), floatValue);
Assert.assertEquals(message.getSamzaSqlRelRecord().getField("long_value").get(), longValue);
if (unionValue instanceof String) {
Assert.assertEquals(message.getSamzaSqlRelRecord().getField("union_value").get(), new Utf8((String) unionValue));
} else {
Assert.assertEquals(message.getSamzaSqlRelRecord().getField("union_value").get(), unionValue);
}
Assert.assertTrue(arrayValue.stream().map(Utf8::new).collect(Collectors.toList()).equals(message.getSamzaSqlRelRecord().getField("array_values").get()));
Assert.assertTrue(mapValue.entrySet().stream().collect(Collectors.toMap(x -> new Utf8(x.getKey()), y -> new Utf8(y.getValue()))).equals(message.getSamzaSqlRelRecord().getField("map_values").get()));
Assert.assertTrue(Arrays.equals(((ByteString) message.getSamzaSqlRelRecord().getField("bytes_value").get()).getBytes(), testBytes.array()));
Assert.assertTrue(Arrays.equals(((ByteString) message.getSamzaSqlRelRecord().getField("fixed_value").get()).getBytes(), DEFAULT_TRACKING_ID_BYTES));
LOG.info(message.toString());
KV<Object, Object> samzaMessage = complexRecordAvroRelConverter.convertToSamzaMessage(message);
GenericRecord record = (GenericRecord) samzaMessage.getValue();
for (Schema.Field field : ComplexRecord.SCHEMA$.getFields()) {
if (field.name().equals("array_values")) {
Assert.assertTrue(record.get(field.name()).equals(complexRecordValue.get(field.name())));
} else {
Object expected = complexRecordValue.get(field.name());
Assert.assertEquals(expected, record.get(field.name()));
}
}
}
use of org.apache.samza.operators.KV in project samza by apache.
the class TestRemoteTableWithBatchEndToEnd method doTestStreamTableJoinRemoteTablePartialUpdates.
private void doTestStreamTableJoinRemoteTablePartialUpdates(String testName, boolean isCompactBatch) throws Exception {
final InMemoryWriteFunction writer = new InMemoryWriteFunction(testName);
BATCH_READS.put(testName, new AtomicInteger());
BATCH_WRITES.put(testName, new AtomicInteger());
WRITTEN_RECORDS.put(testName, new HashMap<>());
int count = 16;
int batchSize = 4;
String profiles = Base64Serializer.serialize(generateProfiles(count));
final RateLimiter readRateLimiter = mock(RateLimiter.class, withSettings().serializable());
final RateLimiter writeRateLimiter = mock(RateLimiter.class, withSettings().serializable());
final TableRateLimiter.CreditFunction creditFunction = (k, v, args) -> 1;
final StreamApplication app = appDesc -> {
RemoteTableDescriptor<Integer, Profile, Void> inputTableDesc = new RemoteTableDescriptor<>("profile-table-1");
inputTableDesc.withReadFunction(InMemoryReadFunction.getInMemoryReadFunction(testName, profiles)).withRateLimiter(readRateLimiter, creditFunction, null);
// dummy reader
TableReadFunction<Integer, EnrichedPageView> readFn = new MyReadFunction();
RemoteTableDescriptor<Integer, EnrichedPageView, EnrichedPageView> outputTableDesc = new RemoteTableDescriptor<>("enriched-page-view-table-1");
outputTableDesc.withReadFunction(readFn).withWriteFunction(writer).withRateLimiter(writeRateLimiter, creditFunction, creditFunction);
if (isCompactBatch) {
outputTableDesc.withBatchProvider(new CompactBatchProvider<Integer, EnrichedPageView, EnrichedPageView>().withMaxBatchSize(batchSize).withMaxBatchDelay(Duration.ofHours(1)));
} else {
outputTableDesc.withBatchProvider(new CompleteBatchProvider<Integer, EnrichedPageView, EnrichedPageView>().withMaxBatchSize(batchSize).withMaxBatchDelay(Duration.ofHours(1)));
}
Table<KV<Integer, EnrichedPageView>> table = appDesc.getTable(outputTableDesc);
Table<KV<Integer, Profile>> inputTable = appDesc.getTable(inputTableDesc);
DelegatingSystemDescriptor ksd = new DelegatingSystemDescriptor("test");
GenericInputDescriptor<PageView> isd = ksd.getInputDescriptor("PageView", new NoOpSerde<>());
appDesc.getInputStream(isd).map(pv -> new KV<>(pv.getMemberId(), pv)).join(inputTable, new PageViewToProfileJoinFunction()).map(m -> new KV<>(m.getMemberId(), UpdateMessage.of(m, m))).sendTo(table, UpdateOptions.UPDATE_WITH_DEFAULTS);
};
InMemorySystemDescriptor isd = new InMemorySystemDescriptor("test");
InMemoryInputDescriptor<PageView> inputDescriptor = isd.getInputDescriptor("PageView", new NoOpSerde<>());
TestRunner.of(app).addInputStream(inputDescriptor, Arrays.asList(generatePageViewsWithDistinctKeys(count))).addConfig("task.max.concurrency", String.valueOf(count)).addConfig("task.async.commit", String.valueOf(true)).run(Duration.ofSeconds(10));
Assert.assertEquals(count, WRITTEN_RECORDS.get(testName).size());
Assert.assertNotNull(WRITTEN_RECORDS.get(testName).get(0));
Assert.assertEquals(count / batchSize, BATCH_WRITES.get(testName).get());
}
use of org.apache.samza.operators.KV in project samza by apache.
the class TestRemoteTableEndToEnd method getCachingTable.
private <K, V> Table<KV<K, V>> getCachingTable(TableDescriptor<K, V, ?> actualTableDesc, boolean defaultCache, StreamApplicationDescriptor appDesc) {
String id = actualTableDesc.getTableId();
CachingTableDescriptor<K, V> cachingDesc;
if (defaultCache) {
cachingDesc = new CachingTableDescriptor<>("caching-table-" + id, actualTableDesc);
cachingDesc.withReadTtl(Duration.ofMinutes(5));
cachingDesc.withWriteTtl(Duration.ofMinutes(5));
} else {
GuavaCacheTableDescriptor<K, V> guavaTableDesc = new GuavaCacheTableDescriptor<>("guava-table-" + id);
guavaTableDesc.withCache(CacheBuilder.newBuilder().expireAfterAccess(5, TimeUnit.MINUTES).build());
cachingDesc = new CachingTableDescriptor<>("caching-table-" + id, actualTableDesc, guavaTableDesc);
}
return appDesc.getTable(cachingDesc);
}
use of org.apache.samza.operators.KV in project samza by apache.
the class TestRemoteTableEndToEnd method doTestStreamTableJoinRemoteTable.
private void doTestStreamTableJoinRemoteTable(boolean withCache, boolean defaultCache, boolean withUpdate, String testName) throws Exception {
WRITTEN_RECORDS.put(testName, new ArrayList<>());
// max member id for page views is 10
final String profiles = Base64Serializer.serialize(generateProfiles(10));
final RateLimiter readRateLimiter = mock(RateLimiter.class, withSettings().serializable());
final TableRateLimiter.CreditFunction creditFunction = (k, v, args) -> 1;
final StreamApplication app = appDesc -> {
final RemoteTableDescriptor joinTableDesc = new RemoteTableDescriptor<Integer, TestTableData.Profile, Void>("profile-table-1").withReadFunction(InMemoryProfileReadFunction.getInMemoryReadFunction(profiles)).withRateLimiter(readRateLimiter, creditFunction, null);
final RemoteTableDescriptor outputTableDesc = new RemoteTableDescriptor<Integer, EnrichedPageView, EnrichedPageView>("enriched-page-view-table-1").withReadFunction(new NoOpTableReadFunction<>()).withReadRateLimiterDisabled().withWriteFunction(new InMemoryEnrichedPageViewWriteFunction(testName)).withWriteRateLimit(1000);
final Table<KV<Integer, Profile>> outputTable = withCache ? getCachingTable(outputTableDesc, defaultCache, appDesc) : appDesc.getTable(outputTableDesc);
final Table<KV<Integer, Profile>> joinTable = withCache ? getCachingTable(joinTableDesc, defaultCache, appDesc) : appDesc.getTable(joinTableDesc);
final DelegatingSystemDescriptor ksd = new DelegatingSystemDescriptor("test");
final GenericInputDescriptor<PageView> isd = ksd.getInputDescriptor("PageView", new NoOpSerde<>());
if (withUpdate) {
appDesc.getInputStream(isd).map(pv -> new KV<>(pv.getMemberId(), pv)).join(joinTable, new PageViewToProfileJoinFunction()).map(m -> new KV(m.getMemberId(), UpdateMessage.of(m, m))).sendTo(outputTable, UpdateOptions.UPDATE_WITH_DEFAULTS);
} else {
appDesc.getInputStream(isd).map(pv -> new KV<>(pv.getMemberId(), pv)).join(joinTable, new PageViewToProfileJoinFunction()).map(m -> KV.of(m.getMemberId(), m)).sendTo(outputTable);
}
};
int numPageViews = 40;
InMemorySystemDescriptor isd = new InMemorySystemDescriptor("test");
InMemoryInputDescriptor<PageView> inputDescriptor = isd.getInputDescriptor("PageView", new NoOpSerde<>());
TestRunner.of(app).addInputStream(inputDescriptor, TestTableData.generatePartitionedPageViews(numPageViews, 4)).run(Duration.ofSeconds(10));
Assert.assertEquals(numPageViews, WRITTEN_RECORDS.get(testName).size());
Assert.assertNotNull(WRITTEN_RECORDS.get(testName).get(0));
WRITTEN_RECORDS.get(testName).forEach(epv -> Assert.assertFalse(epv.company.contains("-")));
}
Aggregations