use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class UuidDeduplicationTransformTest method dedupesBasedOnReturnedUuid.
@Test
public void dedupesBasedOnReturnedUuid() {
byte[] bytes = { (byte) 0x123, (byte) 0x456 };
// These messages have different uuids, so they would both appear in the output collection if
// the extractor is not respected.
SequencedMessage message1 = newMessage();
SequencedMessage message2 = newMessage();
TestStream<SequencedMessage> messageStream = TestStream.create(ProtoCoder.of(SequencedMessage.class)).advanceWatermarkTo(START).addElements(message1, message2).advanceWatermarkToInfinity();
PCollection<SequencedMessage> results = pipeline.apply(messageStream).apply(new UuidDeduplicationTransform(UuidDeduplicationOptions.newBuilder().setUuidExtractor(message -> Uuid.of(ByteString.copyFrom(bytes))).build()));
PAssert.that(results).satisfies(messages -> {
Preconditions.checkArgument(Iterables.size(messages) == 1);
return null;
});
pipeline.run();
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class Neo4jIOIT method testLargeWriteUnwind.
@Test
public void testLargeWriteUnwind() throws Exception {
final int startId = 5000;
final int endId = 6000;
// Create 1000 IDs
List<Integer> idList = new ArrayList<>();
for (int id = startId; id < endId; id++) {
idList.add(id);
}
PCollection<Integer> idCollection = largeWriteUnwindPipeline.apply(Create.of(idList));
// Every row is represented by a Map<String, Object> in the parameters map.
// We accumulate the rows and 'unwind' those to Neo4j for performance reasons.
//
SerializableFunction<Integer, Map<String, Object>> parametersFunction = id -> ImmutableMap.of("id", id, "name", "Casters", "firstName", "Matt");
// 1000 rows with a batch size of 123 should trigger most scenarios we can think of
// We've put a unique constraint on Something.id
//
Neo4jIO.WriteUnwind<Integer> read = Neo4jIO.<Integer>writeUnwind().withDriverConfiguration(Neo4jTestUtil.getDriverConfiguration(containerHostname, containerPort)).withSessionConfig(SessionConfig.forDatabase(Neo4jTestUtil.NEO4J_DATABASE)).withBatchSize(123).withUnwindMapName("rows").withCypher("UNWIND $rows AS row CREATE(n:Something { id : row.id })").withParametersFunction(parametersFunction).withCypherLogging();
idCollection.apply(read);
// Now run this pipeline
//
PipelineResult pipelineResult = largeWriteUnwindPipeline.run();
Assert.assertEquals(PipelineResult.State.DONE, pipelineResult.getState());
//
try (Driver driver = Neo4jTestUtil.getDriver(containerHostname, containerPort)) {
try (Session session = Neo4jTestUtil.getSession(driver, true)) {
List<Integer> values = session.readTransaction(tx -> {
List<Integer> v = null;
int nrRows = 0;
Result result = tx.run("MATCH(n:Something) RETURN count(n), min(n.id), max(n.id)");
while (result.hasNext()) {
Record record = result.next();
v = Arrays.asList(record.get(0).asInt(), record.get(1).asInt(), record.get(2).asInt(), ++nrRows);
}
return v;
});
Assert.assertNotNull(values);
assertThat(values, contains(endId - startId, startId, endId - 1, 1));
}
}
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class Neo4jIOIT method testWriteUnwind.
@Test
public void testWriteUnwind() throws Exception {
PCollection<String> stringsCollections = writeUnwindPipeline.apply(Create.of(Arrays.asList("one", "two", "three")));
// Every row is represented by a Map<String, Object> in the parameters map.
// We accumulate the rows and 'unwind' those to Neo4j for performance reasons.
//
SerializableFunction<String, Map<String, Object>> parametersMapper = name -> Collections.singletonMap("name", name);
Neo4jIO.WriteUnwind<String> read = Neo4jIO.<String>writeUnwind().withDriverConfiguration(Neo4jTestUtil.getDriverConfiguration(containerHostname, containerPort)).withSessionConfig(SessionConfig.forDatabase(Neo4jTestUtil.NEO4J_DATABASE)).withBatchSize(5000).withUnwindMapName("rows").withCypher("UNWIND $rows AS row MERGE(n:Num { name : row.name })").withParametersFunction(parametersMapper).withCypherLogging();
stringsCollections.apply(read);
// Now run this pipeline
//
PipelineResult pipelineResult = writeUnwindPipeline.run();
Assert.assertEquals(PipelineResult.State.DONE, pipelineResult.getState());
//
try (Driver driver = Neo4jTestUtil.getDriver(containerHostname, containerPort)) {
try (Session session = Neo4jTestUtil.getSession(driver, true)) {
List<String> names = session.readTransaction(tx -> {
List<String> list = new ArrayList<>();
Result result = tx.run("MATCH(n:Num) RETURN n.name");
while (result.hasNext()) {
Record record = result.next();
list.add(record.get(0).asString());
}
return list;
});
assertThat(names, containsInAnyOrder("one", "two", "three"));
}
}
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class Neo4jIOIT method testParameterizedRead.
@Test
public void testParameterizedRead() throws Exception {
PCollection<String> stringsCollections = parameterizedReadPipeline.apply(Create.of(Arrays.asList("one", "two", "three")));
final Schema outputSchema = Schema.of(Schema.Field.of("One", Schema.FieldType.INT32), Schema.Field.of("Str", Schema.FieldType.STRING));
SerializableFunction<String, Map<String, Object>> parametersFunction = string -> Collections.singletonMap("par1", string);
Neo4jIO.RowMapper<Row> rowMapper = record -> {
int one = record.get(0).asInt();
String string = record.get(1).asString();
return Row.withSchema(outputSchema).attachValues(one, string);
};
Neo4jIO.ReadAll<String, Row> read = Neo4jIO.<String, Row>readAll().withCypher("RETURN 1, $par1").withDriverConfiguration(Neo4jTestUtil.getDriverConfiguration(containerHostname, containerPort)).withSessionConfig(SessionConfig.forDatabase(Neo4jTestUtil.NEO4J_DATABASE)).withRowMapper(rowMapper).withParametersFunction(parametersFunction).withCoder(SerializableCoder.of(Row.class)).withCypherLogging();
PCollection<Row> outputRows = stringsCollections.apply(read);
PCollection<String> outputLines = outputRows.apply(ParDo.of(new ParameterizedReadRowToLineFn()));
PAssert.that(outputLines).containsInAnyOrder("1,one", "1,two", "1,three");
// Now run this pipeline
//
PipelineResult pipelineResult = parameterizedReadPipeline.run();
Assert.assertEquals(PipelineResult.State.DONE, pipelineResult.getState());
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class DataflowGroupByKeyTest method testGroupByKeyServiceUnbounded.
@Test
public void testGroupByKeyServiceUnbounded() {
Pipeline p = createTestServiceRunner();
PCollection<KV<String, Integer>> input = p.apply(new PTransform<PBegin, PCollection<KV<String, Integer>>>() {
@Override
public PCollection<KV<String, Integer>> expand(PBegin input) {
return PCollection.createPrimitiveOutputInternal(input.getPipeline(), WindowingStrategy.globalDefault(), PCollection.IsBounded.UNBOUNDED, KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()));
}
});
thrown.expect(IllegalStateException.class);
thrown.expectMessage("GroupByKey cannot be applied to non-bounded PCollection in the GlobalWindow without " + "a trigger. Use a Window.into or Window.triggering transform prior to GroupByKey.");
input.apply("GroupByKey", GroupByKey.create());
}
Aggregations