use of org.apache.beam.examples.complete.datatokenization.utils.RowToCsv in project beam by apache.
the class DataTokenizationTest method testRowToCSVWithNull.
@Test
public void testRowToCSVWithNull() {
final String nullableTestSchema = "{\"fields\":[{\"mode\":\"REQUIRED\",\"name\":\"FieldName1\",\"type\":\"STRING\"},{\"mode\":\"NULLABLE\",\"name\":\"FieldName2\",\"type\":\"STRING\"}]}";
final String expectedCsv = "TestValueOne;null";
List<Object> values = Lists.newArrayList("TestValueOne", null);
Schema beamSchema = new SchemasUtils(nullableTestSchema).getBeamSchema();
Row.Builder rowBuilder = Row.withSchema(beamSchema);
Row row = rowBuilder.addValues(values).build();
String csvResult = new RowToCsv(";").getCsvFromRow(row);
Assert.assertEquals(expectedCsv, csvResult);
}
use of org.apache.beam.examples.complete.datatokenization.utils.RowToCsv in project beam by apache.
the class DataTokenizationTest method testRowToCSV.
@Test
public void testRowToCSV() {
Schema beamSchema = new SchemasUtils(testSchema).getBeamSchema();
Row.Builder rowBuilder = Row.withSchema(beamSchema);
Row row = rowBuilder.addValues(new ArrayList<>(Arrays.asList(fields))).build();
String csvResult = new RowToCsv(";").getCsvFromRow(row);
Assert.assertEquals(String.join(";", fields), csvResult);
}
use of org.apache.beam.examples.complete.datatokenization.utils.RowToCsv in project beam by apache.
the class DataTokenization method run.
/**
* Runs the pipeline to completion with the specified options.
*
* @param options The execution options.
* @return The pipeline result.
*/
@SuppressWarnings({ "dereference.of.nullable", "argument.type.incompatible" })
public static PipelineResult run(DataTokenizationOptions options) {
SchemasUtils schema = null;
try {
schema = new SchemasUtils(options.getDataSchemaPath(), StandardCharsets.UTF_8);
} catch (IOException e) {
LOG.error("Failed to retrieve schema for data.", e);
}
checkArgument(schema != null, "Data schema is mandatory.");
// Create the pipeline
Pipeline pipeline = Pipeline.create(options);
// Register the coder for pipeline
CoderRegistry coderRegistry = pipeline.getCoderRegistry();
coderRegistry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
coderRegistry.registerCoderForType(RowCoder.of(schema.getBeamSchema()).getEncodedTypeDescriptor(), RowCoder.of(schema.getBeamSchema()));
/*
* Row/Row Coder for FailsafeElement.
*/
FailsafeElementCoder<Row, Row> coder = FailsafeElementCoder.of(RowCoder.of(schema.getBeamSchema()), RowCoder.of(schema.getBeamSchema()));
coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
PCollection<Row> rows;
if (options.getInputFilePattern() != null) {
rows = new TokenizationFileSystemIO(options).read(pipeline, schema);
} else if (options.getPubsubTopic() != null) {
rows = pipeline.apply("ReadMessagesFromPubsub", PubsubIO.readStrings().fromTopic(options.getPubsubTopic())).apply("TransformToBeamRow", new JsonToBeamRow(options.getNonTokenizedDeadLetterPath(), schema));
if (options.getOutputDirectory() != null) {
rows = rows.apply(Window.into(FixedWindows.of(parseDuration(options.getWindowDuration()))));
}
} else {
throw new IllegalStateException("No source is provided, please configure File System or Pub/Sub");
}
/*
Tokenize data using remote API call
*/
PCollectionTuple tokenizedRows = rows.setRowSchema(schema.getBeamSchema()).apply(MapElements.into(TypeDescriptors.kvs(TypeDescriptors.integers(), TypeDescriptors.rows())).via((Row row) -> KV.of(0, row))).setCoder(KvCoder.of(VarIntCoder.of(), RowCoder.of(schema.getBeamSchema()))).apply("DsgTokenization", RowToTokenizedRow.newBuilder().setBatchSize(options.getBatchSize()).setRpcURI(options.getRpcUri()).setSchema(schema.getBeamSchema()).setSuccessTag(TOKENIZATION_OUT).setFailureTag(TOKENIZATION_DEADLETTER_OUT).build());
String csvDelimiter = options.getCsvDelimiter();
if (options.getNonTokenizedDeadLetterPath() != null) {
/*
Write tokenization errors to dead-letter sink
*/
tokenizedRows.get(TOKENIZATION_DEADLETTER_OUT).apply("ConvertToCSV", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via((FailsafeElement<Row, Row> fse) -> FailsafeElement.of(new RowToCsv(csvDelimiter).getCsvFromRow(fse.getOriginalPayload()), new RowToCsv(csvDelimiter).getCsvFromRow(fse.getPayload())))).apply("WriteTokenizationErrorsToFS", ErrorConverters.WriteErrorsToTextIO.<String, String>newBuilder().setErrorWritePath(options.getNonTokenizedDeadLetterPath()).setTranslateFunction(SerializableFunctions.getCsvErrorConverter()).build());
}
if (options.getOutputDirectory() != null) {
new TokenizationFileSystemIO(options).write(tokenizedRows.get(TOKENIZATION_OUT), schema.getBeamSchema());
} else if (options.getBigQueryTableName() != null) {
WriteResult writeResult = TokenizationBigQueryIO.write(tokenizedRows.get(TOKENIZATION_OUT), options.getBigQueryTableName(), schema.getBigQuerySchema());
writeResult.getFailedInsertsWithErr().apply("WrapInsertionErrors", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via(TokenizationBigQueryIO::wrapBigQueryInsertError)).setCoder(FAILSAFE_ELEMENT_CODER).apply("WriteInsertionFailedRecords", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(options.getBigQueryTableName() + DEFAULT_DEADLETTER_TABLE_SUFFIX).setErrorRecordsTableSchema(DEADLETTER_SCHEMA).build());
} else if (options.getBigTableInstanceId() != null) {
new TokenizationBigTableIO(options).write(tokenizedRows.get(TOKENIZATION_OUT), schema.getBeamSchema());
} else {
throw new IllegalStateException("No sink is provided, please configure BigQuery or BigTable.");
}
return pipeline.run();
}
use of org.apache.beam.examples.complete.datatokenization.utils.RowToCsv in project beam by apache.
the class TokenizationFileSystemIO method writeCsv.
private PDone writeCsv(PCollection<Row> input, Schema schema) {
String header = String.join(options.getCsvDelimiter(), schema.getFieldNames());
String csvDelimiter = options.getCsvDelimiter();
PCollection<String> csvs = input.apply("ConvertToCSV", MapElements.into(TypeDescriptors.strings()).via((Row inputRow) -> new RowToCsv(csvDelimiter).getCsvFromRow(inputRow)));
if (csvs.isBounded() == IsBounded.BOUNDED) {
return csvs.apply("WriteToFS", TextIO.write().to(options.getOutputDirectory()).withHeader(header));
} else {
return csvs.apply("WriteToFS", TextIO.write().withWindowedWrites().withNumShards(1).to(options.getOutputDirectory()).withHeader(header));
}
}
Aggregations