use of org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationFileSystemIO in project beam by apache.
the class DataTokenization method run.
/**
* Runs the pipeline to completion with the specified options.
*
* @param options The execution options.
* @return The pipeline result.
*/
@SuppressWarnings({ "dereference.of.nullable", "argument.type.incompatible" })
public static PipelineResult run(DataTokenizationOptions options) {
SchemasUtils schema = null;
try {
schema = new SchemasUtils(options.getDataSchemaPath(), StandardCharsets.UTF_8);
} catch (IOException e) {
LOG.error("Failed to retrieve schema for data.", e);
}
checkArgument(schema != null, "Data schema is mandatory.");
// Create the pipeline
Pipeline pipeline = Pipeline.create(options);
// Register the coder for pipeline
CoderRegistry coderRegistry = pipeline.getCoderRegistry();
coderRegistry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
coderRegistry.registerCoderForType(RowCoder.of(schema.getBeamSchema()).getEncodedTypeDescriptor(), RowCoder.of(schema.getBeamSchema()));
/*
* Row/Row Coder for FailsafeElement.
*/
FailsafeElementCoder<Row, Row> coder = FailsafeElementCoder.of(RowCoder.of(schema.getBeamSchema()), RowCoder.of(schema.getBeamSchema()));
coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
PCollection<Row> rows;
if (options.getInputFilePattern() != null) {
rows = new TokenizationFileSystemIO(options).read(pipeline, schema);
} else if (options.getPubsubTopic() != null) {
rows = pipeline.apply("ReadMessagesFromPubsub", PubsubIO.readStrings().fromTopic(options.getPubsubTopic())).apply("TransformToBeamRow", new JsonToBeamRow(options.getNonTokenizedDeadLetterPath(), schema));
if (options.getOutputDirectory() != null) {
rows = rows.apply(Window.into(FixedWindows.of(parseDuration(options.getWindowDuration()))));
}
} else {
throw new IllegalStateException("No source is provided, please configure File System or Pub/Sub");
}
/*
Tokenize data using remote API call
*/
PCollectionTuple tokenizedRows = rows.setRowSchema(schema.getBeamSchema()).apply(MapElements.into(TypeDescriptors.kvs(TypeDescriptors.integers(), TypeDescriptors.rows())).via((Row row) -> KV.of(0, row))).setCoder(KvCoder.of(VarIntCoder.of(), RowCoder.of(schema.getBeamSchema()))).apply("DsgTokenization", RowToTokenizedRow.newBuilder().setBatchSize(options.getBatchSize()).setRpcURI(options.getRpcUri()).setSchema(schema.getBeamSchema()).setSuccessTag(TOKENIZATION_OUT).setFailureTag(TOKENIZATION_DEADLETTER_OUT).build());
String csvDelimiter = options.getCsvDelimiter();
if (options.getNonTokenizedDeadLetterPath() != null) {
/*
Write tokenization errors to dead-letter sink
*/
tokenizedRows.get(TOKENIZATION_DEADLETTER_OUT).apply("ConvertToCSV", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via((FailsafeElement<Row, Row> fse) -> FailsafeElement.of(new RowToCsv(csvDelimiter).getCsvFromRow(fse.getOriginalPayload()), new RowToCsv(csvDelimiter).getCsvFromRow(fse.getPayload())))).apply("WriteTokenizationErrorsToFS", ErrorConverters.WriteErrorsToTextIO.<String, String>newBuilder().setErrorWritePath(options.getNonTokenizedDeadLetterPath()).setTranslateFunction(SerializableFunctions.getCsvErrorConverter()).build());
}
if (options.getOutputDirectory() != null) {
new TokenizationFileSystemIO(options).write(tokenizedRows.get(TOKENIZATION_OUT), schema.getBeamSchema());
} else if (options.getBigQueryTableName() != null) {
WriteResult writeResult = TokenizationBigQueryIO.write(tokenizedRows.get(TOKENIZATION_OUT), options.getBigQueryTableName(), schema.getBigQuerySchema());
writeResult.getFailedInsertsWithErr().apply("WrapInsertionErrors", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via(TokenizationBigQueryIO::wrapBigQueryInsertError)).setCoder(FAILSAFE_ELEMENT_CODER).apply("WriteInsertionFailedRecords", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(options.getBigQueryTableName() + DEFAULT_DEADLETTER_TABLE_SUFFIX).setErrorRecordsTableSchema(DEADLETTER_SCHEMA).build());
} else if (options.getBigTableInstanceId() != null) {
new TokenizationBigTableIO(options).write(tokenizedRows.get(TOKENIZATION_OUT), schema.getBeamSchema());
} else {
throw new IllegalStateException("No sink is provided, please configure BigQuery or BigTable.");
}
return pipeline.run();
}
use of org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationFileSystemIO in project beam by apache.
the class DataTokenizationTest method fileSystemIORead.
private PCollection<Row> fileSystemIORead(String inputGcsFilePattern, FORMAT inputGcsFileFormat) throws IOException {
DataTokenizationOptions options = PipelineOptionsFactory.create().as(DataTokenizationOptions.class);
options.setDataSchemaPath(SCHEMA_FILE_PATH);
options.setInputFilePattern(inputGcsFilePattern);
options.setInputFileFormat(inputGcsFileFormat);
if (inputGcsFileFormat == FORMAT.CSV) {
options.setCsvContainsHeaders(Boolean.FALSE);
}
SchemasUtils testSchemaUtils = new SchemasUtils(options.getDataSchemaPath(), StandardCharsets.UTF_8);
CoderRegistry coderRegistry = testPipeline.getCoderRegistry();
coderRegistry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
coderRegistry.registerCoderForType(RowCoder.of(testSchemaUtils.getBeamSchema()).getEncodedTypeDescriptor(), RowCoder.of(testSchemaUtils.getBeamSchema()));
/*
* Row/Row Coder for FailsafeElement.
*/
FailsafeElementCoder<Row, Row> coder = FailsafeElementCoder.of(RowCoder.of(testSchemaUtils.getBeamSchema()), RowCoder.of(testSchemaUtils.getBeamSchema()));
coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
return new TokenizationFileSystemIO(options).read(testPipeline, testSchemaUtils);
}
Aggregations