Search in sources :

Example 1 with TokenizationFileSystemIO

use of org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationFileSystemIO in project beam by apache.

the class DataTokenization method run.

/**
 * Runs the pipeline to completion with the specified options.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
@SuppressWarnings({ "dereference.of.nullable", "argument.type.incompatible" })
public static PipelineResult run(DataTokenizationOptions options) {
    SchemasUtils schema = null;
    try {
        schema = new SchemasUtils(options.getDataSchemaPath(), StandardCharsets.UTF_8);
    } catch (IOException e) {
        LOG.error("Failed to retrieve schema for data.", e);
    }
    checkArgument(schema != null, "Data schema is mandatory.");
    // Create the pipeline
    Pipeline pipeline = Pipeline.create(options);
    // Register the coder for pipeline
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
    coderRegistry.registerCoderForType(RowCoder.of(schema.getBeamSchema()).getEncodedTypeDescriptor(), RowCoder.of(schema.getBeamSchema()));
    /*
     * Row/Row Coder for FailsafeElement.
     */
    FailsafeElementCoder<Row, Row> coder = FailsafeElementCoder.of(RowCoder.of(schema.getBeamSchema()), RowCoder.of(schema.getBeamSchema()));
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    PCollection<Row> rows;
    if (options.getInputFilePattern() != null) {
        rows = new TokenizationFileSystemIO(options).read(pipeline, schema);
    } else if (options.getPubsubTopic() != null) {
        rows = pipeline.apply("ReadMessagesFromPubsub", PubsubIO.readStrings().fromTopic(options.getPubsubTopic())).apply("TransformToBeamRow", new JsonToBeamRow(options.getNonTokenizedDeadLetterPath(), schema));
        if (options.getOutputDirectory() != null) {
            rows = rows.apply(Window.into(FixedWindows.of(parseDuration(options.getWindowDuration()))));
        }
    } else {
        throw new IllegalStateException("No source is provided, please configure File System or Pub/Sub");
    }
    /*
    Tokenize data using remote API call
     */
    PCollectionTuple tokenizedRows = rows.setRowSchema(schema.getBeamSchema()).apply(MapElements.into(TypeDescriptors.kvs(TypeDescriptors.integers(), TypeDescriptors.rows())).via((Row row) -> KV.of(0, row))).setCoder(KvCoder.of(VarIntCoder.of(), RowCoder.of(schema.getBeamSchema()))).apply("DsgTokenization", RowToTokenizedRow.newBuilder().setBatchSize(options.getBatchSize()).setRpcURI(options.getRpcUri()).setSchema(schema.getBeamSchema()).setSuccessTag(TOKENIZATION_OUT).setFailureTag(TOKENIZATION_DEADLETTER_OUT).build());
    String csvDelimiter = options.getCsvDelimiter();
    if (options.getNonTokenizedDeadLetterPath() != null) {
        /*
      Write tokenization errors to dead-letter sink
       */
        tokenizedRows.get(TOKENIZATION_DEADLETTER_OUT).apply("ConvertToCSV", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via((FailsafeElement<Row, Row> fse) -> FailsafeElement.of(new RowToCsv(csvDelimiter).getCsvFromRow(fse.getOriginalPayload()), new RowToCsv(csvDelimiter).getCsvFromRow(fse.getPayload())))).apply("WriteTokenizationErrorsToFS", ErrorConverters.WriteErrorsToTextIO.<String, String>newBuilder().setErrorWritePath(options.getNonTokenizedDeadLetterPath()).setTranslateFunction(SerializableFunctions.getCsvErrorConverter()).build());
    }
    if (options.getOutputDirectory() != null) {
        new TokenizationFileSystemIO(options).write(tokenizedRows.get(TOKENIZATION_OUT), schema.getBeamSchema());
    } else if (options.getBigQueryTableName() != null) {
        WriteResult writeResult = TokenizationBigQueryIO.write(tokenizedRows.get(TOKENIZATION_OUT), options.getBigQueryTableName(), schema.getBigQuerySchema());
        writeResult.getFailedInsertsWithErr().apply("WrapInsertionErrors", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via(TokenizationBigQueryIO::wrapBigQueryInsertError)).setCoder(FAILSAFE_ELEMENT_CODER).apply("WriteInsertionFailedRecords", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(options.getBigQueryTableName() + DEFAULT_DEADLETTER_TABLE_SUFFIX).setErrorRecordsTableSchema(DEADLETTER_SCHEMA).build());
    } else if (options.getBigTableInstanceId() != null) {
        new TokenizationBigTableIO(options).write(tokenizedRows.get(TOKENIZATION_OUT), schema.getBeamSchema());
    } else {
        throw new IllegalStateException("No sink is provided, please configure BigQuery or BigTable.");
    }
    return pipeline.run();
}
Also used : TokenizationBigTableIO(org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationBigTableIO) SchemasUtils(org.apache.beam.examples.complete.datatokenization.utils.SchemasUtils) TokenizationFileSystemIO(org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationFileSystemIO) TokenizationBigQueryIO(org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationBigQueryIO) RowToCsv(org.apache.beam.examples.complete.datatokenization.utils.RowToCsv) IOException(java.io.IOException) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(org.apache.beam.examples.complete.datatokenization.utils.FailsafeElement) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) JsonToBeamRow(org.apache.beam.examples.complete.datatokenization.transforms.JsonToBeamRow) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Row(org.apache.beam.sdk.values.Row) JsonToBeamRow(org.apache.beam.examples.complete.datatokenization.transforms.JsonToBeamRow) RowToTokenizedRow(org.apache.beam.examples.complete.datatokenization.transforms.DataProtectors.RowToTokenizedRow)

Example 2 with TokenizationFileSystemIO

use of org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationFileSystemIO in project beam by apache.

the class DataTokenizationTest method fileSystemIORead.

private PCollection<Row> fileSystemIORead(String inputGcsFilePattern, FORMAT inputGcsFileFormat) throws IOException {
    DataTokenizationOptions options = PipelineOptionsFactory.create().as(DataTokenizationOptions.class);
    options.setDataSchemaPath(SCHEMA_FILE_PATH);
    options.setInputFilePattern(inputGcsFilePattern);
    options.setInputFileFormat(inputGcsFileFormat);
    if (inputGcsFileFormat == FORMAT.CSV) {
        options.setCsvContainsHeaders(Boolean.FALSE);
    }
    SchemasUtils testSchemaUtils = new SchemasUtils(options.getDataSchemaPath(), StandardCharsets.UTF_8);
    CoderRegistry coderRegistry = testPipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
    coderRegistry.registerCoderForType(RowCoder.of(testSchemaUtils.getBeamSchema()).getEncodedTypeDescriptor(), RowCoder.of(testSchemaUtils.getBeamSchema()));
    /*
     * Row/Row Coder for FailsafeElement.
     */
    FailsafeElementCoder<Row, Row> coder = FailsafeElementCoder.of(RowCoder.of(testSchemaUtils.getBeamSchema()), RowCoder.of(testSchemaUtils.getBeamSchema()));
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    return new TokenizationFileSystemIO(options).read(testPipeline, testSchemaUtils);
}
Also used : SchemasUtils(org.apache.beam.examples.complete.datatokenization.utils.SchemasUtils) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) TokenizationFileSystemIO(org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationFileSystemIO) DataTokenizationOptions(org.apache.beam.examples.complete.datatokenization.options.DataTokenizationOptions) Row(org.apache.beam.sdk.values.Row)

Aggregations

TokenizationFileSystemIO (org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationFileSystemIO)2 SchemasUtils (org.apache.beam.examples.complete.datatokenization.utils.SchemasUtils)2 CoderRegistry (org.apache.beam.sdk.coders.CoderRegistry)2 Row (org.apache.beam.sdk.values.Row)2 IOException (java.io.IOException)1 DataTokenizationOptions (org.apache.beam.examples.complete.datatokenization.options.DataTokenizationOptions)1 RowToTokenizedRow (org.apache.beam.examples.complete.datatokenization.transforms.DataProtectors.RowToTokenizedRow)1 JsonToBeamRow (org.apache.beam.examples.complete.datatokenization.transforms.JsonToBeamRow)1 TokenizationBigQueryIO (org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationBigQueryIO)1 TokenizationBigTableIO (org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationBigTableIO)1 FailsafeElement (org.apache.beam.examples.complete.datatokenization.utils.FailsafeElement)1 RowToCsv (org.apache.beam.examples.complete.datatokenization.utils.RowToCsv)1 Pipeline (org.apache.beam.sdk.Pipeline)1 WriteResult (org.apache.beam.sdk.io.gcp.bigquery.WriteResult)1 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)1